xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ndp.c (revision 8a06b3d6467c15646e663c05086378f16288af85)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/stream.h>
27 #include <sys/stropts.h>
28 #include <sys/strsun.h>
29 #include <sys/sysmacros.h>
30 #include <sys/errno.h>
31 #include <sys/dlpi.h>
32 #include <sys/socket.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/vtrace.h>
38 #include <sys/kmem.h>
39 #include <sys/zone.h>
40 #include <sys/ethernet.h>
41 #include <sys/sdt.h>
42 #include <sys/mac.h>
43 
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/sctp_ip.h>
65 #include <inet/ip_arp.h>
66 #include <inet/ip2mac_impl.h>
67 
68 #define	ANNOUNCE_INTERVAL(isv6) \
69 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
70 	ipst->ips_ip_arp_publish_interval)
71 
72 #define	DEFENSE_INTERVAL(isv6) \
73 	(isv6 ? ipst->ips_ndp_defend_interval : \
74 	ipst->ips_arp_defend_interval)
75 
76 /* Non-tunable probe interval, based on link capabilities */
77 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
78 
79 /*
80  * The IPv4 Link Local address space is special; we do extra duplicate checking
81  * there, as the entire assignment mechanism rests on random numbers.
82  */
83 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
84 				((uchar_t *)ptr)[1] == 254)
85 
86 /*
87  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
88  * in to the ncec*add* functions.
89  *
90  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
91  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
92  * that we will respond to requests for the protocol address.
93  */
94 #define	NCE_EXTERNAL_FLAGS_MASK \
95 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
96 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
97 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
98 
99 /*
100  * Lock ordering:
101  *
102  *	ndp_g_lock -> ill_lock -> ncec_lock
103  *
104  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
105  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
106  * ncec_refcnt).
107  */
108 
109 static	void	nce_cleanup_list(ncec_t *ncec);
110 static	void	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
111 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
112     ncec_t *);
113 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
114 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
115     uint16_t ncec_flags, nce_t **newnce);
116 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
117     uint16_t ncec_flags, nce_t **newnce);
118 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
119     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
120     const in6_addr_t *target, int flag);
121 static void	ncec_refhold_locked(ncec_t *);
122 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
123 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
124 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
125     uint16_t, uint16_t, nce_t **);
126 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
127 static nce_t *nce_add(ill_t *, ncec_t *);
128 static void nce_inactive(nce_t *);
129 extern nce_t	*nce_lookup(ill_t *, const in6_addr_t *);
130 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
131 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
132     uint16_t, uint16_t, nce_t **);
133 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
134     uint16_t, uint16_t, nce_t **);
135 static int  nce_add_v6_postprocess(nce_t *);
136 static int  nce_add_v4_postprocess(nce_t *);
137 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
138 static clock_t nce_fuzz_interval(clock_t, boolean_t);
139 static void nce_resolv_ipmp_ok(ncec_t *);
140 static void nce_walk_common(ill_t *, pfi_t, void *);
141 static void nce_start_timer(ncec_t *, uint_t);
142 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
143 static void nce_fastpath_trigger(nce_t *);
144 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
145 
146 #ifdef DEBUG
147 static void	ncec_trace_cleanup(const ncec_t *);
148 #endif
149 
150 #define	NCE_HASH_PTR_V4(ipst, addr)					\
151 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
152 
153 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
154 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
155 		NCE_TABLE_SIZE)]))
156 
157 extern kmem_cache_t *ncec_cache;
158 extern kmem_cache_t *nce_cache;
159 
160 /*
161  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
162  * If src_ill is not null, the ncec_addr is bound to src_ill. The
163  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
164  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
165  * IPMP cast_ill (in the IPMP case).
166  *
167  * Note that the probe interval is based on the src_ill for IPv6, and
168  * the ncec_xmit_interval for IPv4.
169  */
170 static void
171 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
172 {
173 	boolean_t dropped;
174 	uint32_t probe_interval;
175 
176 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
177 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
178 	if (ncec->ncec_ipversion == IPV6_VERSION) {
179 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
180 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
181 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
182 		probe_interval = ILL_PROBE_INTERVAL(src_ill);
183 	} else {
184 		/* IPv4 DAD delay the initial probe. */
185 		if (send_probe)
186 			dropped = arp_probe(ncec);
187 		else
188 			dropped = B_TRUE;
189 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
190 		    !send_probe);
191 	}
192 	if (!dropped) {
193 		mutex_enter(&ncec->ncec_lock);
194 		ncec->ncec_pcnt--;
195 		mutex_exit(&ncec->ncec_lock);
196 	}
197 	nce_restart_timer(ncec, probe_interval);
198 }
199 
200 /*
201  * Compute default flags to use for an advertisement of this ncec's address.
202  */
203 static int
204 nce_advert_flags(const ncec_t *ncec)
205 {
206 	int flag = 0;
207 
208 	if (ncec->ncec_flags & NCE_F_ISROUTER)
209 		flag |= NDP_ISROUTER;
210 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
211 		flag |= NDP_ORIDE;
212 
213 	return (flag);
214 }
215 
216 /*
217  * NDP Cache Entry creation routine.
218  * This routine must always be called with ndp6->ndp_g_lock held.
219  */
220 int
221 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
222     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
223 {
224 	int		err;
225 	nce_t		*nce;
226 
227 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
228 	ASSERT(ill != NULL && ill->ill_isv6);
229 
230 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
231 	    &nce);
232 	if (err != 0)
233 		return (err);
234 	ASSERT(newnce != NULL);
235 	*newnce = nce;
236 	return (err);
237 }
238 
239 /*
240  * Post-processing routine to be executed after nce_add_v6(). This function
241  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
242  * and must be called without any locks held.
243  */
244 int
245 nce_add_v6_postprocess(nce_t *nce)
246 {
247 	ncec_t		*ncec = nce->nce_common;
248 	boolean_t	dropped = B_FALSE;
249 	uchar_t		*hw_addr = ncec->ncec_lladdr;
250 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
251 	ill_t		*ill = ncec->ncec_ill;
252 	int		err = 0;
253 	uint16_t	flags = ncec->ncec_flags;
254 	ip_stack_t	*ipst = ill->ill_ipst;
255 	boolean_t	trigger_fastpath = B_TRUE;
256 
257 	/*
258 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
259 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
260 	 * We call nce_fastpath from nce_update if the link layer address of
261 	 * the peer changes from nce_update
262 	 */
263 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
264 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
265 		trigger_fastpath = B_FALSE;
266 
267 	if (trigger_fastpath)
268 		nce_fastpath_trigger(nce);
269 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
270 		ill_t *hwaddr_ill;
271 		/*
272 		 * Unicast entry that needs DAD.
273 		 */
274 		if (IS_IPMP(ill)) {
275 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
276 			    hw_addr, hw_addr_len);
277 		} else {
278 			hwaddr_ill = ill;
279 		}
280 		nce_dad(ncec, hwaddr_ill, B_TRUE);
281 		err = EINPROGRESS;
282 	} else if (flags & NCE_F_UNSOL_ADV) {
283 		/*
284 		 * We account for the transmit below by assigning one
285 		 * less than the ndd variable. Subsequent decrements
286 		 * are done in nce_timer.
287 		 */
288 		mutex_enter(&ncec->ncec_lock);
289 		ncec->ncec_unsolicit_count =
290 		    ipst->ips_ip_ndp_unsolicit_count - 1;
291 		mutex_exit(&ncec->ncec_lock);
292 		dropped = ndp_xmit(ill,
293 		    ND_NEIGHBOR_ADVERT,
294 		    hw_addr,
295 		    hw_addr_len,
296 		    &ncec->ncec_addr,	/* Source and target of the adv */
297 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
298 		    nce_advert_flags(ncec));
299 		mutex_enter(&ncec->ncec_lock);
300 		if (dropped)
301 			ncec->ncec_unsolicit_count++;
302 		else
303 			ncec->ncec_last_time_defended = ddi_get_lbolt();
304 		if (ncec->ncec_unsolicit_count != 0) {
305 			nce_start_timer(ncec,
306 			    ipst->ips_ip_ndp_unsolicit_interval);
307 		}
308 		mutex_exit(&ncec->ncec_lock);
309 	}
310 	return (err);
311 }
312 
313 /*
314  * Atomically lookup and add (if needed) Neighbor Cache information for
315  * an address.
316  *
317  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
318  * are always added pointing at the ipmp_ill. Thus, when the ill passed
319  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
320  * entries will be created, both pointing at the same ncec_t. The nce_t
321  * entries will have their nce_ill set to the ipmp_ill and the under_ill
322  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
323  * Local addresses are always created on the ill passed to nce_add_v6.
324  */
325 int
326 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
327     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
328 {
329 	int		err = 0;
330 	ip_stack_t	*ipst = ill->ill_ipst;
331 	nce_t		*nce, *upper_nce = NULL;
332 	ill_t		*in_ill = ill;
333 	boolean_t	need_ill_refrele = B_FALSE;
334 
335 	if (flags & NCE_F_MCAST) {
336 		/*
337 		 * hw_addr will be figured out in nce_set_multicast_v6;
338 		 * caller has to select the cast_ill
339 		 */
340 		ASSERT(hw_addr == NULL);
341 		ASSERT(!IS_IPMP(ill));
342 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
343 		return (err);
344 	}
345 	ASSERT(ill->ill_isv6);
346 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
347 		ill = ipmp_ill_hold_ipmp_ill(ill);
348 		if (ill == NULL)
349 			return (ENXIO);
350 		need_ill_refrele = B_TRUE;
351 	}
352 
353 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
354 	nce = nce_lookup_addr(ill, addr);
355 	if (nce == NULL) {
356 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
357 		    &nce);
358 	} else {
359 		err = EEXIST;
360 	}
361 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
362 	if (err == 0)
363 		err = nce_add_v6_postprocess(nce);
364 	if (in_ill != ill && nce != NULL) {
365 		nce_t *under_nce = NULL;
366 
367 		/*
368 		 * in_ill was the under_ill. Try to create the under_nce.
369 		 * Hold the ill_g_lock to prevent changes to group membership
370 		 * until we are done.
371 		 */
372 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
373 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
374 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
375 			    ill_t *, ill);
376 			rw_exit(&ipst->ips_ill_g_lock);
377 			err = ENXIO;
378 			nce_refrele(nce);
379 			nce = NULL;
380 			goto bail;
381 		}
382 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
383 		if (under_nce == NULL) {
384 			rw_exit(&ipst->ips_ill_g_lock);
385 			err = EINVAL;
386 			nce_refrele(nce);
387 			nce = NULL;
388 			goto bail;
389 		}
390 		rw_exit(&ipst->ips_ill_g_lock);
391 		upper_nce = nce;
392 		nce = under_nce; /* will be returned to caller */
393 		if (NCE_ISREACHABLE(nce->nce_common))
394 			nce_fastpath_trigger(under_nce);
395 	}
396 	/* nce_refrele is deferred until the lock is dropped  */
397 	if (nce != NULL) {
398 		if (newnce != NULL)
399 			*newnce = nce;
400 		else
401 			nce_refrele(nce);
402 	}
403 bail:
404 	if (upper_nce != NULL)
405 		nce_refrele(upper_nce);
406 	if (need_ill_refrele)
407 		ill_refrele(ill);
408 	return (err);
409 }
410 
411 /*
412  * Remove all the CONDEMNED nces from the appropriate hash table.
413  * We create a private list of NCEs, these may have ires pointing
414  * to them, so the list will be passed through to clean up dependent
415  * ires and only then we can do ncec_refrele() which can make NCE inactive.
416  */
417 static void
418 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
419 {
420 	ncec_t *ncec1;
421 	ncec_t **ptpn;
422 
423 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
424 	ASSERT(ndp->ndp_g_walker == 0);
425 	for (; ncec; ncec = ncec1) {
426 		ncec1 = ncec->ncec_next;
427 		mutex_enter(&ncec->ncec_lock);
428 		if (NCE_ISCONDEMNED(ncec)) {
429 			ptpn = ncec->ncec_ptpn;
430 			ncec1 = ncec->ncec_next;
431 			if (ncec1 != NULL)
432 				ncec1->ncec_ptpn = ptpn;
433 			*ptpn = ncec1;
434 			ncec->ncec_ptpn = NULL;
435 			ncec->ncec_next = NULL;
436 			ncec->ncec_next = *free_nce_list;
437 			*free_nce_list = ncec;
438 		}
439 		mutex_exit(&ncec->ncec_lock);
440 	}
441 }
442 
443 /*
444  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
445  *    will return this NCE. Also no new timeouts will
446  *    be started (See nce_restart_timer).
447  * 2. Cancel any currently running timeouts.
448  * 3. If there is an ndp walker, return. The walker will do the cleanup.
449  *    This ensures that walkers see a consistent list of NCEs while walking.
450  * 4. Otherwise remove the NCE from the list of NCEs
451  */
452 void
453 ncec_delete(ncec_t *ncec)
454 {
455 	ncec_t	**ptpn;
456 	ncec_t	*ncec1;
457 	int	ipversion = ncec->ncec_ipversion;
458 	ndp_g_t *ndp;
459 	ip_stack_t	*ipst = ncec->ncec_ipst;
460 
461 	if (ipversion == IPV4_VERSION)
462 		ndp = ipst->ips_ndp4;
463 	else
464 		ndp = ipst->ips_ndp6;
465 
466 	/* Serialize deletes */
467 	mutex_enter(&ncec->ncec_lock);
468 	if (NCE_ISCONDEMNED(ncec)) {
469 		/* Some other thread is doing the delete */
470 		mutex_exit(&ncec->ncec_lock);
471 		return;
472 	}
473 	/*
474 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
475 	 * refcnt has to be >= 2
476 	 */
477 	ASSERT(ncec->ncec_refcnt >= 2);
478 	ncec->ncec_flags |= NCE_F_CONDEMNED;
479 	mutex_exit(&ncec->ncec_lock);
480 
481 	/* Count how many condemned ires for kmem_cache callback */
482 	atomic_inc_32(&ipst->ips_num_nce_condemned);
483 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
484 
485 	/* Complete any waiting callbacks */
486 	ncec_cb_dispatch(ncec);
487 
488 	/*
489 	 * Cancel any running timer. Timeout can't be restarted
490 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
491 	 * Passing invalid timeout id is fine.
492 	 */
493 	if (ncec->ncec_timeout_id != 0) {
494 		(void) untimeout(ncec->ncec_timeout_id);
495 		ncec->ncec_timeout_id = 0;
496 	}
497 
498 	mutex_enter(&ndp->ndp_g_lock);
499 	if (ncec->ncec_ptpn == NULL) {
500 		/*
501 		 * The last ndp walker has already removed this ncec from
502 		 * the list after we marked the ncec CONDEMNED and before
503 		 * we grabbed the global lock.
504 		 */
505 		mutex_exit(&ndp->ndp_g_lock);
506 		return;
507 	}
508 	if (ndp->ndp_g_walker > 0) {
509 		/*
510 		 * Can't unlink. The walker will clean up
511 		 */
512 		ndp->ndp_g_walker_cleanup = B_TRUE;
513 		mutex_exit(&ndp->ndp_g_lock);
514 		return;
515 	}
516 
517 	/*
518 	 * Now remove the ncec from the list. nce_restart_timer won't restart
519 	 * the timer since it is marked CONDEMNED.
520 	 */
521 	ptpn = ncec->ncec_ptpn;
522 	ncec1 = ncec->ncec_next;
523 	if (ncec1 != NULL)
524 		ncec1->ncec_ptpn = ptpn;
525 	*ptpn = ncec1;
526 	ncec->ncec_ptpn = NULL;
527 	ncec->ncec_next = NULL;
528 	mutex_exit(&ndp->ndp_g_lock);
529 
530 	/* Removed from ncec_ptpn/ncec_next list */
531 	ncec_refrele_notr(ncec);
532 }
533 
534 void
535 ncec_inactive(ncec_t *ncec)
536 {
537 	mblk_t		**mpp;
538 	ill_t		*ill = ncec->ncec_ill;
539 	ip_stack_t	*ipst = ncec->ncec_ipst;
540 
541 	ASSERT(ncec->ncec_refcnt == 0);
542 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
543 
544 	/* Count how many condemned nces for kmem_cache callback */
545 	if (NCE_ISCONDEMNED(ncec))
546 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
547 
548 	/* Free all allocated messages */
549 	mpp = &ncec->ncec_qd_mp;
550 	while (*mpp != NULL) {
551 		mblk_t  *mp;
552 
553 		mp = *mpp;
554 		*mpp = mp->b_next;
555 
556 		inet_freemsg(mp);
557 	}
558 	/*
559 	 * must have been cleaned up in ncec_delete
560 	 */
561 	ASSERT(list_is_empty(&ncec->ncec_cb));
562 	list_destroy(&ncec->ncec_cb);
563 	/*
564 	 * free the ncec_lladdr if one was allocated in nce_add_common()
565 	 */
566 	if (ncec->ncec_lladdr_length > 0)
567 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
568 
569 #ifdef DEBUG
570 	ncec_trace_cleanup(ncec);
571 #endif
572 
573 	mutex_enter(&ill->ill_lock);
574 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
575 	    (char *), "ncec", (void *), ncec);
576 	ill->ill_ncec_cnt--;
577 	ncec->ncec_ill = NULL;
578 	/*
579 	 * If the number of ncec's associated with this ill have dropped
580 	 * to zero, check whether we need to restart any operation that
581 	 * is waiting for this to happen.
582 	 */
583 	if (ILL_DOWN_OK(ill)) {
584 		/* ipif_ill_refrele_tail drops the ill_lock */
585 		ipif_ill_refrele_tail(ill);
586 	} else {
587 		mutex_exit(&ill->ill_lock);
588 	}
589 
590 	mutex_destroy(&ncec->ncec_lock);
591 	kmem_cache_free(ncec_cache, ncec);
592 }
593 
594 /*
595  * ncec_walk routine.  Delete the ncec if it is associated with the ill
596  * that is going away.  Always called as a writer.
597  */
598 void
599 ncec_delete_per_ill(ncec_t *ncec, void *arg)
600 {
601 	if ((ncec != NULL) && ncec->ncec_ill == arg) {
602 		ncec_delete(ncec);
603 	}
604 }
605 
606 /*
607  * Neighbor Cache cleanup logic for a list of ncec_t entries.
608  */
609 static void
610 nce_cleanup_list(ncec_t *ncec)
611 {
612 	ncec_t *ncec_next;
613 
614 	ASSERT(ncec != NULL);
615 	while (ncec != NULL) {
616 		ncec_next = ncec->ncec_next;
617 		ncec->ncec_next = NULL;
618 
619 		/*
620 		 * It is possible for the last ndp walker (this thread)
621 		 * to come here after ncec_delete has marked the ncec CONDEMNED
622 		 * and before it has removed the ncec from the fastpath list
623 		 * or called untimeout. So we need to do it here. It is safe
624 		 * for both ncec_delete and this thread to do it twice or
625 		 * even simultaneously since each of the threads has a
626 		 * reference on the ncec.
627 		 */
628 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
629 		/*
630 		 * Cancel any running timer. Timeout can't be restarted
631 		 * since CONDEMNED is set. The ncec_lock can't be
632 		 * held across untimeout though passing invalid timeout
633 		 * id is fine.
634 		 */
635 		if (ncec->ncec_timeout_id != 0) {
636 			(void) untimeout(ncec->ncec_timeout_id);
637 			ncec->ncec_timeout_id = 0;
638 		}
639 		/* Removed from ncec_ptpn/ncec_next list */
640 		ncec_refrele_notr(ncec);
641 		ncec = ncec_next;
642 	}
643 }
644 
645 /*
646  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
647  */
648 boolean_t
649 nce_restart_dad(ncec_t *ncec)
650 {
651 	boolean_t started;
652 	ill_t *ill, *hwaddr_ill;
653 
654 	if (ncec == NULL)
655 		return (B_FALSE);
656 	ill = ncec->ncec_ill;
657 	mutex_enter(&ncec->ncec_lock);
658 	if (ncec->ncec_state == ND_PROBE) {
659 		mutex_exit(&ncec->ncec_lock);
660 		started = B_TRUE;
661 	} else if (ncec->ncec_state == ND_REACHABLE) {
662 		ASSERT(ncec->ncec_lladdr != NULL);
663 		ncec->ncec_state = ND_PROBE;
664 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
665 		/*
666 		 * Slight cheat here: we don't use the initial probe delay
667 		 * for IPv4 in this obscure case.
668 		 */
669 		mutex_exit(&ncec->ncec_lock);
670 		if (IS_IPMP(ill)) {
671 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
672 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
673 		} else {
674 			hwaddr_ill = ill;
675 		}
676 		nce_dad(ncec, hwaddr_ill, B_TRUE);
677 		started = B_TRUE;
678 	} else {
679 		mutex_exit(&ncec->ncec_lock);
680 		started = B_FALSE;
681 	}
682 	return (started);
683 }
684 
685 /*
686  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
687  * If one is found, the refcnt on the ncec will be incremented.
688  */
689 ncec_t *
690 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
691 {
692 	ncec_t		*ncec;
693 	ip_stack_t	*ipst = ill->ill_ipst;
694 
695 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
696 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
697 
698 	/* Get head of v6 hash table */
699 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
700 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
701 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
702 	rw_exit(&ipst->ips_ill_g_lock);
703 	return (ncec);
704 }
705 /*
706  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
707  * If one is found, the refcnt on the ncec will be incremented.
708  */
709 ncec_t *
710 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
711 {
712 	ncec_t	*ncec = NULL;
713 	in6_addr_t addr6;
714 	ip_stack_t *ipst = ill->ill_ipst;
715 
716 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
717 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
718 
719 	/* Get head of v4 hash table */
720 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
721 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
722 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
723 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
724 	rw_exit(&ipst->ips_ill_g_lock);
725 	return (ncec);
726 }
727 
728 /*
729  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
730  * If an ncec is found, increment the hold count on that ncec.
731  * The caller passes in the start of the appropriate hash table, and must
732  * be holding the appropriate global lock (ndp_g_lock). In addition, since
733  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
734  * must be held as reader.
735  *
736  * This function always matches across the ipmp group.
737  */
738 ncec_t *
739 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
740 {
741 	ndp_g_t		*ndp;
742 	ip_stack_t	*ipst = ill->ill_ipst;
743 
744 	if (ill->ill_isv6)
745 		ndp = ipst->ips_ndp6;
746 	else
747 		ndp = ipst->ips_ndp4;
748 
749 	ASSERT(ill != NULL);
750 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
751 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
752 		return (NULL);
753 	for (; ncec != NULL; ncec = ncec->ncec_next) {
754 		if (ncec->ncec_ill == ill ||
755 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
756 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
757 				mutex_enter(&ncec->ncec_lock);
758 				if (!NCE_ISCONDEMNED(ncec)) {
759 					ncec_refhold_locked(ncec);
760 					mutex_exit(&ncec->ncec_lock);
761 					break;
762 				}
763 				mutex_exit(&ncec->ncec_lock);
764 			}
765 		}
766 	}
767 	return (ncec);
768 }
769 
770 /*
771  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
772  * entries for ill only, i.e., when ill is part of an ipmp group,
773  * nce_lookup_v4 will never try to match across the group.
774  */
775 nce_t *
776 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
777 {
778 	nce_t *nce;
779 	in6_addr_t addr6;
780 	ip_stack_t *ipst = ill->ill_ipst;
781 
782 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
783 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
784 	nce = nce_lookup_addr(ill, &addr6);
785 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
786 	return (nce);
787 }
788 
789 /*
790  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
791  * entries for ill only, i.e., when ill is part of an ipmp group,
792  * nce_lookup_v6 will never try to match across the group.
793  */
794 nce_t *
795 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
796 {
797 	nce_t *nce;
798 	ip_stack_t *ipst = ill->ill_ipst;
799 
800 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
801 	nce = nce_lookup_addr(ill, addr6);
802 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
803 	return (nce);
804 }
805 
806 static nce_t *
807 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
808 {
809 	nce_t *nce;
810 
811 	ASSERT(ill != NULL);
812 #ifdef DEBUG
813 	if (ill->ill_isv6)
814 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
815 	else
816 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
817 #endif
818 	mutex_enter(&ill->ill_lock);
819 	nce = nce_lookup(ill, addr);
820 	mutex_exit(&ill->ill_lock);
821 	return (nce);
822 }
823 
824 
825 /*
826  * Router turned to host.  We need to make sure that cached copies of the ncec
827  * are not used for forwarding packets if they were derived from the default
828  * route, and that the default route itself is removed, as  required by
829  * section 7.2.5 of RFC 2461.
830  *
831  * Note that the ncec itself probably has valid link-layer information for the
832  * nexthop, so that there is no reason to delete the ncec, as long as the
833  * ISROUTER flag is turned off.
834  */
835 static void
836 ncec_router_to_host(ncec_t *ncec)
837 {
838 	ire_t		*ire;
839 	ip_stack_t	*ipst = ncec->ncec_ipst;
840 
841 	mutex_enter(&ncec->ncec_lock);
842 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
843 	mutex_exit(&ncec->ncec_lock);
844 
845 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
846 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
847 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
848 	if (ire != NULL) {
849 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
850 		ire_delete(ire);
851 		ire_refrele(ire);
852 	}
853 }
854 
855 /*
856  * Process passed in parameters either from an incoming packet or via
857  * user ioctl.
858  */
859 void
860 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
861 {
862 	ill_t	*ill = ncec->ncec_ill;
863 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
864 	boolean_t ll_updated = B_FALSE;
865 	boolean_t ll_changed;
866 	nce_t	*nce;
867 
868 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
869 	/*
870 	 * No updates of link layer address or the neighbor state is
871 	 * allowed, when the cache is in NONUD state.  This still
872 	 * allows for responding to reachability solicitation.
873 	 */
874 	mutex_enter(&ncec->ncec_lock);
875 	if (ncec->ncec_state == ND_INCOMPLETE) {
876 		if (hw_addr == NULL) {
877 			mutex_exit(&ncec->ncec_lock);
878 			return;
879 		}
880 		nce_set_ll(ncec, hw_addr);
881 		/*
882 		 * Update ncec state and send the queued packets
883 		 * back to ip this time ire will be added.
884 		 */
885 		if (flag & ND_NA_FLAG_SOLICITED) {
886 			nce_update(ncec, ND_REACHABLE, NULL);
887 		} else {
888 			nce_update(ncec, ND_STALE, NULL);
889 		}
890 		mutex_exit(&ncec->ncec_lock);
891 		nce = nce_fastpath(ncec, B_TRUE, NULL);
892 		nce_resolv_ok(ncec);
893 		if (nce != NULL)
894 			nce_refrele(nce);
895 		return;
896 	}
897 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
898 	if (!is_adv) {
899 		/* If this is a SOLICITATION request only */
900 		if (ll_changed)
901 			nce_update(ncec, ND_STALE, hw_addr);
902 		mutex_exit(&ncec->ncec_lock);
903 		ncec_cb_dispatch(ncec);
904 		return;
905 	}
906 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
907 		/* If in any other state than REACHABLE, ignore */
908 		if (ncec->ncec_state == ND_REACHABLE) {
909 			nce_update(ncec, ND_STALE, NULL);
910 		}
911 		mutex_exit(&ncec->ncec_lock);
912 		ncec_cb_dispatch(ncec);
913 		return;
914 	} else {
915 		if (ll_changed) {
916 			nce_update(ncec, ND_UNCHANGED, hw_addr);
917 			ll_updated = B_TRUE;
918 		}
919 		if (flag & ND_NA_FLAG_SOLICITED) {
920 			nce_update(ncec, ND_REACHABLE, NULL);
921 		} else {
922 			if (ll_updated) {
923 				nce_update(ncec, ND_STALE, NULL);
924 			}
925 		}
926 		mutex_exit(&ncec->ncec_lock);
927 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
928 		    NCE_F_ISROUTER)) {
929 			ncec_router_to_host(ncec);
930 		} else {
931 			ncec_cb_dispatch(ncec);
932 		}
933 	}
934 }
935 
936 /*
937  * Pass arg1 to the cbf supplied, along with each ncec in existence.
938  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
939  * walking the hash list.
940  */
941 void
942 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
943     void *arg1, boolean_t trace)
944 {
945 	ncec_t	*ncec;
946 	ncec_t	*ncec1;
947 	ncec_t	**ncep;
948 	ncec_t	*free_nce_list = NULL;
949 
950 	mutex_enter(&ndp->ndp_g_lock);
951 	/* Prevent ncec_delete from unlink and free of NCE */
952 	ndp->ndp_g_walker++;
953 	mutex_exit(&ndp->ndp_g_lock);
954 	for (ncep = ndp->nce_hash_tbl;
955 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
956 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
957 			ncec1 = ncec->ncec_next;
958 			if (ill == NULL || ncec->ncec_ill == ill) {
959 				if (trace) {
960 					ncec_refhold(ncec);
961 					(*cbf)(ncec, arg1);
962 					ncec_refrele(ncec);
963 				} else {
964 					ncec_refhold_notr(ncec);
965 					(*cbf)(ncec, arg1);
966 					ncec_refrele_notr(ncec);
967 				}
968 			}
969 		}
970 	}
971 	mutex_enter(&ndp->ndp_g_lock);
972 	ndp->ndp_g_walker--;
973 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
974 		/* Time to delete condemned entries */
975 		for (ncep = ndp->nce_hash_tbl;
976 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
977 			ncec = *ncep;
978 			if (ncec != NULL) {
979 				nce_remove(ndp, ncec, &free_nce_list);
980 			}
981 		}
982 		ndp->ndp_g_walker_cleanup = B_FALSE;
983 	}
984 
985 	mutex_exit(&ndp->ndp_g_lock);
986 
987 	if (free_nce_list != NULL) {
988 		nce_cleanup_list(free_nce_list);
989 	}
990 }
991 
992 /*
993  * Walk everything.
994  * Note that ill can be NULL hence can't derive the ipst from it.
995  */
996 void
997 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
998 {
999 	ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1000 	ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1001 }
1002 
1003 /*
1004  * For each interface an entry is added for the unspecified multicast group.
1005  * Here that mapping is used to form the multicast cache entry for a particular
1006  * multicast destination.
1007  */
1008 static int
1009 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1010     uint16_t flags, nce_t **newnce)
1011 {
1012 	uchar_t		*hw_addr;
1013 	int		err = 0;
1014 	ip_stack_t	*ipst = ill->ill_ipst;
1015 	nce_t		*nce;
1016 
1017 	ASSERT(ill != NULL);
1018 	ASSERT(ill->ill_isv6);
1019 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1020 
1021 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1022 	nce = nce_lookup_addr(ill, dst);
1023 	if (nce != NULL) {
1024 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1025 		goto done;
1026 	}
1027 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1028 		/*
1029 		 * For IRE_IF_RESOLVER a hardware mapping can be
1030 		 * generated.
1031 		 */
1032 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1033 		if (hw_addr == NULL) {
1034 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1035 			return (ENOMEM);
1036 		}
1037 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1038 	} else {
1039 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
1040 		hw_addr = NULL;
1041 	}
1042 	ASSERT((flags & NCE_F_MCAST) != 0);
1043 	ASSERT((flags & NCE_F_NONUD) != 0);
1044 	/* nce_state will be computed by nce_add_common() */
1045 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1046 	    ND_UNCHANGED, &nce);
1047 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1048 	if (err == 0)
1049 		err = nce_add_v6_postprocess(nce);
1050 	if (hw_addr != NULL)
1051 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1052 	if (err != 0) {
1053 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1054 		return (err);
1055 	}
1056 done:
1057 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1058 	if (newnce != NULL)
1059 		*newnce = nce;
1060 	else
1061 		nce_refrele(nce);
1062 	return (0);
1063 }
1064 
1065 /*
1066  * Return the link layer address, and any flags of a ncec.
1067  */
1068 int
1069 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1070 {
1071 	ncec_t		*ncec;
1072 	in6_addr_t	*addr;
1073 	sin6_t		*sin6;
1074 
1075 	ASSERT(ill != NULL && ill->ill_isv6);
1076 	sin6 = (sin6_t *)&lnr->lnr_addr;
1077 	addr =  &sin6->sin6_addr;
1078 
1079 	/*
1080 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1081 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1082 	 * addresses for the data addresses on an IPMP interface even though
1083 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1084 	 */
1085 	ncec = ncec_lookup_illgrp_v6(ill, addr);
1086 	if (ncec == NULL)
1087 		return (ESRCH);
1088 	/* If no link layer address is available yet, return ESRCH */
1089 	if (!NCE_ISREACHABLE(ncec)) {
1090 		ncec_refrele(ncec);
1091 		return (ESRCH);
1092 	}
1093 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1094 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1095 	    lnr->lnr_hdw_len);
1096 	if (ncec->ncec_flags & NCE_F_ISROUTER)
1097 		lnr->lnr_flags = NDF_ISROUTER_ON;
1098 	if (ncec->ncec_flags & NCE_F_ANYCAST)
1099 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1100 	if (ncec->ncec_flags & NCE_F_STATIC)
1101 		lnr->lnr_flags |= NDF_STATIC;
1102 	ncec_refrele(ncec);
1103 	return (0);
1104 }
1105 
1106 /*
1107  * Finish setting up the Enable/Disable multicast for the driver.
1108  */
1109 mblk_t *
1110 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1111     uint32_t hw_addr_offset, mblk_t *mp)
1112 {
1113 	uchar_t		*hw_addr;
1114 	ipaddr_t	v4group;
1115 	uchar_t		*addr;
1116 
1117 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1118 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1119 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1120 
1121 		ASSERT(CLASSD(v4group));
1122 		ASSERT(!(ill->ill_isv6));
1123 
1124 		addr = (uchar_t *)&v4group;
1125 	} else {
1126 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1127 		ASSERT(ill->ill_isv6);
1128 
1129 		addr = (uchar_t *)v6group;
1130 	}
1131 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1132 	if (hw_addr == NULL) {
1133 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1134 		freemsg(mp);
1135 		return (NULL);
1136 	}
1137 
1138 	ip_mcast_mapping(ill, addr, hw_addr);
1139 	return (mp);
1140 }
1141 
1142 void
1143 ip_ndp_resolve(ncec_t *ncec)
1144 {
1145 	in_addr_t	sender4 = INADDR_ANY;
1146 	in6_addr_t	sender6 = ipv6_all_zeros;
1147 	ill_t		*src_ill;
1148 	uint32_t	ms;
1149 
1150 	src_ill = nce_resolve_src(ncec, &sender6);
1151 	if (src_ill == NULL) {
1152 		/* Make sure we try again later */
1153 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
1154 		nce_restart_timer(ncec, (clock_t)ms);
1155 		return;
1156 	}
1157 	if (ncec->ncec_ipversion == IPV4_VERSION)
1158 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1159 	mutex_enter(&ncec->ncec_lock);
1160 	if (ncec->ncec_ipversion == IPV6_VERSION)
1161 		ms = ndp_solicit(ncec, sender6, src_ill);
1162 	else
1163 		ms = arp_request(ncec, sender4, src_ill);
1164 	mutex_exit(&ncec->ncec_lock);
1165 	if (ms == 0) {
1166 		if (ncec->ncec_state != ND_REACHABLE) {
1167 			if (ncec->ncec_ipversion == IPV6_VERSION)
1168 				ndp_resolv_failed(ncec);
1169 			else
1170 				arp_resolv_failed(ncec);
1171 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1172 			nce_make_unreachable(ncec);
1173 			ncec_delete(ncec);
1174 		}
1175 	} else {
1176 		nce_restart_timer(ncec, (clock_t)ms);
1177 	}
1178 done:
1179 	ill_refrele(src_ill);
1180 }
1181 
1182 /*
1183  * Send an IPv6 neighbor solicitation.
1184  * Returns number of milliseconds after which we should either rexmit or abort.
1185  * Return of zero means we should abort.
1186  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1187  * The optional source address is used as a hint to ndp_solicit for
1188  * which source to use in the packet.
1189  *
1190  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1191  * the packet.
1192  */
1193 uint32_t
1194 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1195 {
1196 	in6_addr_t	dst;
1197 	boolean_t	dropped = B_FALSE;
1198 
1199 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1200 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1201 
1202 	if (ncec->ncec_rcnt == 0)
1203 		return (0);
1204 
1205 	dst = ncec->ncec_addr;
1206 	ncec->ncec_rcnt--;
1207 	mutex_exit(&ncec->ncec_lock);
1208 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1209 	    ill->ill_phys_addr_length, &src, &dst, 0);
1210 	mutex_enter(&ncec->ncec_lock);
1211 	if (dropped)
1212 		ncec->ncec_rcnt++;
1213 	return (ncec->ncec_ill->ill_reachable_retrans_time);
1214 }
1215 
1216 /*
1217  * Attempt to recover an address on an interface that's been marked as a
1218  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1219  * no easy way to just probe the address and have the right thing happen if
1220  * it's no longer in use.  Instead, we just bring it up normally and allow the
1221  * regular interface start-up logic to probe for a remaining duplicate and take
1222  * us back down if necessary.
1223  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1224  * ip_ndp_excl.
1225  */
1226 /* ARGSUSED */
1227 void
1228 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1229 {
1230 	ill_t	*ill = rq->q_ptr;
1231 	ipif_t	*ipif;
1232 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1233 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1234 	boolean_t addr_equal;
1235 
1236 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1237 		/*
1238 		 * We do not support recovery of proxy ARP'd interfaces,
1239 		 * because the system lacks a complete proxy ARP mechanism.
1240 		 */
1241 		if (ill->ill_isv6) {
1242 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1243 			    addr6);
1244 		} else {
1245 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
1246 		}
1247 
1248 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1249 			continue;
1250 
1251 		/*
1252 		 * If we have already recovered or if the interface is going
1253 		 * away, then ignore.
1254 		 */
1255 		mutex_enter(&ill->ill_lock);
1256 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1257 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1258 			mutex_exit(&ill->ill_lock);
1259 			continue;
1260 		}
1261 
1262 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1263 		ill->ill_ipif_dup_count--;
1264 		mutex_exit(&ill->ill_lock);
1265 		ipif->ipif_was_dup = B_TRUE;
1266 
1267 		if (ill->ill_isv6) {
1268 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1269 			(void) ipif_up_done_v6(ipif);
1270 		} else {
1271 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1272 			    EINPROGRESS);
1273 			(void) ipif_up_done(ipif);
1274 		}
1275 	}
1276 	freeb(mp);
1277 }
1278 
1279 /*
1280  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1281  * As long as someone else holds the address, the interface will stay down.
1282  * When that conflict goes away, the interface is brought back up.  This is
1283  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1284  * server will recover from a failure.
1285  *
1286  * For DHCP and temporary addresses, recovery is not done in the kernel.
1287  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1288  *
1289  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1290  */
1291 void
1292 ipif_dup_recovery(void *arg)
1293 {
1294 	ipif_t *ipif = arg;
1295 
1296 	ipif->ipif_recovery_id = 0;
1297 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1298 		return;
1299 
1300 	/*
1301 	 * No lock, because this is just an optimization.
1302 	 */
1303 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1304 		return;
1305 
1306 	/* If the link is down, we'll retry this later */
1307 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1308 		return;
1309 
1310 	ipif_do_recovery(ipif);
1311 }
1312 
1313 /*
1314  * Perform interface recovery by forcing the duplicate interfaces up and
1315  * allowing the system to determine which ones should stay up.
1316  *
1317  * Called both by recovery timer expiry and link-up notification.
1318  */
1319 void
1320 ipif_do_recovery(ipif_t *ipif)
1321 {
1322 	ill_t *ill = ipif->ipif_ill;
1323 	mblk_t *mp;
1324 	ip_stack_t *ipst = ill->ill_ipst;
1325 	size_t mp_size;
1326 
1327 	if (ipif->ipif_isv6)
1328 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
1329 	else
1330 		mp_size = sizeof (ipif->ipif_lcl_addr);
1331 	mp = allocb(mp_size, BPRI_MED);
1332 	if (mp == NULL) {
1333 		mutex_enter(&ill->ill_lock);
1334 		if (ipst->ips_ip_dup_recovery > 0 &&
1335 		    ipif->ipif_recovery_id == 0 &&
1336 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1337 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1338 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1339 		}
1340 		mutex_exit(&ill->ill_lock);
1341 	} else {
1342 		/*
1343 		 * A recovery timer may still be running if we got here from
1344 		 * ill_restart_dad(); cancel that timer.
1345 		 */
1346 		if (ipif->ipif_recovery_id != 0)
1347 			(void) untimeout(ipif->ipif_recovery_id);
1348 		ipif->ipif_recovery_id = 0;
1349 
1350 		if (ipif->ipif_isv6) {
1351 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1352 			    sizeof (ipif->ipif_v6lcl_addr));
1353 		} else  {
1354 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1355 			    sizeof (ipif->ipif_lcl_addr));
1356 		}
1357 		ill_refhold(ill);
1358 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1359 		    B_FALSE);
1360 	}
1361 }
1362 
1363 /*
1364  * Find the MAC and IP addresses in an NA/NS message.
1365  */
1366 static void
1367 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1368     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1369 {
1370 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1371 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1372 	uchar_t *addr;
1373 	int alen;
1374 
1375 	/* icmp_inbound_v6 ensures this */
1376 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1377 
1378 	addr = ira->ira_l2src;
1379 	alen = ill->ill_phys_addr_length;
1380 	if (alen > 0) {
1381 		*haddr = addr;
1382 		*haddrlenp = alen;
1383 	} else {
1384 		*haddr = NULL;
1385 		*haddrlenp = 0;
1386 	}
1387 
1388 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1389 	*targp = ns->nd_ns_target;
1390 }
1391 
1392 /*
1393  * This is for exclusive changes due to NDP duplicate address detection
1394  * failure.
1395  */
1396 /* ARGSUSED */
1397 static void
1398 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1399 {
1400 	ill_t	*ill = rq->q_ptr;
1401 	ipif_t	*ipif;
1402 	uchar_t	*haddr;
1403 	uint_t	haddrlen;
1404 	ip_stack_t *ipst = ill->ill_ipst;
1405 	in6_addr_t targ;
1406 	ip_recv_attr_t iras;
1407 	mblk_t	*attrmp;
1408 
1409 	attrmp = mp;
1410 	mp = mp->b_cont;
1411 	attrmp->b_cont = NULL;
1412 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1413 		/* The ill or ip_stack_t disappeared on us */
1414 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1415 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1416 		freemsg(mp);
1417 		ira_cleanup(&iras, B_TRUE);
1418 		return;
1419 	}
1420 
1421 	ASSERT(ill == iras.ira_rill);
1422 
1423 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1424 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1425 		/*
1426 		 * Ignore conflicts generated by misbehaving switches that
1427 		 * just reflect our own messages back to us.  For IPMP, we may
1428 		 * see reflections across any ill in the illgrp.
1429 		 *
1430 		 * RFC2462 and revisions tried to detect both the case
1431 		 * when a statically configured IPv6 address is a duplicate,
1432 		 * and the case when the L2 address itself is a duplicate. The
1433 		 * later is important because, with stateles address autoconf,
1434 		 * if the L2 address is a duplicate, the resulting IPv6
1435 		 * address(es) would also be duplicates. We rely on DAD of the
1436 		 * IPv6 address itself to detect the latter case.
1437 		 */
1438 		/* For an under ill_grp can change under lock */
1439 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1440 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1441 		    IS_UNDER_IPMP(ill) &&
1442 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1443 		    haddrlen) != NULL) {
1444 			rw_exit(&ipst->ips_ill_g_lock);
1445 			goto ignore_conflict;
1446 		}
1447 		rw_exit(&ipst->ips_ill_g_lock);
1448 	}
1449 
1450 	/*
1451 	 * Look up the appropriate ipif.
1452 	 */
1453 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1454 	if (ipif == NULL)
1455 		goto ignore_conflict;
1456 
1457 	/* Reload the ill to match the ipif */
1458 	ill = ipif->ipif_ill;
1459 
1460 	/* If it's already duplicate or ineligible, then don't do anything. */
1461 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1462 		ipif_refrele(ipif);
1463 		goto ignore_conflict;
1464 	}
1465 
1466 	/*
1467 	 * If this is a failure during duplicate recovery, then don't
1468 	 * complain.  It may take a long time to recover.
1469 	 */
1470 	if (!ipif->ipif_was_dup) {
1471 		char ibuf[LIFNAMSIZ];
1472 		char hbuf[MAC_STR_LEN];
1473 		char sbuf[INET6_ADDRSTRLEN];
1474 
1475 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1476 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1477 		    " disabled", ibuf,
1478 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1479 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1480 	}
1481 	mutex_enter(&ill->ill_lock);
1482 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1483 	ipif->ipif_flags |= IPIF_DUPLICATE;
1484 	ill->ill_ipif_dup_count++;
1485 	mutex_exit(&ill->ill_lock);
1486 	(void) ipif_down(ipif, NULL, NULL);
1487 	(void) ipif_down_tail(ipif);
1488 	mutex_enter(&ill->ill_lock);
1489 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1490 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1491 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1492 	    ipst->ips_ip_dup_recovery > 0) {
1493 		ASSERT(ipif->ipif_recovery_id == 0);
1494 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1495 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1496 	}
1497 	mutex_exit(&ill->ill_lock);
1498 	ipif_refrele(ipif);
1499 
1500 ignore_conflict:
1501 	freemsg(mp);
1502 	ira_cleanup(&iras, B_TRUE);
1503 }
1504 
1505 /*
1506  * Handle failure by tearing down the ipifs with the specified address.  Note
1507  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1508  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1509  * we start a timer on the ipif.
1510  * Caller has to free mp;
1511  */
1512 static void
1513 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1514 {
1515 	const uchar_t	*haddr;
1516 	ill_t		*ill = ira->ira_rill;
1517 
1518 	/*
1519 	 * Ignore conflicts generated by misbehaving switches that just
1520 	 * reflect our own messages back to us.
1521 	 */
1522 
1523 	/* icmp_inbound_v6 ensures this */
1524 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1525 	haddr = ira->ira_l2src;
1526 	if (haddr != NULL &&
1527 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1528 		return;
1529 	}
1530 
1531 	if ((mp = copymsg(mp)) != NULL) {
1532 		mblk_t	*attrmp;
1533 
1534 		attrmp = ip_recv_attr_to_mblk(ira);
1535 		if (attrmp == NULL) {
1536 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1537 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1538 			freemsg(mp);
1539 		} else {
1540 			ASSERT(attrmp->b_cont == NULL);
1541 			attrmp->b_cont = mp;
1542 			mp = attrmp;
1543 			ill_refhold(ill);
1544 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1545 			    B_FALSE);
1546 		}
1547 	}
1548 }
1549 
1550 /*
1551  * Handle a discovered conflict: some other system is advertising that it owns
1552  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1553  * interface.
1554  *
1555  * Handles both IPv4 and IPv6
1556  */
1557 boolean_t
1558 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1559 {
1560 	ipif_t		*ipif;
1561 	clock_t		now;
1562 	uint_t		maxdefense;
1563 	uint_t		defs;
1564 	ill_t		*ill = ira->ira_ill;
1565 	ip_stack_t	*ipst = ill->ill_ipst;
1566 	uint32_t	elapsed;
1567 	boolean_t	isv6 = ill->ill_isv6;
1568 	ipaddr_t	ncec_addr;
1569 
1570 	if (isv6) {
1571 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1572 		    ipst);
1573 	} else {
1574 		if (arp_no_defense) {
1575 			/*
1576 			 * Yes, there is a conflict, but no, we do not
1577 			 * defend ourself.
1578 			 */
1579 			return (B_TRUE);
1580 		}
1581 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1582 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1583 		    ipst);
1584 	}
1585 	if (ipif == NULL)
1586 		return (B_FALSE);
1587 
1588 	/*
1589 	 * First, figure out if this address is disposable.
1590 	 */
1591 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1592 		maxdefense = ipst->ips_ip_max_temp_defend;
1593 	else
1594 		maxdefense = ipst->ips_ip_max_defend;
1595 
1596 	/*
1597 	 * Now figure out how many times we've defended ourselves.  Ignore
1598 	 * defenses that happened long in the past.
1599 	 */
1600 	now = ddi_get_lbolt();
1601 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1602 	mutex_enter(&ncec->ncec_lock);
1603 	if ((defs = ncec->ncec_defense_count) > 0 &&
1604 	    elapsed > ipst->ips_ip_defend_interval) {
1605 		/*
1606 		 * ip_defend_interval has elapsed.
1607 		 * reset the defense count.
1608 		 */
1609 		ncec->ncec_defense_count = defs = 0;
1610 	}
1611 	ncec->ncec_defense_count++;
1612 	ncec->ncec_last_time_defended = now;
1613 	mutex_exit(&ncec->ncec_lock);
1614 	ipif_refrele(ipif);
1615 
1616 	/*
1617 	 * If we've defended ourselves too many times already, then give up and
1618 	 * tear down the interface(s) using this address.
1619 	 * Otherwise, caller has to defend by sending out an announce.
1620 	 */
1621 	if (defs >= maxdefense) {
1622 		if (isv6)
1623 			ndp_failure(mp, ira);
1624 		else
1625 			arp_failure(mp, ira);
1626 	} else {
1627 		return (B_TRUE); /* caller must defend this address */
1628 	}
1629 	return (B_FALSE);
1630 }
1631 
1632 /*
1633  * Handle reception of Neighbor Solicitation messages.
1634  */
1635 static void
1636 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1637 {
1638 	ill_t		*ill = ira->ira_ill, *under_ill;
1639 	nd_neighbor_solicit_t *ns;
1640 	uint32_t	hlen = ill->ill_phys_addr_length;
1641 	uchar_t		*haddr = NULL;
1642 	icmp6_t		*icmp_nd;
1643 	ip6_t		*ip6h;
1644 	ncec_t		*our_ncec = NULL;
1645 	in6_addr_t	target;
1646 	in6_addr_t	src;
1647 	int		len;
1648 	int		flag = 0;
1649 	nd_opt_hdr_t	*opt = NULL;
1650 	boolean_t	bad_solicit = B_FALSE;
1651 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1652 	boolean_t	need_ill_refrele = B_FALSE;
1653 
1654 	ip6h = (ip6_t *)mp->b_rptr;
1655 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1656 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1657 	src = ip6h->ip6_src;
1658 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1659 	target = ns->nd_ns_target;
1660 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1661 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1662 		if (ip_debug > 2) {
1663 			/* ip1dbg */
1664 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1665 			    AF_INET6, &target);
1666 		}
1667 		bad_solicit = B_TRUE;
1668 		goto done;
1669 	}
1670 	if (len > sizeof (nd_neighbor_solicit_t)) {
1671 		/* Options present */
1672 		opt = (nd_opt_hdr_t *)&ns[1];
1673 		len -= sizeof (nd_neighbor_solicit_t);
1674 		if (!ndp_verify_optlen(opt, len)) {
1675 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1676 			bad_solicit = B_TRUE;
1677 			goto done;
1678 		}
1679 	}
1680 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1681 		/* Check to see if this is a valid DAD solicitation */
1682 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1683 			if (ip_debug > 2) {
1684 				/* ip1dbg */
1685 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1686 				    "Destination is not solicited node "
1687 				    "multicast %s\n", AF_INET6,
1688 				    &ip6h->ip6_dst);
1689 			}
1690 			bad_solicit = B_TRUE;
1691 			goto done;
1692 		}
1693 	}
1694 
1695 	/*
1696 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1697 	 * received this packet if it's multicast) is not the ill tied to
1698 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1699 	 * to ensure we find the associated NCE.
1700 	 */
1701 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1702 	/*
1703 	 * If this is a valid Solicitation for an address we are publishing,
1704 	 * then a PUBLISH entry should exist in the cache
1705 	 */
1706 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1707 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1708 		    "ifname=%s ", ill->ill_name));
1709 		if (ip_debug > 2) {
1710 			/* ip1dbg */
1711 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1712 		}
1713 		if (our_ncec == NULL)
1714 			bad_solicit = B_TRUE;
1715 		goto done;
1716 	}
1717 
1718 	/* At this point we should have a verified NS per spec */
1719 	if (opt != NULL) {
1720 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1721 		if (opt != NULL) {
1722 			haddr = (uchar_t *)&opt[1];
1723 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1724 			    hlen == 0) {
1725 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1726 				bad_solicit = B_TRUE;
1727 				goto done;
1728 			}
1729 		}
1730 	}
1731 
1732 	/* If sending directly to peer, set the unicast flag */
1733 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1734 		flag |= NDP_UNICAST;
1735 
1736 	/*
1737 	 * Create/update the entry for the soliciting node on the ipmp_ill.
1738 	 * or respond to outstanding queries, don't if
1739 	 * the source is unspecified address.
1740 	 */
1741 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1742 		int	err;
1743 		nce_t	*nnce;
1744 
1745 		ASSERT(ill->ill_isv6);
1746 		/*
1747 		 * Regular solicitations *must* include the Source Link-Layer
1748 		 * Address option.  Ignore messages that do not.
1749 		 */
1750 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1751 			ip1dbg(("ndp_input_solicit: source link-layer address "
1752 			    "option missing with a specified source.\n"));
1753 			bad_solicit = B_TRUE;
1754 			goto done;
1755 		}
1756 
1757 		/*
1758 		 * This is a regular solicitation.  If we're still in the
1759 		 * process of verifying the address, then don't respond at all
1760 		 * and don't keep track of the sender.
1761 		 */
1762 		if (our_ncec->ncec_state == ND_PROBE)
1763 			goto done;
1764 
1765 		/*
1766 		 * If the solicitation doesn't have sender hardware address
1767 		 * (legal for unicast solicitation), then process without
1768 		 * installing the return NCE.  Either we already know it, or
1769 		 * we'll be forced to look it up when (and if) we reply to the
1770 		 * packet.
1771 		 */
1772 		if (haddr == NULL)
1773 			goto no_source;
1774 
1775 		under_ill = ill;
1776 		if (IS_UNDER_IPMP(under_ill)) {
1777 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
1778 			if (ill == NULL)
1779 				ill = under_ill;
1780 			else
1781 				need_ill_refrele = B_TRUE;
1782 		}
1783 		err = nce_lookup_then_add_v6(ill,
1784 		    haddr, hlen,
1785 		    &src,	/* Soliciting nodes address */
1786 		    0,
1787 		    ND_STALE,
1788 		    &nnce);
1789 
1790 		if (need_ill_refrele) {
1791 			ill_refrele(ill);
1792 			ill = under_ill;
1793 			need_ill_refrele =  B_FALSE;
1794 		}
1795 		switch (err) {
1796 		case 0:
1797 			/* done with this entry */
1798 			nce_refrele(nnce);
1799 			break;
1800 		case EEXIST:
1801 			/*
1802 			 * B_FALSE indicates this is not an an advertisement.
1803 			 */
1804 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1805 			nce_refrele(nnce);
1806 			break;
1807 		default:
1808 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1809 			    err));
1810 			goto done;
1811 		}
1812 no_source:
1813 		flag |= NDP_SOLICITED;
1814 	} else {
1815 		/*
1816 		 * No source link layer address option should be present in a
1817 		 * valid DAD request.
1818 		 */
1819 		if (haddr != NULL) {
1820 			ip1dbg(("ndp_input_solicit: source link-layer address "
1821 			    "option present with an unspecified source.\n"));
1822 			bad_solicit = B_TRUE;
1823 			goto done;
1824 		}
1825 		if (our_ncec->ncec_state == ND_PROBE) {
1826 			/*
1827 			 * Internally looped-back probes will have
1828 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1829 			 * transmissions.
1830 			 */
1831 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1832 				/*
1833 				 * If someone else is probing our address, then
1834 				 * we've crossed wires.  Declare failure.
1835 				 */
1836 				ndp_failure(mp, ira);
1837 			}
1838 			goto done;
1839 		}
1840 		/*
1841 		 * This is a DAD probe.  Multicast the advertisement to the
1842 		 * all-nodes address.
1843 		 */
1844 		src = ipv6_all_hosts_mcast;
1845 	}
1846 	flag |= nce_advert_flags(our_ncec);
1847 	(void) ndp_xmit(ill,
1848 	    ND_NEIGHBOR_ADVERT,
1849 	    our_ncec->ncec_lladdr,
1850 	    our_ncec->ncec_lladdr_length,
1851 	    &target,	/* Source and target of the advertisement pkt */
1852 	    &src,	/* IP Destination (source of original pkt) */
1853 	    flag);
1854 done:
1855 	if (bad_solicit)
1856 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1857 	if (our_ncec != NULL)
1858 		ncec_refrele(our_ncec);
1859 }
1860 
1861 /*
1862  * Handle reception of Neighbor Solicitation messages
1863  */
1864 void
1865 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1866 {
1867 	ill_t		*ill = ira->ira_ill;
1868 	nd_neighbor_advert_t *na;
1869 	uint32_t	hlen = ill->ill_phys_addr_length;
1870 	uchar_t		*haddr = NULL;
1871 	icmp6_t		*icmp_nd;
1872 	ip6_t		*ip6h;
1873 	ncec_t		*dst_ncec = NULL;
1874 	in6_addr_t	target;
1875 	nd_opt_hdr_t	*opt = NULL;
1876 	int		len;
1877 	ip_stack_t	*ipst = ill->ill_ipst;
1878 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1879 
1880 	ip6h = (ip6_t *)mp->b_rptr;
1881 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1882 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1883 	na = (nd_neighbor_advert_t *)icmp_nd;
1884 
1885 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1886 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1887 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1888 		    "solicited flag is not zero\n"));
1889 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1890 		return;
1891 	}
1892 	target = na->nd_na_target;
1893 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1894 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1895 		if (ip_debug > 2) {
1896 			/* ip1dbg */
1897 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1898 			    AF_INET6, &target);
1899 		}
1900 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1901 		return;
1902 	}
1903 	if (len > sizeof (nd_neighbor_advert_t)) {
1904 		opt = (nd_opt_hdr_t *)&na[1];
1905 		if (!ndp_verify_optlen(opt,
1906 		    len - sizeof (nd_neighbor_advert_t))) {
1907 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1908 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1909 			return;
1910 		}
1911 		/* At this point we have a verified NA per spec */
1912 		len -= sizeof (nd_neighbor_advert_t);
1913 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1914 		if (opt != NULL) {
1915 			haddr = (uchar_t *)&opt[1];
1916 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1917 			    hlen == 0) {
1918 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1919 				BUMP_MIB(mib,
1920 				    ipv6IfIcmpInBadNeighborAdvertisements);
1921 				return;
1922 			}
1923 		}
1924 	}
1925 
1926 	/*
1927 	 * NOTE: we match across the illgrp since we need to do DAD for all of
1928 	 * our local addresses, and those are spread across all the active
1929 	 * ills in the group.
1930 	 */
1931 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1932 		return;
1933 
1934 	if (NCE_PUBLISH(dst_ncec)) {
1935 		/*
1936 		 * Someone just advertised an addresses that we publish. First,
1937 		 * check it it was us -- if so, we can safely ignore it.
1938 		 * We don't get the haddr from the ira_l2src because, in the
1939 		 * case that the packet originated from us, on an IPMP group,
1940 		 * the ira_l2src may would be the link-layer address of the
1941 		 * cast_ill used to send the packet, which may not be the same
1942 		 * as the dst_ncec->ncec_lladdr of the address.
1943 		 */
1944 		if (haddr != NULL) {
1945 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1946 				goto out;
1947 
1948 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1949 				goto out;   /* from us -- no conflict */
1950 
1951 			/*
1952 			 * If we're in an IPMP group, check if this is an echo
1953 			 * from another ill in the group.  Use the double-
1954 			 * checked locking pattern to avoid grabbing
1955 			 * ill_g_lock in the non-IPMP case.
1956 			 */
1957 			if (IS_UNDER_IPMP(ill)) {
1958 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1959 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1960 				    ill->ill_grp, haddr, hlen) != NULL) {
1961 					rw_exit(&ipst->ips_ill_g_lock);
1962 					goto out;
1963 				}
1964 				rw_exit(&ipst->ips_ill_g_lock);
1965 			}
1966 		}
1967 
1968 		/*
1969 		 * This appears to be a real conflict.  If we're trying to
1970 		 * configure this NCE (ND_PROBE), then shut it down.
1971 		 * Otherwise, handle the discovered conflict.
1972 		 */
1973 		if (dst_ncec->ncec_state == ND_PROBE) {
1974 			ndp_failure(mp, ira);
1975 		} else {
1976 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
1977 				char hbuf[MAC_STR_LEN];
1978 				char sbuf[INET6_ADDRSTRLEN];
1979 
1980 				cmn_err(CE_WARN,
1981 				    "node '%s' is using %s on %s",
1982 				    inet_ntop(AF_INET6, &target, sbuf,
1983 				    sizeof (sbuf)),
1984 				    haddr == NULL ? "<none>" :
1985 				    mac_colon_addr(haddr, hlen, hbuf,
1986 				    sizeof (hbuf)), ill->ill_name);
1987 				/*
1988 				 * RFC 4862, Section 5.4.4 does not mandate
1989 				 * any specific behavior when an NA matches
1990 				 * a non-tentative address assigned to the
1991 				 * receiver. We make the choice of defending
1992 				 * our address, based on the assumption that
1993 				 * the sender has not detected the Duplicate.
1994 				 *
1995 				 * ncec_last_time_defended has been adjusted
1996 				 * in ip_nce_conflict()
1997 				 */
1998 				(void) ndp_announce(dst_ncec);
1999 			}
2000 		}
2001 	} else {
2002 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2003 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2004 
2005 		/* B_TRUE indicates this an advertisement */
2006 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2007 	}
2008 out:
2009 	ncec_refrele(dst_ncec);
2010 }
2011 
2012 /*
2013  * Process NDP neighbor solicitation/advertisement messages.
2014  * The checksum has already checked o.k before reaching here.
2015  * Information about the datalink header is contained in ira_l2src, but
2016  * that should be ignored for loopback packets.
2017  */
2018 void
2019 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2020 {
2021 	ill_t		*ill = ira->ira_rill;
2022 	icmp6_t		*icmp_nd;
2023 	ip6_t		*ip6h;
2024 	int		len;
2025 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2026 	ill_t		*orig_ill = NULL;
2027 
2028 	/*
2029 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2030 	 * and make it be the IPMP upper so avoid being confused by a packet
2031 	 * addressed to a unicast address on a different ill.
2032 	 */
2033 	if (IS_UNDER_IPMP(ill)) {
2034 		orig_ill = ill;
2035 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2036 		if (ill == NULL) {
2037 			ill = orig_ill;
2038 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2039 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2040 			    mp, ill);
2041 			freemsg(mp);
2042 			return;
2043 		}
2044 		ASSERT(ill != orig_ill);
2045 		orig_ill = ira->ira_ill;
2046 		ira->ira_ill = ill;
2047 		mib = ill->ill_icmp6_mib;
2048 	}
2049 	if (!pullupmsg(mp, -1)) {
2050 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2051 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2052 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2053 		goto done;
2054 	}
2055 	ip6h = (ip6_t *)mp->b_rptr;
2056 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2057 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2058 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2059 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2060 		goto done;
2061 	}
2062 	/*
2063 	 * NDP does not accept any extension headers between the
2064 	 * IP header and the ICMP header since e.g. a routing
2065 	 * header could be dangerous.
2066 	 * This assumes that any AH or ESP headers are removed
2067 	 * by ip prior to passing the packet to ndp_input.
2068 	 */
2069 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2070 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2071 		    ip6h->ip6_nxt));
2072 		ip_drop_input("Wrong next header", mp, ill);
2073 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2074 		goto done;
2075 	}
2076 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2077 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2078 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2079 	if (icmp_nd->icmp6_code != 0) {
2080 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2081 		ip_drop_input("code non-zero", mp, ill);
2082 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2083 		goto done;
2084 	}
2085 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2086 	/*
2087 	 * Make sure packet length is large enough for either
2088 	 * a NS or a NA icmp packet.
2089 	 */
2090 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2091 		ip1dbg(("ndp_input: packet too short\n"));
2092 		ip_drop_input("packet too short", mp, ill);
2093 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2094 		goto done;
2095 	}
2096 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2097 		ndp_input_solicit(mp, ira);
2098 	} else {
2099 		ndp_input_advert(mp, ira);
2100 	}
2101 done:
2102 	freemsg(mp);
2103 	if (orig_ill != NULL) {
2104 		ill_refrele(ill);
2105 		ira->ira_ill = orig_ill;
2106 	}
2107 }
2108 
2109 /*
2110  * ndp_xmit is called to form and transmit a ND solicitation or
2111  * advertisement ICMP packet.
2112  *
2113  * If the source address is unspecified and this isn't a probe (used for
2114  * duplicate address detection), an appropriate source address and link layer
2115  * address will be chosen here.  The link layer address option is included if
2116  * the source is specified (i.e., all non-probe packets), and omitted (per the
2117  * specification) otherwise.
2118  *
2119  * It returns B_FALSE only if it does a successful put() to the
2120  * corresponding ill's ill_wq otherwise returns B_TRUE.
2121  */
2122 static boolean_t
2123 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2124     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2125 {
2126 	uint32_t	len;
2127 	icmp6_t		*icmp6;
2128 	mblk_t		*mp;
2129 	ip6_t		*ip6h;
2130 	nd_opt_hdr_t	*opt;
2131 	uint_t		plen;
2132 	zoneid_t	zoneid = GLOBAL_ZONEID;
2133 	ill_t		*hwaddr_ill = ill;
2134 	ip_xmit_attr_t	ixas;
2135 	ip_stack_t	*ipst = ill->ill_ipst;
2136 	boolean_t	need_refrele = B_FALSE;
2137 	boolean_t	probe = B_FALSE;
2138 
2139 	if (IS_UNDER_IPMP(ill)) {
2140 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2141 		/*
2142 		 * We send non-probe packets on the upper IPMP interface.
2143 		 * ip_output_simple() will use cast_ill for sending any
2144 		 * multicast packets. Note that we can't follow the same
2145 		 * logic for probe packets because all interfaces in the ipmp
2146 		 * group may have failed, so that we really want to only try
2147 		 * to send the ND packet on the ill corresponding to the src
2148 		 * address.
2149 		 */
2150 		if (!probe) {
2151 			ill = ipmp_ill_hold_ipmp_ill(ill);
2152 			if (ill != NULL)
2153 				need_refrele = B_TRUE;
2154 			else
2155 				ill = hwaddr_ill;
2156 		}
2157 	}
2158 
2159 	/*
2160 	 * If we have a unspecified source(sender) address, select a
2161 	 * proper source address for the solicitation here itself so
2162 	 * that we can initialize the h/w address correctly.
2163 	 *
2164 	 * If the sender is specified then we use this address in order
2165 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2166 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2167 	 * by IP (we cannot guarantee that the global zone has an interface
2168 	 * route to the destination).
2169 	 *
2170 	 * Note that the NA never comes here with the unspecified source
2171 	 * address.
2172 	 */
2173 
2174 	/*
2175 	 * Probes will have unspec src at this point.
2176 	 */
2177 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2178 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2179 		/*
2180 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2181 		 * ALL_ZONES if it cannot find a matching ipif for the address
2182 		 * we are trying to use. In this case we err on the side of
2183 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2184 		 */
2185 		if (zoneid == ALL_ZONES)
2186 			zoneid = GLOBAL_ZONEID;
2187 	}
2188 
2189 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2190 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2191 	mp = allocb(len,  BPRI_LO);
2192 	if (mp == NULL) {
2193 		if (need_refrele)
2194 			ill_refrele(ill);
2195 		return (B_TRUE);
2196 	}
2197 
2198 	bzero((char *)mp->b_rptr, len);
2199 	mp->b_wptr = mp->b_rptr + len;
2200 
2201 	bzero(&ixas, sizeof (ixas));
2202 	ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2203 
2204 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2205 	ixas.ixa_ipst = ipst;
2206 	ixas.ixa_cred = kcred;
2207 	ixas.ixa_cpid = NOPID;
2208 	ixas.ixa_tsl = NULL;
2209 	ixas.ixa_zoneid = zoneid;
2210 
2211 	ip6h = (ip6_t *)mp->b_rptr;
2212 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2213 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2214 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2215 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2216 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2217 	ip6h->ip6_dst = *target;
2218 	icmp6 = (icmp6_t *)&ip6h[1];
2219 
2220 	if (hw_addr_len != 0) {
2221 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2222 		    sizeof (nd_neighbor_advert_t));
2223 	} else {
2224 		opt = NULL;
2225 	}
2226 	if (operation == ND_NEIGHBOR_SOLICIT) {
2227 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2228 
2229 		if (opt != NULL && !(flag & NDP_PROBE)) {
2230 			/*
2231 			 * Note that we don't send out SLLA for ND probes
2232 			 * per RFC 4862, even though we do send out the src
2233 			 * haddr for IPv4 DAD probes, even though both IPv4
2234 			 * and IPv6 go out with the unspecified/INADDR_ANY
2235 			 * src IP addr.
2236 			 */
2237 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2238 		}
2239 		ip6h->ip6_src = *sender;
2240 		ns->nd_ns_target = *target;
2241 		if (!(flag & NDP_UNICAST)) {
2242 			/* Form multicast address of the target */
2243 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2244 			ip6h->ip6_dst.s6_addr32[3] |=
2245 			    ns->nd_ns_target.s6_addr32[3];
2246 		}
2247 	} else {
2248 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2249 
2250 		ASSERT(!(flag & NDP_PROBE));
2251 		if (opt != NULL)
2252 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2253 		ip6h->ip6_src = *sender;
2254 		na->nd_na_target = *sender;
2255 		if (flag & NDP_ISROUTER)
2256 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2257 		if (flag & NDP_SOLICITED)
2258 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2259 		if (flag & NDP_ORIDE)
2260 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2261 	}
2262 
2263 	if (!(flag & NDP_PROBE)) {
2264 		if (hw_addr != NULL && opt != NULL) {
2265 			/* Fill in link layer address and option len */
2266 			opt->nd_opt_len = (uint8_t)plen;
2267 			bcopy(hw_addr, &opt[1], hw_addr_len);
2268 		}
2269 	}
2270 	if (opt != NULL && opt->nd_opt_type == 0) {
2271 		/* If there's no link layer address option, then strip it. */
2272 		len -= plen * 8;
2273 		mp->b_wptr = mp->b_rptr + len;
2274 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2275 	}
2276 
2277 	icmp6->icmp6_type = (uint8_t)operation;
2278 	icmp6->icmp6_code = 0;
2279 	/*
2280 	 * Prepare for checksum by putting icmp length in the icmp
2281 	 * checksum field. The checksum is calculated in ip_output.c.
2282 	 */
2283 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2284 
2285 	(void) ip_output_simple(mp, &ixas);
2286 	ixa_cleanup(&ixas);
2287 	if (need_refrele)
2288 		ill_refrele(ill);
2289 	return (B_FALSE);
2290 }
2291 
2292 /*
2293  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2294  * The datapath uses this as an indication that there
2295  * is a problem (as opposed to a NCE that was just
2296  * reclaimed due to lack of memory.
2297  * Note that static ARP entries never become unreachable.
2298  */
2299 void
2300 nce_make_unreachable(ncec_t *ncec)
2301 {
2302 	mutex_enter(&ncec->ncec_lock);
2303 	ncec->ncec_state = ND_UNREACHABLE;
2304 	mutex_exit(&ncec->ncec_lock);
2305 }
2306 
2307 /*
2308  * NCE retransmit timer. Common to IPv4 and IPv6.
2309  * This timer goes off when:
2310  * a. It is time to retransmit a resolution for resolver.
2311  * b. It is time to send reachability probes.
2312  */
2313 void
2314 nce_timer(void *arg)
2315 {
2316 	ncec_t		*ncec = arg;
2317 	ill_t		*ill = ncec->ncec_ill, *src_ill;
2318 	char		addrbuf[INET6_ADDRSTRLEN];
2319 	boolean_t	dropped = B_FALSE;
2320 	ip_stack_t	*ipst = ncec->ncec_ipst;
2321 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2322 	in_addr_t	sender4 = INADDR_ANY;
2323 	in6_addr_t	sender6 = ipv6_all_zeros;
2324 
2325 	/*
2326 	 * The timer has to be cancelled by ncec_delete before doing the final
2327 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2328 	 * until it clears the timeout_id. Before clearing the timeout_id
2329 	 * bump up the refcnt so that we can continue to use the ncec
2330 	 */
2331 	ASSERT(ncec != NULL);
2332 	mutex_enter(&ncec->ncec_lock);
2333 	ncec_refhold_locked(ncec);
2334 	ncec->ncec_timeout_id = 0;
2335 	mutex_exit(&ncec->ncec_lock);
2336 
2337 	src_ill = nce_resolve_src(ncec, &sender6);
2338 	/* if we could not find a sender address, return */
2339 	if (src_ill == NULL) {
2340 		if (!isv6) {
2341 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2342 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2343 			    &sender4, addrbuf, sizeof (addrbuf))));
2344 		} else {
2345 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2346 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2347 		}
2348 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2349 		ncec_refrele(ncec);
2350 		return;
2351 	}
2352 	if (!isv6)
2353 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2354 
2355 	mutex_enter(&ncec->ncec_lock);
2356 	/*
2357 	 * Check the reachability state.
2358 	 */
2359 	switch (ncec->ncec_state) {
2360 	case ND_DELAY:
2361 		ASSERT(ncec->ncec_lladdr != NULL);
2362 		ncec->ncec_state = ND_PROBE;
2363 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2364 		if (isv6) {
2365 			mutex_exit(&ncec->ncec_lock);
2366 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2367 			    src_ill->ill_phys_addr,
2368 			    src_ill->ill_phys_addr_length,
2369 			    &sender6, &ncec->ncec_addr,
2370 			    NDP_UNICAST);
2371 		} else {
2372 			dropped = (arp_request(ncec, sender4, src_ill) == 0);
2373 			mutex_exit(&ncec->ncec_lock);
2374 		}
2375 		if (!dropped) {
2376 			mutex_enter(&ncec->ncec_lock);
2377 			ncec->ncec_pcnt--;
2378 			mutex_exit(&ncec->ncec_lock);
2379 		}
2380 		if (ip_debug > 3) {
2381 			/* ip2dbg */
2382 			pr_addr_dbg("nce_timer: state for %s changed "
2383 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2384 		}
2385 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2386 		break;
2387 	case ND_PROBE:
2388 		/* must be retransmit timer */
2389 		ASSERT(ncec->ncec_pcnt >= -1);
2390 		if (ncec->ncec_pcnt > 0) {
2391 			/*
2392 			 * As per RFC2461, the ncec gets deleted after
2393 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2394 			 * Note that the first unicast solicitation is sent
2395 			 * during the DELAY state.
2396 			 */
2397 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2398 			    ncec->ncec_pcnt,
2399 			    inet_ntop((isv6? AF_INET6 : AF_INET),
2400 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2401 			if (NCE_PUBLISH(ncec)) {
2402 				mutex_exit(&ncec->ncec_lock);
2403 				/*
2404 				 * send out a probe; note that src_ill
2405 				 * is ignored by nce_dad() for all
2406 				 * DAD message types other than IPv6
2407 				 * unicast probes
2408 				 */
2409 				nce_dad(ncec, src_ill, B_TRUE);
2410 			} else {
2411 				ASSERT(src_ill != NULL);
2412 				if (isv6) {
2413 					mutex_exit(&ncec->ncec_lock);
2414 					dropped = ndp_xmit(src_ill,
2415 					    ND_NEIGHBOR_SOLICIT,
2416 					    src_ill->ill_phys_addr,
2417 					    src_ill->ill_phys_addr_length,
2418 					    &sender6, &ncec->ncec_addr,
2419 					    NDP_UNICAST);
2420 				} else {
2421 					/*
2422 					 * since the nce is REACHABLE,
2423 					 * the ARP request will be sent out
2424 					 * as a link-layer unicast.
2425 					 */
2426 					dropped = (arp_request(ncec, sender4,
2427 					    src_ill) == 0);
2428 					mutex_exit(&ncec->ncec_lock);
2429 				}
2430 				if (!dropped) {
2431 					mutex_enter(&ncec->ncec_lock);
2432 					ncec->ncec_pcnt--;
2433 					mutex_exit(&ncec->ncec_lock);
2434 				}
2435 				nce_restart_timer(ncec,
2436 				    ill->ill_reachable_retrans_time);
2437 			}
2438 		} else if (ncec->ncec_pcnt < 0) {
2439 			/* No hope, delete the ncec */
2440 			/* Tell datapath it went bad */
2441 			ncec->ncec_state = ND_UNREACHABLE;
2442 			mutex_exit(&ncec->ncec_lock);
2443 			if (ip_debug > 2) {
2444 				/* ip1dbg */
2445 				pr_addr_dbg("nce_timer: Delete NCE for"
2446 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
2447 				    &ncec->ncec_addr);
2448 			}
2449 			/* if static ARP can't delete. */
2450 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2451 				ncec_delete(ncec);
2452 
2453 		} else if (!NCE_PUBLISH(ncec)) {
2454 			/*
2455 			 * Probe count is 0 for a dynamic entry (one that we
2456 			 * ourselves are not publishing). We should never get
2457 			 * here if NONUD was requested, hence the ASSERT below.
2458 			 */
2459 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2460 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2461 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
2462 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2463 			ncec->ncec_pcnt--;
2464 			mutex_exit(&ncec->ncec_lock);
2465 			/* Wait one interval before killing */
2466 			nce_restart_timer(ncec,
2467 			    ill->ill_reachable_retrans_time);
2468 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2469 			ipif_t *ipif;
2470 			ipaddr_t ncec_addr;
2471 
2472 			/*
2473 			 * We're done probing, and we can now declare this
2474 			 * address to be usable.  Let IP know that it's ok to
2475 			 * use.
2476 			 */
2477 			ncec->ncec_state = ND_REACHABLE;
2478 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2479 			mutex_exit(&ncec->ncec_lock);
2480 			if (isv6) {
2481 				ipif = ipif_lookup_addr_exact_v6(
2482 				    &ncec->ncec_addr, ill, ipst);
2483 			} else {
2484 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2485 				    ncec_addr);
2486 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2487 				    ipst);
2488 			}
2489 			if (ipif != NULL) {
2490 				if (ipif->ipif_was_dup) {
2491 					char ibuf[LIFNAMSIZ];
2492 					char sbuf[INET6_ADDRSTRLEN];
2493 
2494 					ipif->ipif_was_dup = B_FALSE;
2495 					(void) inet_ntop(AF_INET6,
2496 					    &ipif->ipif_v6lcl_addr,
2497 					    sbuf, sizeof (sbuf));
2498 					ipif_get_name(ipif, ibuf,
2499 					    sizeof (ibuf));
2500 					cmn_err(CE_NOTE, "recovered address "
2501 					    "%s on %s", sbuf, ibuf);
2502 				}
2503 				if ((ipif->ipif_flags & IPIF_UP) &&
2504 				    !ipif->ipif_addr_ready)
2505 					ipif_up_notify(ipif);
2506 				ipif->ipif_addr_ready = 1;
2507 				ipif_refrele(ipif);
2508 			}
2509 			if (!isv6 && arp_no_defense)
2510 				break;
2511 			/* Begin defending our new address */
2512 			if (ncec->ncec_unsolicit_count > 0) {
2513 				ncec->ncec_unsolicit_count--;
2514 				if (isv6) {
2515 					dropped = ndp_announce(ncec);
2516 				} else {
2517 					dropped = arp_announce(ncec);
2518 				}
2519 
2520 				if (dropped)
2521 					ncec->ncec_unsolicit_count++;
2522 				else
2523 					ncec->ncec_last_time_defended =
2524 					    ddi_get_lbolt();
2525 			}
2526 			if (ncec->ncec_unsolicit_count > 0) {
2527 				nce_restart_timer(ncec,
2528 				    ANNOUNCE_INTERVAL(isv6));
2529 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
2530 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2531 			}
2532 		} else {
2533 			/*
2534 			 * This is an address we're probing to be our own, but
2535 			 * the ill is down.  Wait until it comes back before
2536 			 * doing anything, but switch to reachable state so
2537 			 * that the restart will work.
2538 			 */
2539 			ncec->ncec_state = ND_REACHABLE;
2540 			mutex_exit(&ncec->ncec_lock);
2541 		}
2542 		break;
2543 	case ND_INCOMPLETE: {
2544 		mblk_t	*mp, *nextmp;
2545 		mblk_t	**prevmpp;
2546 
2547 		/*
2548 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2549 		 * for any IPMP probe packets, and toss them.  IPMP probe
2550 		 * packets will always be at the head of ncec_qd_mp, so that
2551 		 * we can stop at the first queued ND packet that is
2552 		 * not a probe packet.
2553 		 */
2554 		prevmpp = &ncec->ncec_qd_mp;
2555 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2556 			nextmp = mp->b_next;
2557 
2558 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2559 				inet_freemsg(mp);
2560 				ncec->ncec_nprobes--;
2561 				*prevmpp = nextmp;
2562 			} else {
2563 				prevmpp = &mp->b_next;
2564 			}
2565 		}
2566 
2567 		/*
2568 		 * Must be resolver's retransmit timer.
2569 		 */
2570 		mutex_exit(&ncec->ncec_lock);
2571 		ip_ndp_resolve(ncec);
2572 		break;
2573 	}
2574 	case ND_REACHABLE:
2575 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2576 		    ncec->ncec_unsolicit_count != 0) ||
2577 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2578 			if (ncec->ncec_unsolicit_count > 0) {
2579 				ncec->ncec_unsolicit_count--;
2580 				mutex_exit(&ncec->ncec_lock);
2581 				/*
2582 				 * When we get to zero announcements left,
2583 				 * switch to address defense
2584 				 */
2585 			} else {
2586 				boolean_t rate_limit;
2587 
2588 				mutex_exit(&ncec->ncec_lock);
2589 				rate_limit = ill_defend_rate_limit(ill, ncec);
2590 				if (rate_limit) {
2591 					nce_restart_timer(ncec,
2592 					    DEFENSE_INTERVAL(isv6));
2593 					break;
2594 				}
2595 			}
2596 			if (isv6) {
2597 				dropped = ndp_announce(ncec);
2598 			} else {
2599 				dropped = arp_announce(ncec);
2600 			}
2601 			mutex_enter(&ncec->ncec_lock);
2602 			if (dropped) {
2603 				ncec->ncec_unsolicit_count++;
2604 			} else {
2605 				ncec->ncec_last_time_defended =
2606 				    ddi_get_lbolt();
2607 			}
2608 			mutex_exit(&ncec->ncec_lock);
2609 			if (ncec->ncec_unsolicit_count != 0) {
2610 				nce_restart_timer(ncec,
2611 				    ANNOUNCE_INTERVAL(isv6));
2612 			} else {
2613 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2614 			}
2615 		} else {
2616 			mutex_exit(&ncec->ncec_lock);
2617 		}
2618 		break;
2619 	default:
2620 		mutex_exit(&ncec->ncec_lock);
2621 		break;
2622 	}
2623 done:
2624 	ncec_refrele(ncec);
2625 	ill_refrele(src_ill);
2626 }
2627 
2628 /*
2629  * Set a link layer address from the ll_addr passed in.
2630  * Copy SAP from ill.
2631  */
2632 static void
2633 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2634 {
2635 	ill_t	*ill = ncec->ncec_ill;
2636 
2637 	ASSERT(ll_addr != NULL);
2638 	if (ill->ill_phys_addr_length > 0) {
2639 		/*
2640 		 * The bcopy() below used to be called for the physical address
2641 		 * length rather than the link layer address length. For
2642 		 * ethernet and many other media, the phys_addr and lla are
2643 		 * identical.
2644 		 *
2645 		 * The phys_addr and lla may not be the same for devices that
2646 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2647 		 * no known instances of these.
2648 		 *
2649 		 * For PPP or other interfaces with a zero length
2650 		 * physical address, don't do anything here.
2651 		 * The bcopy() with a zero phys_addr length was previously
2652 		 * a no-op for interfaces with a zero-length physical address.
2653 		 * Using the lla for them would change the way they operate.
2654 		 * Doing nothing in such cases preserves expected behavior.
2655 		 */
2656 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2657 	}
2658 }
2659 
2660 boolean_t
2661 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2662     uint32_t ll_addr_len)
2663 {
2664 	ASSERT(ncec->ncec_lladdr != NULL);
2665 	if (ll_addr == NULL)
2666 		return (B_FALSE);
2667 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2668 		return (B_TRUE);
2669 	return (B_FALSE);
2670 }
2671 
2672 /*
2673  * Updates the link layer address or the reachability state of
2674  * a cache entry.  Reset probe counter if needed.
2675  */
2676 void
2677 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2678 {
2679 	ill_t	*ill = ncec->ncec_ill;
2680 	boolean_t need_stop_timer = B_FALSE;
2681 	boolean_t need_fastpath_update = B_FALSE;
2682 	nce_t	*nce = NULL;
2683 	timeout_id_t tid;
2684 
2685 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2686 	/*
2687 	 * If this interface does not do NUD, there is no point
2688 	 * in allowing an update to the cache entry.  Although
2689 	 * we will respond to NS.
2690 	 * The only time we accept an update for a resolver when
2691 	 * NUD is turned off is when it has just been created.
2692 	 * Non-Resolvers will always be created as REACHABLE.
2693 	 */
2694 	if (new_state != ND_UNCHANGED) {
2695 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
2696 		    (ncec->ncec_state != ND_INCOMPLETE))
2697 			return;
2698 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2699 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2700 		need_stop_timer = B_TRUE;
2701 		if (new_state == ND_REACHABLE)
2702 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2703 		else {
2704 			/* We force NUD in this case */
2705 			ncec->ncec_last = 0;
2706 		}
2707 		ncec->ncec_state = new_state;
2708 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2709 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2710 		    new_state == ND_INCOMPLETE);
2711 	}
2712 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2713 		tid = ncec->ncec_timeout_id;
2714 		ncec->ncec_timeout_id = 0;
2715 	}
2716 	/*
2717 	 * Re-trigger fastpath probe and
2718 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2719 	 * whatever packets that happens to be transmitting at the time.
2720 	 */
2721 	if (new_ll_addr != NULL) {
2722 		bcopy(new_ll_addr, ncec->ncec_lladdr,
2723 		    ill->ill_phys_addr_length);
2724 		need_fastpath_update = B_TRUE;
2725 	}
2726 	mutex_exit(&ncec->ncec_lock);
2727 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2728 		if (tid != 0)
2729 			(void) untimeout(tid);
2730 	}
2731 	if (need_fastpath_update) {
2732 		/*
2733 		 * Delete any existing existing dlur_mp and fp_mp information.
2734 		 * For IPMP interfaces, all underlying ill's must be checked
2735 		 * and purged.
2736 		 */
2737 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2738 		/*
2739 		 * add the new dlur_mp and fp_mp
2740 		 */
2741 		nce = nce_fastpath(ncec, B_TRUE, NULL);
2742 		if (nce != NULL)
2743 			nce_refrele(nce);
2744 	}
2745 	mutex_enter(&ncec->ncec_lock);
2746 }
2747 
2748 static void
2749 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2750 {
2751 	uint_t	count = 0;
2752 	mblk_t  **mpp, *tmp;
2753 
2754 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2755 
2756 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2757 		if (++count > ncec->ncec_ill->ill_max_buf) {
2758 			tmp = ncec->ncec_qd_mp->b_next;
2759 			ncec->ncec_qd_mp->b_next = NULL;
2760 			/*
2761 			 * if we never create data addrs on the under_ill
2762 			 * does this matter?
2763 			 */
2764 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2765 			    ipIfStatsOutDiscards);
2766 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2767 			    ncec->ncec_ill);
2768 			freemsg(ncec->ncec_qd_mp);
2769 			ncec->ncec_qd_mp = tmp;
2770 		}
2771 	}
2772 
2773 	if (head_insert) {
2774 		ncec->ncec_nprobes++;
2775 		mp->b_next = ncec->ncec_qd_mp;
2776 		ncec->ncec_qd_mp = mp;
2777 	} else {
2778 		*mpp = mp;
2779 	}
2780 }
2781 
2782 /*
2783  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2784  * queued at the head or tail of the queue based on the input argument
2785  * 'head_insert'. The caller should specify this argument as B_TRUE if this
2786  * packet is an IPMP probe packet, in which case the following happens:
2787  *
2788  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2789  *	(non-ipmp_probe) load-speading case where the source address of the ND
2790  *	packet is not tied to ncec_ill. If the ill bound to the source address
2791  *	cannot receive, the response to the ND packet will not be received.
2792  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
2793  *	packet, those probes will also fail to be sent, and thus in.mpathd will
2794  *	 erroneously conclude that ncec_ill has also failed.
2795  *
2796  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
2797  *	the first attempt.  This ensures that ND problems do not manifest as
2798  *	probe RTT spikes.
2799  *
2800  * We achieve this by inserting ipmp_probe() packets at the head of the
2801  * nce_queue.
2802  *
2803  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2804  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2805  */
2806 void
2807 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2808 {
2809 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2810 	nce_queue_mp_common(ncec, mp, head_insert);
2811 }
2812 
2813 /*
2814  * Called when address resolution failed due to a timeout.
2815  * Send an ICMP unreachable in response to all queued packets.
2816  */
2817 void
2818 ndp_resolv_failed(ncec_t *ncec)
2819 {
2820 	mblk_t	*mp, *nxt_mp;
2821 	char	buf[INET6_ADDRSTRLEN];
2822 	ill_t *ill = ncec->ncec_ill;
2823 	ip_recv_attr_t	iras;
2824 
2825 	bzero(&iras, sizeof (iras));
2826 	iras.ira_flags = 0;
2827 	/*
2828 	 * we are setting the ira_rill to the ipmp_ill (instead of
2829 	 * the actual ill on which the packet was received), but this
2830 	 * is ok because we don't actually need the real ira_rill.
2831 	 * to send the icmp unreachable to the sender.
2832 	 */
2833 	iras.ira_ill = iras.ira_rill = ill;
2834 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2835 	iras.ira_rifindex = iras.ira_ruifindex;
2836 
2837 	ip1dbg(("ndp_resolv_failed: dst %s\n",
2838 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2839 	mutex_enter(&ncec->ncec_lock);
2840 	mp = ncec->ncec_qd_mp;
2841 	ncec->ncec_qd_mp = NULL;
2842 	ncec->ncec_nprobes = 0;
2843 	mutex_exit(&ncec->ncec_lock);
2844 	while (mp != NULL) {
2845 		nxt_mp = mp->b_next;
2846 		mp->b_next = NULL;
2847 
2848 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2849 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2850 		    mp, ill);
2851 		icmp_unreachable_v6(mp,
2852 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2853 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2854 		mp = nxt_mp;
2855 	}
2856 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2857 }
2858 
2859 /*
2860  * Handle the completion of NDP and ARP resolution.
2861  */
2862 void
2863 nce_resolv_ok(ncec_t *ncec)
2864 {
2865 	mblk_t *mp;
2866 	uint_t pkt_len;
2867 	iaflags_t ixaflags = IXAF_NO_TRACE;
2868 	nce_t *nce;
2869 	ill_t	*ill = ncec->ncec_ill;
2870 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2871 	ip_stack_t *ipst = ill->ill_ipst;
2872 
2873 	if (IS_IPMP(ncec->ncec_ill)) {
2874 		nce_resolv_ipmp_ok(ncec);
2875 		return;
2876 	}
2877 	/* non IPMP case */
2878 
2879 	mutex_enter(&ncec->ncec_lock);
2880 	ASSERT(ncec->ncec_nprobes == 0);
2881 	mp = ncec->ncec_qd_mp;
2882 	ncec->ncec_qd_mp = NULL;
2883 	mutex_exit(&ncec->ncec_lock);
2884 
2885 	while (mp != NULL) {
2886 		mblk_t *nxt_mp;
2887 
2888 		if (ill->ill_isv6) {
2889 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2890 
2891 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2892 		} else {
2893 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
2894 
2895 			ixaflags |= IXAF_IS_IPV4;
2896 			pkt_len = ntohs(ipha->ipha_length);
2897 		}
2898 		nxt_mp = mp->b_next;
2899 		mp->b_next = NULL;
2900 		/*
2901 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2902 		 * longer available, but it's ok to drop this flag because TCP
2903 		 * has its own flow-control in effect, so TCP packets
2904 		 * are not likely to get here when flow-control is in effect.
2905 		 */
2906 		mutex_enter(&ill->ill_lock);
2907 		nce = nce_lookup(ill, &ncec->ncec_addr);
2908 		mutex_exit(&ill->ill_lock);
2909 
2910 		if (nce == NULL) {
2911 			if (isv6) {
2912 				BUMP_MIB(&ipst->ips_ip6_mib,
2913 				    ipIfStatsOutDiscards);
2914 			} else {
2915 				BUMP_MIB(&ipst->ips_ip_mib,
2916 				    ipIfStatsOutDiscards);
2917 			}
2918 			ip_drop_output("ipIfStatsOutDiscards - no nce",
2919 			    mp, NULL);
2920 			freemsg(mp);
2921 		} else {
2922 			/*
2923 			 * We don't know the zoneid, but
2924 			 * ip_xmit does not care since IXAF_NO_TRACE
2925 			 * is set. (We traced the packet the first
2926 			 * time through ip_xmit.)
2927 			 */
2928 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2929 			    ALL_ZONES, 0, NULL);
2930 			nce_refrele(nce);
2931 		}
2932 		mp = nxt_mp;
2933 	}
2934 
2935 	ncec_cb_dispatch(ncec); /* complete callbacks */
2936 }
2937 
2938 /*
2939  * Called by SIOCSNDP* ioctl to add/change an ncec entry
2940  * and the corresponding attributes.
2941  * Disallow states other than ND_REACHABLE or ND_STALE.
2942  */
2943 int
2944 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2945 {
2946 	sin6_t		*sin6;
2947 	in6_addr_t	*addr;
2948 	ncec_t		*ncec;
2949 	nce_t		*nce;
2950 	int		err = 0;
2951 	uint16_t	new_flags = 0;
2952 	uint16_t	old_flags = 0;
2953 	int		inflags = lnr->lnr_flags;
2954 	ip_stack_t	*ipst = ill->ill_ipst;
2955 	boolean_t	do_postprocess = B_FALSE;
2956 
2957 	ASSERT(ill->ill_isv6);
2958 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2959 	    (lnr->lnr_state_create != ND_STALE))
2960 		return (EINVAL);
2961 
2962 	sin6 = (sin6_t *)&lnr->lnr_addr;
2963 	addr = &sin6->sin6_addr;
2964 
2965 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2966 	ASSERT(!IS_UNDER_IPMP(ill));
2967 	nce = nce_lookup_addr(ill, addr);
2968 	if (nce != NULL)
2969 		new_flags = nce->nce_common->ncec_flags;
2970 
2971 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2972 	case NDF_ISROUTER_ON:
2973 		new_flags |= NCE_F_ISROUTER;
2974 		break;
2975 	case NDF_ISROUTER_OFF:
2976 		new_flags &= ~NCE_F_ISROUTER;
2977 		break;
2978 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2979 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2980 		if (nce != NULL)
2981 			nce_refrele(nce);
2982 		return (EINVAL);
2983 	}
2984 	if (inflags & NDF_STATIC)
2985 		new_flags |= NCE_F_STATIC;
2986 
2987 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2988 	case NDF_ANYCAST_ON:
2989 		new_flags |= NCE_F_ANYCAST;
2990 		break;
2991 	case NDF_ANYCAST_OFF:
2992 		new_flags &= ~NCE_F_ANYCAST;
2993 		break;
2994 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2995 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2996 		if (nce != NULL)
2997 			nce_refrele(nce);
2998 		return (EINVAL);
2999 	}
3000 
3001 	if (nce == NULL) {
3002 		err = nce_add_v6(ill,
3003 		    (uchar_t *)lnr->lnr_hdw_addr,
3004 		    ill->ill_phys_addr_length,
3005 		    addr,
3006 		    new_flags,
3007 		    lnr->lnr_state_create,
3008 		    &nce);
3009 		if (err != 0) {
3010 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3011 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3012 			return (err);
3013 		} else {
3014 			do_postprocess = B_TRUE;
3015 		}
3016 	}
3017 	ncec = nce->nce_common;
3018 	old_flags = ncec->ncec_flags;
3019 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3020 		ncec_router_to_host(ncec);
3021 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3022 		if (do_postprocess)
3023 			err = nce_add_v6_postprocess(nce);
3024 		nce_refrele(nce);
3025 		return (0);
3026 	}
3027 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3028 
3029 	if (do_postprocess)
3030 		err = nce_add_v6_postprocess(nce);
3031 	/*
3032 	 * err cannot be anything other than 0 because we don't support
3033 	 * proxy arp of static addresses.
3034 	 */
3035 	ASSERT(err == 0);
3036 
3037 	mutex_enter(&ncec->ncec_lock);
3038 	ncec->ncec_flags = new_flags;
3039 	mutex_exit(&ncec->ncec_lock);
3040 	/*
3041 	 * Note that we ignore the state at this point, which
3042 	 * should be either STALE or REACHABLE.  Instead we let
3043 	 * the link layer address passed in to determine the state
3044 	 * much like incoming packets.
3045 	 */
3046 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3047 	nce_refrele(nce);
3048 	return (0);
3049 }
3050 
3051 /*
3052  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3053  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3054  * be held to ensure that they are in the same group.
3055  */
3056 static nce_t *
3057 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3058 {
3059 
3060 	nce_t *nce;
3061 
3062 	nce = nce_ill_lookup_then_add(ill, ncec);
3063 
3064 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3065 		return (nce);
3066 
3067 	/*
3068 	 * hold the ncec_lock to synchronize with nce_update() so that,
3069 	 * at the end of this function, the contents of nce_dlur_mp are
3070 	 * consistent with ncec->ncec_lladdr, even though some intermediate
3071 	 * packet may have been sent out with a mangled address, which would
3072 	 * only be a transient condition.
3073 	 */
3074 	mutex_enter(&ncec->ncec_lock);
3075 	if (ncec->ncec_lladdr != NULL) {
3076 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3077 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3078 	} else {
3079 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3080 		    ill->ill_sap_length);
3081 	}
3082 	mutex_exit(&ncec->ncec_lock);
3083 	return (nce);
3084 }
3085 
3086 /*
3087  * we make nce_fp_mp to have an M_DATA prepend.
3088  * The caller ensures there is hold on ncec for this function.
3089  * Note that since ill_fastpath_probe() copies the mblk there is
3090  * no need to hold the nce or ncec beyond this function.
3091  *
3092  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3093  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3094  * and will be returned back by this function, so that no extra nce_refrele
3095  * is required for the caller. The calls from nce_add_common() use this
3096  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3097  * nce_refrele of the returned nce (when it is non-null).
3098  */
3099 nce_t *
3100 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3101 {
3102 	nce_t *nce;
3103 	ill_t *ill = ncec->ncec_ill;
3104 
3105 	ASSERT(ill != NULL);
3106 
3107 	if (IS_IPMP(ill) && trigger_fp_req) {
3108 		trigger_fp_req = B_FALSE;
3109 		ipmp_ncec_refresh_nce(ncec);
3110 	}
3111 
3112 	/*
3113 	 * If the caller already has the nce corresponding to the ill, use
3114 	 * that one. Otherwise we have to lookup/add the nce. Calls from
3115 	 * nce_add_common() fall in the former category, and have just done
3116 	 * the nce lookup/add that can be reused.
3117 	 */
3118 	if (ncec_nce == NULL)
3119 		nce = nce_fastpath_create(ill, ncec);
3120 	else
3121 		nce = ncec_nce;
3122 
3123 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3124 		return (nce);
3125 
3126 	if (trigger_fp_req)
3127 		nce_fastpath_trigger(nce);
3128 	return (nce);
3129 }
3130 
3131 /*
3132  * Trigger fastpath on nce. No locks may be held.
3133  */
3134 static void
3135 nce_fastpath_trigger(nce_t *nce)
3136 {
3137 	int res;
3138 	ill_t *ill = nce->nce_ill;
3139 	ncec_t *ncec = nce->nce_common;
3140 
3141 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3142 	/*
3143 	 * EAGAIN is an indication of a transient error
3144 	 * i.e. allocation failure etc. leave the ncec in the list it
3145 	 * will be updated when another probe happens for another ire
3146 	 * if not it will be taken out of the list when the ire is
3147 	 * deleted.
3148 	 */
3149 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
3150 		nce_fastpath_list_delete(ill, ncec, NULL);
3151 }
3152 
3153 /*
3154  * Add ncec to the nce fastpath list on ill.
3155  */
3156 static nce_t *
3157 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3158 {
3159 	nce_t *nce = NULL;
3160 
3161 	ASSERT(MUTEX_HELD(&ill->ill_lock));
3162 	/*
3163 	 * Atomically ensure that the ill is not CONDEMNED and is not going
3164 	 * down, before adding the NCE.
3165 	 */
3166 	if (ill->ill_state_flags & ILL_CONDEMNED)
3167 		return (NULL);
3168 	mutex_enter(&ncec->ncec_lock);
3169 	/*
3170 	 * if ncec has not been deleted and
3171 	 * is not already in the list add it.
3172 	 */
3173 	if (!NCE_ISCONDEMNED(ncec)) {
3174 		nce = nce_lookup(ill, &ncec->ncec_addr);
3175 		if (nce != NULL)
3176 			goto done;
3177 		nce = nce_add(ill, ncec);
3178 	}
3179 done:
3180 	mutex_exit(&ncec->ncec_lock);
3181 	return (nce);
3182 }
3183 
3184 nce_t *
3185 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3186 {
3187 	nce_t *nce;
3188 
3189 	mutex_enter(&ill->ill_lock);
3190 	nce = nce_ill_lookup_then_add_locked(ill, ncec);
3191 	mutex_exit(&ill->ill_lock);
3192 	return (nce);
3193 }
3194 
3195 
3196 /*
3197  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3198  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3199  * entry after all locks have been dropped.
3200  */
3201 void
3202 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3203 {
3204 	nce_t *nce;
3205 
3206 	ASSERT(ill != NULL);
3207 
3208 	/* delete any nces referencing the ncec from underlying ills */
3209 	if (IS_IPMP(ill))
3210 		ipmp_ncec_delete_nce(ncec);
3211 
3212 	/* now the ill itself */
3213 	mutex_enter(&ill->ill_lock);
3214 	for (nce = list_head(&ill->ill_nce); nce != NULL;
3215 	    nce = list_next(&ill->ill_nce, nce)) {
3216 		if (nce->nce_common == ncec) {
3217 			nce_refhold(nce);
3218 			nce_delete(nce);
3219 			break;
3220 		}
3221 	}
3222 	mutex_exit(&ill->ill_lock);
3223 	if (nce != NULL) {
3224 		if (dead == NULL)
3225 			nce_refrele(nce);
3226 		else
3227 			list_insert_tail(dead, nce);
3228 	}
3229 }
3230 
3231 /*
3232  * when the fastpath response does not fit in the datab
3233  * associated with the existing nce_fp_mp, we delete and
3234  * add the nce to retrigger fastpath based on the information
3235  * in the ncec_t.
3236  */
3237 static nce_t *
3238 nce_delete_then_add(nce_t *nce)
3239 {
3240 	ill_t		*ill = nce->nce_ill;
3241 	nce_t		*newnce = NULL;
3242 
3243 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3244 	    (void *)nce, ill->ill_name));
3245 	mutex_enter(&ill->ill_lock);
3246 	mutex_enter(&nce->nce_common->ncec_lock);
3247 	nce_delete(nce);
3248 	/*
3249 	 * Make sure that ncec is not condemned before adding. We hold the
3250 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3251 	 * ipmp_ncec_delete_nce()
3252 	 */
3253 	if (!NCE_ISCONDEMNED(nce->nce_common))
3254 		newnce = nce_add(ill, nce->nce_common);
3255 	mutex_exit(&nce->nce_common->ncec_lock);
3256 	mutex_exit(&ill->ill_lock);
3257 	nce_refrele(nce);
3258 	return (newnce); /* could be null if nomem */
3259 }
3260 
3261 typedef struct nce_fp_match_s {
3262 	nce_t	*nce_fp_match_res;
3263 	mblk_t	*nce_fp_match_ack_mp;
3264 } nce_fp_match_t;
3265 
3266 /* ARGSUSED */
3267 static int
3268 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3269 {
3270 	nce_fp_match_t	*nce_fp_marg = arg;
3271 	ncec_t		*ncec = nce->nce_common;
3272 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
3273 	uchar_t	*mp_rptr, *ud_mp_rptr;
3274 	mblk_t		*ud_mp = nce->nce_dlur_mp;
3275 	ptrdiff_t	cmplen;
3276 
3277 	/*
3278 	 * mp is the mp associated with the fastpath ack.
3279 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3280 	 * under consideration. If the contents match, then the
3281 	 * fastpath ack is used to update the nce.
3282 	 */
3283 	if (ud_mp == NULL)
3284 		return (0);
3285 	mp_rptr = mp->b_rptr;
3286 	cmplen = mp->b_wptr - mp_rptr;
3287 	ASSERT(cmplen >= 0);
3288 
3289 	ud_mp_rptr = ud_mp->b_rptr;
3290 	/*
3291 	 * The ncec is locked here to prevent any other threads from accessing
3292 	 * and changing nce_dlur_mp when the address becomes resolved to an
3293 	 * lla while we're in the middle of looking at and comparing the
3294 	 * hardware address (lla). It is also locked to prevent multiple
3295 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3296 	 * time.
3297 	 */
3298 	mutex_enter(&ncec->ncec_lock);
3299 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3300 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3301 		nce_fp_marg->nce_fp_match_res = nce;
3302 		mutex_exit(&ncec->ncec_lock);
3303 		nce_refhold(nce);
3304 		return (1);
3305 	}
3306 	mutex_exit(&ncec->ncec_lock);
3307 	return (0);
3308 }
3309 
3310 /*
3311  * Update all NCE's that are not in fastpath mode and
3312  * have an nce_fp_mp that matches mp. mp->b_cont contains
3313  * the fastpath header.
3314  *
3315  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3316  */
3317 void
3318 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3319 {
3320 	nce_fp_match_t nce_fp_marg;
3321 	nce_t *nce;
3322 	mblk_t *nce_fp_mp, *fp_mp;
3323 
3324 	nce_fp_marg.nce_fp_match_res = NULL;
3325 	nce_fp_marg.nce_fp_match_ack_mp = mp;
3326 
3327 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3328 
3329 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3330 		return;
3331 
3332 	mutex_enter(&nce->nce_lock);
3333 	nce_fp_mp = nce->nce_fp_mp;
3334 
3335 	if (nce_fp_mp != NULL) {
3336 		fp_mp = mp->b_cont;
3337 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3338 		    nce_fp_mp->b_datap->db_lim) {
3339 			mutex_exit(&nce->nce_lock);
3340 			nce = nce_delete_then_add(nce);
3341 			if (nce == NULL) {
3342 				return;
3343 			}
3344 			mutex_enter(&nce->nce_lock);
3345 			nce_fp_mp = nce->nce_fp_mp;
3346 		}
3347 	}
3348 
3349 	/* Matched - install mp as the fastpath mp */
3350 	if (nce_fp_mp == NULL) {
3351 		fp_mp = dupb(mp->b_cont);
3352 		nce->nce_fp_mp = fp_mp;
3353 	} else {
3354 		fp_mp = mp->b_cont;
3355 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3356 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3357 		    + MBLKL(fp_mp);
3358 	}
3359 	mutex_exit(&nce->nce_lock);
3360 	nce_refrele(nce);
3361 }
3362 
3363 /*
3364  * Return a pointer to a given option in the packet.
3365  * Assumes that option part of the packet have already been validated.
3366  */
3367 nd_opt_hdr_t *
3368 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3369 {
3370 	while (optlen > 0) {
3371 		if (opt->nd_opt_type == opt_type)
3372 			return (opt);
3373 		optlen -= 8 * opt->nd_opt_len;
3374 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3375 	}
3376 	return (NULL);
3377 }
3378 
3379 /*
3380  * Verify all option lengths present are > 0, also check to see
3381  * if the option lengths and packet length are consistent.
3382  */
3383 boolean_t
3384 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3385 {
3386 	ASSERT(opt != NULL);
3387 	while (optlen > 0) {
3388 		if (opt->nd_opt_len == 0)
3389 			return (B_FALSE);
3390 		optlen -= 8 * opt->nd_opt_len;
3391 		if (optlen < 0)
3392 			return (B_FALSE);
3393 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3394 	}
3395 	return (B_TRUE);
3396 }
3397 
3398 /*
3399  * ncec_walk function.
3400  * Free a fraction of the NCE cache entries.
3401  *
3402  * A possible optimization here would be to use ncec_last where possible, and
3403  * delete the least-frequently used entry, which would require more complex
3404  * computation as we walk through the ncec's (e.g., track ncec entries by
3405  * order of ncec_last and/or maintain state)
3406  */
3407 static void
3408 ncec_cache_reclaim(ncec_t *ncec, void *arg)
3409 {
3410 	ip_stack_t	*ipst = ncec->ncec_ipst;
3411 	uint_t		fraction = *(uint_t *)arg;
3412 	uint_t		rand;
3413 
3414 	if ((ncec->ncec_flags &
3415 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3416 		return;
3417 	}
3418 
3419 	rand = (uint_t)ddi_get_lbolt() +
3420 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3421 	if ((rand/fraction)*fraction == rand) {
3422 		IP_STAT(ipst, ip_nce_reclaim_deleted);
3423 		ncec_delete(ncec);
3424 	}
3425 }
3426 
3427 /*
3428  * kmem_cache callback to free up memory.
3429  *
3430  * For now we just delete a fixed fraction.
3431  */
3432 static void
3433 ip_nce_reclaim_stack(ip_stack_t *ipst)
3434 {
3435 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
3436 
3437 	IP_STAT(ipst, ip_nce_reclaim_calls);
3438 
3439 	ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3440 
3441 	/*
3442 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3443 	 * Get them to update any stale references to drop any refholds they
3444 	 * have.
3445 	 */
3446 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3447 }
3448 
3449 /*
3450  * Called by the memory allocator subsystem directly, when the system
3451  * is running low on memory.
3452  */
3453 /* ARGSUSED */
3454 void
3455 ip_nce_reclaim(void *args)
3456 {
3457 	netstack_handle_t nh;
3458 	netstack_t *ns;
3459 	ip_stack_t *ipst;
3460 
3461 	netstack_next_init(&nh);
3462 	while ((ns = netstack_next(&nh)) != NULL) {
3463 		/*
3464 		 * netstack_next() can return a netstack_t with a NULL
3465 		 * netstack_ip at boot time.
3466 		 */
3467 		if ((ipst = ns->netstack_ip) == NULL) {
3468 			netstack_rele(ns);
3469 			continue;
3470 		}
3471 		ip_nce_reclaim_stack(ipst);
3472 		netstack_rele(ns);
3473 	}
3474 	netstack_next_fini(&nh);
3475 }
3476 
3477 #ifdef DEBUG
3478 void
3479 ncec_trace_ref(ncec_t *ncec)
3480 {
3481 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3482 
3483 	if (ncec->ncec_trace_disable)
3484 		return;
3485 
3486 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3487 		ncec->ncec_trace_disable = B_TRUE;
3488 		ncec_trace_cleanup(ncec);
3489 	}
3490 }
3491 
3492 void
3493 ncec_untrace_ref(ncec_t *ncec)
3494 {
3495 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3496 
3497 	if (!ncec->ncec_trace_disable)
3498 		th_trace_unref(ncec);
3499 }
3500 
3501 static void
3502 ncec_trace_cleanup(const ncec_t *ncec)
3503 {
3504 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3505 }
3506 #endif
3507 
3508 /*
3509  * Called when address resolution fails due to a timeout.
3510  * Send an ICMP unreachable in response to all queued packets.
3511  */
3512 void
3513 arp_resolv_failed(ncec_t *ncec)
3514 {
3515 	mblk_t	*mp, *nxt_mp;
3516 	char	buf[INET6_ADDRSTRLEN];
3517 	struct in_addr ipv4addr;
3518 	ill_t *ill = ncec->ncec_ill;
3519 	ip_stack_t *ipst = ncec->ncec_ipst;
3520 	ip_recv_attr_t	iras;
3521 
3522 	bzero(&iras, sizeof (iras));
3523 	iras.ira_flags = IRAF_IS_IPV4;
3524 	/*
3525 	 * we are setting the ira_rill to the ipmp_ill (instead of
3526 	 * the actual ill on which the packet was received), but this
3527 	 * is ok because we don't actually need the real ira_rill.
3528 	 * to send the icmp unreachable to the sender.
3529 	 */
3530 	iras.ira_ill = iras.ira_rill = ill;
3531 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3532 	iras.ira_rifindex = iras.ira_ruifindex;
3533 
3534 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3535 	ip3dbg(("arp_resolv_failed: dst %s\n",
3536 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3537 	mutex_enter(&ncec->ncec_lock);
3538 	mp = ncec->ncec_qd_mp;
3539 	ncec->ncec_qd_mp = NULL;
3540 	ncec->ncec_nprobes = 0;
3541 	mutex_exit(&ncec->ncec_lock);
3542 	while (mp != NULL) {
3543 		nxt_mp = mp->b_next;
3544 		mp->b_next = NULL;
3545 
3546 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3547 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3548 		    mp, ill);
3549 		if (ipst->ips_ip_arp_icmp_error) {
3550 			ip3dbg(("arp_resolv_failed: "
3551 			    "Calling icmp_unreachable\n"));
3552 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3553 		} else {
3554 			freemsg(mp);
3555 		}
3556 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3557 		mp = nxt_mp;
3558 	}
3559 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3560 }
3561 
3562 /*
3563  * if ill is an under_ill, translate it to the ipmp_ill and add the
3564  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3565  * one on the underlying in_ill) will be created for the
3566  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3567  */
3568 int
3569 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3570     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3571 {
3572 	int	err;
3573 	in6_addr_t addr6;
3574 	ip_stack_t *ipst = ill->ill_ipst;
3575 	nce_t	*nce, *upper_nce = NULL;
3576 	ill_t	*in_ill = ill, *under = NULL;
3577 	boolean_t need_ill_refrele = B_FALSE;
3578 
3579 	if (flags & NCE_F_MCAST) {
3580 		/*
3581 		 * hw_addr will be figured out in nce_set_multicast_v4;
3582 		 * caller needs to pass in the cast_ill for ipmp
3583 		 */
3584 		ASSERT(hw_addr == NULL);
3585 		ASSERT(!IS_IPMP(ill));
3586 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
3587 		return (err);
3588 	}
3589 
3590 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3591 		ill = ipmp_ill_hold_ipmp_ill(ill);
3592 		if (ill == NULL)
3593 			return (ENXIO);
3594 		need_ill_refrele = B_TRUE;
3595 	}
3596 	if ((flags & NCE_F_BCAST) != 0) {
3597 		/*
3598 		 * IPv4 broadcast ncec: compute the hwaddr.
3599 		 */
3600 		if (IS_IPMP(ill)) {
3601 			under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3602 			if (under == NULL)  {
3603 				if (need_ill_refrele)
3604 					ill_refrele(ill);
3605 				return (ENETDOWN);
3606 			}
3607 			hw_addr = under->ill_bcast_mp->b_rptr +
3608 			    NCE_LL_ADDR_OFFSET(under);
3609 			hw_addr_len = under->ill_phys_addr_length;
3610 		} else {
3611 			hw_addr = ill->ill_bcast_mp->b_rptr +
3612 			    NCE_LL_ADDR_OFFSET(ill),
3613 			    hw_addr_len = ill->ill_phys_addr_length;
3614 		}
3615 	}
3616 
3617 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3618 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3619 	nce = nce_lookup_addr(ill, &addr6);
3620 	if (nce == NULL) {
3621 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3622 		    state, &nce);
3623 	} else {
3624 		err = EEXIST;
3625 	}
3626 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3627 	if (err == 0)
3628 		err = nce_add_v4_postprocess(nce);
3629 
3630 	if (in_ill != ill && nce != NULL) {
3631 		nce_t *under_nce = NULL;
3632 
3633 		/*
3634 		 * in_ill was the under_ill. Try to create the under_nce.
3635 		 * Hold the ill_g_lock to prevent changes to group membership
3636 		 * until we are done.
3637 		 */
3638 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3639 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3640 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3641 			    ill_t *, ill);
3642 			rw_exit(&ipst->ips_ill_g_lock);
3643 			err = ENXIO;
3644 			nce_refrele(nce);
3645 			nce = NULL;
3646 			goto bail;
3647 		}
3648 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3649 		if (under_nce == NULL) {
3650 			rw_exit(&ipst->ips_ill_g_lock);
3651 			err = EINVAL;
3652 			nce_refrele(nce);
3653 			nce = NULL;
3654 			goto bail;
3655 		}
3656 		rw_exit(&ipst->ips_ill_g_lock);
3657 		upper_nce = nce;
3658 		nce = under_nce; /* will be returned to caller */
3659 		if (NCE_ISREACHABLE(nce->nce_common))
3660 			nce_fastpath_trigger(under_nce);
3661 	}
3662 	if (nce != NULL) {
3663 		if (newnce != NULL)
3664 			*newnce = nce;
3665 		else
3666 			nce_refrele(nce);
3667 	}
3668 bail:
3669 	if (under != NULL)
3670 		ill_refrele(under);
3671 	if (upper_nce != NULL)
3672 		nce_refrele(upper_nce);
3673 	if (need_ill_refrele)
3674 		ill_refrele(ill);
3675 
3676 	return (err);
3677 }
3678 
3679 /*
3680  * NDP Cache Entry creation routine for IPv4.
3681  * This routine must always be called with ndp4->ndp_g_lock held.
3682  * Prior to return, ncec_refcnt is incremented.
3683  *
3684  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3685  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3686  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3687  * entries will be created, both pointing at the same ncec_t. The nce_t
3688  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3689  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3690  * Local addresses are always created on the ill passed to nce_add_v4.
3691  */
3692 int
3693 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3694     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3695 {
3696 	int		err;
3697 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
3698 	struct in6_addr	addr6;
3699 	nce_t		*nce;
3700 
3701 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3702 	ASSERT(!ill->ill_isv6);
3703 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3704 
3705 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3706 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3707 	    &nce);
3708 	ASSERT(newnce != NULL);
3709 	*newnce = nce;
3710 	return (err);
3711 }
3712 
3713 /*
3714  * Post-processing routine to be executed after nce_add_v4(). This function
3715  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3716  * and must be called without any locks held.
3717  *
3718  * Always returns 0, but we return an int to keep this symmetric with the
3719  * IPv6 counter-part.
3720  */
3721 int
3722 nce_add_v4_postprocess(nce_t *nce)
3723 {
3724 	ncec_t		*ncec = nce->nce_common;
3725 	uint16_t	flags = ncec->ncec_flags;
3726 	boolean_t	ndp_need_dad = B_FALSE;
3727 	boolean_t	dropped;
3728 	clock_t		delay;
3729 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
3730 	uchar_t		*hw_addr = ncec->ncec_lladdr;
3731 	boolean_t	trigger_fastpath = B_TRUE;
3732 
3733 	/*
3734 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3735 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3736 	 * We call nce_fastpath from nce_update if the link layer address of
3737 	 * the peer changes from nce_update
3738 	 */
3739 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3740 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3741 		trigger_fastpath = B_FALSE;
3742 
3743 	if (trigger_fastpath)
3744 		nce_fastpath_trigger(nce);
3745 
3746 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3747 		/*
3748 		 * Either the caller (by passing in ND_PROBE)
3749 		 * or nce_add_common() (by the internally computed state
3750 		 * based on ncec_addr and ill_net_type) has determined
3751 		 * that this unicast entry needs DAD. Trigger DAD.
3752 		 */
3753 		ndp_need_dad = B_TRUE;
3754 	} else if (flags & NCE_F_UNSOL_ADV) {
3755 		/*
3756 		 * We account for the transmit below by assigning one
3757 		 * less than the ndd variable. Subsequent decrements
3758 		 * are done in nce_timer.
3759 		 */
3760 		mutex_enter(&ncec->ncec_lock);
3761 		ncec->ncec_unsolicit_count =
3762 		    ipst->ips_ip_arp_publish_count - 1;
3763 		mutex_exit(&ncec->ncec_lock);
3764 		dropped = arp_announce(ncec);
3765 		mutex_enter(&ncec->ncec_lock);
3766 		if (dropped)
3767 			ncec->ncec_unsolicit_count++;
3768 		else
3769 			ncec->ncec_last_time_defended = ddi_get_lbolt();
3770 		if (ncec->ncec_unsolicit_count != 0) {
3771 			nce_start_timer(ncec,
3772 			    ipst->ips_ip_arp_publish_interval);
3773 		}
3774 		mutex_exit(&ncec->ncec_lock);
3775 	}
3776 
3777 	/*
3778 	 * If ncec_xmit_interval is 0, user has configured us to send the first
3779 	 * probe right away.  Do so, and set up for the subsequent probes.
3780 	 */
3781 	if (ndp_need_dad) {
3782 		mutex_enter(&ncec->ncec_lock);
3783 		if (ncec->ncec_pcnt == 0) {
3784 			/*
3785 			 * DAD probes and announce can be
3786 			 * administratively disabled by setting the
3787 			 * probe_count to zero. Restart the timer in
3788 			 * this case to mark the ipif as ready.
3789 			 */
3790 			ncec->ncec_unsolicit_count = 0;
3791 			mutex_exit(&ncec->ncec_lock);
3792 			nce_restart_timer(ncec, 0);
3793 		} else {
3794 			mutex_exit(&ncec->ncec_lock);
3795 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3796 			    ipst->ips_arp_probe_delay :
3797 			    ipst->ips_arp_fastprobe_delay);
3798 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3799 		}
3800 	}
3801 	return (0);
3802 }
3803 
3804 /*
3805  * ncec_walk routine to update all entries that have a given destination or
3806  * gateway address and cached link layer (MAC) address.  This is used when ARP
3807  * informs us that a network-to-link-layer mapping may have changed.
3808  */
3809 void
3810 nce_update_hw_changed(ncec_t *ncec, void *arg)
3811 {
3812 	nce_hw_map_t *hwm = arg;
3813 	ipaddr_t ncec_addr;
3814 
3815 	if (ncec->ncec_state != ND_REACHABLE)
3816 		return;
3817 
3818 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3819 	if (ncec_addr != hwm->hwm_addr)
3820 		return;
3821 
3822 	mutex_enter(&ncec->ncec_lock);
3823 	if (hwm->hwm_flags != 0)
3824 		ncec->ncec_flags = hwm->hwm_flags;
3825 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3826 	mutex_exit(&ncec->ncec_lock);
3827 }
3828 
3829 void
3830 ncec_refhold(ncec_t *ncec)
3831 {
3832 	mutex_enter(&(ncec)->ncec_lock);
3833 	(ncec)->ncec_refcnt++;
3834 	ASSERT((ncec)->ncec_refcnt != 0);
3835 #ifdef DEBUG
3836 	ncec_trace_ref(ncec);
3837 #endif
3838 	mutex_exit(&(ncec)->ncec_lock);
3839 }
3840 
3841 void
3842 ncec_refhold_notr(ncec_t *ncec)
3843 {
3844 	mutex_enter(&(ncec)->ncec_lock);
3845 	(ncec)->ncec_refcnt++;
3846 	ASSERT((ncec)->ncec_refcnt != 0);
3847 	mutex_exit(&(ncec)->ncec_lock);
3848 }
3849 
3850 static void
3851 ncec_refhold_locked(ncec_t *ncec)
3852 {
3853 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3854 	(ncec)->ncec_refcnt++;
3855 #ifdef DEBUG
3856 	ncec_trace_ref(ncec);
3857 #endif
3858 }
3859 
3860 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3861 void
3862 ncec_refrele(ncec_t *ncec)
3863 {
3864 	mutex_enter(&(ncec)->ncec_lock);
3865 #ifdef DEBUG
3866 	ncec_untrace_ref(ncec);
3867 #endif
3868 	ASSERT((ncec)->ncec_refcnt != 0);
3869 	if (--(ncec)->ncec_refcnt == 0) {
3870 		ncec_inactive(ncec);
3871 	} else {
3872 		mutex_exit(&(ncec)->ncec_lock);
3873 	}
3874 }
3875 
3876 void
3877 ncec_refrele_notr(ncec_t *ncec)
3878 {
3879 	mutex_enter(&(ncec)->ncec_lock);
3880 	ASSERT((ncec)->ncec_refcnt != 0);
3881 	if (--(ncec)->ncec_refcnt == 0) {
3882 		ncec_inactive(ncec);
3883 	} else {
3884 		mutex_exit(&(ncec)->ncec_lock);
3885 	}
3886 }
3887 
3888 /*
3889  * Common to IPv4 and IPv6.
3890  */
3891 void
3892 nce_restart_timer(ncec_t *ncec, uint_t ms)
3893 {
3894 	timeout_id_t tid;
3895 
3896 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3897 
3898 	/* First cancel any running timer */
3899 	mutex_enter(&ncec->ncec_lock);
3900 	tid = ncec->ncec_timeout_id;
3901 	ncec->ncec_timeout_id = 0;
3902 	if (tid != 0) {
3903 		mutex_exit(&ncec->ncec_lock);
3904 		(void) untimeout(tid);
3905 		mutex_enter(&ncec->ncec_lock);
3906 	}
3907 
3908 	/* Restart timer */
3909 	nce_start_timer(ncec, ms);
3910 	mutex_exit(&ncec->ncec_lock);
3911 }
3912 
3913 static void
3914 nce_start_timer(ncec_t *ncec, uint_t ms)
3915 {
3916 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3917 	/*
3918 	 * Don't start the timer if the ncec has been deleted, or if the timer
3919 	 * is already running
3920 	 */
3921 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3922 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3923 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3924 	}
3925 }
3926 
3927 int
3928 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3929     uint16_t flags, nce_t **newnce)
3930 {
3931 	uchar_t		*hw_addr;
3932 	int		err = 0;
3933 	ip_stack_t	*ipst = ill->ill_ipst;
3934 	in6_addr_t	dst6;
3935 	nce_t		*nce;
3936 
3937 	ASSERT(!ill->ill_isv6);
3938 
3939 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3940 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3941 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3942 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3943 		goto done;
3944 	}
3945 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
3946 		/*
3947 		 * For IRE_IF_RESOLVER a hardware mapping can be
3948 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
3949 		 * in the ill is copied in nce_add_v4().
3950 		 */
3951 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3952 		if (hw_addr == NULL) {
3953 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3954 			return (ENOMEM);
3955 		}
3956 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3957 	} else {
3958 		/*
3959 		 * IRE_IF_NORESOLVER type simply copies the resolution
3960 		 * cookie passed in.  So no hw_addr is needed.
3961 		 */
3962 		hw_addr = NULL;
3963 	}
3964 	ASSERT(flags & NCE_F_MCAST);
3965 	ASSERT(flags & NCE_F_NONUD);
3966 	/* nce_state will be computed by nce_add_common() */
3967 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3968 	    ND_UNCHANGED, &nce);
3969 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3970 	if (err == 0)
3971 		err = nce_add_v4_postprocess(nce);
3972 	if (hw_addr != NULL)
3973 		kmem_free(hw_addr, ill->ill_phys_addr_length);
3974 	if (err != 0) {
3975 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3976 		return (err);
3977 	}
3978 done:
3979 	if (newnce != NULL)
3980 		*newnce = nce;
3981 	else
3982 		nce_refrele(nce);
3983 	return (0);
3984 }
3985 
3986 /*
3987  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3988  * don't want to have to walk the list for every single one, so we gather up
3989  * batches at a time.
3990  */
3991 #define	NCE_RESCHED_LIST_LEN	8
3992 
3993 typedef struct {
3994 	ill_t	*ncert_ill;
3995 	uint_t	ncert_num;
3996 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
3997 } nce_resched_t;
3998 
3999 /*
4000  * Pick the longest waiting NCEs for defense.
4001  */
4002 /* ARGSUSED */
4003 static int
4004 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4005 {
4006 	nce_resched_t *ncert = arg;
4007 	ncec_t **ncecs;
4008 	ncec_t **ncec_max;
4009 	ncec_t *ncec_temp;
4010 	ncec_t *ncec = nce->nce_common;
4011 
4012 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4013 	/*
4014 	 * Only reachable entries that are ready for announcement are eligible.
4015 	 */
4016 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4017 		return (0);
4018 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4019 		ncec_refhold(ncec);
4020 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
4021 	} else {
4022 		ncecs = ncert->ncert_nces;
4023 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4024 		ncec_refhold(ncec);
4025 		for (; ncecs < ncec_max; ncecs++) {
4026 			ASSERT(ncec != NULL);
4027 			if ((*ncecs)->ncec_last_time_defended >
4028 			    ncec->ncec_last_time_defended) {
4029 				ncec_temp = *ncecs;
4030 				*ncecs = ncec;
4031 				ncec = ncec_temp;
4032 			}
4033 		}
4034 		ncec_refrele(ncec);
4035 	}
4036 	return (0);
4037 }
4038 
4039 /*
4040  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4041  * doesn't happen very often (if at all), and thus it needn't be highly
4042  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4043  * outer loop is bounded by a constant rather than by the length of the list.)
4044  */
4045 static void
4046 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4047 {
4048 	ncec_t		*ncec;
4049 	ip_stack_t	*ipst = ill->ill_ipst;
4050 	uint_t		i, defend_rate;
4051 
4052 	i = ill->ill_defend_count;
4053 	ill->ill_defend_count = 0;
4054 	if (ill->ill_isv6)
4055 		defend_rate = ipst->ips_ndp_defend_rate;
4056 	else
4057 		defend_rate = ipst->ips_arp_defend_rate;
4058 	/* If none could be sitting around, then don't reschedule */
4059 	if (i < defend_rate) {
4060 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4061 		return;
4062 	}
4063 	ncert->ncert_ill = ill;
4064 	while (ill->ill_defend_count < defend_rate) {
4065 		nce_walk_common(ill, ncec_reschedule, ncert);
4066 		for (i = 0; i < ncert->ncert_num; i++) {
4067 
4068 			ncec = ncert->ncert_nces[i];
4069 			mutex_enter(&ncec->ncec_lock);
4070 			ncec->ncec_flags |= NCE_F_DELAYED;
4071 			mutex_exit(&ncec->ncec_lock);
4072 			/*
4073 			 * we plan to schedule this ncec, so incr the
4074 			 * defend_count in anticipation.
4075 			 */
4076 			if (++ill->ill_defend_count >= defend_rate)
4077 				break;
4078 		}
4079 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4080 			break;
4081 	}
4082 }
4083 
4084 /*
4085  * Check if the current rate-limiting parameters permit the sending
4086  * of another address defense announcement for both IPv4 and IPv6.
4087  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4088  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4089  * determines how many address defense announcements are permitted
4090  * in any `defense_perio' interval.
4091  */
4092 static boolean_t
4093 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4094 {
4095 	clock_t		now = ddi_get_lbolt();
4096 	ip_stack_t	*ipst = ill->ill_ipst;
4097 	clock_t		start = ill->ill_defend_start;
4098 	uint32_t	elapsed, defend_period, defend_rate;
4099 	nce_resched_t	ncert;
4100 	boolean_t	ret;
4101 	int		i;
4102 
4103 	if (ill->ill_isv6) {
4104 		defend_period = ipst->ips_ndp_defend_period;
4105 		defend_rate = ipst->ips_ndp_defend_rate;
4106 	} else {
4107 		defend_period = ipst->ips_arp_defend_period;
4108 		defend_rate = ipst->ips_arp_defend_rate;
4109 	}
4110 	if (defend_rate == 0)
4111 		return (B_TRUE);
4112 	bzero(&ncert, sizeof (ncert));
4113 	mutex_enter(&ill->ill_lock);
4114 	if (start > 0) {
4115 		elapsed = now - start;
4116 		if (elapsed > SEC_TO_TICK(defend_period)) {
4117 			ill->ill_defend_start = now;
4118 			/*
4119 			 * nce_ill_reschedule will attempt to
4120 			 * prevent starvation by reschduling the
4121 			 * oldest entries, which are marked with
4122 			 * the NCE_F_DELAYED flag.
4123 			 */
4124 			nce_ill_reschedule(ill, &ncert);
4125 		}
4126 	} else {
4127 		ill->ill_defend_start = now;
4128 	}
4129 	ASSERT(ill->ill_defend_count <= defend_rate);
4130 	mutex_enter(&ncec->ncec_lock);
4131 	if (ncec->ncec_flags & NCE_F_DELAYED) {
4132 		/*
4133 		 * This ncec was rescheduled as one of the really old
4134 		 * entries needing on-going defense. The
4135 		 * ill_defend_count was already incremented in
4136 		 * nce_ill_reschedule. Go ahead and send the announce.
4137 		 */
4138 		ncec->ncec_flags &= ~NCE_F_DELAYED;
4139 		mutex_exit(&ncec->ncec_lock);
4140 		ret = B_FALSE;
4141 		goto done;
4142 	}
4143 	mutex_exit(&ncec->ncec_lock);
4144 	if (ill->ill_defend_count < defend_rate)
4145 		ill->ill_defend_count++;
4146 	if (ill->ill_defend_count == defend_rate) {
4147 		/*
4148 		 * we are no longer allowed to send unbidden defense
4149 		 * messages. Wait for rescheduling.
4150 		 */
4151 		ret = B_TRUE;
4152 	} else {
4153 		ret = B_FALSE;
4154 	}
4155 done:
4156 	mutex_exit(&ill->ill_lock);
4157 	/*
4158 	 * After all the locks have been dropped we can restart nce timer,
4159 	 * and refrele the delayed ncecs
4160 	 */
4161 	for (i = 0; i < ncert.ncert_num; i++) {
4162 		clock_t	xmit_interval;
4163 		ncec_t	*tmp;
4164 
4165 		tmp = ncert.ncert_nces[i];
4166 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4167 		    B_FALSE);
4168 		nce_restart_timer(tmp, xmit_interval);
4169 		ncec_refrele(tmp);
4170 	}
4171 	return (ret);
4172 }
4173 
4174 boolean_t
4175 ndp_announce(ncec_t *ncec)
4176 {
4177 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4178 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4179 	    nce_advert_flags(ncec)));
4180 }
4181 
4182 ill_t *
4183 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4184 {
4185 	mblk_t		*mp;
4186 	in6_addr_t	src6;
4187 	ipaddr_t	src4;
4188 	ill_t		*ill = ncec->ncec_ill;
4189 	ill_t		*src_ill = NULL;
4190 	ipif_t		*ipif = NULL;
4191 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
4192 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4193 
4194 	ASSERT(src != NULL);
4195 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4196 	src6 = *src;
4197 	if (is_myaddr) {
4198 		src6 = ncec->ncec_addr;
4199 		if (!isv6)
4200 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4201 	} else {
4202 		/*
4203 		 * try to find one from the outgoing packet.
4204 		 */
4205 		mutex_enter(&ncec->ncec_lock);
4206 		mp = ncec->ncec_qd_mp;
4207 		if (mp != NULL) {
4208 			if (isv6) {
4209 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
4210 
4211 				src6 = ip6h->ip6_src;
4212 			} else {
4213 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4214 
4215 				src4 = ipha->ipha_src;
4216 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4217 			}
4218 		}
4219 		mutex_exit(&ncec->ncec_lock);
4220 	}
4221 
4222 	/*
4223 	 * For outgoing packets, if the src of outgoing packet is one
4224 	 * of the assigned interface addresses use it, otherwise we
4225 	 * will pick the source address below.
4226 	 * For local addresses (is_myaddr) doing DAD, NDP announce
4227 	 * messages are mcast. So we use the (IPMP) cast_ill or the
4228 	 * (non-IPMP) ncec_ill for these message types. The only case
4229 	 * of unicast DAD messages are for IPv6 ND probes, for which
4230 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
4231 	 */
4232 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4233 		if (isv6) {
4234 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4235 			    ill->ill_ipst);
4236 		} else {
4237 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4238 			    ill->ill_ipst);
4239 		}
4240 
4241 		/*
4242 		 * If no relevant ipif can be found, then it's not one of our
4243 		 * addresses.  Reset to :: and try to find a src for the NS or
4244 		 * ARP request using ipif_select_source_v[4,6]  below.
4245 		 * If an ipif can be found, but it's not yet done with
4246 		 * DAD verification, and we are not being invoked for
4247 		 * DAD (i.e., !is_myaddr), then just postpone this
4248 		 * transmission until later.
4249 		 */
4250 		if (ipif == NULL) {
4251 			src6 = ipv6_all_zeros;
4252 			src4 = INADDR_ANY;
4253 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
4254 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4255 			    ncec_t *, ncec, ipif_t *, ipif);
4256 			ipif_refrele(ipif);
4257 			return (NULL);
4258 		}
4259 	}
4260 
4261 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4262 		/*
4263 		 * Pick a source address for this solicitation, but
4264 		 * restrict the selection to addresses assigned to the
4265 		 * output interface.  We do this because the destination will
4266 		 * create a neighbor cache entry for the source address of
4267 		 * this packet, so the source address had better be a valid
4268 		 * neighbor.
4269 		 */
4270 		if (isv6) {
4271 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4272 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4273 			    B_FALSE, NULL);
4274 		} else {
4275 			ipaddr_t nce_addr;
4276 
4277 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4278 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4279 			    B_FALSE, NULL);
4280 		}
4281 		if (ipif == NULL && IS_IPMP(ill)) {
4282 			ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4283 
4284 			if (send_ill != NULL) {
4285 				if (isv6) {
4286 					ipif = ipif_select_source_v6(send_ill,
4287 					    &ncec->ncec_addr, B_TRUE,
4288 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4289 					    B_FALSE, NULL);
4290 				} else {
4291 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4292 					    src4);
4293 					ipif = ipif_select_source_v4(send_ill,
4294 					    src4, ALL_ZONES, B_TRUE, NULL);
4295 				}
4296 				ill_refrele(send_ill);
4297 			}
4298 		}
4299 
4300 		if (ipif == NULL) {
4301 			char buf[INET6_ADDRSTRLEN];
4302 
4303 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4304 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
4305 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4306 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4307 			return (NULL);
4308 		}
4309 		src6 = ipif->ipif_v6lcl_addr;
4310 	}
4311 	*src = src6;
4312 	if (ipif != NULL) {
4313 		src_ill = ipif->ipif_ill;
4314 		if (IS_IPMP(src_ill))
4315 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
4316 		else
4317 			ill_refhold(src_ill);
4318 		ipif_refrele(ipif);
4319 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4320 		    ill_t *, src_ill);
4321 	}
4322 	return (src_ill);
4323 }
4324 
4325 void
4326 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4327     uchar_t *hwaddr, int hwaddr_len, int flags)
4328 {
4329 	ill_t	*ill;
4330 	ncec_t	*ncec;
4331 	nce_t	*nce;
4332 	uint16_t new_state;
4333 
4334 	ill = (ipif ? ipif->ipif_ill : NULL);
4335 	if (ill != NULL) {
4336 		/*
4337 		 * only one ncec is possible
4338 		 */
4339 		nce = nce_lookup_v4(ill, addr);
4340 		if (nce != NULL) {
4341 			ncec = nce->nce_common;
4342 			mutex_enter(&ncec->ncec_lock);
4343 			if (NCE_ISREACHABLE(ncec))
4344 				new_state = ND_UNCHANGED;
4345 			else
4346 				new_state = ND_STALE;
4347 			ncec->ncec_flags = flags;
4348 			nce_update(ncec, new_state, hwaddr);
4349 			mutex_exit(&ncec->ncec_lock);
4350 			nce_refrele(nce);
4351 			return;
4352 		}
4353 	} else {
4354 		/*
4355 		 * ill is wildcard; clean up all ncec's and ire's
4356 		 * that match on addr.
4357 		 */
4358 		nce_hw_map_t hwm;
4359 
4360 		hwm.hwm_addr = *addr;
4361 		hwm.hwm_hwlen = hwaddr_len;
4362 		hwm.hwm_hwaddr = hwaddr;
4363 		hwm.hwm_flags = flags;
4364 
4365 		ncec_walk_common(ipst->ips_ndp4, NULL,
4366 		    nce_update_hw_changed, &hwm, B_TRUE);
4367 	}
4368 }
4369 
4370 /*
4371  * Common function to add ncec entries.
4372  * we always add the ncec with ncec_ill == ill, and always create
4373  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4374  * ncec is !reachable.
4375  *
4376  * When the caller passes in an nce_state of ND_UNCHANGED,
4377  * nce_add_common() will determine the state of the created nce based
4378  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4379  * be created with state set to the passed in nce_state.
4380  */
4381 static int
4382 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4383     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4384 {
4385 	static	ncec_t		nce_nil;
4386 	uchar_t			*template = NULL;
4387 	int			err;
4388 	ncec_t			*ncec;
4389 	ncec_t			**ncep;
4390 	ip_stack_t		*ipst = ill->ill_ipst;
4391 	uint16_t		state;
4392 	boolean_t		fastprobe = B_FALSE;
4393 	struct ndp_g_s		*ndp;
4394 	nce_t			*nce = NULL;
4395 	mblk_t			*dlur_mp = NULL;
4396 
4397 	if (ill->ill_isv6)
4398 		ndp = ill->ill_ipst->ips_ndp6;
4399 	else
4400 		ndp = ill->ill_ipst->ips_ndp4;
4401 
4402 	*retnce = NULL;
4403 
4404 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4405 
4406 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4407 		ip0dbg(("nce_add_common: no addr\n"));
4408 		return (EINVAL);
4409 	}
4410 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4411 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4412 		return (EINVAL);
4413 	}
4414 
4415 	if (ill->ill_isv6) {
4416 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4417 	} else {
4418 		ipaddr_t v4addr;
4419 
4420 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4421 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4422 	}
4423 
4424 	/*
4425 	 * The caller has ensured that there is no nce on ill, but there could
4426 	 * still be an nce_common_t for the address, so that we find exisiting
4427 	 * ncec_t strucutures first, and atomically add a new nce_t if
4428 	 * one is found. The ndp_g_lock ensures that we don't cross threads
4429 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4430 	 * compare for matches across the illgrp because this function is
4431 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4432 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4433 	 * appropriate.
4434 	 */
4435 	ncec = *ncep;
4436 	for (; ncec != NULL; ncec = ncec->ncec_next) {
4437 		if (ncec->ncec_ill == ill) {
4438 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4439 				/*
4440 				 * We should never find *retnce to be
4441 				 * MYADDR, since the caller may then
4442 				 * incorrectly restart a DAD timer that's
4443 				 * already running.  However, if we are in
4444 				 * forwarding mode, and the interface is
4445 				 * moving in/out of groups, the data
4446 				 * path ire lookup (e.g., ire_revalidate_nce)
4447 				 * may  have determined that some destination
4448 				 * is offlink while the control path is adding
4449 				 * that address as a local address.
4450 				 * Recover from  this case by failing the
4451 				 * lookup
4452 				 */
4453 				if (NCE_MYADDR(ncec))
4454 					return (ENXIO);
4455 				*retnce = nce_ill_lookup_then_add(ill, ncec);
4456 				if (*retnce != NULL)
4457 					break;
4458 			}
4459 		}
4460 	}
4461 	if (*retnce != NULL) /* caller must trigger fastpath on nce */
4462 		return (0);
4463 
4464 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4465 	if (ncec == NULL)
4466 		return (ENOMEM);
4467 	*ncec = nce_nil;
4468 	ncec->ncec_ill = ill;
4469 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4470 	ncec->ncec_flags = flags;
4471 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
4472 
4473 	if (!ill->ill_isv6) {
4474 		ipaddr_t addr4;
4475 
4476 		/*
4477 		 * DAD probe interval and probe count are set based on
4478 		 * fast/slow probe settings. If the underlying link doesn't
4479 		 * have reliably up/down notifications or if we're working
4480 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
4481 		 * don't use the fast timers.  Otherwise, use them.
4482 		 */
4483 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4484 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4485 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4486 			fastprobe = B_TRUE;
4487 		} else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4488 		    !IS_IPV4_LL_SPACE(&addr4)) {
4489 			ill_t *hwaddr_ill;
4490 
4491 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4492 			    hw_addr_len);
4493 			if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4494 				fastprobe = B_TRUE;
4495 		}
4496 		if (fastprobe) {
4497 			ncec->ncec_xmit_interval =
4498 			    ipst->ips_arp_fastprobe_interval;
4499 			ncec->ncec_pcnt =
4500 			    ipst->ips_arp_fastprobe_count;
4501 			ncec->ncec_flags |= NCE_F_FAST;
4502 		} else {
4503 			ncec->ncec_xmit_interval =
4504 			    ipst->ips_arp_probe_interval;
4505 			ncec->ncec_pcnt =
4506 			    ipst->ips_arp_probe_count;
4507 		}
4508 		if (NCE_PUBLISH(ncec)) {
4509 			ncec->ncec_unsolicit_count =
4510 			    ipst->ips_ip_arp_publish_count;
4511 		}
4512 	} else {
4513 		/*
4514 		 * probe interval is constant: ILL_PROBE_INTERVAL
4515 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4516 		 */
4517 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4518 		if (NCE_PUBLISH(ncec)) {
4519 			ncec->ncec_unsolicit_count =
4520 			    ipst->ips_ip_ndp_unsolicit_count;
4521 		}
4522 	}
4523 	ncec->ncec_rcnt = ill->ill_xmit_count;
4524 	ncec->ncec_addr = *addr;
4525 	ncec->ncec_qd_mp = NULL;
4526 	ncec->ncec_refcnt = 1; /* for ncec getting created */
4527 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4528 	ncec->ncec_trace_disable = B_FALSE;
4529 
4530 	/*
4531 	 * ncec_lladdr holds link layer address
4532 	 */
4533 	if (hw_addr_len > 0) {
4534 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4535 		if (template == NULL) {
4536 			err = ENOMEM;
4537 			goto err_ret;
4538 		}
4539 		ncec->ncec_lladdr = template;
4540 		ncec->ncec_lladdr_length = hw_addr_len;
4541 		bzero(ncec->ncec_lladdr, hw_addr_len);
4542 	}
4543 	if ((flags & NCE_F_BCAST) != 0) {
4544 		state = ND_REACHABLE;
4545 		ASSERT(hw_addr_len > 0);
4546 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4547 		state = ND_INITIAL;
4548 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4549 		/*
4550 		 * NORESOLVER entries are always created in the REACHABLE
4551 		 * state.
4552 		 */
4553 		state = ND_REACHABLE;
4554 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4555 		    ill->ill_mactype != DL_IPV4 &&
4556 		    ill->ill_mactype != DL_6TO4) {
4557 			/*
4558 			 * We create a nce_res_mp with the IP nexthop address
4559 			 * as the destination address if the physical length
4560 			 * is exactly 4 bytes for point-to-multipoint links
4561 			 * that do their own resolution from IP to link-layer
4562 			 * address (e.g. IP over X.25).
4563 			 */
4564 			bcopy((uchar_t *)addr,
4565 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4566 		}
4567 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4568 		    ill->ill_mactype != DL_IPV6) {
4569 			/*
4570 			 * We create a nce_res_mp with the IP nexthop address
4571 			 * as the destination address if the physical legnth
4572 			 * is exactly 16 bytes for point-to-multipoint links
4573 			 * that do their own resolution from IP to link-layer
4574 			 * address.
4575 			 */
4576 			bcopy((uchar_t *)addr,
4577 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4578 		}
4579 		/*
4580 		 * Since NUD is not part of the base IPv4 protocol definition,
4581 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
4582 		 * age, and are marked NCE_F_NONUD.
4583 		 */
4584 		if (!ill->ill_isv6)
4585 			ncec->ncec_flags |= NCE_F_NONUD;
4586 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
4587 		state = ND_REACHABLE;
4588 	}
4589 
4590 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4591 		/*
4592 		 * We are adding an ncec with a deterministic hw_addr,
4593 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4594 		 *
4595 		 * if we are adding a unicast ncec for the local address
4596 		 * it would be REACHABLE; we would be adding a ND_STALE entry
4597 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4598 		 * addresses are added in PROBE to trigger DAD.
4599 		 */
4600 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4601 		    ill->ill_net_type == IRE_IF_NORESOLVER)
4602 			state = ND_REACHABLE;
4603 		else if (!NCE_PUBLISH(ncec))
4604 			state = ND_STALE;
4605 		else
4606 			state = ND_PROBE;
4607 		if (hw_addr != NULL)
4608 			nce_set_ll(ncec, hw_addr);
4609 	}
4610 	/* caller overrides internally computed state */
4611 	if (nce_state != ND_UNCHANGED)
4612 		state = nce_state;
4613 
4614 	if (state == ND_PROBE)
4615 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
4616 
4617 	ncec->ncec_state = state;
4618 
4619 	if (state == ND_REACHABLE) {
4620 		ncec->ncec_last = ncec->ncec_init_time =
4621 		    TICK_TO_MSEC(ddi_get_lbolt64());
4622 	} else {
4623 		ncec->ncec_last = 0;
4624 		if (state == ND_INITIAL)
4625 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4626 	}
4627 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4628 	    offsetof(ncec_cb_t, ncec_cb_node));
4629 	/*
4630 	 * have all the memory allocations out of the way before taking locks
4631 	 * and adding the nce.
4632 	 */
4633 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4634 	if (nce == NULL) {
4635 		err = ENOMEM;
4636 		goto err_ret;
4637 	}
4638 	if (ncec->ncec_lladdr != NULL ||
4639 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4640 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4641 		    ill->ill_phys_addr_length, ill->ill_sap,
4642 		    ill->ill_sap_length);
4643 		if (dlur_mp == NULL) {
4644 			err = ENOMEM;
4645 			goto err_ret;
4646 		}
4647 	}
4648 
4649 	/*
4650 	 * Atomically ensure that the ill is not CONDEMNED, before
4651 	 * adding the NCE.
4652 	 */
4653 	mutex_enter(&ill->ill_lock);
4654 	if (ill->ill_state_flags & ILL_CONDEMNED) {
4655 		mutex_exit(&ill->ill_lock);
4656 		err = EINVAL;
4657 		goto err_ret;
4658 	}
4659 	if (!NCE_MYADDR(ncec) &&
4660 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4661 		mutex_exit(&ill->ill_lock);
4662 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4663 		err = EINVAL;
4664 		goto err_ret;
4665 	}
4666 	/*
4667 	 * Acquire the ncec_lock even before adding the ncec to the list
4668 	 * so that it cannot get deleted after the ncec is added, but
4669 	 * before we add the nce.
4670 	 */
4671 	mutex_enter(&ncec->ncec_lock);
4672 	if ((ncec->ncec_next = *ncep) != NULL)
4673 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4674 	*ncep = ncec;
4675 	ncec->ncec_ptpn = ncep;
4676 
4677 	/* Bump up the number of ncec's referencing this ill */
4678 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4679 	    (char *), "ncec", (void *), ncec);
4680 	ill->ill_ncec_cnt++;
4681 	/*
4682 	 * Since we hold the ncec_lock at this time, the ncec cannot be
4683 	 * condemned, and we can safely add the nce.
4684 	 */
4685 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4686 	mutex_exit(&ncec->ncec_lock);
4687 	mutex_exit(&ill->ill_lock);
4688 
4689 	/* caller must trigger fastpath on *retnce */
4690 	return (0);
4691 
4692 err_ret:
4693 	if (ncec != NULL)
4694 		kmem_cache_free(ncec_cache, ncec);
4695 	if (nce != NULL)
4696 		kmem_cache_free(nce_cache, nce);
4697 	freemsg(dlur_mp);
4698 	if (template != NULL)
4699 		kmem_free(template, ill->ill_phys_addr_length);
4700 	return (err);
4701 }
4702 
4703 /*
4704  * take a ref on the nce
4705  */
4706 void
4707 nce_refhold(nce_t *nce)
4708 {
4709 	mutex_enter(&nce->nce_lock);
4710 	nce->nce_refcnt++;
4711 	ASSERT((nce)->nce_refcnt != 0);
4712 	mutex_exit(&nce->nce_lock);
4713 }
4714 
4715 /*
4716  * release a ref on the nce; In general, this
4717  * cannot be called with locks held because nce_inactive
4718  * may result in nce_inactive which will take the ill_lock,
4719  * do ipif_ill_refrele_tail etc. Thus the one exception
4720  * where this can be called with locks held is when the caller
4721  * is certain that the nce_refcnt is sufficient to prevent
4722  * the invocation of nce_inactive.
4723  */
4724 void
4725 nce_refrele(nce_t *nce)
4726 {
4727 	ASSERT((nce)->nce_refcnt != 0);
4728 	mutex_enter(&nce->nce_lock);
4729 	if (--nce->nce_refcnt == 0)
4730 		nce_inactive(nce); /* destroys the mutex */
4731 	else
4732 		mutex_exit(&nce->nce_lock);
4733 }
4734 
4735 /*
4736  * free the nce after all refs have gone away.
4737  */
4738 static void
4739 nce_inactive(nce_t *nce)
4740 {
4741 	ill_t *ill = nce->nce_ill;
4742 
4743 	ASSERT(nce->nce_refcnt == 0);
4744 
4745 	ncec_refrele_notr(nce->nce_common);
4746 	nce->nce_common = NULL;
4747 	freemsg(nce->nce_fp_mp);
4748 	freemsg(nce->nce_dlur_mp);
4749 
4750 	mutex_enter(&ill->ill_lock);
4751 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4752 	    (char *), "nce", (void *), nce);
4753 	ill->ill_nce_cnt--;
4754 	nce->nce_ill = NULL;
4755 	/*
4756 	 * If the number of ncec's associated with this ill have dropped
4757 	 * to zero, check whether we need to restart any operation that
4758 	 * is waiting for this to happen.
4759 	 */
4760 	if (ILL_DOWN_OK(ill)) {
4761 		/* ipif_ill_refrele_tail drops the ill_lock */
4762 		ipif_ill_refrele_tail(ill);
4763 	} else {
4764 		mutex_exit(&ill->ill_lock);
4765 	}
4766 
4767 	mutex_destroy(&nce->nce_lock);
4768 	kmem_cache_free(nce_cache, nce);
4769 }
4770 
4771 /*
4772  * Add an nce to the ill_nce list.
4773  */
4774 static nce_t *
4775 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4776 {
4777 	bzero(nce, sizeof (*nce));
4778 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4779 	nce->nce_common = ncec;
4780 	nce->nce_addr = ncec->ncec_addr;
4781 	nce->nce_ill = ill;
4782 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4783 	    (char *), "nce", (void *), nce);
4784 	ill->ill_nce_cnt++;
4785 
4786 	nce->nce_refcnt = 1; /* for the thread */
4787 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4788 	nce->nce_dlur_mp = dlur_mp;
4789 
4790 	/* add nce to the ill's fastpath list.  */
4791 	nce->nce_refcnt++; /* for the list */
4792 	list_insert_head(&ill->ill_nce, nce);
4793 	return (nce);
4794 }
4795 
4796 static nce_t *
4797 nce_add(ill_t *ill, ncec_t *ncec)
4798 {
4799 	nce_t	*nce;
4800 	mblk_t	*dlur_mp = NULL;
4801 
4802 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4803 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4804 
4805 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4806 	if (nce == NULL)
4807 		return (NULL);
4808 	if (ncec->ncec_lladdr != NULL ||
4809 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4810 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4811 		    ill->ill_phys_addr_length, ill->ill_sap,
4812 		    ill->ill_sap_length);
4813 		if (dlur_mp == NULL) {
4814 			kmem_cache_free(nce_cache, nce);
4815 			return (NULL);
4816 		}
4817 	}
4818 	return (nce_add_impl(ill, ncec, nce, dlur_mp));
4819 }
4820 
4821 /*
4822  * remove the nce from the ill_faspath list
4823  */
4824 void
4825 nce_delete(nce_t *nce)
4826 {
4827 	ill_t	*ill = nce->nce_ill;
4828 
4829 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4830 
4831 	mutex_enter(&nce->nce_lock);
4832 	if (nce->nce_is_condemned) {
4833 		/*
4834 		 * some other thread has removed this nce from the ill_nce list
4835 		 */
4836 		mutex_exit(&nce->nce_lock);
4837 		return;
4838 	}
4839 	nce->nce_is_condemned = B_TRUE;
4840 	mutex_exit(&nce->nce_lock);
4841 
4842 	list_remove(&ill->ill_nce, nce);
4843 	/*
4844 	 * even though we are holding the ill_lock, it is ok to
4845 	 * call nce_refrele here because we know that we should have
4846 	 * at least 2 refs on the nce: one for the thread, and one
4847 	 * for the list. The refrele below will release the one for
4848 	 * the list.
4849 	 */
4850 	nce_refrele(nce);
4851 }
4852 
4853 nce_t *
4854 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4855 {
4856 	nce_t *nce = NULL;
4857 
4858 	ASSERT(ill != NULL);
4859 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4860 
4861 	for (nce = list_head(&ill->ill_nce); nce != NULL;
4862 	    nce = list_next(&ill->ill_nce, nce)) {
4863 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4864 			break;
4865 	}
4866 
4867 	/*
4868 	 * if we found the nce on the ill_nce list while holding
4869 	 * the ill_lock, then it cannot be condemned yet.
4870 	 */
4871 	if (nce != NULL) {
4872 		ASSERT(!nce->nce_is_condemned);
4873 		nce_refhold(nce);
4874 	}
4875 	return (nce);
4876 }
4877 
4878 /*
4879  * Walk the ill_nce list on ill. The callback function func() cannot perform
4880  * any destructive actions.
4881  */
4882 static void
4883 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4884 {
4885 	nce_t *nce = NULL, *nce_next;
4886 
4887 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4888 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4889 		nce_next = list_next(&ill->ill_nce, nce);
4890 		if (func(ill, nce, arg) != 0)
4891 			break;
4892 		nce = nce_next;
4893 	}
4894 }
4895 
4896 void
4897 nce_walk(ill_t *ill, pfi_t func, void *arg)
4898 {
4899 	mutex_enter(&ill->ill_lock);
4900 	nce_walk_common(ill, func, arg);
4901 	mutex_exit(&ill->ill_lock);
4902 }
4903 
4904 void
4905 nce_flush(ill_t *ill, boolean_t flushall)
4906 {
4907 	nce_t *nce, *nce_next;
4908 	list_t dead;
4909 
4910 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4911 	mutex_enter(&ill->ill_lock);
4912 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4913 		nce_next = list_next(&ill->ill_nce, nce);
4914 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4915 			nce = nce_next;
4916 			continue;
4917 		}
4918 		/*
4919 		 * nce_delete requires that the caller should either not
4920 		 * be holding locks, or should hold a ref to ensure that
4921 		 * we wont hit ncec_inactive. So take a ref and clean up
4922 		 * after the list is flushed.
4923 		 */
4924 		nce_refhold(nce);
4925 		nce_delete(nce);
4926 		list_insert_tail(&dead, nce);
4927 		nce = nce_next;
4928 	}
4929 	mutex_exit(&ill->ill_lock);
4930 	while ((nce = list_head(&dead)) != NULL) {
4931 		list_remove(&dead, nce);
4932 		nce_refrele(nce);
4933 	}
4934 	ASSERT(list_is_empty(&dead));
4935 	list_destroy(&dead);
4936 }
4937 
4938 /* Return an interval that is anywhere in the [1 .. intv] range */
4939 static clock_t
4940 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4941 {
4942 	clock_t rnd, frac;
4943 
4944 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4945 	/* Note that clock_t is signed; must chop off bits */
4946 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4947 	if (initial_time) {
4948 		if (intv <= 0)
4949 			intv = 1;
4950 		else
4951 			intv = (rnd % intv) + 1;
4952 	} else {
4953 		/* Compute 'frac' as 20% of the configured interval */
4954 		if ((frac = intv / 5) <= 1)
4955 			frac = 2;
4956 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
4957 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4958 		intv = 1;
4959 	}
4960 	return (intv);
4961 }
4962 
4963 void
4964 nce_resolv_ipmp_ok(ncec_t *ncec)
4965 {
4966 	mblk_t *mp;
4967 	uint_t pkt_len;
4968 	iaflags_t ixaflags = IXAF_NO_TRACE;
4969 	nce_t *under_nce;
4970 	ill_t	*ill = ncec->ncec_ill;
4971 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4972 	ipif_t *src_ipif = NULL;
4973 	ip_stack_t *ipst = ill->ill_ipst;
4974 	ill_t *send_ill;
4975 	uint_t nprobes;
4976 
4977 	ASSERT(IS_IPMP(ill));
4978 
4979 	mutex_enter(&ncec->ncec_lock);
4980 	nprobes = ncec->ncec_nprobes;
4981 	mp = ncec->ncec_qd_mp;
4982 	ncec->ncec_qd_mp = NULL;
4983 	ncec->ncec_nprobes = 0;
4984 	mutex_exit(&ncec->ncec_lock);
4985 
4986 	while (mp != NULL) {
4987 		mblk_t *nxt_mp;
4988 
4989 		nxt_mp = mp->b_next;
4990 		mp->b_next = NULL;
4991 		if (isv6) {
4992 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4993 
4994 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4995 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4996 			    ill, ALL_ZONES, ipst);
4997 		} else {
4998 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
4999 
5000 			ixaflags |= IXAF_IS_IPV4;
5001 			pkt_len = ntohs(ipha->ipha_length);
5002 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5003 			    ill, ALL_ZONES, ipst);
5004 		}
5005 
5006 		/*
5007 		 * find a new nce based on an under_ill. The first IPMP probe
5008 		 * packet gets queued, so we could still find a src_ipif that
5009 		 * matches an IPMP test address.
5010 		 */
5011 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5012 			/*
5013 			 * if src_ipif is null, this could be either a
5014 			 * forwarded packet or a probe whose src got deleted.
5015 			 * We identify the former case by looking for the
5016 			 * ncec_nprobes: the first ncec_nprobes packets are
5017 			 * probes;
5018 			 */
5019 			if (src_ipif == NULL && nprobes > 0)
5020 				goto drop_pkt;
5021 
5022 			/*
5023 			 * For forwarded packets, we use the ipmp rotor
5024 			 * to find send_ill.
5025 			 */
5026 			send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5027 			    B_TRUE);
5028 		} else {
5029 			send_ill = src_ipif->ipif_ill;
5030 			ill_refhold(send_ill);
5031 		}
5032 
5033 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5034 		    (ncec_t *), ncec, (ipif_t *),
5035 		    src_ipif, (ill_t *), send_ill);
5036 
5037 		if (send_ill == NULL) {
5038 			if (src_ipif != NULL)
5039 				ipif_refrele(src_ipif);
5040 			goto drop_pkt;
5041 		}
5042 		/* create an under_nce on send_ill */
5043 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5044 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5045 			under_nce = nce_fastpath_create(send_ill, ncec);
5046 		else
5047 			under_nce = NULL;
5048 		rw_exit(&ipst->ips_ill_g_lock);
5049 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5050 			nce_fastpath_trigger(under_nce);
5051 
5052 		ill_refrele(send_ill);
5053 		if (src_ipif != NULL)
5054 			ipif_refrele(src_ipif);
5055 
5056 		if (under_nce != NULL) {
5057 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5058 			    ALL_ZONES, 0, NULL);
5059 			nce_refrele(under_nce);
5060 			if (nprobes > 0)
5061 				nprobes--;
5062 			mp = nxt_mp;
5063 			continue;
5064 		}
5065 drop_pkt:
5066 		if (isv6) {
5067 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5068 		} else {
5069 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5070 		}
5071 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5072 		freemsg(mp);
5073 		if (nprobes > 0)
5074 			nprobes--;
5075 		mp = nxt_mp;
5076 	}
5077 	ncec_cb_dispatch(ncec); /* complete callbacks */
5078 }
5079