xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ndp.c (revision 15c07adc1c7b828006b5e3c4d528b92229d6bd23)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2018, Joyent, Inc.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/errno.h>
35 #include <sys/dlpi.h>
36 #include <sys/socket.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/vtrace.h>
42 #include <sys/kmem.h>
43 #include <sys/zone.h>
44 #include <sys/ethernet.h>
45 #include <sys/sdt.h>
46 #include <sys/mac.h>
47 
48 #include <net/if.h>
49 #include <net/if_types.h>
50 #include <net/if_dl.h>
51 #include <net/route.h>
52 #include <netinet/in.h>
53 #include <netinet/ip6.h>
54 #include <netinet/icmp6.h>
55 
56 #include <inet/common.h>
57 #include <inet/mi.h>
58 #include <inet/mib2.h>
59 #include <inet/nd.h>
60 #include <inet/ip.h>
61 #include <inet/ip_impl.h>
62 #include <inet/ipclassifier.h>
63 #include <inet/ip_if.h>
64 #include <inet/ip_ire.h>
65 #include <inet/ip_rts.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_ndp.h>
68 #include <inet/sctp_ip.h>
69 #include <inet/ip_arp.h>
70 #include <inet/ip2mac_impl.h>
71 
72 #define	ANNOUNCE_INTERVAL(isv6) \
73 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
74 	ipst->ips_ip_arp_publish_interval)
75 
76 #define	DEFENSE_INTERVAL(isv6) \
77 	(isv6 ? ipst->ips_ndp_defend_interval : \
78 	ipst->ips_arp_defend_interval)
79 
80 /* Non-tunable probe interval, based on link capabilities */
81 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
82 
83 /*
84  * The IPv4 Link Local address space is special; we do extra duplicate checking
85  * there, as the entire assignment mechanism rests on random numbers.
86  */
87 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
88 				((uchar_t *)ptr)[1] == 254)
89 
90 /*
91  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
92  * in to the ncec*add* functions.
93  *
94  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
95  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
96  * that we will respond to requests for the protocol address.
97  */
98 #define	NCE_EXTERNAL_FLAGS_MASK \
99 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
100 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
101 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
102 
103 /*
104  * Lock ordering:
105  *
106  *	ndp_g_lock -> ill_lock -> ncec_lock
107  *
108  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
109  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
110  * ncec_refcnt).
111  */
112 
113 static	void	nce_cleanup_list(ncec_t *ncec);
114 static	void	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
115 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
116     ncec_t *);
117 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
118 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
119     uint16_t ncec_flags, nce_t **newnce);
120 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
121     uint16_t ncec_flags, nce_t **newnce);
122 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
123     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
124     const in6_addr_t *target, int flag);
125 static void	ncec_refhold_locked(ncec_t *);
126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
127 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
128 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
129     uint16_t, uint16_t, nce_t **);
130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
131 static nce_t *nce_add(ill_t *, ncec_t *);
132 static void nce_inactive(nce_t *);
133 extern nce_t	*nce_lookup(ill_t *, const in6_addr_t *);
134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
135 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
136     uint16_t, uint16_t, nce_t **);
137 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
138     uint16_t, uint16_t, nce_t **);
139 static int  nce_add_v6_postprocess(nce_t *);
140 static int  nce_add_v4_postprocess(nce_t *);
141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
143 static void nce_resolv_ipmp_ok(ncec_t *);
144 static void nce_walk_common(ill_t *, pfi_t, void *);
145 static void nce_start_timer(ncec_t *, uint_t);
146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
147 static void nce_fastpath_trigger(nce_t *);
148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
149 
150 #ifdef DEBUG
151 static void	ncec_trace_cleanup(const ncec_t *);
152 #endif
153 
154 #define	NCE_HASH_PTR_V4(ipst, addr)					\
155 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
156 
157 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
158 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
159 		NCE_TABLE_SIZE)]))
160 
161 extern kmem_cache_t *ncec_cache;
162 extern kmem_cache_t *nce_cache;
163 
164 /*
165  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
166  * If src_ill is not null, the ncec_addr is bound to src_ill. The
167  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
168  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
169  * IPMP cast_ill (in the IPMP case).
170  *
171  * Note that the probe interval is based on the src_ill for IPv6, and
172  * the ncec_xmit_interval for IPv4.
173  */
174 static void
175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
176 {
177 	boolean_t dropped;
178 	uint32_t probe_interval;
179 
180 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
181 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
182 	if (ncec->ncec_ipversion == IPV6_VERSION) {
183 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
184 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
185 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
186 		probe_interval = ILL_PROBE_INTERVAL(src_ill);
187 	} else {
188 		/* IPv4 DAD delay the initial probe. */
189 		if (send_probe)
190 			dropped = arp_probe(ncec);
191 		else
192 			dropped = B_TRUE;
193 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
194 		    !send_probe);
195 	}
196 	if (!dropped) {
197 		mutex_enter(&ncec->ncec_lock);
198 		ncec->ncec_pcnt--;
199 		mutex_exit(&ncec->ncec_lock);
200 	}
201 	nce_restart_timer(ncec, probe_interval);
202 }
203 
204 /*
205  * Compute default flags to use for an advertisement of this ncec's address.
206  */
207 static int
208 nce_advert_flags(const ncec_t *ncec)
209 {
210 	int flag = 0;
211 
212 	if (ncec->ncec_flags & NCE_F_ISROUTER)
213 		flag |= NDP_ISROUTER;
214 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
215 		flag |= NDP_ORIDE;
216 
217 	return (flag);
218 }
219 
220 /*
221  * NDP Cache Entry creation routine.
222  * This routine must always be called with ndp6->ndp_g_lock held.
223  */
224 int
225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
226     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
227 {
228 	int		err;
229 	nce_t		*nce;
230 
231 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
232 	ASSERT(ill != NULL && ill->ill_isv6);
233 
234 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
235 	    &nce);
236 	if (err != 0)
237 		return (err);
238 	ASSERT(newnce != NULL);
239 	*newnce = nce;
240 	return (err);
241 }
242 
243 /*
244  * Post-processing routine to be executed after nce_add_v6(). This function
245  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
246  * and must be called without any locks held.
247  */
248 int
249 nce_add_v6_postprocess(nce_t *nce)
250 {
251 	ncec_t		*ncec = nce->nce_common;
252 	boolean_t	dropped = B_FALSE;
253 	uchar_t		*hw_addr = ncec->ncec_lladdr;
254 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
255 	ill_t		*ill = ncec->ncec_ill;
256 	int		err = 0;
257 	uint16_t	flags = ncec->ncec_flags;
258 	ip_stack_t	*ipst = ill->ill_ipst;
259 	boolean_t	trigger_fastpath = B_TRUE;
260 
261 	/*
262 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
263 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
264 	 * We call nce_fastpath from nce_update if the link layer address of
265 	 * the peer changes from nce_update
266 	 */
267 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
268 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
269 		trigger_fastpath = B_FALSE;
270 
271 	if (trigger_fastpath)
272 		nce_fastpath_trigger(nce);
273 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
274 		ill_t *hwaddr_ill;
275 		/*
276 		 * Unicast entry that needs DAD.
277 		 */
278 		if (IS_IPMP(ill)) {
279 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
280 			    hw_addr, hw_addr_len);
281 		} else {
282 			hwaddr_ill = ill;
283 		}
284 		nce_dad(ncec, hwaddr_ill, B_TRUE);
285 		err = EINPROGRESS;
286 	} else if (flags & NCE_F_UNSOL_ADV) {
287 		/*
288 		 * We account for the transmit below by assigning one
289 		 * less than the ndd variable. Subsequent decrements
290 		 * are done in nce_timer.
291 		 */
292 		mutex_enter(&ncec->ncec_lock);
293 		ncec->ncec_unsolicit_count =
294 		    ipst->ips_ip_ndp_unsolicit_count - 1;
295 		mutex_exit(&ncec->ncec_lock);
296 		dropped = ndp_xmit(ill,
297 		    ND_NEIGHBOR_ADVERT,
298 		    hw_addr,
299 		    hw_addr_len,
300 		    &ncec->ncec_addr,	/* Source and target of the adv */
301 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
302 		    nce_advert_flags(ncec));
303 		mutex_enter(&ncec->ncec_lock);
304 		if (dropped)
305 			ncec->ncec_unsolicit_count++;
306 		else
307 			ncec->ncec_last_time_defended = ddi_get_lbolt();
308 		if (ncec->ncec_unsolicit_count != 0) {
309 			nce_start_timer(ncec,
310 			    ipst->ips_ip_ndp_unsolicit_interval);
311 		}
312 		mutex_exit(&ncec->ncec_lock);
313 	}
314 	return (err);
315 }
316 
317 /*
318  * Atomically lookup and add (if needed) Neighbor Cache information for
319  * an address.
320  *
321  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
322  * are always added pointing at the ipmp_ill. Thus, when the ill passed
323  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
324  * entries will be created, both pointing at the same ncec_t. The nce_t
325  * entries will have their nce_ill set to the ipmp_ill and the under_ill
326  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
327  * Local addresses are always created on the ill passed to nce_add_v6.
328  */
329 int
330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
331     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
332 {
333 	int		err = 0;
334 	ip_stack_t	*ipst = ill->ill_ipst;
335 	nce_t		*nce, *upper_nce = NULL;
336 	ill_t		*in_ill = ill;
337 	boolean_t	need_ill_refrele = B_FALSE;
338 
339 	if (flags & NCE_F_MCAST) {
340 		/*
341 		 * hw_addr will be figured out in nce_set_multicast_v6;
342 		 * caller has to select the cast_ill
343 		 */
344 		ASSERT(hw_addr == NULL);
345 		ASSERT(!IS_IPMP(ill));
346 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
347 		return (err);
348 	}
349 	ASSERT(ill->ill_isv6);
350 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
351 		ill = ipmp_ill_hold_ipmp_ill(ill);
352 		if (ill == NULL)
353 			return (ENXIO);
354 		need_ill_refrele = B_TRUE;
355 	}
356 
357 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
358 	nce = nce_lookup_addr(ill, addr);
359 	if (nce == NULL) {
360 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
361 		    &nce);
362 	} else {
363 		err = EEXIST;
364 	}
365 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
366 	if (err == 0)
367 		err = nce_add_v6_postprocess(nce);
368 	if (in_ill != ill && nce != NULL) {
369 		nce_t *under_nce = NULL;
370 
371 		/*
372 		 * in_ill was the under_ill. Try to create the under_nce.
373 		 * Hold the ill_g_lock to prevent changes to group membership
374 		 * until we are done.
375 		 */
376 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
377 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
378 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
379 			    ill_t *, ill);
380 			rw_exit(&ipst->ips_ill_g_lock);
381 			err = ENXIO;
382 			nce_refrele(nce);
383 			nce = NULL;
384 			goto bail;
385 		}
386 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
387 		if (under_nce == NULL) {
388 			rw_exit(&ipst->ips_ill_g_lock);
389 			err = EINVAL;
390 			nce_refrele(nce);
391 			nce = NULL;
392 			goto bail;
393 		}
394 		rw_exit(&ipst->ips_ill_g_lock);
395 		upper_nce = nce;
396 		nce = under_nce; /* will be returned to caller */
397 		if (NCE_ISREACHABLE(nce->nce_common))
398 			nce_fastpath_trigger(under_nce);
399 	}
400 	/* nce_refrele is deferred until the lock is dropped  */
401 	if (nce != NULL) {
402 		if (newnce != NULL)
403 			*newnce = nce;
404 		else
405 			nce_refrele(nce);
406 	}
407 bail:
408 	if (upper_nce != NULL)
409 		nce_refrele(upper_nce);
410 	if (need_ill_refrele)
411 		ill_refrele(ill);
412 	return (err);
413 }
414 
415 /*
416  * Remove all the CONDEMNED nces from the appropriate hash table.
417  * We create a private list of NCEs, these may have ires pointing
418  * to them, so the list will be passed through to clean up dependent
419  * ires and only then we can do ncec_refrele() which can make NCE inactive.
420  */
421 static void
422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
423 {
424 	ncec_t *ncec1;
425 	ncec_t **ptpn;
426 
427 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
428 	ASSERT(ndp->ndp_g_walker == 0);
429 	for (; ncec; ncec = ncec1) {
430 		ncec1 = ncec->ncec_next;
431 		mutex_enter(&ncec->ncec_lock);
432 		if (NCE_ISCONDEMNED(ncec)) {
433 			ptpn = ncec->ncec_ptpn;
434 			ncec1 = ncec->ncec_next;
435 			if (ncec1 != NULL)
436 				ncec1->ncec_ptpn = ptpn;
437 			*ptpn = ncec1;
438 			ncec->ncec_ptpn = NULL;
439 			ncec->ncec_next = NULL;
440 			ncec->ncec_next = *free_nce_list;
441 			*free_nce_list = ncec;
442 		}
443 		mutex_exit(&ncec->ncec_lock);
444 	}
445 }
446 
447 /*
448  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
449  *    will return this NCE. Also no new timeouts will
450  *    be started (See nce_restart_timer).
451  * 2. Cancel any currently running timeouts.
452  * 3. If there is an ndp walker, return. The walker will do the cleanup.
453  *    This ensures that walkers see a consistent list of NCEs while walking.
454  * 4. Otherwise remove the NCE from the list of NCEs
455  */
456 void
457 ncec_delete(ncec_t *ncec)
458 {
459 	ncec_t	**ptpn;
460 	ncec_t	*ncec1;
461 	int	ipversion = ncec->ncec_ipversion;
462 	ndp_g_t *ndp;
463 	ip_stack_t	*ipst = ncec->ncec_ipst;
464 
465 	if (ipversion == IPV4_VERSION)
466 		ndp = ipst->ips_ndp4;
467 	else
468 		ndp = ipst->ips_ndp6;
469 
470 	/* Serialize deletes */
471 	mutex_enter(&ncec->ncec_lock);
472 	if (NCE_ISCONDEMNED(ncec)) {
473 		/* Some other thread is doing the delete */
474 		mutex_exit(&ncec->ncec_lock);
475 		return;
476 	}
477 	/*
478 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
479 	 * refcnt has to be >= 2
480 	 */
481 	ASSERT(ncec->ncec_refcnt >= 2);
482 	ncec->ncec_flags |= NCE_F_CONDEMNED;
483 	mutex_exit(&ncec->ncec_lock);
484 
485 	/* Count how many condemned ires for kmem_cache callback */
486 	atomic_inc_32(&ipst->ips_num_nce_condemned);
487 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
488 
489 	/* Complete any waiting callbacks */
490 	ncec_cb_dispatch(ncec);
491 
492 	/*
493 	 * Cancel any running timer. Timeout can't be restarted
494 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
495 	 * Passing invalid timeout id is fine.
496 	 */
497 	if (ncec->ncec_timeout_id != 0) {
498 		(void) untimeout(ncec->ncec_timeout_id);
499 		ncec->ncec_timeout_id = 0;
500 	}
501 
502 	mutex_enter(&ndp->ndp_g_lock);
503 	if (ncec->ncec_ptpn == NULL) {
504 		/*
505 		 * The last ndp walker has already removed this ncec from
506 		 * the list after we marked the ncec CONDEMNED and before
507 		 * we grabbed the global lock.
508 		 */
509 		mutex_exit(&ndp->ndp_g_lock);
510 		return;
511 	}
512 	if (ndp->ndp_g_walker > 0) {
513 		/*
514 		 * Can't unlink. The walker will clean up
515 		 */
516 		ndp->ndp_g_walker_cleanup = B_TRUE;
517 		mutex_exit(&ndp->ndp_g_lock);
518 		return;
519 	}
520 
521 	/*
522 	 * Now remove the ncec from the list. nce_restart_timer won't restart
523 	 * the timer since it is marked CONDEMNED.
524 	 */
525 	ptpn = ncec->ncec_ptpn;
526 	ncec1 = ncec->ncec_next;
527 	if (ncec1 != NULL)
528 		ncec1->ncec_ptpn = ptpn;
529 	*ptpn = ncec1;
530 	ncec->ncec_ptpn = NULL;
531 	ncec->ncec_next = NULL;
532 	mutex_exit(&ndp->ndp_g_lock);
533 
534 	/* Removed from ncec_ptpn/ncec_next list */
535 	ncec_refrele_notr(ncec);
536 }
537 
538 void
539 ncec_inactive(ncec_t *ncec)
540 {
541 	mblk_t		**mpp;
542 	ill_t		*ill = ncec->ncec_ill;
543 	ip_stack_t	*ipst = ncec->ncec_ipst;
544 
545 	ASSERT(ncec->ncec_refcnt == 0);
546 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
547 
548 	/* Count how many condemned nces for kmem_cache callback */
549 	if (NCE_ISCONDEMNED(ncec))
550 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
551 
552 	/* Free all allocated messages */
553 	mpp = &ncec->ncec_qd_mp;
554 	while (*mpp != NULL) {
555 		mblk_t  *mp;
556 
557 		mp = *mpp;
558 		*mpp = mp->b_next;
559 
560 		inet_freemsg(mp);
561 	}
562 	/*
563 	 * must have been cleaned up in ncec_delete
564 	 */
565 	ASSERT(list_is_empty(&ncec->ncec_cb));
566 	list_destroy(&ncec->ncec_cb);
567 	/*
568 	 * free the ncec_lladdr if one was allocated in nce_add_common()
569 	 */
570 	if (ncec->ncec_lladdr_length > 0)
571 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
572 
573 #ifdef DEBUG
574 	ncec_trace_cleanup(ncec);
575 #endif
576 
577 	mutex_enter(&ill->ill_lock);
578 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
579 	    (char *), "ncec", (void *), ncec);
580 	ill->ill_ncec_cnt--;
581 	ncec->ncec_ill = NULL;
582 	/*
583 	 * If the number of ncec's associated with this ill have dropped
584 	 * to zero, check whether we need to restart any operation that
585 	 * is waiting for this to happen.
586 	 */
587 	if (ILL_DOWN_OK(ill)) {
588 		/* ipif_ill_refrele_tail drops the ill_lock */
589 		ipif_ill_refrele_tail(ill);
590 	} else {
591 		mutex_exit(&ill->ill_lock);
592 	}
593 
594 	mutex_destroy(&ncec->ncec_lock);
595 	kmem_cache_free(ncec_cache, ncec);
596 }
597 
598 /*
599  * ncec_walk routine.  Delete the ncec if it is associated with the ill
600  * that is going away.  Always called as a writer.
601  */
602 void
603 ncec_delete_per_ill(ncec_t *ncec, void *arg)
604 {
605 	if ((ncec != NULL) && ncec->ncec_ill == arg) {
606 		ncec_delete(ncec);
607 	}
608 }
609 
610 /*
611  * Neighbor Cache cleanup logic for a list of ncec_t entries.
612  */
613 static void
614 nce_cleanup_list(ncec_t *ncec)
615 {
616 	ncec_t *ncec_next;
617 
618 	ASSERT(ncec != NULL);
619 	while (ncec != NULL) {
620 		ncec_next = ncec->ncec_next;
621 		ncec->ncec_next = NULL;
622 
623 		/*
624 		 * It is possible for the last ndp walker (this thread)
625 		 * to come here after ncec_delete has marked the ncec CONDEMNED
626 		 * and before it has removed the ncec from the fastpath list
627 		 * or called untimeout. So we need to do it here. It is safe
628 		 * for both ncec_delete and this thread to do it twice or
629 		 * even simultaneously since each of the threads has a
630 		 * reference on the ncec.
631 		 */
632 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
633 		/*
634 		 * Cancel any running timer. Timeout can't be restarted
635 		 * since CONDEMNED is set. The ncec_lock can't be
636 		 * held across untimeout though passing invalid timeout
637 		 * id is fine.
638 		 */
639 		if (ncec->ncec_timeout_id != 0) {
640 			(void) untimeout(ncec->ncec_timeout_id);
641 			ncec->ncec_timeout_id = 0;
642 		}
643 		/* Removed from ncec_ptpn/ncec_next list */
644 		ncec_refrele_notr(ncec);
645 		ncec = ncec_next;
646 	}
647 }
648 
649 /*
650  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
651  */
652 boolean_t
653 nce_restart_dad(ncec_t *ncec)
654 {
655 	boolean_t started;
656 	ill_t *ill, *hwaddr_ill;
657 
658 	if (ncec == NULL)
659 		return (B_FALSE);
660 	ill = ncec->ncec_ill;
661 	mutex_enter(&ncec->ncec_lock);
662 	if (ncec->ncec_state == ND_PROBE) {
663 		mutex_exit(&ncec->ncec_lock);
664 		started = B_TRUE;
665 	} else if (ncec->ncec_state == ND_REACHABLE) {
666 		ASSERT(ncec->ncec_lladdr != NULL);
667 		ncec->ncec_state = ND_PROBE;
668 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
669 		/*
670 		 * Slight cheat here: we don't use the initial probe delay
671 		 * for IPv4 in this obscure case.
672 		 */
673 		mutex_exit(&ncec->ncec_lock);
674 		if (IS_IPMP(ill)) {
675 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
676 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
677 		} else {
678 			hwaddr_ill = ill;
679 		}
680 		nce_dad(ncec, hwaddr_ill, B_TRUE);
681 		started = B_TRUE;
682 	} else {
683 		mutex_exit(&ncec->ncec_lock);
684 		started = B_FALSE;
685 	}
686 	return (started);
687 }
688 
689 /*
690  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
691  * If one is found, the refcnt on the ncec will be incremented.
692  */
693 ncec_t *
694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
695 {
696 	ncec_t		*ncec;
697 	ip_stack_t	*ipst = ill->ill_ipst;
698 
699 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
700 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
701 
702 	/* Get head of v6 hash table */
703 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
704 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
705 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
706 	rw_exit(&ipst->ips_ill_g_lock);
707 	return (ncec);
708 }
709 /*
710  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
711  * If one is found, the refcnt on the ncec will be incremented.
712  */
713 ncec_t *
714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
715 {
716 	ncec_t	*ncec = NULL;
717 	in6_addr_t addr6;
718 	ip_stack_t *ipst = ill->ill_ipst;
719 
720 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
721 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
722 
723 	/* Get head of v4 hash table */
724 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
725 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
726 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
727 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
728 	rw_exit(&ipst->ips_ill_g_lock);
729 	return (ncec);
730 }
731 
732 /*
733  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
734  * If an ncec is found, increment the hold count on that ncec.
735  * The caller passes in the start of the appropriate hash table, and must
736  * be holding the appropriate global lock (ndp_g_lock). In addition, since
737  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
738  * must be held as reader.
739  *
740  * This function always matches across the ipmp group.
741  */
742 ncec_t *
743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
744 {
745 	ndp_g_t		*ndp;
746 	ip_stack_t	*ipst = ill->ill_ipst;
747 
748 	if (ill->ill_isv6)
749 		ndp = ipst->ips_ndp6;
750 	else
751 		ndp = ipst->ips_ndp4;
752 
753 	ASSERT(ill != NULL);
754 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
755 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
756 		return (NULL);
757 	for (; ncec != NULL; ncec = ncec->ncec_next) {
758 		if (ncec->ncec_ill == ill ||
759 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
760 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
761 				mutex_enter(&ncec->ncec_lock);
762 				if (!NCE_ISCONDEMNED(ncec)) {
763 					ncec_refhold_locked(ncec);
764 					mutex_exit(&ncec->ncec_lock);
765 					break;
766 				}
767 				mutex_exit(&ncec->ncec_lock);
768 			}
769 		}
770 	}
771 	return (ncec);
772 }
773 
774 /*
775  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
776  * entries for ill only, i.e., when ill is part of an ipmp group,
777  * nce_lookup_v4 will never try to match across the group.
778  */
779 nce_t *
780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
781 {
782 	nce_t *nce;
783 	in6_addr_t addr6;
784 	ip_stack_t *ipst = ill->ill_ipst;
785 
786 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
787 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
788 	nce = nce_lookup_addr(ill, &addr6);
789 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
790 	return (nce);
791 }
792 
793 /*
794  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
795  * entries for ill only, i.e., when ill is part of an ipmp group,
796  * nce_lookup_v6 will never try to match across the group.
797  */
798 nce_t *
799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
800 {
801 	nce_t *nce;
802 	ip_stack_t *ipst = ill->ill_ipst;
803 
804 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
805 	nce = nce_lookup_addr(ill, addr6);
806 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
807 	return (nce);
808 }
809 
810 static nce_t *
811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
812 {
813 	nce_t *nce;
814 
815 	ASSERT(ill != NULL);
816 #ifdef DEBUG
817 	if (ill->ill_isv6)
818 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
819 	else
820 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
821 #endif
822 	mutex_enter(&ill->ill_lock);
823 	nce = nce_lookup(ill, addr);
824 	mutex_exit(&ill->ill_lock);
825 	return (nce);
826 }
827 
828 
829 /*
830  * Router turned to host.  We need to make sure that cached copies of the ncec
831  * are not used for forwarding packets if they were derived from the default
832  * route, and that the default route itself is removed, as  required by
833  * section 7.2.5 of RFC 2461.
834  *
835  * Note that the ncec itself probably has valid link-layer information for the
836  * nexthop, so that there is no reason to delete the ncec, as long as the
837  * ISROUTER flag is turned off.
838  */
839 static void
840 ncec_router_to_host(ncec_t *ncec)
841 {
842 	ire_t		*ire;
843 	ip_stack_t	*ipst = ncec->ncec_ipst;
844 
845 	mutex_enter(&ncec->ncec_lock);
846 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
847 	mutex_exit(&ncec->ncec_lock);
848 
849 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
850 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
851 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
852 	if (ire != NULL) {
853 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
854 		ire_delete(ire);
855 		ire_refrele(ire);
856 	}
857 }
858 
859 /*
860  * Process passed in parameters either from an incoming packet or via
861  * user ioctl.
862  */
863 void
864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
865 {
866 	ill_t	*ill = ncec->ncec_ill;
867 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
868 	boolean_t ll_updated = B_FALSE;
869 	boolean_t ll_changed;
870 	nce_t	*nce;
871 
872 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
873 	/*
874 	 * No updates of link layer address or the neighbor state is
875 	 * allowed, when the cache is in NONUD state.  This still
876 	 * allows for responding to reachability solicitation.
877 	 */
878 	mutex_enter(&ncec->ncec_lock);
879 	if (ncec->ncec_state == ND_INCOMPLETE) {
880 		if (hw_addr == NULL) {
881 			mutex_exit(&ncec->ncec_lock);
882 			return;
883 		}
884 		nce_set_ll(ncec, hw_addr);
885 		/*
886 		 * Update ncec state and send the queued packets
887 		 * back to ip this time ire will be added.
888 		 */
889 		if (flag & ND_NA_FLAG_SOLICITED) {
890 			nce_update(ncec, ND_REACHABLE, NULL);
891 		} else {
892 			nce_update(ncec, ND_STALE, NULL);
893 		}
894 		mutex_exit(&ncec->ncec_lock);
895 		nce = nce_fastpath(ncec, B_TRUE, NULL);
896 		nce_resolv_ok(ncec);
897 		if (nce != NULL)
898 			nce_refrele(nce);
899 		return;
900 	}
901 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
902 	if (!is_adv) {
903 		/* If this is a SOLICITATION request only */
904 		if (ll_changed)
905 			nce_update(ncec, ND_STALE, hw_addr);
906 		mutex_exit(&ncec->ncec_lock);
907 		ncec_cb_dispatch(ncec);
908 		return;
909 	}
910 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
911 		/* If in any other state than REACHABLE, ignore */
912 		if (ncec->ncec_state == ND_REACHABLE) {
913 			nce_update(ncec, ND_STALE, NULL);
914 		}
915 		mutex_exit(&ncec->ncec_lock);
916 		ncec_cb_dispatch(ncec);
917 		return;
918 	} else {
919 		if (ll_changed) {
920 			nce_update(ncec, ND_UNCHANGED, hw_addr);
921 			ll_updated = B_TRUE;
922 		}
923 		if (flag & ND_NA_FLAG_SOLICITED) {
924 			nce_update(ncec, ND_REACHABLE, NULL);
925 		} else {
926 			if (ll_updated) {
927 				nce_update(ncec, ND_STALE, NULL);
928 			}
929 		}
930 		mutex_exit(&ncec->ncec_lock);
931 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
932 		    NCE_F_ISROUTER)) {
933 			ncec_router_to_host(ncec);
934 		} else {
935 			ncec_cb_dispatch(ncec);
936 		}
937 	}
938 }
939 
940 /*
941  * Pass arg1 to the cbf supplied, along with each ncec in existence.
942  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
943  * walking the hash list.
944  */
945 void
946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
947     void *arg1, boolean_t trace)
948 {
949 	ncec_t	*ncec;
950 	ncec_t	*ncec1;
951 	ncec_t	**ncep;
952 	ncec_t	*free_nce_list = NULL;
953 
954 	mutex_enter(&ndp->ndp_g_lock);
955 	/* Prevent ncec_delete from unlink and free of NCE */
956 	ndp->ndp_g_walker++;
957 	mutex_exit(&ndp->ndp_g_lock);
958 	for (ncep = ndp->nce_hash_tbl;
959 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
960 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
961 			ncec1 = ncec->ncec_next;
962 			if (ill == NULL || ncec->ncec_ill == ill) {
963 				if (trace) {
964 					ncec_refhold(ncec);
965 					(*cbf)(ncec, arg1);
966 					ncec_refrele(ncec);
967 				} else {
968 					ncec_refhold_notr(ncec);
969 					(*cbf)(ncec, arg1);
970 					ncec_refrele_notr(ncec);
971 				}
972 			}
973 		}
974 	}
975 	mutex_enter(&ndp->ndp_g_lock);
976 	ndp->ndp_g_walker--;
977 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
978 		/* Time to delete condemned entries */
979 		for (ncep = ndp->nce_hash_tbl;
980 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
981 			ncec = *ncep;
982 			if (ncec != NULL) {
983 				nce_remove(ndp, ncec, &free_nce_list);
984 			}
985 		}
986 		ndp->ndp_g_walker_cleanup = B_FALSE;
987 	}
988 
989 	mutex_exit(&ndp->ndp_g_lock);
990 
991 	if (free_nce_list != NULL) {
992 		nce_cleanup_list(free_nce_list);
993 	}
994 }
995 
996 /*
997  * Walk everything.
998  * Note that ill can be NULL hence can't derive the ipst from it.
999  */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003 	ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 	ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006 
1007 /*
1008  * For each interface an entry is added for the unspecified multicast group.
1009  * Here that mapping is used to form the multicast cache entry for a particular
1010  * multicast destination.
1011  */
1012 static int
1013 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1014     uint16_t flags, nce_t **newnce)
1015 {
1016 	uchar_t		*hw_addr;
1017 	int		err = 0;
1018 	ip_stack_t	*ipst = ill->ill_ipst;
1019 	nce_t		*nce;
1020 
1021 	ASSERT(ill != NULL);
1022 	ASSERT(ill->ill_isv6);
1023 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1024 
1025 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1026 	nce = nce_lookup_addr(ill, dst);
1027 	if (nce != NULL) {
1028 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1029 		goto done;
1030 	}
1031 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1032 		/*
1033 		 * For IRE_IF_RESOLVER a hardware mapping can be
1034 		 * generated.
1035 		 */
1036 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1037 		if (hw_addr == NULL) {
1038 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1039 			return (ENOMEM);
1040 		}
1041 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1042 	} else {
1043 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
1044 		hw_addr = NULL;
1045 	}
1046 	ASSERT((flags & NCE_F_MCAST) != 0);
1047 	ASSERT((flags & NCE_F_NONUD) != 0);
1048 	/* nce_state will be computed by nce_add_common() */
1049 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1050 	    ND_UNCHANGED, &nce);
1051 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1052 	if (err == 0)
1053 		err = nce_add_v6_postprocess(nce);
1054 	if (hw_addr != NULL)
1055 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1056 	if (err != 0) {
1057 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1058 		return (err);
1059 	}
1060 done:
1061 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1062 	if (newnce != NULL)
1063 		*newnce = nce;
1064 	else
1065 		nce_refrele(nce);
1066 	return (0);
1067 }
1068 
1069 /*
1070  * Return the link layer address, and any flags of a ncec.
1071  */
1072 int
1073 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1074 {
1075 	ncec_t		*ncec;
1076 	in6_addr_t	*addr;
1077 	sin6_t		*sin6;
1078 
1079 	ASSERT(ill != NULL && ill->ill_isv6);
1080 	sin6 = (sin6_t *)&lnr->lnr_addr;
1081 	addr =  &sin6->sin6_addr;
1082 
1083 	/*
1084 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1085 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1086 	 * addresses for the data addresses on an IPMP interface even though
1087 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1088 	 */
1089 	ncec = ncec_lookup_illgrp_v6(ill, addr);
1090 	if (ncec == NULL)
1091 		return (ESRCH);
1092 	/* If no link layer address is available yet, return ESRCH */
1093 	if (!NCE_ISREACHABLE(ncec)) {
1094 		ncec_refrele(ncec);
1095 		return (ESRCH);
1096 	}
1097 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1098 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1099 	    lnr->lnr_hdw_len);
1100 	if (ncec->ncec_flags & NCE_F_ISROUTER)
1101 		lnr->lnr_flags = NDF_ISROUTER_ON;
1102 	if (ncec->ncec_flags & NCE_F_ANYCAST)
1103 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1104 	if (ncec->ncec_flags & NCE_F_STATIC)
1105 		lnr->lnr_flags |= NDF_STATIC;
1106 	ncec_refrele(ncec);
1107 	return (0);
1108 }
1109 
1110 /*
1111  * Finish setting up the Enable/Disable multicast for the driver.
1112  */
1113 mblk_t *
1114 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1115     uint32_t hw_addr_offset, mblk_t *mp)
1116 {
1117 	uchar_t		*hw_addr;
1118 	ipaddr_t	v4group;
1119 	uchar_t		*addr;
1120 
1121 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1122 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1123 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1124 
1125 		ASSERT(CLASSD(v4group));
1126 		ASSERT(!(ill->ill_isv6));
1127 
1128 		addr = (uchar_t *)&v4group;
1129 	} else {
1130 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1131 		ASSERT(ill->ill_isv6);
1132 
1133 		addr = (uchar_t *)v6group;
1134 	}
1135 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1136 	if (hw_addr == NULL) {
1137 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1138 		freemsg(mp);
1139 		return (NULL);
1140 	}
1141 
1142 	ip_mcast_mapping(ill, addr, hw_addr);
1143 	return (mp);
1144 }
1145 
1146 void
1147 ip_ndp_resolve(ncec_t *ncec)
1148 {
1149 	in_addr_t	sender4 = INADDR_ANY;
1150 	in6_addr_t	sender6 = ipv6_all_zeros;
1151 	ill_t		*src_ill;
1152 	uint32_t	ms;
1153 
1154 	src_ill = nce_resolve_src(ncec, &sender6);
1155 	if (src_ill == NULL) {
1156 		/* Make sure we try again later */
1157 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
1158 		nce_restart_timer(ncec, (clock_t)ms);
1159 		return;
1160 	}
1161 	if (ncec->ncec_ipversion == IPV4_VERSION)
1162 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1163 	mutex_enter(&ncec->ncec_lock);
1164 	if (ncec->ncec_ipversion == IPV6_VERSION)
1165 		ms = ndp_solicit(ncec, sender6, src_ill);
1166 	else
1167 		ms = arp_request(ncec, sender4, src_ill);
1168 	mutex_exit(&ncec->ncec_lock);
1169 	if (ms == 0) {
1170 		if (ncec->ncec_state != ND_REACHABLE) {
1171 			if (ncec->ncec_ipversion == IPV6_VERSION)
1172 				ndp_resolv_failed(ncec);
1173 			else
1174 				arp_resolv_failed(ncec);
1175 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1176 			nce_make_unreachable(ncec);
1177 			ncec_delete(ncec);
1178 		}
1179 	} else {
1180 		nce_restart_timer(ncec, (clock_t)ms);
1181 	}
1182 done:
1183 	ill_refrele(src_ill);
1184 }
1185 
1186 /*
1187  * Send an IPv6 neighbor solicitation.
1188  * Returns number of milliseconds after which we should either rexmit or abort.
1189  * Return of zero means we should abort.
1190  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1191  * The optional source address is used as a hint to ndp_solicit for
1192  * which source to use in the packet.
1193  *
1194  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1195  * the packet.
1196  */
1197 uint32_t
1198 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1199 {
1200 	in6_addr_t	dst;
1201 	boolean_t	dropped = B_FALSE;
1202 
1203 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1204 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1205 
1206 	if (ncec->ncec_rcnt == 0)
1207 		return (0);
1208 
1209 	dst = ncec->ncec_addr;
1210 	ncec->ncec_rcnt--;
1211 	mutex_exit(&ncec->ncec_lock);
1212 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1213 	    ill->ill_phys_addr_length, &src, &dst, 0);
1214 	mutex_enter(&ncec->ncec_lock);
1215 	if (dropped)
1216 		ncec->ncec_rcnt++;
1217 	return (ncec->ncec_ill->ill_reachable_retrans_time);
1218 }
1219 
1220 /*
1221  * Attempt to recover an address on an interface that's been marked as a
1222  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1223  * no easy way to just probe the address and have the right thing happen if
1224  * it's no longer in use.  Instead, we just bring it up normally and allow the
1225  * regular interface start-up logic to probe for a remaining duplicate and take
1226  * us back down if necessary.
1227  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1228  * ip_ndp_excl.
1229  */
1230 /* ARGSUSED */
1231 void
1232 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1233 {
1234 	ill_t	*ill = rq->q_ptr;
1235 	ipif_t	*ipif;
1236 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1237 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1238 	boolean_t addr_equal;
1239 
1240 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1241 		/*
1242 		 * We do not support recovery of proxy ARP'd interfaces,
1243 		 * because the system lacks a complete proxy ARP mechanism.
1244 		 */
1245 		if (ill->ill_isv6) {
1246 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1247 			    addr6);
1248 		} else {
1249 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
1250 		}
1251 
1252 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1253 			continue;
1254 
1255 		/*
1256 		 * If we have already recovered or if the interface is going
1257 		 * away, then ignore.
1258 		 */
1259 		mutex_enter(&ill->ill_lock);
1260 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1261 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1262 			mutex_exit(&ill->ill_lock);
1263 			continue;
1264 		}
1265 
1266 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1267 		ill->ill_ipif_dup_count--;
1268 		mutex_exit(&ill->ill_lock);
1269 		ipif->ipif_was_dup = B_TRUE;
1270 
1271 		if (ill->ill_isv6) {
1272 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1273 			(void) ipif_up_done_v6(ipif);
1274 		} else {
1275 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1276 			    EINPROGRESS);
1277 			(void) ipif_up_done(ipif);
1278 		}
1279 	}
1280 	freeb(mp);
1281 }
1282 
1283 /*
1284  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1285  * As long as someone else holds the address, the interface will stay down.
1286  * When that conflict goes away, the interface is brought back up.  This is
1287  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1288  * server will recover from a failure.
1289  *
1290  * For DHCP and temporary addresses, recovery is not done in the kernel.
1291  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1292  *
1293  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1294  */
1295 void
1296 ipif_dup_recovery(void *arg)
1297 {
1298 	ipif_t *ipif = arg;
1299 
1300 	ipif->ipif_recovery_id = 0;
1301 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1302 		return;
1303 
1304 	/*
1305 	 * No lock, because this is just an optimization.
1306 	 */
1307 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1308 		return;
1309 
1310 	/* If the link is down, we'll retry this later */
1311 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1312 		return;
1313 
1314 	ipif_do_recovery(ipif);
1315 }
1316 
1317 /*
1318  * Perform interface recovery by forcing the duplicate interfaces up and
1319  * allowing the system to determine which ones should stay up.
1320  *
1321  * Called both by recovery timer expiry and link-up notification.
1322  */
1323 void
1324 ipif_do_recovery(ipif_t *ipif)
1325 {
1326 	ill_t *ill = ipif->ipif_ill;
1327 	mblk_t *mp;
1328 	ip_stack_t *ipst = ill->ill_ipst;
1329 	size_t mp_size;
1330 
1331 	if (ipif->ipif_isv6)
1332 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
1333 	else
1334 		mp_size = sizeof (ipif->ipif_lcl_addr);
1335 	mp = allocb(mp_size, BPRI_MED);
1336 	if (mp == NULL) {
1337 		mutex_enter(&ill->ill_lock);
1338 		if (ipst->ips_ip_dup_recovery > 0 &&
1339 		    ipif->ipif_recovery_id == 0 &&
1340 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1341 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1342 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1343 		}
1344 		mutex_exit(&ill->ill_lock);
1345 	} else {
1346 		/*
1347 		 * A recovery timer may still be running if we got here from
1348 		 * ill_restart_dad(); cancel that timer.
1349 		 */
1350 		if (ipif->ipif_recovery_id != 0)
1351 			(void) untimeout(ipif->ipif_recovery_id);
1352 		ipif->ipif_recovery_id = 0;
1353 
1354 		if (ipif->ipif_isv6) {
1355 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1356 			    sizeof (ipif->ipif_v6lcl_addr));
1357 		} else  {
1358 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1359 			    sizeof (ipif->ipif_lcl_addr));
1360 		}
1361 		ill_refhold(ill);
1362 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1363 		    B_FALSE);
1364 	}
1365 }
1366 
1367 /*
1368  * Find the MAC and IP addresses in an NA/NS message.
1369  */
1370 static void
1371 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1372     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1373 {
1374 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1375 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1376 	uchar_t *addr;
1377 	int alen;
1378 
1379 	/* icmp_inbound_v6 ensures this */
1380 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1381 
1382 	addr = ira->ira_l2src;
1383 	alen = ill->ill_phys_addr_length;
1384 	if (alen > 0) {
1385 		*haddr = addr;
1386 		*haddrlenp = alen;
1387 	} else {
1388 		*haddr = NULL;
1389 		*haddrlenp = 0;
1390 	}
1391 
1392 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1393 	*targp = ns->nd_ns_target;
1394 }
1395 
1396 /*
1397  * This is for exclusive changes due to NDP duplicate address detection
1398  * failure.
1399  */
1400 /* ARGSUSED */
1401 static void
1402 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1403 {
1404 	ill_t	*ill = rq->q_ptr;
1405 	ipif_t	*ipif;
1406 	uchar_t	*haddr;
1407 	uint_t	haddrlen;
1408 	ip_stack_t *ipst = ill->ill_ipst;
1409 	in6_addr_t targ;
1410 	ip_recv_attr_t iras;
1411 	mblk_t	*attrmp;
1412 
1413 	attrmp = mp;
1414 	mp = mp->b_cont;
1415 	attrmp->b_cont = NULL;
1416 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1417 		/* The ill or ip_stack_t disappeared on us */
1418 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1419 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1420 		freemsg(mp);
1421 		ira_cleanup(&iras, B_TRUE);
1422 		return;
1423 	}
1424 
1425 	ASSERT(ill == iras.ira_rill);
1426 
1427 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1428 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1429 		/*
1430 		 * Ignore conflicts generated by misbehaving switches that
1431 		 * just reflect our own messages back to us.  For IPMP, we may
1432 		 * see reflections across any ill in the illgrp.
1433 		 *
1434 		 * RFC2462 and revisions tried to detect both the case
1435 		 * when a statically configured IPv6 address is a duplicate,
1436 		 * and the case when the L2 address itself is a duplicate. The
1437 		 * later is important because, with stateles address autoconf,
1438 		 * if the L2 address is a duplicate, the resulting IPv6
1439 		 * address(es) would also be duplicates. We rely on DAD of the
1440 		 * IPv6 address itself to detect the latter case.
1441 		 */
1442 		/* For an under ill_grp can change under lock */
1443 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1444 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1445 		    IS_UNDER_IPMP(ill) &&
1446 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1447 		    haddrlen) != NULL) {
1448 			rw_exit(&ipst->ips_ill_g_lock);
1449 			goto ignore_conflict;
1450 		}
1451 		rw_exit(&ipst->ips_ill_g_lock);
1452 	}
1453 
1454 	/*
1455 	 * Look up the appropriate ipif.
1456 	 */
1457 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1458 	if (ipif == NULL)
1459 		goto ignore_conflict;
1460 
1461 	/* Reload the ill to match the ipif */
1462 	ill = ipif->ipif_ill;
1463 
1464 	/* If it's already duplicate or ineligible, then don't do anything. */
1465 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1466 		ipif_refrele(ipif);
1467 		goto ignore_conflict;
1468 	}
1469 
1470 	/*
1471 	 * If this is a failure during duplicate recovery, then don't
1472 	 * complain.  It may take a long time to recover.
1473 	 */
1474 	if (!ipif->ipif_was_dup) {
1475 		char ibuf[LIFNAMSIZ];
1476 		char hbuf[MAC_STR_LEN];
1477 		char sbuf[INET6_ADDRSTRLEN];
1478 
1479 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1480 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1481 		    " disabled", ibuf,
1482 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1483 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1484 	}
1485 	mutex_enter(&ill->ill_lock);
1486 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1487 	ipif->ipif_flags |= IPIF_DUPLICATE;
1488 	ill->ill_ipif_dup_count++;
1489 	mutex_exit(&ill->ill_lock);
1490 	(void) ipif_down(ipif, NULL, NULL);
1491 	(void) ipif_down_tail(ipif);
1492 	mutex_enter(&ill->ill_lock);
1493 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1494 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1495 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1496 	    ipst->ips_ip_dup_recovery > 0) {
1497 		ASSERT(ipif->ipif_recovery_id == 0);
1498 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1499 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1500 	}
1501 	mutex_exit(&ill->ill_lock);
1502 	ipif_refrele(ipif);
1503 
1504 ignore_conflict:
1505 	freemsg(mp);
1506 	ira_cleanup(&iras, B_TRUE);
1507 }
1508 
1509 /*
1510  * Handle failure by tearing down the ipifs with the specified address.  Note
1511  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1512  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1513  * we start a timer on the ipif.
1514  * Caller has to free mp;
1515  */
1516 static void
1517 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1518 {
1519 	const uchar_t	*haddr;
1520 	ill_t		*ill = ira->ira_rill;
1521 
1522 	/*
1523 	 * Ignore conflicts generated by misbehaving switches that just
1524 	 * reflect our own messages back to us.
1525 	 */
1526 
1527 	/* icmp_inbound_v6 ensures this */
1528 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1529 	haddr = ira->ira_l2src;
1530 	if (haddr != NULL &&
1531 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1532 		return;
1533 	}
1534 
1535 	if ((mp = copymsg(mp)) != NULL) {
1536 		mblk_t	*attrmp;
1537 
1538 		attrmp = ip_recv_attr_to_mblk(ira);
1539 		if (attrmp == NULL) {
1540 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1541 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1542 			freemsg(mp);
1543 		} else {
1544 			ASSERT(attrmp->b_cont == NULL);
1545 			attrmp->b_cont = mp;
1546 			mp = attrmp;
1547 			ill_refhold(ill);
1548 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1549 			    B_FALSE);
1550 		}
1551 	}
1552 }
1553 
1554 /*
1555  * Handle a discovered conflict: some other system is advertising that it owns
1556  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1557  * interface.
1558  *
1559  * Handles both IPv4 and IPv6
1560  */
1561 boolean_t
1562 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1563 {
1564 	ipif_t		*ipif;
1565 	clock_t		now;
1566 	uint_t		maxdefense;
1567 	uint_t		defs;
1568 	ill_t		*ill = ira->ira_ill;
1569 	ip_stack_t	*ipst = ill->ill_ipst;
1570 	uint32_t	elapsed;
1571 	boolean_t	isv6 = ill->ill_isv6;
1572 	ipaddr_t	ncec_addr;
1573 
1574 	if (isv6) {
1575 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1576 		    ipst);
1577 	} else {
1578 		if (arp_no_defense) {
1579 			/*
1580 			 * Yes, there is a conflict, but no, we do not
1581 			 * defend ourself.
1582 			 */
1583 			return (B_TRUE);
1584 		}
1585 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1586 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1587 		    ipst);
1588 	}
1589 	if (ipif == NULL)
1590 		return (B_FALSE);
1591 
1592 	/*
1593 	 * First, figure out if this address is disposable.
1594 	 */
1595 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1596 		maxdefense = ipst->ips_ip_max_temp_defend;
1597 	else
1598 		maxdefense = ipst->ips_ip_max_defend;
1599 
1600 	/*
1601 	 * Now figure out how many times we've defended ourselves.  Ignore
1602 	 * defenses that happened long in the past.
1603 	 */
1604 	now = ddi_get_lbolt();
1605 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1606 	mutex_enter(&ncec->ncec_lock);
1607 	if ((defs = ncec->ncec_defense_count) > 0 &&
1608 	    elapsed > ipst->ips_ip_defend_interval) {
1609 		/*
1610 		 * ip_defend_interval has elapsed.
1611 		 * reset the defense count.
1612 		 */
1613 		ncec->ncec_defense_count = defs = 0;
1614 	}
1615 	ncec->ncec_defense_count++;
1616 	ncec->ncec_last_time_defended = now;
1617 	mutex_exit(&ncec->ncec_lock);
1618 	ipif_refrele(ipif);
1619 
1620 	/*
1621 	 * If we've defended ourselves too many times already, then give up and
1622 	 * tear down the interface(s) using this address.
1623 	 * Otherwise, caller has to defend by sending out an announce.
1624 	 */
1625 	if (defs >= maxdefense) {
1626 		if (isv6)
1627 			ndp_failure(mp, ira);
1628 		else
1629 			arp_failure(mp, ira);
1630 	} else {
1631 		return (B_TRUE); /* caller must defend this address */
1632 	}
1633 	return (B_FALSE);
1634 }
1635 
1636 /*
1637  * Handle reception of Neighbor Solicitation messages.
1638  */
1639 static void
1640 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1641 {
1642 	ill_t		*ill = ira->ira_ill, *under_ill;
1643 	nd_neighbor_solicit_t *ns;
1644 	uint32_t	hlen = ill->ill_phys_addr_length;
1645 	uchar_t		*haddr = NULL;
1646 	icmp6_t		*icmp_nd;
1647 	ip6_t		*ip6h;
1648 	ncec_t		*our_ncec = NULL;
1649 	in6_addr_t	target;
1650 	in6_addr_t	src;
1651 	int		len;
1652 	int		flag = 0;
1653 	nd_opt_hdr_t	*opt = NULL;
1654 	boolean_t	bad_solicit = B_FALSE;
1655 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1656 	boolean_t	need_ill_refrele = B_FALSE;
1657 
1658 	ip6h = (ip6_t *)mp->b_rptr;
1659 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1660 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1661 	src = ip6h->ip6_src;
1662 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1663 	target = ns->nd_ns_target;
1664 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1665 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1666 		if (ip_debug > 2) {
1667 			/* ip1dbg */
1668 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1669 			    AF_INET6, &target);
1670 		}
1671 		bad_solicit = B_TRUE;
1672 		goto done;
1673 	}
1674 	if (len > sizeof (nd_neighbor_solicit_t)) {
1675 		/* Options present */
1676 		opt = (nd_opt_hdr_t *)&ns[1];
1677 		len -= sizeof (nd_neighbor_solicit_t);
1678 		if (!ndp_verify_optlen(opt, len)) {
1679 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1680 			bad_solicit = B_TRUE;
1681 			goto done;
1682 		}
1683 	}
1684 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1685 		/* Check to see if this is a valid DAD solicitation */
1686 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1687 			if (ip_debug > 2) {
1688 				/* ip1dbg */
1689 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1690 				    "Destination is not solicited node "
1691 				    "multicast %s\n", AF_INET6,
1692 				    &ip6h->ip6_dst);
1693 			}
1694 			bad_solicit = B_TRUE;
1695 			goto done;
1696 		}
1697 	}
1698 
1699 	/*
1700 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1701 	 * received this packet if it's multicast) is not the ill tied to
1702 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1703 	 * to ensure we find the associated NCE.
1704 	 */
1705 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1706 	/*
1707 	 * If this is a valid Solicitation for an address we are publishing,
1708 	 * then a PUBLISH entry should exist in the cache
1709 	 */
1710 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1711 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1712 		    "ifname=%s ", ill->ill_name));
1713 		if (ip_debug > 2) {
1714 			/* ip1dbg */
1715 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1716 		}
1717 		if (our_ncec == NULL)
1718 			bad_solicit = B_TRUE;
1719 		goto done;
1720 	}
1721 
1722 	/* At this point we should have a verified NS per spec */
1723 	if (opt != NULL) {
1724 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1725 		if (opt != NULL) {
1726 			haddr = (uchar_t *)&opt[1];
1727 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1728 			    hlen == 0) {
1729 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1730 				bad_solicit = B_TRUE;
1731 				goto done;
1732 			}
1733 		}
1734 	}
1735 
1736 	/* If sending directly to peer, set the unicast flag */
1737 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1738 		flag |= NDP_UNICAST;
1739 
1740 	/*
1741 	 * Create/update the entry for the soliciting node on the ipmp_ill.
1742 	 * or respond to outstanding queries, don't if
1743 	 * the source is unspecified address.
1744 	 */
1745 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1746 		int	err;
1747 		nce_t	*nnce;
1748 
1749 		ASSERT(ill->ill_isv6);
1750 		/*
1751 		 * Regular solicitations *must* include the Source Link-Layer
1752 		 * Address option.  Ignore messages that do not.
1753 		 */
1754 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1755 			ip1dbg(("ndp_input_solicit: source link-layer address "
1756 			    "option missing with a specified source.\n"));
1757 			bad_solicit = B_TRUE;
1758 			goto done;
1759 		}
1760 
1761 		/*
1762 		 * This is a regular solicitation.  If we're still in the
1763 		 * process of verifying the address, then don't respond at all
1764 		 * and don't keep track of the sender.
1765 		 */
1766 		if (our_ncec->ncec_state == ND_PROBE)
1767 			goto done;
1768 
1769 		/*
1770 		 * If the solicitation doesn't have sender hardware address
1771 		 * (legal for unicast solicitation), then process without
1772 		 * installing the return NCE.  Either we already know it, or
1773 		 * we'll be forced to look it up when (and if) we reply to the
1774 		 * packet.
1775 		 */
1776 		if (haddr == NULL)
1777 			goto no_source;
1778 
1779 		under_ill = ill;
1780 		if (IS_UNDER_IPMP(under_ill)) {
1781 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
1782 			if (ill == NULL)
1783 				ill = under_ill;
1784 			else
1785 				need_ill_refrele = B_TRUE;
1786 		}
1787 		err = nce_lookup_then_add_v6(ill,
1788 		    haddr, hlen,
1789 		    &src,	/* Soliciting nodes address */
1790 		    0,
1791 		    ND_STALE,
1792 		    &nnce);
1793 
1794 		if (need_ill_refrele) {
1795 			ill_refrele(ill);
1796 			ill = under_ill;
1797 			need_ill_refrele =  B_FALSE;
1798 		}
1799 		switch (err) {
1800 		case 0:
1801 			/* done with this entry */
1802 			nce_refrele(nnce);
1803 			break;
1804 		case EEXIST:
1805 			/*
1806 			 * B_FALSE indicates this is not an an advertisement.
1807 			 */
1808 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1809 			nce_refrele(nnce);
1810 			break;
1811 		default:
1812 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1813 			    err));
1814 			goto done;
1815 		}
1816 no_source:
1817 		flag |= NDP_SOLICITED;
1818 	} else {
1819 		/*
1820 		 * No source link layer address option should be present in a
1821 		 * valid DAD request.
1822 		 */
1823 		if (haddr != NULL) {
1824 			ip1dbg(("ndp_input_solicit: source link-layer address "
1825 			    "option present with an unspecified source.\n"));
1826 			bad_solicit = B_TRUE;
1827 			goto done;
1828 		}
1829 		if (our_ncec->ncec_state == ND_PROBE) {
1830 			/*
1831 			 * Internally looped-back probes will have
1832 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1833 			 * transmissions.
1834 			 */
1835 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1836 				/*
1837 				 * If someone else is probing our address, then
1838 				 * we've crossed wires.  Declare failure.
1839 				 */
1840 				ndp_failure(mp, ira);
1841 			}
1842 			goto done;
1843 		}
1844 		/*
1845 		 * This is a DAD probe.  Multicast the advertisement to the
1846 		 * all-nodes address.
1847 		 */
1848 		src = ipv6_all_hosts_mcast;
1849 	}
1850 	flag |= nce_advert_flags(our_ncec);
1851 	(void) ndp_xmit(ill,
1852 	    ND_NEIGHBOR_ADVERT,
1853 	    our_ncec->ncec_lladdr,
1854 	    our_ncec->ncec_lladdr_length,
1855 	    &target,	/* Source and target of the advertisement pkt */
1856 	    &src,	/* IP Destination (source of original pkt) */
1857 	    flag);
1858 done:
1859 	if (bad_solicit)
1860 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1861 	if (our_ncec != NULL)
1862 		ncec_refrele(our_ncec);
1863 }
1864 
1865 /*
1866  * Handle reception of Neighbor Solicitation messages
1867  */
1868 void
1869 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1870 {
1871 	ill_t		*ill = ira->ira_ill;
1872 	nd_neighbor_advert_t *na;
1873 	uint32_t	hlen = ill->ill_phys_addr_length;
1874 	uchar_t		*haddr = NULL;
1875 	icmp6_t		*icmp_nd;
1876 	ip6_t		*ip6h;
1877 	ncec_t		*dst_ncec = NULL;
1878 	in6_addr_t	target;
1879 	nd_opt_hdr_t	*opt = NULL;
1880 	int		len;
1881 	ip_stack_t	*ipst = ill->ill_ipst;
1882 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1883 
1884 	ip6h = (ip6_t *)mp->b_rptr;
1885 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1886 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1887 	na = (nd_neighbor_advert_t *)icmp_nd;
1888 
1889 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1890 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1891 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1892 		    "solicited flag is not zero\n"));
1893 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1894 		return;
1895 	}
1896 	target = na->nd_na_target;
1897 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1898 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1899 		if (ip_debug > 2) {
1900 			/* ip1dbg */
1901 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1902 			    AF_INET6, &target);
1903 		}
1904 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1905 		return;
1906 	}
1907 	if (len > sizeof (nd_neighbor_advert_t)) {
1908 		opt = (nd_opt_hdr_t *)&na[1];
1909 		if (!ndp_verify_optlen(opt,
1910 		    len - sizeof (nd_neighbor_advert_t))) {
1911 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1912 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1913 			return;
1914 		}
1915 		/* At this point we have a verified NA per spec */
1916 		len -= sizeof (nd_neighbor_advert_t);
1917 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1918 		if (opt != NULL) {
1919 			haddr = (uchar_t *)&opt[1];
1920 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1921 			    hlen == 0) {
1922 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1923 				BUMP_MIB(mib,
1924 				    ipv6IfIcmpInBadNeighborAdvertisements);
1925 				return;
1926 			}
1927 		}
1928 	}
1929 
1930 	/*
1931 	 * NOTE: we match across the illgrp since we need to do DAD for all of
1932 	 * our local addresses, and those are spread across all the active
1933 	 * ills in the group.
1934 	 */
1935 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1936 		return;
1937 
1938 	if (NCE_PUBLISH(dst_ncec)) {
1939 		/*
1940 		 * Someone just advertised an addresses that we publish. First,
1941 		 * check it it was us -- if so, we can safely ignore it.
1942 		 * We don't get the haddr from the ira_l2src because, in the
1943 		 * case that the packet originated from us, on an IPMP group,
1944 		 * the ira_l2src may would be the link-layer address of the
1945 		 * cast_ill used to send the packet, which may not be the same
1946 		 * as the dst_ncec->ncec_lladdr of the address.
1947 		 */
1948 		if (haddr != NULL) {
1949 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1950 				goto out;
1951 
1952 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1953 				goto out;   /* from us -- no conflict */
1954 
1955 			/*
1956 			 * If we're in an IPMP group, check if this is an echo
1957 			 * from another ill in the group.  Use the double-
1958 			 * checked locking pattern to avoid grabbing
1959 			 * ill_g_lock in the non-IPMP case.
1960 			 */
1961 			if (IS_UNDER_IPMP(ill)) {
1962 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1963 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1964 				    ill->ill_grp, haddr, hlen) != NULL) {
1965 					rw_exit(&ipst->ips_ill_g_lock);
1966 					goto out;
1967 				}
1968 				rw_exit(&ipst->ips_ill_g_lock);
1969 			}
1970 		}
1971 
1972 		/*
1973 		 * This appears to be a real conflict.  If we're trying to
1974 		 * configure this NCE (ND_PROBE), then shut it down.
1975 		 * Otherwise, handle the discovered conflict.
1976 		 */
1977 		if (dst_ncec->ncec_state == ND_PROBE) {
1978 			ndp_failure(mp, ira);
1979 		} else {
1980 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
1981 				char hbuf[MAC_STR_LEN];
1982 				char sbuf[INET6_ADDRSTRLEN];
1983 
1984 				cmn_err(CE_WARN,
1985 				    "node '%s' is using %s on %s",
1986 				    inet_ntop(AF_INET6, &target, sbuf,
1987 				    sizeof (sbuf)),
1988 				    haddr == NULL ? "<none>" :
1989 				    mac_colon_addr(haddr, hlen, hbuf,
1990 				    sizeof (hbuf)), ill->ill_name);
1991 				/*
1992 				 * RFC 4862, Section 5.4.4 does not mandate
1993 				 * any specific behavior when an NA matches
1994 				 * a non-tentative address assigned to the
1995 				 * receiver. We make the choice of defending
1996 				 * our address, based on the assumption that
1997 				 * the sender has not detected the Duplicate.
1998 				 *
1999 				 * ncec_last_time_defended has been adjusted
2000 				 * in ip_nce_conflict()
2001 				 */
2002 				(void) ndp_announce(dst_ncec);
2003 			}
2004 		}
2005 	} else {
2006 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2007 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2008 
2009 		/* B_TRUE indicates this an advertisement */
2010 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2011 	}
2012 out:
2013 	ncec_refrele(dst_ncec);
2014 }
2015 
2016 /*
2017  * Process NDP neighbor solicitation/advertisement messages.
2018  * The checksum has already checked o.k before reaching here.
2019  * Information about the datalink header is contained in ira_l2src, but
2020  * that should be ignored for loopback packets.
2021  */
2022 void
2023 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2024 {
2025 	ill_t		*ill = ira->ira_rill;
2026 	icmp6_t		*icmp_nd;
2027 	ip6_t		*ip6h;
2028 	int		len;
2029 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2030 	ill_t		*orig_ill = NULL;
2031 
2032 	/*
2033 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2034 	 * and make it be the IPMP upper so avoid being confused by a packet
2035 	 * addressed to a unicast address on a different ill.
2036 	 */
2037 	if (IS_UNDER_IPMP(ill)) {
2038 		orig_ill = ill;
2039 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2040 		if (ill == NULL) {
2041 			ill = orig_ill;
2042 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2043 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2044 			    mp, ill);
2045 			freemsg(mp);
2046 			return;
2047 		}
2048 		ASSERT(ill != orig_ill);
2049 		orig_ill = ira->ira_ill;
2050 		ira->ira_ill = ill;
2051 		mib = ill->ill_icmp6_mib;
2052 	}
2053 	if (!pullupmsg(mp, -1)) {
2054 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2055 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2056 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2057 		goto done;
2058 	}
2059 	ip6h = (ip6_t *)mp->b_rptr;
2060 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2061 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2062 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2063 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2064 		goto done;
2065 	}
2066 	/*
2067 	 * NDP does not accept any extension headers between the
2068 	 * IP header and the ICMP header since e.g. a routing
2069 	 * header could be dangerous.
2070 	 * This assumes that any AH or ESP headers are removed
2071 	 * by ip prior to passing the packet to ndp_input.
2072 	 */
2073 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2074 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2075 		    ip6h->ip6_nxt));
2076 		ip_drop_input("Wrong next header", mp, ill);
2077 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2078 		goto done;
2079 	}
2080 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2081 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2082 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2083 	if (icmp_nd->icmp6_code != 0) {
2084 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2085 		ip_drop_input("code non-zero", mp, ill);
2086 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2087 		goto done;
2088 	}
2089 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2090 	/*
2091 	 * Make sure packet length is large enough for either
2092 	 * a NS or a NA icmp packet.
2093 	 */
2094 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2095 		ip1dbg(("ndp_input: packet too short\n"));
2096 		ip_drop_input("packet too short", mp, ill);
2097 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2098 		goto done;
2099 	}
2100 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2101 		ndp_input_solicit(mp, ira);
2102 	} else {
2103 		ndp_input_advert(mp, ira);
2104 	}
2105 done:
2106 	freemsg(mp);
2107 	if (orig_ill != NULL) {
2108 		ill_refrele(ill);
2109 		ira->ira_ill = orig_ill;
2110 	}
2111 }
2112 
2113 /*
2114  * ndp_xmit is called to form and transmit a ND solicitation or
2115  * advertisement ICMP packet.
2116  *
2117  * If the source address is unspecified and this isn't a probe (used for
2118  * duplicate address detection), an appropriate source address and link layer
2119  * address will be chosen here.  The link layer address option is included if
2120  * the source is specified (i.e., all non-probe packets), and omitted (per the
2121  * specification) otherwise.
2122  *
2123  * It returns B_FALSE only if it does a successful put() to the
2124  * corresponding ill's ill_wq otherwise returns B_TRUE.
2125  */
2126 static boolean_t
2127 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2128     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2129 {
2130 	uint32_t	len;
2131 	icmp6_t		*icmp6;
2132 	mblk_t		*mp;
2133 	ip6_t		*ip6h;
2134 	nd_opt_hdr_t	*opt;
2135 	uint_t		plen;
2136 	zoneid_t	zoneid = GLOBAL_ZONEID;
2137 	ill_t		*hwaddr_ill = ill;
2138 	ip_xmit_attr_t	ixas;
2139 	ip_stack_t	*ipst = ill->ill_ipst;
2140 	boolean_t	need_refrele = B_FALSE;
2141 	boolean_t	probe = B_FALSE;
2142 
2143 	if (IS_UNDER_IPMP(ill)) {
2144 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2145 		/*
2146 		 * We send non-probe packets on the upper IPMP interface.
2147 		 * ip_output_simple() will use cast_ill for sending any
2148 		 * multicast packets. Note that we can't follow the same
2149 		 * logic for probe packets because all interfaces in the ipmp
2150 		 * group may have failed, so that we really want to only try
2151 		 * to send the ND packet on the ill corresponding to the src
2152 		 * address.
2153 		 */
2154 		if (!probe) {
2155 			ill = ipmp_ill_hold_ipmp_ill(ill);
2156 			if (ill != NULL)
2157 				need_refrele = B_TRUE;
2158 			else
2159 				ill = hwaddr_ill;
2160 		}
2161 	}
2162 
2163 	/*
2164 	 * If we have a unspecified source(sender) address, select a
2165 	 * proper source address for the solicitation here itself so
2166 	 * that we can initialize the h/w address correctly.
2167 	 *
2168 	 * If the sender is specified then we use this address in order
2169 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2170 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2171 	 * by IP (we cannot guarantee that the global zone has an interface
2172 	 * route to the destination).
2173 	 *
2174 	 * Note that the NA never comes here with the unspecified source
2175 	 * address.
2176 	 */
2177 
2178 	/*
2179 	 * Probes will have unspec src at this point.
2180 	 */
2181 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2182 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2183 		/*
2184 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2185 		 * ALL_ZONES if it cannot find a matching ipif for the address
2186 		 * we are trying to use. In this case we err on the side of
2187 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2188 		 */
2189 		if (zoneid == ALL_ZONES)
2190 			zoneid = GLOBAL_ZONEID;
2191 	}
2192 
2193 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2194 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2195 	mp = allocb(len,  BPRI_LO);
2196 	if (mp == NULL) {
2197 		if (need_refrele)
2198 			ill_refrele(ill);
2199 		return (B_TRUE);
2200 	}
2201 
2202 	bzero((char *)mp->b_rptr, len);
2203 	mp->b_wptr = mp->b_rptr + len;
2204 
2205 	bzero(&ixas, sizeof (ixas));
2206 	ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2207 
2208 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2209 	ixas.ixa_ipst = ipst;
2210 	ixas.ixa_cred = kcred;
2211 	ixas.ixa_cpid = NOPID;
2212 	ixas.ixa_tsl = NULL;
2213 	ixas.ixa_zoneid = zoneid;
2214 
2215 	ip6h = (ip6_t *)mp->b_rptr;
2216 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2217 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2218 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2219 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2220 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2221 	ip6h->ip6_dst = *target;
2222 	icmp6 = (icmp6_t *)&ip6h[1];
2223 
2224 	if (hw_addr_len != 0) {
2225 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2226 		    sizeof (nd_neighbor_advert_t));
2227 	} else {
2228 		opt = NULL;
2229 	}
2230 	if (operation == ND_NEIGHBOR_SOLICIT) {
2231 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2232 
2233 		if (opt != NULL && !(flag & NDP_PROBE)) {
2234 			/*
2235 			 * Note that we don't send out SLLA for ND probes
2236 			 * per RFC 4862, even though we do send out the src
2237 			 * haddr for IPv4 DAD probes, even though both IPv4
2238 			 * and IPv6 go out with the unspecified/INADDR_ANY
2239 			 * src IP addr.
2240 			 */
2241 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2242 		}
2243 		ip6h->ip6_src = *sender;
2244 		ns->nd_ns_target = *target;
2245 		if (!(flag & NDP_UNICAST)) {
2246 			/* Form multicast address of the target */
2247 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2248 			ip6h->ip6_dst.s6_addr32[3] |=
2249 			    ns->nd_ns_target.s6_addr32[3];
2250 		}
2251 	} else {
2252 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2253 
2254 		ASSERT(!(flag & NDP_PROBE));
2255 		if (opt != NULL)
2256 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2257 		ip6h->ip6_src = *sender;
2258 		na->nd_na_target = *sender;
2259 		if (flag & NDP_ISROUTER)
2260 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2261 		if (flag & NDP_SOLICITED)
2262 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2263 		if (flag & NDP_ORIDE)
2264 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2265 	}
2266 
2267 	if (!(flag & NDP_PROBE)) {
2268 		if (hw_addr != NULL && opt != NULL) {
2269 			/* Fill in link layer address and option len */
2270 			opt->nd_opt_len = (uint8_t)plen;
2271 			bcopy(hw_addr, &opt[1], hw_addr_len);
2272 		}
2273 	}
2274 	if (opt != NULL && opt->nd_opt_type == 0) {
2275 		/* If there's no link layer address option, then strip it. */
2276 		len -= plen * 8;
2277 		mp->b_wptr = mp->b_rptr + len;
2278 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2279 	}
2280 
2281 	icmp6->icmp6_type = (uint8_t)operation;
2282 	icmp6->icmp6_code = 0;
2283 	/*
2284 	 * Prepare for checksum by putting icmp length in the icmp
2285 	 * checksum field. The checksum is calculated in ip_output.c.
2286 	 */
2287 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2288 
2289 	(void) ip_output_simple(mp, &ixas);
2290 	ixa_cleanup(&ixas);
2291 	if (need_refrele)
2292 		ill_refrele(ill);
2293 	return (B_FALSE);
2294 }
2295 
2296 /*
2297  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2298  * The datapath uses this as an indication that there
2299  * is a problem (as opposed to a NCE that was just
2300  * reclaimed due to lack of memory.
2301  * Note that static ARP entries never become unreachable.
2302  */
2303 void
2304 nce_make_unreachable(ncec_t *ncec)
2305 {
2306 	mutex_enter(&ncec->ncec_lock);
2307 	ncec->ncec_state = ND_UNREACHABLE;
2308 	mutex_exit(&ncec->ncec_lock);
2309 }
2310 
2311 /*
2312  * NCE retransmit timer. Common to IPv4 and IPv6.
2313  * This timer goes off when:
2314  * a. It is time to retransmit a resolution for resolver.
2315  * b. It is time to send reachability probes.
2316  */
2317 void
2318 nce_timer(void *arg)
2319 {
2320 	ncec_t		*ncec = arg;
2321 	ill_t		*ill = ncec->ncec_ill, *src_ill;
2322 	char		addrbuf[INET6_ADDRSTRLEN];
2323 	boolean_t	dropped = B_FALSE;
2324 	ip_stack_t	*ipst = ncec->ncec_ipst;
2325 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2326 	in_addr_t	sender4 = INADDR_ANY;
2327 	in6_addr_t	sender6 = ipv6_all_zeros;
2328 
2329 	/*
2330 	 * The timer has to be cancelled by ncec_delete before doing the final
2331 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2332 	 * until it clears the timeout_id. Before clearing the timeout_id
2333 	 * bump up the refcnt so that we can continue to use the ncec
2334 	 */
2335 	ASSERT(ncec != NULL);
2336 	mutex_enter(&ncec->ncec_lock);
2337 	ncec_refhold_locked(ncec);
2338 	ncec->ncec_timeout_id = 0;
2339 	mutex_exit(&ncec->ncec_lock);
2340 
2341 	src_ill = nce_resolve_src(ncec, &sender6);
2342 	/* if we could not find a sender address, return */
2343 	if (src_ill == NULL) {
2344 		if (!isv6) {
2345 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2346 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2347 			    &sender4, addrbuf, sizeof (addrbuf))));
2348 		} else {
2349 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2350 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2351 		}
2352 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2353 		ncec_refrele(ncec);
2354 		return;
2355 	}
2356 	if (!isv6)
2357 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2358 
2359 	mutex_enter(&ncec->ncec_lock);
2360 	/*
2361 	 * Check the reachability state.
2362 	 */
2363 	switch (ncec->ncec_state) {
2364 	case ND_DELAY:
2365 		ASSERT(ncec->ncec_lladdr != NULL);
2366 		ncec->ncec_state = ND_PROBE;
2367 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2368 		if (isv6) {
2369 			mutex_exit(&ncec->ncec_lock);
2370 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2371 			    src_ill->ill_phys_addr,
2372 			    src_ill->ill_phys_addr_length,
2373 			    &sender6, &ncec->ncec_addr,
2374 			    NDP_UNICAST);
2375 		} else {
2376 			dropped = (arp_request(ncec, sender4, src_ill) == 0);
2377 			mutex_exit(&ncec->ncec_lock);
2378 		}
2379 		if (!dropped) {
2380 			mutex_enter(&ncec->ncec_lock);
2381 			ncec->ncec_pcnt--;
2382 			mutex_exit(&ncec->ncec_lock);
2383 		}
2384 		if (ip_debug > 3) {
2385 			/* ip2dbg */
2386 			pr_addr_dbg("nce_timer: state for %s changed "
2387 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2388 		}
2389 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2390 		break;
2391 	case ND_PROBE:
2392 		/* must be retransmit timer */
2393 		ASSERT(ncec->ncec_pcnt >= -1);
2394 		if (ncec->ncec_pcnt > 0) {
2395 			/*
2396 			 * As per RFC2461, the ncec gets deleted after
2397 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2398 			 * Note that the first unicast solicitation is sent
2399 			 * during the DELAY state.
2400 			 */
2401 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2402 			    ncec->ncec_pcnt,
2403 			    inet_ntop((isv6? AF_INET6 : AF_INET),
2404 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2405 			if (NCE_PUBLISH(ncec)) {
2406 				mutex_exit(&ncec->ncec_lock);
2407 				/*
2408 				 * send out a probe; note that src_ill
2409 				 * is ignored by nce_dad() for all
2410 				 * DAD message types other than IPv6
2411 				 * unicast probes
2412 				 */
2413 				nce_dad(ncec, src_ill, B_TRUE);
2414 			} else {
2415 				ASSERT(src_ill != NULL);
2416 				if (isv6) {
2417 					mutex_exit(&ncec->ncec_lock);
2418 					dropped = ndp_xmit(src_ill,
2419 					    ND_NEIGHBOR_SOLICIT,
2420 					    src_ill->ill_phys_addr,
2421 					    src_ill->ill_phys_addr_length,
2422 					    &sender6, &ncec->ncec_addr,
2423 					    NDP_UNICAST);
2424 				} else {
2425 					/*
2426 					 * since the nce is REACHABLE,
2427 					 * the ARP request will be sent out
2428 					 * as a link-layer unicast.
2429 					 */
2430 					dropped = (arp_request(ncec, sender4,
2431 					    src_ill) == 0);
2432 					mutex_exit(&ncec->ncec_lock);
2433 				}
2434 				if (!dropped) {
2435 					mutex_enter(&ncec->ncec_lock);
2436 					ncec->ncec_pcnt--;
2437 					mutex_exit(&ncec->ncec_lock);
2438 				}
2439 				nce_restart_timer(ncec,
2440 				    ill->ill_reachable_retrans_time);
2441 			}
2442 		} else if (ncec->ncec_pcnt < 0) {
2443 			/* No hope, delete the ncec */
2444 			/* Tell datapath it went bad */
2445 			ncec->ncec_state = ND_UNREACHABLE;
2446 			mutex_exit(&ncec->ncec_lock);
2447 			if (ip_debug > 2) {
2448 				/* ip1dbg */
2449 				pr_addr_dbg("nce_timer: Delete NCE for"
2450 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
2451 				    &ncec->ncec_addr);
2452 			}
2453 			/* if static ARP can't delete. */
2454 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2455 				ncec_delete(ncec);
2456 
2457 		} else if (!NCE_PUBLISH(ncec)) {
2458 			/*
2459 			 * Probe count is 0 for a dynamic entry (one that we
2460 			 * ourselves are not publishing). We should never get
2461 			 * here if NONUD was requested, hence the ASSERT below.
2462 			 */
2463 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2464 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2465 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
2466 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2467 			ncec->ncec_pcnt--;
2468 			mutex_exit(&ncec->ncec_lock);
2469 			/* Wait one interval before killing */
2470 			nce_restart_timer(ncec,
2471 			    ill->ill_reachable_retrans_time);
2472 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2473 			ipif_t *ipif;
2474 			ipaddr_t ncec_addr;
2475 
2476 			/*
2477 			 * We're done probing, and we can now declare this
2478 			 * address to be usable.  Let IP know that it's ok to
2479 			 * use.
2480 			 */
2481 			ncec->ncec_state = ND_REACHABLE;
2482 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2483 			mutex_exit(&ncec->ncec_lock);
2484 			if (isv6) {
2485 				ipif = ipif_lookup_addr_exact_v6(
2486 				    &ncec->ncec_addr, ill, ipst);
2487 			} else {
2488 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2489 				    ncec_addr);
2490 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2491 				    ipst);
2492 			}
2493 			if (ipif != NULL) {
2494 				if (ipif->ipif_was_dup) {
2495 					char ibuf[LIFNAMSIZ];
2496 					char sbuf[INET6_ADDRSTRLEN];
2497 
2498 					ipif->ipif_was_dup = B_FALSE;
2499 					(void) inet_ntop(AF_INET6,
2500 					    &ipif->ipif_v6lcl_addr,
2501 					    sbuf, sizeof (sbuf));
2502 					ipif_get_name(ipif, ibuf,
2503 					    sizeof (ibuf));
2504 					cmn_err(CE_NOTE, "recovered address "
2505 					    "%s on %s", sbuf, ibuf);
2506 				}
2507 				if ((ipif->ipif_flags & IPIF_UP) &&
2508 				    !ipif->ipif_addr_ready)
2509 					ipif_up_notify(ipif);
2510 				ipif->ipif_addr_ready = 1;
2511 				ipif_refrele(ipif);
2512 			}
2513 			if (!isv6 && arp_no_defense)
2514 				break;
2515 			/* Begin defending our new address */
2516 			if (ncec->ncec_unsolicit_count > 0) {
2517 				ncec->ncec_unsolicit_count--;
2518 				if (isv6) {
2519 					dropped = ndp_announce(ncec);
2520 				} else {
2521 					dropped = arp_announce(ncec);
2522 				}
2523 
2524 				if (dropped)
2525 					ncec->ncec_unsolicit_count++;
2526 				else
2527 					ncec->ncec_last_time_defended =
2528 					    ddi_get_lbolt();
2529 			}
2530 			if (ncec->ncec_unsolicit_count > 0) {
2531 				nce_restart_timer(ncec,
2532 				    ANNOUNCE_INTERVAL(isv6));
2533 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
2534 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2535 			}
2536 		} else {
2537 			/*
2538 			 * This is an address we're probing to be our own, but
2539 			 * the ill is down.  Wait until it comes back before
2540 			 * doing anything, but switch to reachable state so
2541 			 * that the restart will work.
2542 			 */
2543 			ncec->ncec_state = ND_REACHABLE;
2544 			mutex_exit(&ncec->ncec_lock);
2545 		}
2546 		break;
2547 	case ND_INCOMPLETE: {
2548 		mblk_t	*mp, *nextmp;
2549 		mblk_t	**prevmpp;
2550 
2551 		/*
2552 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2553 		 * for any IPMP probe packets, and toss them.  IPMP probe
2554 		 * packets will always be at the head of ncec_qd_mp, so that
2555 		 * we can stop at the first queued ND packet that is
2556 		 * not a probe packet.
2557 		 */
2558 		prevmpp = &ncec->ncec_qd_mp;
2559 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2560 			nextmp = mp->b_next;
2561 
2562 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2563 				inet_freemsg(mp);
2564 				ncec->ncec_nprobes--;
2565 				*prevmpp = nextmp;
2566 			} else {
2567 				prevmpp = &mp->b_next;
2568 			}
2569 		}
2570 
2571 		/*
2572 		 * Must be resolver's retransmit timer.
2573 		 */
2574 		mutex_exit(&ncec->ncec_lock);
2575 		ip_ndp_resolve(ncec);
2576 		break;
2577 	}
2578 	case ND_REACHABLE:
2579 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2580 		    ncec->ncec_unsolicit_count != 0) ||
2581 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2582 			if (ncec->ncec_unsolicit_count > 0) {
2583 				ncec->ncec_unsolicit_count--;
2584 				mutex_exit(&ncec->ncec_lock);
2585 				/*
2586 				 * When we get to zero announcements left,
2587 				 * switch to address defense
2588 				 */
2589 			} else {
2590 				boolean_t rate_limit;
2591 
2592 				mutex_exit(&ncec->ncec_lock);
2593 				rate_limit = ill_defend_rate_limit(ill, ncec);
2594 				if (rate_limit) {
2595 					nce_restart_timer(ncec,
2596 					    DEFENSE_INTERVAL(isv6));
2597 					break;
2598 				}
2599 			}
2600 			if (isv6) {
2601 				dropped = ndp_announce(ncec);
2602 			} else {
2603 				dropped = arp_announce(ncec);
2604 			}
2605 			mutex_enter(&ncec->ncec_lock);
2606 			if (dropped) {
2607 				ncec->ncec_unsolicit_count++;
2608 			} else {
2609 				ncec->ncec_last_time_defended =
2610 				    ddi_get_lbolt();
2611 			}
2612 			mutex_exit(&ncec->ncec_lock);
2613 			if (ncec->ncec_unsolicit_count != 0) {
2614 				nce_restart_timer(ncec,
2615 				    ANNOUNCE_INTERVAL(isv6));
2616 			} else {
2617 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2618 			}
2619 		} else {
2620 			mutex_exit(&ncec->ncec_lock);
2621 		}
2622 		break;
2623 	default:
2624 		mutex_exit(&ncec->ncec_lock);
2625 		break;
2626 	}
2627 done:
2628 	ncec_refrele(ncec);
2629 	ill_refrele(src_ill);
2630 }
2631 
2632 /*
2633  * Set a link layer address from the ll_addr passed in.
2634  * Copy SAP from ill.
2635  */
2636 static void
2637 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2638 {
2639 	ill_t	*ill = ncec->ncec_ill;
2640 
2641 	ASSERT(ll_addr != NULL);
2642 	if (ill->ill_phys_addr_length > 0) {
2643 		/*
2644 		 * The bcopy() below used to be called for the physical address
2645 		 * length rather than the link layer address length. For
2646 		 * ethernet and many other media, the phys_addr and lla are
2647 		 * identical.
2648 		 *
2649 		 * The phys_addr and lla may not be the same for devices that
2650 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2651 		 * no known instances of these.
2652 		 *
2653 		 * For PPP or other interfaces with a zero length
2654 		 * physical address, don't do anything here.
2655 		 * The bcopy() with a zero phys_addr length was previously
2656 		 * a no-op for interfaces with a zero-length physical address.
2657 		 * Using the lla for them would change the way they operate.
2658 		 * Doing nothing in such cases preserves expected behavior.
2659 		 */
2660 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2661 	}
2662 }
2663 
2664 boolean_t
2665 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2666     uint32_t ll_addr_len)
2667 {
2668 	ASSERT(ncec->ncec_lladdr != NULL);
2669 	if (ll_addr == NULL)
2670 		return (B_FALSE);
2671 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2672 		return (B_TRUE);
2673 	return (B_FALSE);
2674 }
2675 
2676 /*
2677  * Updates the link layer address or the reachability state of
2678  * a cache entry.  Reset probe counter if needed.
2679  */
2680 void
2681 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2682 {
2683 	ill_t	*ill = ncec->ncec_ill;
2684 	boolean_t need_stop_timer = B_FALSE;
2685 	boolean_t need_fastpath_update = B_FALSE;
2686 	nce_t	*nce = NULL;
2687 	timeout_id_t tid;
2688 
2689 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2690 	/*
2691 	 * If this interface does not do NUD, there is no point
2692 	 * in allowing an update to the cache entry.  Although
2693 	 * we will respond to NS.
2694 	 * The only time we accept an update for a resolver when
2695 	 * NUD is turned off is when it has just been created.
2696 	 * Non-Resolvers will always be created as REACHABLE.
2697 	 */
2698 	if (new_state != ND_UNCHANGED) {
2699 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
2700 		    (ncec->ncec_state != ND_INCOMPLETE))
2701 			return;
2702 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2703 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2704 		need_stop_timer = B_TRUE;
2705 		if (new_state == ND_REACHABLE)
2706 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2707 		else {
2708 			/* We force NUD in this case */
2709 			ncec->ncec_last = 0;
2710 		}
2711 		ncec->ncec_state = new_state;
2712 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2713 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2714 		    new_state == ND_INCOMPLETE);
2715 	}
2716 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2717 		tid = ncec->ncec_timeout_id;
2718 		ncec->ncec_timeout_id = 0;
2719 	}
2720 	/*
2721 	 * Re-trigger fastpath probe and
2722 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2723 	 * whatever packets that happens to be transmitting at the time.
2724 	 */
2725 	if (new_ll_addr != NULL) {
2726 		bcopy(new_ll_addr, ncec->ncec_lladdr,
2727 		    ill->ill_phys_addr_length);
2728 		need_fastpath_update = B_TRUE;
2729 	}
2730 	mutex_exit(&ncec->ncec_lock);
2731 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2732 		if (tid != 0)
2733 			(void) untimeout(tid);
2734 	}
2735 	if (need_fastpath_update) {
2736 		/*
2737 		 * Delete any existing existing dlur_mp and fp_mp information.
2738 		 * For IPMP interfaces, all underlying ill's must be checked
2739 		 * and purged.
2740 		 */
2741 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2742 		/*
2743 		 * add the new dlur_mp and fp_mp
2744 		 */
2745 		nce = nce_fastpath(ncec, B_TRUE, NULL);
2746 		if (nce != NULL)
2747 			nce_refrele(nce);
2748 	}
2749 	mutex_enter(&ncec->ncec_lock);
2750 }
2751 
2752 static void
2753 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2754 {
2755 	uint_t	count = 0;
2756 	mblk_t  **mpp, *tmp;
2757 
2758 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2759 
2760 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2761 		if (++count > ncec->ncec_ill->ill_max_buf) {
2762 			tmp = ncec->ncec_qd_mp->b_next;
2763 			ncec->ncec_qd_mp->b_next = NULL;
2764 			/*
2765 			 * if we never create data addrs on the under_ill
2766 			 * does this matter?
2767 			 */
2768 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2769 			    ipIfStatsOutDiscards);
2770 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2771 			    ncec->ncec_ill);
2772 			freemsg(ncec->ncec_qd_mp);
2773 			ncec->ncec_qd_mp = tmp;
2774 		}
2775 	}
2776 
2777 	if (head_insert) {
2778 		ncec->ncec_nprobes++;
2779 		mp->b_next = ncec->ncec_qd_mp;
2780 		ncec->ncec_qd_mp = mp;
2781 	} else {
2782 		*mpp = mp;
2783 	}
2784 }
2785 
2786 /*
2787  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2788  * queued at the head or tail of the queue based on the input argument
2789  * 'head_insert'. The caller should specify this argument as B_TRUE if this
2790  * packet is an IPMP probe packet, in which case the following happens:
2791  *
2792  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2793  *	(non-ipmp_probe) load-speading case where the source address of the ND
2794  *	packet is not tied to ncec_ill. If the ill bound to the source address
2795  *	cannot receive, the response to the ND packet will not be received.
2796  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
2797  *	packet, those probes will also fail to be sent, and thus in.mpathd will
2798  *	 erroneously conclude that ncec_ill has also failed.
2799  *
2800  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
2801  *	the first attempt.  This ensures that ND problems do not manifest as
2802  *	probe RTT spikes.
2803  *
2804  * We achieve this by inserting ipmp_probe() packets at the head of the
2805  * nce_queue.
2806  *
2807  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2808  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2809  */
2810 void
2811 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2812 {
2813 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2814 	nce_queue_mp_common(ncec, mp, head_insert);
2815 }
2816 
2817 /*
2818  * Called when address resolution failed due to a timeout.
2819  * Send an ICMP unreachable in response to all queued packets.
2820  */
2821 void
2822 ndp_resolv_failed(ncec_t *ncec)
2823 {
2824 	mblk_t	*mp, *nxt_mp;
2825 	char	buf[INET6_ADDRSTRLEN];
2826 	ill_t *ill = ncec->ncec_ill;
2827 	ip_recv_attr_t	iras;
2828 
2829 	bzero(&iras, sizeof (iras));
2830 	iras.ira_flags = 0;
2831 	/*
2832 	 * we are setting the ira_rill to the ipmp_ill (instead of
2833 	 * the actual ill on which the packet was received), but this
2834 	 * is ok because we don't actually need the real ira_rill.
2835 	 * to send the icmp unreachable to the sender.
2836 	 */
2837 	iras.ira_ill = iras.ira_rill = ill;
2838 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2839 	iras.ira_rifindex = iras.ira_ruifindex;
2840 
2841 	ip1dbg(("ndp_resolv_failed: dst %s\n",
2842 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2843 	mutex_enter(&ncec->ncec_lock);
2844 	mp = ncec->ncec_qd_mp;
2845 	ncec->ncec_qd_mp = NULL;
2846 	ncec->ncec_nprobes = 0;
2847 	mutex_exit(&ncec->ncec_lock);
2848 	while (mp != NULL) {
2849 		nxt_mp = mp->b_next;
2850 		mp->b_next = NULL;
2851 
2852 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2853 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2854 		    mp, ill);
2855 		icmp_unreachable_v6(mp,
2856 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2857 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2858 		mp = nxt_mp;
2859 	}
2860 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2861 }
2862 
2863 /*
2864  * Handle the completion of NDP and ARP resolution.
2865  */
2866 void
2867 nce_resolv_ok(ncec_t *ncec)
2868 {
2869 	mblk_t *mp;
2870 	uint_t pkt_len;
2871 	iaflags_t ixaflags = IXAF_NO_TRACE;
2872 	nce_t *nce;
2873 	ill_t	*ill = ncec->ncec_ill;
2874 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2875 	ip_stack_t *ipst = ill->ill_ipst;
2876 
2877 	if (IS_IPMP(ncec->ncec_ill)) {
2878 		nce_resolv_ipmp_ok(ncec);
2879 		return;
2880 	}
2881 	/* non IPMP case */
2882 
2883 	mutex_enter(&ncec->ncec_lock);
2884 	ASSERT(ncec->ncec_nprobes == 0);
2885 	mp = ncec->ncec_qd_mp;
2886 	ncec->ncec_qd_mp = NULL;
2887 	mutex_exit(&ncec->ncec_lock);
2888 
2889 	while (mp != NULL) {
2890 		mblk_t *nxt_mp;
2891 
2892 		if (ill->ill_isv6) {
2893 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2894 
2895 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2896 		} else {
2897 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
2898 
2899 			ixaflags |= IXAF_IS_IPV4;
2900 			pkt_len = ntohs(ipha->ipha_length);
2901 		}
2902 		nxt_mp = mp->b_next;
2903 		mp->b_next = NULL;
2904 		/*
2905 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2906 		 * longer available, but it's ok to drop this flag because TCP
2907 		 * has its own flow-control in effect, so TCP packets
2908 		 * are not likely to get here when flow-control is in effect.
2909 		 */
2910 		mutex_enter(&ill->ill_lock);
2911 		nce = nce_lookup(ill, &ncec->ncec_addr);
2912 		mutex_exit(&ill->ill_lock);
2913 
2914 		if (nce == NULL) {
2915 			if (isv6) {
2916 				BUMP_MIB(&ipst->ips_ip6_mib,
2917 				    ipIfStatsOutDiscards);
2918 			} else {
2919 				BUMP_MIB(&ipst->ips_ip_mib,
2920 				    ipIfStatsOutDiscards);
2921 			}
2922 			ip_drop_output("ipIfStatsOutDiscards - no nce",
2923 			    mp, NULL);
2924 			freemsg(mp);
2925 		} else {
2926 			/*
2927 			 * We don't know the zoneid, but
2928 			 * ip_xmit does not care since IXAF_NO_TRACE
2929 			 * is set. (We traced the packet the first
2930 			 * time through ip_xmit.)
2931 			 */
2932 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2933 			    ALL_ZONES, 0, NULL);
2934 			nce_refrele(nce);
2935 		}
2936 		mp = nxt_mp;
2937 	}
2938 
2939 	ncec_cb_dispatch(ncec); /* complete callbacks */
2940 }
2941 
2942 /*
2943  * Called by SIOCSNDP* ioctl to add/change an ncec entry
2944  * and the corresponding attributes.
2945  * Disallow states other than ND_REACHABLE or ND_STALE.
2946  */
2947 int
2948 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2949 {
2950 	sin6_t		*sin6;
2951 	in6_addr_t	*addr;
2952 	ncec_t		*ncec;
2953 	nce_t		*nce;
2954 	int		err = 0;
2955 	uint16_t	new_flags = 0;
2956 	uint16_t	old_flags = 0;
2957 	int		inflags = lnr->lnr_flags;
2958 	ip_stack_t	*ipst = ill->ill_ipst;
2959 	boolean_t	do_postprocess = B_FALSE;
2960 
2961 	ASSERT(ill->ill_isv6);
2962 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2963 	    (lnr->lnr_state_create != ND_STALE))
2964 		return (EINVAL);
2965 
2966 	sin6 = (sin6_t *)&lnr->lnr_addr;
2967 	addr = &sin6->sin6_addr;
2968 
2969 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2970 	ASSERT(!IS_UNDER_IPMP(ill));
2971 	nce = nce_lookup_addr(ill, addr);
2972 	if (nce != NULL)
2973 		new_flags = nce->nce_common->ncec_flags;
2974 
2975 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2976 	case NDF_ISROUTER_ON:
2977 		new_flags |= NCE_F_ISROUTER;
2978 		break;
2979 	case NDF_ISROUTER_OFF:
2980 		new_flags &= ~NCE_F_ISROUTER;
2981 		break;
2982 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2983 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2984 		if (nce != NULL)
2985 			nce_refrele(nce);
2986 		return (EINVAL);
2987 	}
2988 	if (inflags & NDF_STATIC)
2989 		new_flags |= NCE_F_STATIC;
2990 
2991 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2992 	case NDF_ANYCAST_ON:
2993 		new_flags |= NCE_F_ANYCAST;
2994 		break;
2995 	case NDF_ANYCAST_OFF:
2996 		new_flags &= ~NCE_F_ANYCAST;
2997 		break;
2998 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2999 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3000 		if (nce != NULL)
3001 			nce_refrele(nce);
3002 		return (EINVAL);
3003 	}
3004 
3005 	if (nce == NULL) {
3006 		err = nce_add_v6(ill,
3007 		    (uchar_t *)lnr->lnr_hdw_addr,
3008 		    ill->ill_phys_addr_length,
3009 		    addr,
3010 		    new_flags,
3011 		    lnr->lnr_state_create,
3012 		    &nce);
3013 		if (err != 0) {
3014 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3015 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3016 			return (err);
3017 		} else {
3018 			do_postprocess = B_TRUE;
3019 		}
3020 	}
3021 	ncec = nce->nce_common;
3022 	old_flags = ncec->ncec_flags;
3023 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3024 		ncec_router_to_host(ncec);
3025 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3026 		if (do_postprocess)
3027 			err = nce_add_v6_postprocess(nce);
3028 		nce_refrele(nce);
3029 		return (0);
3030 	}
3031 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3032 
3033 	if (do_postprocess)
3034 		err = nce_add_v6_postprocess(nce);
3035 	/*
3036 	 * err cannot be anything other than 0 because we don't support
3037 	 * proxy arp of static addresses.
3038 	 */
3039 	ASSERT(err == 0);
3040 
3041 	mutex_enter(&ncec->ncec_lock);
3042 	ncec->ncec_flags = new_flags;
3043 	mutex_exit(&ncec->ncec_lock);
3044 	/*
3045 	 * Note that we ignore the state at this point, which
3046 	 * should be either STALE or REACHABLE.  Instead we let
3047 	 * the link layer address passed in to determine the state
3048 	 * much like incoming packets.
3049 	 */
3050 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3051 	nce_refrele(nce);
3052 	return (0);
3053 }
3054 
3055 /*
3056  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3057  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3058  * be held to ensure that they are in the same group.
3059  */
3060 static nce_t *
3061 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3062 {
3063 
3064 	nce_t *nce;
3065 
3066 	nce = nce_ill_lookup_then_add(ill, ncec);
3067 
3068 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3069 		return (nce);
3070 
3071 	/*
3072 	 * hold the ncec_lock to synchronize with nce_update() so that,
3073 	 * at the end of this function, the contents of nce_dlur_mp are
3074 	 * consistent with ncec->ncec_lladdr, even though some intermediate
3075 	 * packet may have been sent out with a mangled address, which would
3076 	 * only be a transient condition.
3077 	 */
3078 	mutex_enter(&ncec->ncec_lock);
3079 	if (ncec->ncec_lladdr != NULL) {
3080 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3081 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3082 	} else {
3083 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3084 		    ill->ill_sap_length);
3085 	}
3086 	mutex_exit(&ncec->ncec_lock);
3087 	return (nce);
3088 }
3089 
3090 /*
3091  * we make nce_fp_mp to have an M_DATA prepend.
3092  * The caller ensures there is hold on ncec for this function.
3093  * Note that since ill_fastpath_probe() copies the mblk there is
3094  * no need to hold the nce or ncec beyond this function.
3095  *
3096  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3097  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3098  * and will be returned back by this function, so that no extra nce_refrele
3099  * is required for the caller. The calls from nce_add_common() use this
3100  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3101  * nce_refrele of the returned nce (when it is non-null).
3102  */
3103 nce_t *
3104 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3105 {
3106 	nce_t *nce;
3107 	ill_t *ill = ncec->ncec_ill;
3108 
3109 	ASSERT(ill != NULL);
3110 
3111 	if (IS_IPMP(ill) && trigger_fp_req) {
3112 		trigger_fp_req = B_FALSE;
3113 		ipmp_ncec_refresh_nce(ncec);
3114 	}
3115 
3116 	/*
3117 	 * If the caller already has the nce corresponding to the ill, use
3118 	 * that one. Otherwise we have to lookup/add the nce. Calls from
3119 	 * nce_add_common() fall in the former category, and have just done
3120 	 * the nce lookup/add that can be reused.
3121 	 */
3122 	if (ncec_nce == NULL)
3123 		nce = nce_fastpath_create(ill, ncec);
3124 	else
3125 		nce = ncec_nce;
3126 
3127 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3128 		return (nce);
3129 
3130 	if (trigger_fp_req)
3131 		nce_fastpath_trigger(nce);
3132 	return (nce);
3133 }
3134 
3135 /*
3136  * Trigger fastpath on nce. No locks may be held.
3137  */
3138 static void
3139 nce_fastpath_trigger(nce_t *nce)
3140 {
3141 	int res;
3142 	ill_t *ill = nce->nce_ill;
3143 	ncec_t *ncec = nce->nce_common;
3144 
3145 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3146 	/*
3147 	 * EAGAIN is an indication of a transient error
3148 	 * i.e. allocation failure etc. leave the ncec in the list it
3149 	 * will be updated when another probe happens for another ire
3150 	 * if not it will be taken out of the list when the ire is
3151 	 * deleted.
3152 	 */
3153 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
3154 		nce_fastpath_list_delete(ill, ncec, NULL);
3155 }
3156 
3157 /*
3158  * Add ncec to the nce fastpath list on ill.
3159  */
3160 static nce_t *
3161 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3162 {
3163 	nce_t *nce = NULL;
3164 
3165 	ASSERT(MUTEX_HELD(&ill->ill_lock));
3166 	/*
3167 	 * Atomically ensure that the ill is not CONDEMNED and is not going
3168 	 * down, before adding the NCE.
3169 	 */
3170 	if (ill->ill_state_flags & ILL_CONDEMNED)
3171 		return (NULL);
3172 	mutex_enter(&ncec->ncec_lock);
3173 	/*
3174 	 * if ncec has not been deleted and
3175 	 * is not already in the list add it.
3176 	 */
3177 	if (!NCE_ISCONDEMNED(ncec)) {
3178 		nce = nce_lookup(ill, &ncec->ncec_addr);
3179 		if (nce != NULL)
3180 			goto done;
3181 		nce = nce_add(ill, ncec);
3182 	}
3183 done:
3184 	mutex_exit(&ncec->ncec_lock);
3185 	return (nce);
3186 }
3187 
3188 nce_t *
3189 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3190 {
3191 	nce_t *nce;
3192 
3193 	mutex_enter(&ill->ill_lock);
3194 	nce = nce_ill_lookup_then_add_locked(ill, ncec);
3195 	mutex_exit(&ill->ill_lock);
3196 	return (nce);
3197 }
3198 
3199 
3200 /*
3201  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3202  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3203  * entry after all locks have been dropped.
3204  */
3205 void
3206 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3207 {
3208 	nce_t *nce;
3209 
3210 	ASSERT(ill != NULL);
3211 
3212 	/* delete any nces referencing the ncec from underlying ills */
3213 	if (IS_IPMP(ill))
3214 		ipmp_ncec_delete_nce(ncec);
3215 
3216 	/* now the ill itself */
3217 	mutex_enter(&ill->ill_lock);
3218 	for (nce = list_head(&ill->ill_nce); nce != NULL;
3219 	    nce = list_next(&ill->ill_nce, nce)) {
3220 		if (nce->nce_common == ncec) {
3221 			nce_refhold(nce);
3222 			nce_delete(nce);
3223 			break;
3224 		}
3225 	}
3226 	mutex_exit(&ill->ill_lock);
3227 	if (nce != NULL) {
3228 		if (dead == NULL)
3229 			nce_refrele(nce);
3230 		else
3231 			list_insert_tail(dead, nce);
3232 	}
3233 }
3234 
3235 /*
3236  * when the fastpath response does not fit in the datab
3237  * associated with the existing nce_fp_mp, we delete and
3238  * add the nce to retrigger fastpath based on the information
3239  * in the ncec_t.
3240  */
3241 static nce_t *
3242 nce_delete_then_add(nce_t *nce)
3243 {
3244 	ill_t		*ill = nce->nce_ill;
3245 	nce_t		*newnce = NULL;
3246 
3247 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3248 	    (void *)nce, ill->ill_name));
3249 	mutex_enter(&ill->ill_lock);
3250 	mutex_enter(&nce->nce_common->ncec_lock);
3251 	nce_delete(nce);
3252 	/*
3253 	 * Make sure that ncec is not condemned before adding. We hold the
3254 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3255 	 * ipmp_ncec_delete_nce()
3256 	 */
3257 	if (!NCE_ISCONDEMNED(nce->nce_common))
3258 		newnce = nce_add(ill, nce->nce_common);
3259 	mutex_exit(&nce->nce_common->ncec_lock);
3260 	mutex_exit(&ill->ill_lock);
3261 	nce_refrele(nce);
3262 	return (newnce); /* could be null if nomem */
3263 }
3264 
3265 typedef struct nce_fp_match_s {
3266 	nce_t	*nce_fp_match_res;
3267 	mblk_t	*nce_fp_match_ack_mp;
3268 } nce_fp_match_t;
3269 
3270 /* ARGSUSED */
3271 static int
3272 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3273 {
3274 	nce_fp_match_t	*nce_fp_marg = arg;
3275 	ncec_t		*ncec = nce->nce_common;
3276 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
3277 	uchar_t	*mp_rptr, *ud_mp_rptr;
3278 	mblk_t		*ud_mp = nce->nce_dlur_mp;
3279 	ptrdiff_t	cmplen;
3280 
3281 	/*
3282 	 * mp is the mp associated with the fastpath ack.
3283 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3284 	 * under consideration. If the contents match, then the
3285 	 * fastpath ack is used to update the nce.
3286 	 */
3287 	if (ud_mp == NULL)
3288 		return (0);
3289 	mp_rptr = mp->b_rptr;
3290 	cmplen = mp->b_wptr - mp_rptr;
3291 	ASSERT(cmplen >= 0);
3292 
3293 	ud_mp_rptr = ud_mp->b_rptr;
3294 	/*
3295 	 * The ncec is locked here to prevent any other threads from accessing
3296 	 * and changing nce_dlur_mp when the address becomes resolved to an
3297 	 * lla while we're in the middle of looking at and comparing the
3298 	 * hardware address (lla). It is also locked to prevent multiple
3299 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3300 	 * time.
3301 	 */
3302 	mutex_enter(&ncec->ncec_lock);
3303 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3304 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3305 		nce_fp_marg->nce_fp_match_res = nce;
3306 		mutex_exit(&ncec->ncec_lock);
3307 		nce_refhold(nce);
3308 		return (1);
3309 	}
3310 	mutex_exit(&ncec->ncec_lock);
3311 	return (0);
3312 }
3313 
3314 /*
3315  * Update all NCE's that are not in fastpath mode and
3316  * have an nce_fp_mp that matches mp. mp->b_cont contains
3317  * the fastpath header.
3318  *
3319  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3320  */
3321 void
3322 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3323 {
3324 	nce_fp_match_t nce_fp_marg;
3325 	nce_t *nce;
3326 	mblk_t *nce_fp_mp, *fp_mp;
3327 
3328 	nce_fp_marg.nce_fp_match_res = NULL;
3329 	nce_fp_marg.nce_fp_match_ack_mp = mp;
3330 
3331 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3332 
3333 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3334 		return;
3335 
3336 	mutex_enter(&nce->nce_lock);
3337 	nce_fp_mp = nce->nce_fp_mp;
3338 
3339 	if (nce_fp_mp != NULL) {
3340 		fp_mp = mp->b_cont;
3341 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3342 		    nce_fp_mp->b_datap->db_lim) {
3343 			mutex_exit(&nce->nce_lock);
3344 			nce = nce_delete_then_add(nce);
3345 			if (nce == NULL) {
3346 				return;
3347 			}
3348 			mutex_enter(&nce->nce_lock);
3349 			nce_fp_mp = nce->nce_fp_mp;
3350 		}
3351 	}
3352 
3353 	/* Matched - install mp as the fastpath mp */
3354 	if (nce_fp_mp == NULL) {
3355 		fp_mp = dupb(mp->b_cont);
3356 		nce->nce_fp_mp = fp_mp;
3357 	} else {
3358 		fp_mp = mp->b_cont;
3359 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3360 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3361 		    + MBLKL(fp_mp);
3362 	}
3363 	mutex_exit(&nce->nce_lock);
3364 	nce_refrele(nce);
3365 }
3366 
3367 /*
3368  * Return a pointer to a given option in the packet.
3369  * Assumes that option part of the packet have already been validated.
3370  */
3371 nd_opt_hdr_t *
3372 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3373 {
3374 	while (optlen > 0) {
3375 		if (opt->nd_opt_type == opt_type)
3376 			return (opt);
3377 		optlen -= 8 * opt->nd_opt_len;
3378 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3379 	}
3380 	return (NULL);
3381 }
3382 
3383 /*
3384  * Verify all option lengths present are > 0, also check to see
3385  * if the option lengths and packet length are consistent.
3386  */
3387 boolean_t
3388 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3389 {
3390 	ASSERT(opt != NULL);
3391 	while (optlen > 0) {
3392 		if (opt->nd_opt_len == 0)
3393 			return (B_FALSE);
3394 		optlen -= 8 * opt->nd_opt_len;
3395 		if (optlen < 0)
3396 			return (B_FALSE);
3397 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3398 	}
3399 	return (B_TRUE);
3400 }
3401 
3402 /*
3403  * ncec_walk function.
3404  * Free a fraction of the NCE cache entries.
3405  *
3406  * A possible optimization here would be to use ncec_last where possible, and
3407  * delete the least-frequently used entry, which would require more complex
3408  * computation as we walk through the ncec's (e.g., track ncec entries by
3409  * order of ncec_last and/or maintain state)
3410  */
3411 static void
3412 ncec_cache_reclaim(ncec_t *ncec, void *arg)
3413 {
3414 	ip_stack_t	*ipst = ncec->ncec_ipst;
3415 	uint_t		fraction = *(uint_t *)arg;
3416 	uint_t		rand;
3417 
3418 	if ((ncec->ncec_flags &
3419 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3420 		return;
3421 	}
3422 
3423 	rand = (uint_t)ddi_get_lbolt() +
3424 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3425 	if ((rand/fraction)*fraction == rand) {
3426 		IP_STAT(ipst, ip_nce_reclaim_deleted);
3427 		ncec_delete(ncec);
3428 	}
3429 }
3430 
3431 /*
3432  * kmem_cache callback to free up memory.
3433  *
3434  * For now we just delete a fixed fraction.
3435  */
3436 static void
3437 ip_nce_reclaim_stack(ip_stack_t *ipst)
3438 {
3439 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
3440 
3441 	IP_STAT(ipst, ip_nce_reclaim_calls);
3442 
3443 	ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3444 
3445 	/*
3446 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3447 	 * Get them to update any stale references to drop any refholds they
3448 	 * have.
3449 	 */
3450 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3451 }
3452 
3453 /*
3454  * Called by the memory allocator subsystem directly, when the system
3455  * is running low on memory.
3456  */
3457 /* ARGSUSED */
3458 void
3459 ip_nce_reclaim(void *args)
3460 {
3461 	netstack_handle_t nh;
3462 	netstack_t *ns;
3463 	ip_stack_t *ipst;
3464 
3465 	netstack_next_init(&nh);
3466 	while ((ns = netstack_next(&nh)) != NULL) {
3467 		/*
3468 		 * netstack_next() can return a netstack_t with a NULL
3469 		 * netstack_ip at boot time.
3470 		 */
3471 		if ((ipst = ns->netstack_ip) == NULL) {
3472 			netstack_rele(ns);
3473 			continue;
3474 		}
3475 		ip_nce_reclaim_stack(ipst);
3476 		netstack_rele(ns);
3477 	}
3478 	netstack_next_fini(&nh);
3479 }
3480 
3481 #ifdef DEBUG
3482 void
3483 ncec_trace_ref(ncec_t *ncec)
3484 {
3485 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3486 
3487 	if (ncec->ncec_trace_disable)
3488 		return;
3489 
3490 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3491 		ncec->ncec_trace_disable = B_TRUE;
3492 		ncec_trace_cleanup(ncec);
3493 	}
3494 }
3495 
3496 void
3497 ncec_untrace_ref(ncec_t *ncec)
3498 {
3499 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3500 
3501 	if (!ncec->ncec_trace_disable)
3502 		th_trace_unref(ncec);
3503 }
3504 
3505 static void
3506 ncec_trace_cleanup(const ncec_t *ncec)
3507 {
3508 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3509 }
3510 #endif
3511 
3512 /*
3513  * Called when address resolution fails due to a timeout.
3514  * Send an ICMP unreachable in response to all queued packets.
3515  */
3516 void
3517 arp_resolv_failed(ncec_t *ncec)
3518 {
3519 	mblk_t	*mp, *nxt_mp;
3520 	char	buf[INET6_ADDRSTRLEN];
3521 	struct in_addr ipv4addr;
3522 	ill_t *ill = ncec->ncec_ill;
3523 	ip_stack_t *ipst = ncec->ncec_ipst;
3524 	ip_recv_attr_t	iras;
3525 
3526 	bzero(&iras, sizeof (iras));
3527 	iras.ira_flags = IRAF_IS_IPV4;
3528 	/*
3529 	 * we are setting the ira_rill to the ipmp_ill (instead of
3530 	 * the actual ill on which the packet was received), but this
3531 	 * is ok because we don't actually need the real ira_rill.
3532 	 * to send the icmp unreachable to the sender.
3533 	 */
3534 	iras.ira_ill = iras.ira_rill = ill;
3535 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3536 	iras.ira_rifindex = iras.ira_ruifindex;
3537 
3538 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3539 	ip3dbg(("arp_resolv_failed: dst %s\n",
3540 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3541 	mutex_enter(&ncec->ncec_lock);
3542 	mp = ncec->ncec_qd_mp;
3543 	ncec->ncec_qd_mp = NULL;
3544 	ncec->ncec_nprobes = 0;
3545 	mutex_exit(&ncec->ncec_lock);
3546 	while (mp != NULL) {
3547 		nxt_mp = mp->b_next;
3548 		mp->b_next = NULL;
3549 
3550 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3551 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3552 		    mp, ill);
3553 		if (ipst->ips_ip_arp_icmp_error) {
3554 			ip3dbg(("arp_resolv_failed: "
3555 			    "Calling icmp_unreachable\n"));
3556 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3557 		} else {
3558 			freemsg(mp);
3559 		}
3560 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3561 		mp = nxt_mp;
3562 	}
3563 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3564 }
3565 
3566 /*
3567  * if ill is an under_ill, translate it to the ipmp_ill and add the
3568  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3569  * one on the underlying in_ill) will be created for the
3570  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3571  */
3572 int
3573 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3574     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3575 {
3576 	int	err;
3577 	in6_addr_t addr6;
3578 	ip_stack_t *ipst = ill->ill_ipst;
3579 	nce_t	*nce, *upper_nce = NULL;
3580 	ill_t	*in_ill = ill, *under = NULL;
3581 	boolean_t need_ill_refrele = B_FALSE;
3582 
3583 	if (flags & NCE_F_MCAST) {
3584 		/*
3585 		 * hw_addr will be figured out in nce_set_multicast_v4;
3586 		 * caller needs to pass in the cast_ill for ipmp
3587 		 */
3588 		ASSERT(hw_addr == NULL);
3589 		ASSERT(!IS_IPMP(ill));
3590 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
3591 		return (err);
3592 	}
3593 
3594 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3595 		ill = ipmp_ill_hold_ipmp_ill(ill);
3596 		if (ill == NULL)
3597 			return (ENXIO);
3598 		need_ill_refrele = B_TRUE;
3599 	}
3600 	if ((flags & NCE_F_BCAST) != 0) {
3601 		/*
3602 		 * IPv4 broadcast ncec: compute the hwaddr.
3603 		 */
3604 		if (IS_IPMP(ill)) {
3605 			under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3606 			if (under == NULL)  {
3607 				if (need_ill_refrele)
3608 					ill_refrele(ill);
3609 				return (ENETDOWN);
3610 			}
3611 			hw_addr = under->ill_bcast_mp->b_rptr +
3612 			    NCE_LL_ADDR_OFFSET(under);
3613 			hw_addr_len = under->ill_phys_addr_length;
3614 		} else {
3615 			hw_addr = ill->ill_bcast_mp->b_rptr +
3616 			    NCE_LL_ADDR_OFFSET(ill),
3617 			    hw_addr_len = ill->ill_phys_addr_length;
3618 		}
3619 	}
3620 
3621 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3622 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3623 	nce = nce_lookup_addr(ill, &addr6);
3624 	if (nce == NULL) {
3625 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3626 		    state, &nce);
3627 	} else {
3628 		err = EEXIST;
3629 	}
3630 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3631 	if (err == 0)
3632 		err = nce_add_v4_postprocess(nce);
3633 
3634 	if (in_ill != ill && nce != NULL) {
3635 		nce_t *under_nce = NULL;
3636 
3637 		/*
3638 		 * in_ill was the under_ill. Try to create the under_nce.
3639 		 * Hold the ill_g_lock to prevent changes to group membership
3640 		 * until we are done.
3641 		 */
3642 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3643 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3644 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3645 			    ill_t *, ill);
3646 			rw_exit(&ipst->ips_ill_g_lock);
3647 			err = ENXIO;
3648 			nce_refrele(nce);
3649 			nce = NULL;
3650 			goto bail;
3651 		}
3652 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3653 		if (under_nce == NULL) {
3654 			rw_exit(&ipst->ips_ill_g_lock);
3655 			err = EINVAL;
3656 			nce_refrele(nce);
3657 			nce = NULL;
3658 			goto bail;
3659 		}
3660 		rw_exit(&ipst->ips_ill_g_lock);
3661 		upper_nce = nce;
3662 		nce = under_nce; /* will be returned to caller */
3663 		if (NCE_ISREACHABLE(nce->nce_common))
3664 			nce_fastpath_trigger(under_nce);
3665 	}
3666 	if (nce != NULL) {
3667 		if (newnce != NULL)
3668 			*newnce = nce;
3669 		else
3670 			nce_refrele(nce);
3671 	}
3672 bail:
3673 	if (under != NULL)
3674 		ill_refrele(under);
3675 	if (upper_nce != NULL)
3676 		nce_refrele(upper_nce);
3677 	if (need_ill_refrele)
3678 		ill_refrele(ill);
3679 
3680 	return (err);
3681 }
3682 
3683 /*
3684  * NDP Cache Entry creation routine for IPv4.
3685  * This routine must always be called with ndp4->ndp_g_lock held.
3686  * Prior to return, ncec_refcnt is incremented.
3687  *
3688  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3689  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3690  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3691  * entries will be created, both pointing at the same ncec_t. The nce_t
3692  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3693  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3694  * Local addresses are always created on the ill passed to nce_add_v4.
3695  */
3696 int
3697 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3698     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3699 {
3700 	int		err;
3701 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
3702 	struct in6_addr	addr6;
3703 	nce_t		*nce;
3704 
3705 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3706 	ASSERT(!ill->ill_isv6);
3707 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3708 
3709 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3710 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3711 	    &nce);
3712 	ASSERT(newnce != NULL);
3713 	*newnce = nce;
3714 	return (err);
3715 }
3716 
3717 /*
3718  * Post-processing routine to be executed after nce_add_v4(). This function
3719  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3720  * and must be called without any locks held.
3721  *
3722  * Always returns 0, but we return an int to keep this symmetric with the
3723  * IPv6 counter-part.
3724  */
3725 int
3726 nce_add_v4_postprocess(nce_t *nce)
3727 {
3728 	ncec_t		*ncec = nce->nce_common;
3729 	uint16_t	flags = ncec->ncec_flags;
3730 	boolean_t	ndp_need_dad = B_FALSE;
3731 	boolean_t	dropped;
3732 	clock_t		delay;
3733 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
3734 	uchar_t		*hw_addr = ncec->ncec_lladdr;
3735 	boolean_t	trigger_fastpath = B_TRUE;
3736 
3737 	/*
3738 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3739 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3740 	 * We call nce_fastpath from nce_update if the link layer address of
3741 	 * the peer changes from nce_update
3742 	 */
3743 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3744 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3745 		trigger_fastpath = B_FALSE;
3746 
3747 	if (trigger_fastpath)
3748 		nce_fastpath_trigger(nce);
3749 
3750 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3751 		/*
3752 		 * Either the caller (by passing in ND_PROBE)
3753 		 * or nce_add_common() (by the internally computed state
3754 		 * based on ncec_addr and ill_net_type) has determined
3755 		 * that this unicast entry needs DAD. Trigger DAD.
3756 		 */
3757 		ndp_need_dad = B_TRUE;
3758 	} else if (flags & NCE_F_UNSOL_ADV) {
3759 		/*
3760 		 * We account for the transmit below by assigning one
3761 		 * less than the ndd variable. Subsequent decrements
3762 		 * are done in nce_timer.
3763 		 */
3764 		mutex_enter(&ncec->ncec_lock);
3765 		ncec->ncec_unsolicit_count =
3766 		    ipst->ips_ip_arp_publish_count - 1;
3767 		mutex_exit(&ncec->ncec_lock);
3768 		dropped = arp_announce(ncec);
3769 		mutex_enter(&ncec->ncec_lock);
3770 		if (dropped)
3771 			ncec->ncec_unsolicit_count++;
3772 		else
3773 			ncec->ncec_last_time_defended = ddi_get_lbolt();
3774 		if (ncec->ncec_unsolicit_count != 0) {
3775 			nce_start_timer(ncec,
3776 			    ipst->ips_ip_arp_publish_interval);
3777 		}
3778 		mutex_exit(&ncec->ncec_lock);
3779 	}
3780 
3781 	/*
3782 	 * If ncec_xmit_interval is 0, user has configured us to send the first
3783 	 * probe right away.  Do so, and set up for the subsequent probes.
3784 	 */
3785 	if (ndp_need_dad) {
3786 		mutex_enter(&ncec->ncec_lock);
3787 		if (ncec->ncec_pcnt == 0) {
3788 			/*
3789 			 * DAD probes and announce can be
3790 			 * administratively disabled by setting the
3791 			 * probe_count to zero. Restart the timer in
3792 			 * this case to mark the ipif as ready.
3793 			 */
3794 			ncec->ncec_unsolicit_count = 0;
3795 			mutex_exit(&ncec->ncec_lock);
3796 			nce_restart_timer(ncec, 0);
3797 		} else {
3798 			mutex_exit(&ncec->ncec_lock);
3799 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3800 			    ipst->ips_arp_probe_delay :
3801 			    ipst->ips_arp_fastprobe_delay);
3802 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3803 		}
3804 	}
3805 	return (0);
3806 }
3807 
3808 /*
3809  * ncec_walk routine to update all entries that have a given destination or
3810  * gateway address and cached link layer (MAC) address.  This is used when ARP
3811  * informs us that a network-to-link-layer mapping may have changed.
3812  */
3813 void
3814 nce_update_hw_changed(ncec_t *ncec, void *arg)
3815 {
3816 	nce_hw_map_t *hwm = arg;
3817 	ipaddr_t ncec_addr;
3818 
3819 	if (ncec->ncec_state != ND_REACHABLE)
3820 		return;
3821 
3822 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3823 	if (ncec_addr != hwm->hwm_addr)
3824 		return;
3825 
3826 	mutex_enter(&ncec->ncec_lock);
3827 	if (hwm->hwm_flags != 0)
3828 		ncec->ncec_flags = hwm->hwm_flags;
3829 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3830 	mutex_exit(&ncec->ncec_lock);
3831 }
3832 
3833 void
3834 ncec_refhold(ncec_t *ncec)
3835 {
3836 	mutex_enter(&(ncec)->ncec_lock);
3837 	(ncec)->ncec_refcnt++;
3838 	ASSERT((ncec)->ncec_refcnt != 0);
3839 #ifdef DEBUG
3840 	ncec_trace_ref(ncec);
3841 #endif
3842 	mutex_exit(&(ncec)->ncec_lock);
3843 }
3844 
3845 void
3846 ncec_refhold_notr(ncec_t *ncec)
3847 {
3848 	mutex_enter(&(ncec)->ncec_lock);
3849 	(ncec)->ncec_refcnt++;
3850 	ASSERT((ncec)->ncec_refcnt != 0);
3851 	mutex_exit(&(ncec)->ncec_lock);
3852 }
3853 
3854 static void
3855 ncec_refhold_locked(ncec_t *ncec)
3856 {
3857 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3858 	(ncec)->ncec_refcnt++;
3859 #ifdef DEBUG
3860 	ncec_trace_ref(ncec);
3861 #endif
3862 }
3863 
3864 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3865 void
3866 ncec_refrele(ncec_t *ncec)
3867 {
3868 	mutex_enter(&(ncec)->ncec_lock);
3869 #ifdef DEBUG
3870 	ncec_untrace_ref(ncec);
3871 #endif
3872 	ASSERT((ncec)->ncec_refcnt != 0);
3873 	if (--(ncec)->ncec_refcnt == 0) {
3874 		ncec_inactive(ncec);
3875 	} else {
3876 		mutex_exit(&(ncec)->ncec_lock);
3877 	}
3878 }
3879 
3880 void
3881 ncec_refrele_notr(ncec_t *ncec)
3882 {
3883 	mutex_enter(&(ncec)->ncec_lock);
3884 	ASSERT((ncec)->ncec_refcnt != 0);
3885 	if (--(ncec)->ncec_refcnt == 0) {
3886 		ncec_inactive(ncec);
3887 	} else {
3888 		mutex_exit(&(ncec)->ncec_lock);
3889 	}
3890 }
3891 
3892 /*
3893  * Common to IPv4 and IPv6.
3894  */
3895 void
3896 nce_restart_timer(ncec_t *ncec, uint_t ms)
3897 {
3898 	timeout_id_t tid;
3899 
3900 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3901 
3902 	/* First cancel any running timer */
3903 	mutex_enter(&ncec->ncec_lock);
3904 	tid = ncec->ncec_timeout_id;
3905 	ncec->ncec_timeout_id = 0;
3906 	if (tid != 0) {
3907 		mutex_exit(&ncec->ncec_lock);
3908 		(void) untimeout(tid);
3909 		mutex_enter(&ncec->ncec_lock);
3910 	}
3911 
3912 	/* Restart timer */
3913 	nce_start_timer(ncec, ms);
3914 	mutex_exit(&ncec->ncec_lock);
3915 }
3916 
3917 static void
3918 nce_start_timer(ncec_t *ncec, uint_t ms)
3919 {
3920 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3921 	/*
3922 	 * Don't start the timer if the ncec has been deleted, or if the timer
3923 	 * is already running
3924 	 */
3925 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3926 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3927 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3928 	}
3929 }
3930 
3931 int
3932 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3933     uint16_t flags, nce_t **newnce)
3934 {
3935 	uchar_t		*hw_addr;
3936 	int		err = 0;
3937 	ip_stack_t	*ipst = ill->ill_ipst;
3938 	in6_addr_t	dst6;
3939 	nce_t		*nce;
3940 
3941 	ASSERT(!ill->ill_isv6);
3942 
3943 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3944 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3945 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3946 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3947 		goto done;
3948 	}
3949 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
3950 		/*
3951 		 * For IRE_IF_RESOLVER a hardware mapping can be
3952 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
3953 		 * in the ill is copied in nce_add_v4().
3954 		 */
3955 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3956 		if (hw_addr == NULL) {
3957 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3958 			return (ENOMEM);
3959 		}
3960 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3961 	} else {
3962 		/*
3963 		 * IRE_IF_NORESOLVER type simply copies the resolution
3964 		 * cookie passed in.  So no hw_addr is needed.
3965 		 */
3966 		hw_addr = NULL;
3967 	}
3968 	ASSERT(flags & NCE_F_MCAST);
3969 	ASSERT(flags & NCE_F_NONUD);
3970 	/* nce_state will be computed by nce_add_common() */
3971 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3972 	    ND_UNCHANGED, &nce);
3973 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3974 	if (err == 0)
3975 		err = nce_add_v4_postprocess(nce);
3976 	if (hw_addr != NULL)
3977 		kmem_free(hw_addr, ill->ill_phys_addr_length);
3978 	if (err != 0) {
3979 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3980 		return (err);
3981 	}
3982 done:
3983 	if (newnce != NULL)
3984 		*newnce = nce;
3985 	else
3986 		nce_refrele(nce);
3987 	return (0);
3988 }
3989 
3990 /*
3991  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3992  * don't want to have to walk the list for every single one, so we gather up
3993  * batches at a time.
3994  */
3995 #define	NCE_RESCHED_LIST_LEN	8
3996 
3997 typedef struct {
3998 	ill_t	*ncert_ill;
3999 	uint_t	ncert_num;
4000 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
4001 } nce_resched_t;
4002 
4003 /*
4004  * Pick the longest waiting NCEs for defense.
4005  */
4006 /* ARGSUSED */
4007 static int
4008 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4009 {
4010 	nce_resched_t *ncert = arg;
4011 	ncec_t **ncecs;
4012 	ncec_t **ncec_max;
4013 	ncec_t *ncec_temp;
4014 	ncec_t *ncec = nce->nce_common;
4015 
4016 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4017 	/*
4018 	 * Only reachable entries that are ready for announcement are eligible.
4019 	 */
4020 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4021 		return (0);
4022 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4023 		ncec_refhold(ncec);
4024 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
4025 	} else {
4026 		ncecs = ncert->ncert_nces;
4027 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4028 		ncec_refhold(ncec);
4029 		for (; ncecs < ncec_max; ncecs++) {
4030 			ASSERT(ncec != NULL);
4031 			if ((*ncecs)->ncec_last_time_defended >
4032 			    ncec->ncec_last_time_defended) {
4033 				ncec_temp = *ncecs;
4034 				*ncecs = ncec;
4035 				ncec = ncec_temp;
4036 			}
4037 		}
4038 		ncec_refrele(ncec);
4039 	}
4040 	return (0);
4041 }
4042 
4043 /*
4044  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4045  * doesn't happen very often (if at all), and thus it needn't be highly
4046  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4047  * outer loop is bounded by a constant rather than by the length of the list.)
4048  */
4049 static void
4050 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4051 {
4052 	ncec_t		*ncec;
4053 	ip_stack_t	*ipst = ill->ill_ipst;
4054 	uint_t		i, defend_rate;
4055 
4056 	i = ill->ill_defend_count;
4057 	ill->ill_defend_count = 0;
4058 	if (ill->ill_isv6)
4059 		defend_rate = ipst->ips_ndp_defend_rate;
4060 	else
4061 		defend_rate = ipst->ips_arp_defend_rate;
4062 	/* If none could be sitting around, then don't reschedule */
4063 	if (i < defend_rate) {
4064 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4065 		return;
4066 	}
4067 	ncert->ncert_ill = ill;
4068 	while (ill->ill_defend_count < defend_rate) {
4069 		nce_walk_common(ill, ncec_reschedule, ncert);
4070 		for (i = 0; i < ncert->ncert_num; i++) {
4071 
4072 			ncec = ncert->ncert_nces[i];
4073 			mutex_enter(&ncec->ncec_lock);
4074 			ncec->ncec_flags |= NCE_F_DELAYED;
4075 			mutex_exit(&ncec->ncec_lock);
4076 			/*
4077 			 * we plan to schedule this ncec, so incr the
4078 			 * defend_count in anticipation.
4079 			 */
4080 			if (++ill->ill_defend_count >= defend_rate)
4081 				break;
4082 		}
4083 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4084 			break;
4085 	}
4086 }
4087 
4088 /*
4089  * Check if the current rate-limiting parameters permit the sending
4090  * of another address defense announcement for both IPv4 and IPv6.
4091  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4092  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4093  * determines how many address defense announcements are permitted
4094  * in any `defense_perio' interval.
4095  */
4096 static boolean_t
4097 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4098 {
4099 	clock_t		now = ddi_get_lbolt();
4100 	ip_stack_t	*ipst = ill->ill_ipst;
4101 	clock_t		start = ill->ill_defend_start;
4102 	uint32_t	elapsed, defend_period, defend_rate;
4103 	nce_resched_t	ncert;
4104 	boolean_t	ret;
4105 	int		i;
4106 
4107 	if (ill->ill_isv6) {
4108 		defend_period = ipst->ips_ndp_defend_period;
4109 		defend_rate = ipst->ips_ndp_defend_rate;
4110 	} else {
4111 		defend_period = ipst->ips_arp_defend_period;
4112 		defend_rate = ipst->ips_arp_defend_rate;
4113 	}
4114 	if (defend_rate == 0)
4115 		return (B_TRUE);
4116 	bzero(&ncert, sizeof (ncert));
4117 	mutex_enter(&ill->ill_lock);
4118 	if (start > 0) {
4119 		elapsed = now - start;
4120 		if (elapsed > SEC_TO_TICK(defend_period)) {
4121 			ill->ill_defend_start = now;
4122 			/*
4123 			 * nce_ill_reschedule will attempt to
4124 			 * prevent starvation by reschduling the
4125 			 * oldest entries, which are marked with
4126 			 * the NCE_F_DELAYED flag.
4127 			 */
4128 			nce_ill_reschedule(ill, &ncert);
4129 		}
4130 	} else {
4131 		ill->ill_defend_start = now;
4132 	}
4133 	ASSERT(ill->ill_defend_count <= defend_rate);
4134 	mutex_enter(&ncec->ncec_lock);
4135 	if (ncec->ncec_flags & NCE_F_DELAYED) {
4136 		/*
4137 		 * This ncec was rescheduled as one of the really old
4138 		 * entries needing on-going defense. The
4139 		 * ill_defend_count was already incremented in
4140 		 * nce_ill_reschedule. Go ahead and send the announce.
4141 		 */
4142 		ncec->ncec_flags &= ~NCE_F_DELAYED;
4143 		mutex_exit(&ncec->ncec_lock);
4144 		ret = B_FALSE;
4145 		goto done;
4146 	}
4147 	mutex_exit(&ncec->ncec_lock);
4148 	if (ill->ill_defend_count < defend_rate)
4149 		ill->ill_defend_count++;
4150 	if (ill->ill_defend_count == defend_rate) {
4151 		/*
4152 		 * we are no longer allowed to send unbidden defense
4153 		 * messages. Wait for rescheduling.
4154 		 */
4155 		ret = B_TRUE;
4156 	} else {
4157 		ret = B_FALSE;
4158 	}
4159 done:
4160 	mutex_exit(&ill->ill_lock);
4161 	/*
4162 	 * After all the locks have been dropped we can restart nce timer,
4163 	 * and refrele the delayed ncecs
4164 	 */
4165 	for (i = 0; i < ncert.ncert_num; i++) {
4166 		clock_t	xmit_interval;
4167 		ncec_t	*tmp;
4168 
4169 		tmp = ncert.ncert_nces[i];
4170 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4171 		    B_FALSE);
4172 		nce_restart_timer(tmp, xmit_interval);
4173 		ncec_refrele(tmp);
4174 	}
4175 	return (ret);
4176 }
4177 
4178 boolean_t
4179 ndp_announce(ncec_t *ncec)
4180 {
4181 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4182 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4183 	    nce_advert_flags(ncec)));
4184 }
4185 
4186 ill_t *
4187 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4188 {
4189 	mblk_t		*mp;
4190 	in6_addr_t	src6;
4191 	ipaddr_t	src4;
4192 	ill_t		*ill = ncec->ncec_ill;
4193 	ill_t		*src_ill = NULL;
4194 	ipif_t		*ipif = NULL;
4195 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
4196 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4197 
4198 	ASSERT(src != NULL);
4199 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4200 	src6 = *src;
4201 	if (is_myaddr) {
4202 		src6 = ncec->ncec_addr;
4203 		if (!isv6)
4204 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4205 	} else {
4206 		/*
4207 		 * try to find one from the outgoing packet.
4208 		 */
4209 		mutex_enter(&ncec->ncec_lock);
4210 		mp = ncec->ncec_qd_mp;
4211 		if (mp != NULL) {
4212 			if (isv6) {
4213 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
4214 
4215 				src6 = ip6h->ip6_src;
4216 			} else {
4217 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4218 
4219 				src4 = ipha->ipha_src;
4220 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4221 			}
4222 		}
4223 		mutex_exit(&ncec->ncec_lock);
4224 	}
4225 
4226 	/*
4227 	 * For outgoing packets, if the src of outgoing packet is one
4228 	 * of the assigned interface addresses use it, otherwise we
4229 	 * will pick the source address below.
4230 	 * For local addresses (is_myaddr) doing DAD, NDP announce
4231 	 * messages are mcast. So we use the (IPMP) cast_ill or the
4232 	 * (non-IPMP) ncec_ill for these message types. The only case
4233 	 * of unicast DAD messages are for IPv6 ND probes, for which
4234 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
4235 	 */
4236 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4237 		if (isv6) {
4238 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4239 			    ill->ill_ipst);
4240 		} else {
4241 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4242 			    ill->ill_ipst);
4243 		}
4244 
4245 		/*
4246 		 * If no relevant ipif can be found, then it's not one of our
4247 		 * addresses.  Reset to :: and try to find a src for the NS or
4248 		 * ARP request using ipif_select_source_v[4,6]  below.
4249 		 * If an ipif can be found, but it's not yet done with
4250 		 * DAD verification, and we are not being invoked for
4251 		 * DAD (i.e., !is_myaddr), then just postpone this
4252 		 * transmission until later.
4253 		 */
4254 		if (ipif == NULL) {
4255 			src6 = ipv6_all_zeros;
4256 			src4 = INADDR_ANY;
4257 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
4258 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4259 			    ncec_t *, ncec, ipif_t *, ipif);
4260 			ipif_refrele(ipif);
4261 			return (NULL);
4262 		}
4263 	}
4264 
4265 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4266 		/*
4267 		 * Pick a source address for this solicitation, but
4268 		 * restrict the selection to addresses assigned to the
4269 		 * output interface.  We do this because the destination will
4270 		 * create a neighbor cache entry for the source address of
4271 		 * this packet, so the source address had better be a valid
4272 		 * neighbor.
4273 		 */
4274 		if (isv6) {
4275 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4276 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4277 			    B_FALSE, NULL);
4278 		} else {
4279 			ipaddr_t nce_addr;
4280 
4281 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4282 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4283 			    B_FALSE, NULL);
4284 		}
4285 		if (ipif == NULL && IS_IPMP(ill)) {
4286 			ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4287 
4288 			if (send_ill != NULL) {
4289 				if (isv6) {
4290 					ipif = ipif_select_source_v6(send_ill,
4291 					    &ncec->ncec_addr, B_TRUE,
4292 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4293 					    B_FALSE, NULL);
4294 				} else {
4295 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4296 					    src4);
4297 					ipif = ipif_select_source_v4(send_ill,
4298 					    src4, ALL_ZONES, B_TRUE, NULL);
4299 				}
4300 				ill_refrele(send_ill);
4301 			}
4302 		}
4303 
4304 		if (ipif == NULL) {
4305 			char buf[INET6_ADDRSTRLEN];
4306 
4307 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4308 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
4309 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4310 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4311 			return (NULL);
4312 		}
4313 		src6 = ipif->ipif_v6lcl_addr;
4314 	}
4315 	*src = src6;
4316 	if (ipif != NULL) {
4317 		src_ill = ipif->ipif_ill;
4318 		if (IS_IPMP(src_ill))
4319 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
4320 		else
4321 			ill_refhold(src_ill);
4322 		ipif_refrele(ipif);
4323 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4324 		    ill_t *, src_ill);
4325 	}
4326 	return (src_ill);
4327 }
4328 
4329 void
4330 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4331     uchar_t *hwaddr, int hwaddr_len, int flags)
4332 {
4333 	ill_t	*ill;
4334 	ncec_t	*ncec;
4335 	nce_t	*nce;
4336 	uint16_t new_state;
4337 
4338 	ill = (ipif ? ipif->ipif_ill : NULL);
4339 	if (ill != NULL) {
4340 		/*
4341 		 * only one ncec is possible
4342 		 */
4343 		nce = nce_lookup_v4(ill, addr);
4344 		if (nce != NULL) {
4345 			ncec = nce->nce_common;
4346 			mutex_enter(&ncec->ncec_lock);
4347 			if (NCE_ISREACHABLE(ncec))
4348 				new_state = ND_UNCHANGED;
4349 			else
4350 				new_state = ND_STALE;
4351 			ncec->ncec_flags = flags;
4352 			nce_update(ncec, new_state, hwaddr);
4353 			mutex_exit(&ncec->ncec_lock);
4354 			nce_refrele(nce);
4355 			return;
4356 		}
4357 	} else {
4358 		/*
4359 		 * ill is wildcard; clean up all ncec's and ire's
4360 		 * that match on addr.
4361 		 */
4362 		nce_hw_map_t hwm;
4363 
4364 		hwm.hwm_addr = *addr;
4365 		hwm.hwm_hwlen = hwaddr_len;
4366 		hwm.hwm_hwaddr = hwaddr;
4367 		hwm.hwm_flags = flags;
4368 
4369 		ncec_walk_common(ipst->ips_ndp4, NULL,
4370 		    nce_update_hw_changed, &hwm, B_TRUE);
4371 	}
4372 }
4373 
4374 /*
4375  * Common function to add ncec entries.
4376  * we always add the ncec with ncec_ill == ill, and always create
4377  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4378  * ncec is !reachable.
4379  *
4380  * When the caller passes in an nce_state of ND_UNCHANGED,
4381  * nce_add_common() will determine the state of the created nce based
4382  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4383  * be created with state set to the passed in nce_state.
4384  */
4385 static int
4386 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4387     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4388 {
4389 	static	ncec_t		nce_nil;
4390 	uchar_t			*template = NULL;
4391 	int			err;
4392 	ncec_t			*ncec;
4393 	ncec_t			**ncep;
4394 	ip_stack_t		*ipst = ill->ill_ipst;
4395 	uint16_t		state;
4396 	boolean_t		fastprobe = B_FALSE;
4397 	struct ndp_g_s		*ndp;
4398 	nce_t			*nce = NULL;
4399 	mblk_t			*dlur_mp = NULL;
4400 
4401 	if (ill->ill_isv6)
4402 		ndp = ill->ill_ipst->ips_ndp6;
4403 	else
4404 		ndp = ill->ill_ipst->ips_ndp4;
4405 
4406 	*retnce = NULL;
4407 
4408 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4409 
4410 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4411 		ip0dbg(("nce_add_common: no addr\n"));
4412 		return (EINVAL);
4413 	}
4414 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4415 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4416 		return (EINVAL);
4417 	}
4418 
4419 	if (ill->ill_isv6) {
4420 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4421 	} else {
4422 		ipaddr_t v4addr;
4423 
4424 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4425 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4426 	}
4427 
4428 	/*
4429 	 * The caller has ensured that there is no nce on ill, but there could
4430 	 * still be an nce_common_t for the address, so that we find exisiting
4431 	 * ncec_t strucutures first, and atomically add a new nce_t if
4432 	 * one is found. The ndp_g_lock ensures that we don't cross threads
4433 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4434 	 * compare for matches across the illgrp because this function is
4435 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4436 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4437 	 * appropriate.
4438 	 */
4439 	ncec = *ncep;
4440 	for (; ncec != NULL; ncec = ncec->ncec_next) {
4441 		if (ncec->ncec_ill == ill) {
4442 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4443 				/*
4444 				 * We should never find *retnce to be
4445 				 * MYADDR, since the caller may then
4446 				 * incorrectly restart a DAD timer that's
4447 				 * already running.  However, if we are in
4448 				 * forwarding mode, and the interface is
4449 				 * moving in/out of groups, the data
4450 				 * path ire lookup (e.g., ire_revalidate_nce)
4451 				 * may  have determined that some destination
4452 				 * is offlink while the control path is adding
4453 				 * that address as a local address.
4454 				 * Recover from  this case by failing the
4455 				 * lookup
4456 				 */
4457 				if (NCE_MYADDR(ncec))
4458 					return (ENXIO);
4459 				*retnce = nce_ill_lookup_then_add(ill, ncec);
4460 				if (*retnce != NULL)
4461 					break;
4462 			}
4463 		}
4464 	}
4465 	if (*retnce != NULL) /* caller must trigger fastpath on nce */
4466 		return (0);
4467 
4468 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4469 	if (ncec == NULL)
4470 		return (ENOMEM);
4471 	*ncec = nce_nil;
4472 	ncec->ncec_ill = ill;
4473 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4474 	ncec->ncec_flags = flags;
4475 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
4476 
4477 	if (!ill->ill_isv6) {
4478 		ipaddr_t addr4;
4479 
4480 		/*
4481 		 * DAD probe interval and probe count are set based on
4482 		 * fast/slow probe settings. If the underlying link doesn't
4483 		 * have reliably up/down notifications or if we're working
4484 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
4485 		 * don't use the fast timers.  Otherwise, use them.
4486 		 */
4487 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4488 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4489 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4490 			fastprobe = B_TRUE;
4491 		} else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4492 		    !IS_IPV4_LL_SPACE(&addr4)) {
4493 			ill_t *hwaddr_ill;
4494 
4495 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4496 			    hw_addr_len);
4497 			if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4498 				fastprobe = B_TRUE;
4499 		}
4500 		if (fastprobe) {
4501 			ncec->ncec_xmit_interval =
4502 			    ipst->ips_arp_fastprobe_interval;
4503 			ncec->ncec_pcnt =
4504 			    ipst->ips_arp_fastprobe_count;
4505 			ncec->ncec_flags |= NCE_F_FAST;
4506 		} else {
4507 			ncec->ncec_xmit_interval =
4508 			    ipst->ips_arp_probe_interval;
4509 			ncec->ncec_pcnt =
4510 			    ipst->ips_arp_probe_count;
4511 		}
4512 		if (NCE_PUBLISH(ncec)) {
4513 			ncec->ncec_unsolicit_count =
4514 			    ipst->ips_ip_arp_publish_count;
4515 		}
4516 	} else {
4517 		/*
4518 		 * probe interval is constant: ILL_PROBE_INTERVAL
4519 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4520 		 */
4521 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4522 		if (NCE_PUBLISH(ncec)) {
4523 			ncec->ncec_unsolicit_count =
4524 			    ipst->ips_ip_ndp_unsolicit_count;
4525 		}
4526 	}
4527 	ncec->ncec_rcnt = ill->ill_xmit_count;
4528 	ncec->ncec_addr = *addr;
4529 	ncec->ncec_qd_mp = NULL;
4530 	ncec->ncec_refcnt = 1; /* for ncec getting created */
4531 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4532 	ncec->ncec_trace_disable = B_FALSE;
4533 
4534 	/*
4535 	 * ncec_lladdr holds link layer address
4536 	 */
4537 	if (hw_addr_len > 0) {
4538 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4539 		if (template == NULL) {
4540 			err = ENOMEM;
4541 			goto err_ret;
4542 		}
4543 		ncec->ncec_lladdr = template;
4544 		ncec->ncec_lladdr_length = hw_addr_len;
4545 		bzero(ncec->ncec_lladdr, hw_addr_len);
4546 	}
4547 	if ((flags & NCE_F_BCAST) != 0) {
4548 		state = ND_REACHABLE;
4549 		ASSERT(hw_addr_len > 0);
4550 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4551 		state = ND_INITIAL;
4552 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4553 		/*
4554 		 * NORESOLVER entries are always created in the REACHABLE
4555 		 * state.
4556 		 */
4557 		state = ND_REACHABLE;
4558 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4559 		    ill->ill_mactype != DL_IPV4 &&
4560 		    ill->ill_mactype != DL_6TO4) {
4561 			/*
4562 			 * We create a nce_res_mp with the IP nexthop address
4563 			 * as the destination address if the physical length
4564 			 * is exactly 4 bytes for point-to-multipoint links
4565 			 * that do their own resolution from IP to link-layer
4566 			 * address (e.g. IP over X.25).
4567 			 */
4568 			bcopy((uchar_t *)addr,
4569 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4570 		}
4571 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4572 		    ill->ill_mactype != DL_IPV6) {
4573 			/*
4574 			 * We create a nce_res_mp with the IP nexthop address
4575 			 * as the destination address if the physical legnth
4576 			 * is exactly 16 bytes for point-to-multipoint links
4577 			 * that do their own resolution from IP to link-layer
4578 			 * address.
4579 			 */
4580 			bcopy((uchar_t *)addr,
4581 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4582 		}
4583 		/*
4584 		 * Since NUD is not part of the base IPv4 protocol definition,
4585 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
4586 		 * age, and are marked NCE_F_NONUD.
4587 		 */
4588 		if (!ill->ill_isv6)
4589 			ncec->ncec_flags |= NCE_F_NONUD;
4590 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
4591 		state = ND_REACHABLE;
4592 	}
4593 
4594 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4595 		/*
4596 		 * We are adding an ncec with a deterministic hw_addr,
4597 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4598 		 *
4599 		 * if we are adding a unicast ncec for the local address
4600 		 * it would be REACHABLE; we would be adding a ND_STALE entry
4601 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4602 		 * addresses are added in PROBE to trigger DAD.
4603 		 */
4604 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4605 		    ill->ill_net_type == IRE_IF_NORESOLVER)
4606 			state = ND_REACHABLE;
4607 		else if (!NCE_PUBLISH(ncec))
4608 			state = ND_STALE;
4609 		else
4610 			state = ND_PROBE;
4611 		if (hw_addr != NULL)
4612 			nce_set_ll(ncec, hw_addr);
4613 	}
4614 	/* caller overrides internally computed state */
4615 	if (nce_state != ND_UNCHANGED)
4616 		state = nce_state;
4617 
4618 	if (state == ND_PROBE)
4619 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
4620 
4621 	ncec->ncec_state = state;
4622 
4623 	if (state == ND_REACHABLE) {
4624 		ncec->ncec_last = ncec->ncec_init_time =
4625 		    TICK_TO_MSEC(ddi_get_lbolt64());
4626 	} else {
4627 		ncec->ncec_last = 0;
4628 		if (state == ND_INITIAL)
4629 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4630 	}
4631 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4632 	    offsetof(ncec_cb_t, ncec_cb_node));
4633 	/*
4634 	 * have all the memory allocations out of the way before taking locks
4635 	 * and adding the nce.
4636 	 */
4637 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4638 	if (nce == NULL) {
4639 		err = ENOMEM;
4640 		goto err_ret;
4641 	}
4642 	if (ncec->ncec_lladdr != NULL ||
4643 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4644 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4645 		    ill->ill_phys_addr_length, ill->ill_sap,
4646 		    ill->ill_sap_length);
4647 		if (dlur_mp == NULL) {
4648 			err = ENOMEM;
4649 			goto err_ret;
4650 		}
4651 	}
4652 
4653 	/*
4654 	 * Atomically ensure that the ill is not CONDEMNED, before
4655 	 * adding the NCE.
4656 	 */
4657 	mutex_enter(&ill->ill_lock);
4658 	if (ill->ill_state_flags & ILL_CONDEMNED) {
4659 		mutex_exit(&ill->ill_lock);
4660 		err = EINVAL;
4661 		goto err_ret;
4662 	}
4663 	if (!NCE_MYADDR(ncec) &&
4664 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4665 		mutex_exit(&ill->ill_lock);
4666 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4667 		err = EINVAL;
4668 		goto err_ret;
4669 	}
4670 	/*
4671 	 * Acquire the ncec_lock even before adding the ncec to the list
4672 	 * so that it cannot get deleted after the ncec is added, but
4673 	 * before we add the nce.
4674 	 */
4675 	mutex_enter(&ncec->ncec_lock);
4676 	if ((ncec->ncec_next = *ncep) != NULL)
4677 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4678 	*ncep = ncec;
4679 	ncec->ncec_ptpn = ncep;
4680 
4681 	/* Bump up the number of ncec's referencing this ill */
4682 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4683 	    (char *), "ncec", (void *), ncec);
4684 	ill->ill_ncec_cnt++;
4685 	/*
4686 	 * Since we hold the ncec_lock at this time, the ncec cannot be
4687 	 * condemned, and we can safely add the nce.
4688 	 */
4689 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4690 	mutex_exit(&ncec->ncec_lock);
4691 	mutex_exit(&ill->ill_lock);
4692 
4693 	/* caller must trigger fastpath on *retnce */
4694 	return (0);
4695 
4696 err_ret:
4697 	if (ncec != NULL)
4698 		kmem_cache_free(ncec_cache, ncec);
4699 	if (nce != NULL)
4700 		kmem_cache_free(nce_cache, nce);
4701 	freemsg(dlur_mp);
4702 	if (template != NULL)
4703 		kmem_free(template, ill->ill_phys_addr_length);
4704 	return (err);
4705 }
4706 
4707 /*
4708  * take a ref on the nce
4709  */
4710 void
4711 nce_refhold(nce_t *nce)
4712 {
4713 	mutex_enter(&nce->nce_lock);
4714 	nce->nce_refcnt++;
4715 	ASSERT((nce)->nce_refcnt != 0);
4716 	mutex_exit(&nce->nce_lock);
4717 }
4718 
4719 /*
4720  * release a ref on the nce; In general, this
4721  * cannot be called with locks held because nce_inactive
4722  * may result in nce_inactive which will take the ill_lock,
4723  * do ipif_ill_refrele_tail etc. Thus the one exception
4724  * where this can be called with locks held is when the caller
4725  * is certain that the nce_refcnt is sufficient to prevent
4726  * the invocation of nce_inactive.
4727  */
4728 void
4729 nce_refrele(nce_t *nce)
4730 {
4731 	ASSERT((nce)->nce_refcnt != 0);
4732 	mutex_enter(&nce->nce_lock);
4733 	if (--nce->nce_refcnt == 0)
4734 		nce_inactive(nce); /* destroys the mutex */
4735 	else
4736 		mutex_exit(&nce->nce_lock);
4737 }
4738 
4739 /*
4740  * free the nce after all refs have gone away.
4741  */
4742 static void
4743 nce_inactive(nce_t *nce)
4744 {
4745 	ill_t *ill = nce->nce_ill;
4746 
4747 	ASSERT(nce->nce_refcnt == 0);
4748 
4749 	ncec_refrele_notr(nce->nce_common);
4750 	nce->nce_common = NULL;
4751 	freemsg(nce->nce_fp_mp);
4752 	freemsg(nce->nce_dlur_mp);
4753 
4754 	mutex_enter(&ill->ill_lock);
4755 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4756 	    (char *), "nce", (void *), nce);
4757 	ill->ill_nce_cnt--;
4758 	nce->nce_ill = NULL;
4759 	/*
4760 	 * If the number of ncec's associated with this ill have dropped
4761 	 * to zero, check whether we need to restart any operation that
4762 	 * is waiting for this to happen.
4763 	 */
4764 	if (ILL_DOWN_OK(ill)) {
4765 		/* ipif_ill_refrele_tail drops the ill_lock */
4766 		ipif_ill_refrele_tail(ill);
4767 	} else {
4768 		mutex_exit(&ill->ill_lock);
4769 	}
4770 
4771 	mutex_destroy(&nce->nce_lock);
4772 	kmem_cache_free(nce_cache, nce);
4773 }
4774 
4775 /*
4776  * Add an nce to the ill_nce list.
4777  */
4778 static nce_t *
4779 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4780 {
4781 	bzero(nce, sizeof (*nce));
4782 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4783 	nce->nce_common = ncec;
4784 	nce->nce_addr = ncec->ncec_addr;
4785 	nce->nce_ill = ill;
4786 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4787 	    (char *), "nce", (void *), nce);
4788 	ill->ill_nce_cnt++;
4789 
4790 	nce->nce_refcnt = 1; /* for the thread */
4791 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4792 	nce->nce_dlur_mp = dlur_mp;
4793 
4794 	/* add nce to the ill's fastpath list.  */
4795 	nce->nce_refcnt++; /* for the list */
4796 	list_insert_head(&ill->ill_nce, nce);
4797 	return (nce);
4798 }
4799 
4800 static nce_t *
4801 nce_add(ill_t *ill, ncec_t *ncec)
4802 {
4803 	nce_t	*nce;
4804 	mblk_t	*dlur_mp = NULL;
4805 
4806 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4807 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4808 
4809 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4810 	if (nce == NULL)
4811 		return (NULL);
4812 	if (ncec->ncec_lladdr != NULL ||
4813 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4814 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4815 		    ill->ill_phys_addr_length, ill->ill_sap,
4816 		    ill->ill_sap_length);
4817 		if (dlur_mp == NULL) {
4818 			kmem_cache_free(nce_cache, nce);
4819 			return (NULL);
4820 		}
4821 	}
4822 	return (nce_add_impl(ill, ncec, nce, dlur_mp));
4823 }
4824 
4825 /*
4826  * remove the nce from the ill_faspath list
4827  */
4828 void
4829 nce_delete(nce_t *nce)
4830 {
4831 	ill_t	*ill = nce->nce_ill;
4832 
4833 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4834 
4835 	mutex_enter(&nce->nce_lock);
4836 	if (nce->nce_is_condemned) {
4837 		/*
4838 		 * some other thread has removed this nce from the ill_nce list
4839 		 */
4840 		mutex_exit(&nce->nce_lock);
4841 		return;
4842 	}
4843 	nce->nce_is_condemned = B_TRUE;
4844 	mutex_exit(&nce->nce_lock);
4845 
4846 	list_remove(&ill->ill_nce, nce);
4847 	/*
4848 	 * even though we are holding the ill_lock, it is ok to
4849 	 * call nce_refrele here because we know that we should have
4850 	 * at least 2 refs on the nce: one for the thread, and one
4851 	 * for the list. The refrele below will release the one for
4852 	 * the list.
4853 	 */
4854 	nce_refrele(nce);
4855 }
4856 
4857 nce_t *
4858 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4859 {
4860 	nce_t *nce = NULL;
4861 
4862 	ASSERT(ill != NULL);
4863 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4864 
4865 	for (nce = list_head(&ill->ill_nce); nce != NULL;
4866 	    nce = list_next(&ill->ill_nce, nce)) {
4867 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4868 			break;
4869 	}
4870 
4871 	/*
4872 	 * if we found the nce on the ill_nce list while holding
4873 	 * the ill_lock, then it cannot be condemned yet.
4874 	 */
4875 	if (nce != NULL) {
4876 		ASSERT(!nce->nce_is_condemned);
4877 		nce_refhold(nce);
4878 	}
4879 	return (nce);
4880 }
4881 
4882 /*
4883  * Walk the ill_nce list on ill. The callback function func() cannot perform
4884  * any destructive actions.
4885  */
4886 static void
4887 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4888 {
4889 	nce_t *nce = NULL, *nce_next;
4890 
4891 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4892 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4893 		nce_next = list_next(&ill->ill_nce, nce);
4894 		if (func(ill, nce, arg) != 0)
4895 			break;
4896 		nce = nce_next;
4897 	}
4898 }
4899 
4900 void
4901 nce_walk(ill_t *ill, pfi_t func, void *arg)
4902 {
4903 	mutex_enter(&ill->ill_lock);
4904 	nce_walk_common(ill, func, arg);
4905 	mutex_exit(&ill->ill_lock);
4906 }
4907 
4908 void
4909 nce_flush(ill_t *ill, boolean_t flushall)
4910 {
4911 	nce_t *nce, *nce_next;
4912 	list_t dead;
4913 
4914 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4915 	mutex_enter(&ill->ill_lock);
4916 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4917 		nce_next = list_next(&ill->ill_nce, nce);
4918 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4919 			nce = nce_next;
4920 			continue;
4921 		}
4922 		/*
4923 		 * nce_delete requires that the caller should either not
4924 		 * be holding locks, or should hold a ref to ensure that
4925 		 * we wont hit ncec_inactive. So take a ref and clean up
4926 		 * after the list is flushed.
4927 		 */
4928 		nce_refhold(nce);
4929 		nce_delete(nce);
4930 		list_insert_tail(&dead, nce);
4931 		nce = nce_next;
4932 	}
4933 	mutex_exit(&ill->ill_lock);
4934 	while ((nce = list_head(&dead)) != NULL) {
4935 		list_remove(&dead, nce);
4936 		nce_refrele(nce);
4937 	}
4938 	ASSERT(list_is_empty(&dead));
4939 	list_destroy(&dead);
4940 }
4941 
4942 /* Return an interval that is anywhere in the [1 .. intv] range */
4943 static clock_t
4944 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4945 {
4946 	clock_t rnd, frac;
4947 
4948 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4949 	/* Note that clock_t is signed; must chop off bits */
4950 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4951 	if (initial_time) {
4952 		if (intv <= 0)
4953 			intv = 1;
4954 		else
4955 			intv = (rnd % intv) + 1;
4956 	} else {
4957 		/* Compute 'frac' as 20% of the configured interval */
4958 		if ((frac = intv / 5) <= 1)
4959 			frac = 2;
4960 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
4961 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4962 			intv = 1;
4963 	}
4964 	return (intv);
4965 }
4966 
4967 void
4968 nce_resolv_ipmp_ok(ncec_t *ncec)
4969 {
4970 	mblk_t *mp;
4971 	uint_t pkt_len;
4972 	iaflags_t ixaflags = IXAF_NO_TRACE;
4973 	nce_t *under_nce;
4974 	ill_t	*ill = ncec->ncec_ill;
4975 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4976 	ipif_t *src_ipif = NULL;
4977 	ip_stack_t *ipst = ill->ill_ipst;
4978 	ill_t *send_ill;
4979 	uint_t nprobes;
4980 
4981 	ASSERT(IS_IPMP(ill));
4982 
4983 	mutex_enter(&ncec->ncec_lock);
4984 	nprobes = ncec->ncec_nprobes;
4985 	mp = ncec->ncec_qd_mp;
4986 	ncec->ncec_qd_mp = NULL;
4987 	ncec->ncec_nprobes = 0;
4988 	mutex_exit(&ncec->ncec_lock);
4989 
4990 	while (mp != NULL) {
4991 		mblk_t *nxt_mp;
4992 
4993 		nxt_mp = mp->b_next;
4994 		mp->b_next = NULL;
4995 		if (isv6) {
4996 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4997 
4998 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4999 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
5000 			    ill, ALL_ZONES, ipst);
5001 		} else {
5002 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
5003 
5004 			ixaflags |= IXAF_IS_IPV4;
5005 			pkt_len = ntohs(ipha->ipha_length);
5006 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5007 			    ill, ALL_ZONES, ipst);
5008 		}
5009 
5010 		/*
5011 		 * find a new nce based on an under_ill. The first IPMP probe
5012 		 * packet gets queued, so we could still find a src_ipif that
5013 		 * matches an IPMP test address.
5014 		 */
5015 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5016 			/*
5017 			 * if src_ipif is null, this could be either a
5018 			 * forwarded packet or a probe whose src got deleted.
5019 			 * We identify the former case by looking for the
5020 			 * ncec_nprobes: the first ncec_nprobes packets are
5021 			 * probes;
5022 			 */
5023 			if (src_ipif == NULL && nprobes > 0)
5024 				goto drop_pkt;
5025 
5026 			/*
5027 			 * For forwarded packets, we use the ipmp rotor
5028 			 * to find send_ill.
5029 			 */
5030 			send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5031 			    B_TRUE);
5032 		} else {
5033 			send_ill = src_ipif->ipif_ill;
5034 			ill_refhold(send_ill);
5035 		}
5036 
5037 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5038 		    (ncec_t *), ncec, (ipif_t *),
5039 		    src_ipif, (ill_t *), send_ill);
5040 
5041 		if (send_ill == NULL) {
5042 			if (src_ipif != NULL)
5043 				ipif_refrele(src_ipif);
5044 			goto drop_pkt;
5045 		}
5046 		/* create an under_nce on send_ill */
5047 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5048 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5049 			under_nce = nce_fastpath_create(send_ill, ncec);
5050 		else
5051 			under_nce = NULL;
5052 		rw_exit(&ipst->ips_ill_g_lock);
5053 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5054 			nce_fastpath_trigger(under_nce);
5055 
5056 		ill_refrele(send_ill);
5057 		if (src_ipif != NULL)
5058 			ipif_refrele(src_ipif);
5059 
5060 		if (under_nce != NULL) {
5061 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5062 			    ALL_ZONES, 0, NULL);
5063 			nce_refrele(under_nce);
5064 			if (nprobes > 0)
5065 				nprobes--;
5066 			mp = nxt_mp;
5067 			continue;
5068 		}
5069 drop_pkt:
5070 		if (isv6) {
5071 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5072 		} else {
5073 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5074 		}
5075 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5076 		freemsg(mp);
5077 		if (nprobes > 0)
5078 			nprobes--;
5079 		mp = nxt_mp;
5080 	}
5081 	ncec_cb_dispatch(ncec); /* complete callbacks */
5082 }
5083