xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_if.c (revision 62366fbb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 1990 Mentat Inc.
24  * Copyright (c) 2013 by Delphix. All rights reserved.
25  * Copyright (c) 2016, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
27  */
28 
29 /*
30  * This file contains the interface control functions for IP.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/stream.h>
35 #include <sys/dlpi.h>
36 #include <sys/stropts.h>
37 #include <sys/strsun.h>
38 #include <sys/sysmacros.h>
39 #include <sys/strsubr.h>
40 #include <sys/strlog.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/cmn_err.h>
44 #include <sys/kstat.h>
45 #include <sys/debug.h>
46 #include <sys/zone.h>
47 #include <sys/sunldi.h>
48 #include <sys/file.h>
49 #include <sys/bitmap.h>
50 #include <sys/cpuvar.h>
51 #include <sys/time.h>
52 #include <sys/ctype.h>
53 #include <sys/kmem.h>
54 #include <sys/systm.h>
55 #include <sys/param.h>
56 #include <sys/socket.h>
57 #include <sys/isa_defs.h>
58 #include <net/if.h>
59 #include <net/if_arp.h>
60 #include <net/if_types.h>
61 #include <net/if_dl.h>
62 #include <net/route.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <netinet/ip6.h>
66 #include <netinet/icmp6.h>
67 #include <netinet/igmp_var.h>
68 #include <sys/policy.h>
69 #include <sys/ethernet.h>
70 #include <sys/callb.h>
71 #include <sys/md5.h>
72 
73 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
74 #include <inet/mi.h>
75 #include <inet/nd.h>
76 #include <inet/tunables.h>
77 #include <inet/arp.h>
78 #include <inet/ip_arp.h>
79 #include <inet/mib2.h>
80 #include <inet/ip.h>
81 #include <inet/ip6.h>
82 #include <inet/ip6_asp.h>
83 #include <inet/tcp.h>
84 #include <inet/ip_multi.h>
85 #include <inet/ip_ire.h>
86 #include <inet/ip_ftable.h>
87 #include <inet/ip_rts.h>
88 #include <inet/ip_ndp.h>
89 #include <inet/ip_if.h>
90 #include <inet/ip_impl.h>
91 #include <inet/sctp_ip.h>
92 #include <inet/ip_netinfo.h>
93 #include <inet/ilb_ip.h>
94 
95 #include <netinet/igmp.h>
96 #include <inet/ip_listutils.h>
97 #include <inet/ipclassifier.h>
98 #include <sys/mac_client.h>
99 #include <sys/dld.h>
100 #include <sys/mac_flow.h>
101 
102 #include <sys/systeminfo.h>
103 #include <sys/bootconf.h>
104 
105 #include <sys/tsol/tndb.h>
106 #include <sys/tsol/tnet.h>
107 
108 #include <inet/rawip_impl.h> /* needed for icmp_stack_t */
109 #include <inet/udp_impl.h> /* needed for udp_stack_t */
110 
111 /* The character which tells where the ill_name ends */
112 #define	IPIF_SEPARATOR_CHAR	':'
113 
114 /* IP ioctl function table entry */
115 typedef struct ipft_s {
116 	int	ipft_cmd;
117 	pfi_t	ipft_pfi;
118 	int	ipft_min_size;
119 	int	ipft_flags;
120 } ipft_t;
121 #define	IPFT_F_NO_REPLY		0x1	/* IP ioctl does not expect any reply */
122 #define	IPFT_F_SELF_REPLY	0x2	/* ioctl callee does the ioctl reply */
123 
124 static int	nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
125 static int	nd_ill_forward_set(queue_t *q, mblk_t *mp,
126 		    char *value, caddr_t cp, cred_t *ioc_cr);
127 
128 static boolean_t ill_is_quiescent(ill_t *);
129 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
130 static ip_m_t	*ip_m_lookup(t_uscalar_t mac_type);
131 static int	ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
132     mblk_t *mp, boolean_t need_up);
133 static int	ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
134     mblk_t *mp, boolean_t need_up);
135 static int	ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
136     queue_t *q, mblk_t *mp, boolean_t need_up);
137 static int	ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
138     mblk_t *mp);
139 static int	ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
140     mblk_t *mp);
141 static int	ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
142     queue_t *q, mblk_t *mp, boolean_t need_up);
143 static int	ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
144     int ioccmd, struct linkblk *li);
145 static ipaddr_t	ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
146 static void	ip_wput_ioctl(queue_t *q, mblk_t *mp);
147 static void	ipsq_flush(ill_t *ill);
148 
149 static	int	ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
150     queue_t *q, mblk_t *mp, boolean_t need_up);
151 static void	ipsq_delete(ipsq_t *);
152 
153 static ipif_t	*ipif_allocate(ill_t *ill, int id, uint_t ire_type,
154     boolean_t initialize, boolean_t insert, int *errorp);
155 static ire_t	**ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
156 static void	ipif_delete_bcast_ires(ipif_t *ipif);
157 static int	ipif_add_ires_v4(ipif_t *, boolean_t);
158 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
159 		    boolean_t isv6);
160 static int	ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
161 static void	ipif_free(ipif_t *ipif);
162 static void	ipif_free_tail(ipif_t *ipif);
163 static void	ipif_set_default(ipif_t *ipif);
164 static int	ipif_set_values(queue_t *q, mblk_t *mp,
165     char *interf_name, uint_t *ppa);
166 static int	ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
167     queue_t *q);
168 static ipif_t	*ipif_lookup_on_name(char *name, size_t namelen,
169     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
170     ip_stack_t *);
171 static ipif_t	*ipif_lookup_on_name_async(char *name, size_t namelen,
172     boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func,
173     int *error, ip_stack_t *);
174 
175 static int	ill_alloc_ppa(ill_if_t *, ill_t *);
176 static void	ill_delete_interface_type(ill_if_t *);
177 static int	ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
178 static void	ill_dl_down(ill_t *ill);
179 static void	ill_down(ill_t *ill);
180 static void	ill_down_ipifs(ill_t *, boolean_t);
181 static void	ill_free_mib(ill_t *ill);
182 static void	ill_glist_delete(ill_t *);
183 static void	ill_phyint_reinit(ill_t *ill);
184 static void	ill_set_nce_router_flags(ill_t *, boolean_t);
185 static void	ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
186 static void	ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
187 
188 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
189 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
190 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
191 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
192 static ip_v4mapinfo_func_t ip_ether_v4_mapping;
193 static ip_v6mapinfo_func_t ip_ether_v6_mapping;
194 static ip_v4mapinfo_func_t ip_ib_v4_mapping;
195 static ip_v6mapinfo_func_t ip_ib_v6_mapping;
196 static ip_v4mapinfo_func_t ip_mbcast_mapping;
197 static void	ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
198 static void	ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
199 static void	phyint_free(phyint_t *);
200 
201 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
202 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
203 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
204 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
205 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
206 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
207     dl_capability_sub_t *);
208 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
209 static void	ill_capability_dld_reset_fill(ill_t *, mblk_t *);
210 static void	ill_capability_dld_ack(ill_t *, mblk_t *,
211 		    dl_capability_sub_t *);
212 static void	ill_capability_dld_enable(ill_t *);
213 static void	ill_capability_ack_thr(void *);
214 static void	ill_capability_lso_enable(ill_t *);
215 
216 static ill_t	*ill_prev_usesrc(ill_t *);
217 static int	ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
218 static void	ill_disband_usesrc_group(ill_t *);
219 static void	ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
220 
221 #ifdef DEBUG
222 static	void	ill_trace_cleanup(const ill_t *);
223 static	void	ipif_trace_cleanup(const ipif_t *);
224 #endif
225 
226 static	void	ill_dlpi_clear_deferred(ill_t *ill);
227 
228 static	void	phyint_flags_init(phyint_t *, t_uscalar_t);
229 
230 /*
231  * if we go over the memory footprint limit more than once in this msec
232  * interval, we'll start pruning aggressively.
233  */
234 int ip_min_frag_prune_time = 0;
235 
236 static ipft_t	ip_ioctl_ftbl[] = {
237 	{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
238 	{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
239 		IPFT_F_NO_REPLY },
240 	{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
241 	{ 0 }
242 };
243 
244 /* Simple ICMP IP Header Template */
245 static ipha_t icmp_ipha = {
246 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
247 };
248 
249 static uchar_t	ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
250 
251 static ip_m_t   ip_m_tbl[] = {
252 	{ DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
253 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
254 	    ip_nodef_v6intfid },
255 	{ DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
256 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
257 	    ip_nodef_v6intfid },
258 	{ DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
259 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
260 	    ip_nodef_v6intfid },
261 	{ DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
262 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
263 	    ip_nodef_v6intfid },
264 	{ DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
265 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
266 	    ip_nodef_v6intfid },
267 	{ DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
268 	    ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
269 	    ip_nodef_v6intfid },
270 	{ DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
271 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
272 	    ip_ipv4_v6destintfid },
273 	{ DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
274 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
275 	    ip_ipv6_v6destintfid },
276 	{ DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
277 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
278 	    ip_nodef_v6intfid },
279 	{ SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
280 	    NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
281 	{ SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
282 	    NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
283 	{ DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
284 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
285 	    ip_nodef_v6intfid }
286 };
287 
288 char	ipif_loopback_name[] = "lo0";
289 
290 /* These are used by all IP network modules. */
291 sin6_t	sin6_null;	/* Zero address for quick clears */
292 sin_t	sin_null;	/* Zero address for quick clears */
293 
294 /* When set search for unused ipif_seqid */
295 static ipif_t	ipif_zero;
296 
297 /*
298  * ppa arena is created after these many
299  * interfaces have been plumbed.
300  */
301 uint_t	ill_no_arena = 12;	/* Setable in /etc/system */
302 
303 /*
304  * Allocate per-interface mibs.
305  * Returns true if ok. False otherwise.
306  *  ipsq  may not yet be allocated (loopback case ).
307  */
308 static boolean_t
ill_allocate_mibs(ill_t * ill)309 ill_allocate_mibs(ill_t *ill)
310 {
311 	/* Already allocated? */
312 	if (ill->ill_ip_mib != NULL) {
313 		if (ill->ill_isv6)
314 			ASSERT(ill->ill_icmp6_mib != NULL);
315 		return (B_TRUE);
316 	}
317 
318 	ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
319 	    KM_NOSLEEP);
320 	if (ill->ill_ip_mib == NULL) {
321 		return (B_FALSE);
322 	}
323 
324 	/* Setup static information */
325 	SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
326 	    sizeof (mib2_ipIfStatsEntry_t));
327 	if (ill->ill_isv6) {
328 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
329 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
330 		    sizeof (mib2_ipv6AddrEntry_t));
331 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
332 		    sizeof (mib2_ipv6RouteEntry_t));
333 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
334 		    sizeof (mib2_ipv6NetToMediaEntry_t));
335 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
336 		    sizeof (ipv6_member_t));
337 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
338 		    sizeof (ipv6_grpsrc_t));
339 	} else {
340 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
341 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
342 		    sizeof (mib2_ipAddrEntry_t));
343 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
344 		    sizeof (mib2_ipRouteEntry_t));
345 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
346 		    sizeof (mib2_ipNetToMediaEntry_t));
347 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
348 		    sizeof (ip_member_t));
349 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
350 		    sizeof (ip_grpsrc_t));
351 
352 		/*
353 		 * For a v4 ill, we are done at this point, because per ill
354 		 * icmp mibs are only used for v6.
355 		 */
356 		return (B_TRUE);
357 	}
358 
359 	ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
360 	    KM_NOSLEEP);
361 	if (ill->ill_icmp6_mib == NULL) {
362 		kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
363 		ill->ill_ip_mib = NULL;
364 		return (B_FALSE);
365 	}
366 	/* static icmp info */
367 	ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
368 	    sizeof (mib2_ipv6IfIcmpEntry_t);
369 	/*
370 	 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
371 	 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
372 	 * -> ill_phyint_reinit
373 	 */
374 	return (B_TRUE);
375 }
376 
377 /*
378  * Completely vaporize a lower level tap and all associated interfaces.
379  * ill_delete is called only out of ip_close when the device control
380  * stream is being closed.
381  */
382 void
ill_delete(ill_t * ill)383 ill_delete(ill_t *ill)
384 {
385 	ipif_t	*ipif;
386 	ill_t	*prev_ill;
387 	ip_stack_t	*ipst = ill->ill_ipst;
388 
389 	/*
390 	 * ill_delete may be forcibly entering the ipsq. The previous
391 	 * ioctl may not have completed and may need to be aborted.
392 	 * ipsq_flush takes care of it. If we don't need to enter the
393 	 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
394 	 * ill_delete_tail is sufficient.
395 	 */
396 	ipsq_flush(ill);
397 
398 	/*
399 	 * Nuke all interfaces.  ipif_free will take down the interface,
400 	 * remove it from the list, and free the data structure.
401 	 * Walk down the ipif list and remove the logical interfaces
402 	 * first before removing the main ipif. We can't unplumb
403 	 * zeroth interface first in the case of IPv6 as update_conn_ill
404 	 * -> ip_ll_multireq de-references ill_ipif for checking
405 	 * POINTOPOINT.
406 	 *
407 	 * If ill_ipif was not properly initialized (i.e low on memory),
408 	 * then no interfaces to clean up. In this case just clean up the
409 	 * ill.
410 	 */
411 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
412 		ipif_free(ipif);
413 
414 	/*
415 	 * clean out all the nce_t entries that depend on this
416 	 * ill for the ill_phys_addr.
417 	 */
418 	nce_flush(ill, B_TRUE);
419 
420 	/* Clean up msgs on pending upcalls for mrouted */
421 	reset_mrt_ill(ill);
422 
423 	update_conn_ill(ill, ipst);
424 
425 	/*
426 	 * Remove multicast references added as a result of calls to
427 	 * ip_join_allmulti().
428 	 */
429 	ip_purge_allmulti(ill);
430 
431 	/*
432 	 * If the ill being deleted is under IPMP, boot it out of the illgrp.
433 	 */
434 	if (IS_UNDER_IPMP(ill))
435 		ipmp_ill_leave_illgrp(ill);
436 
437 	/*
438 	 * ill_down will arrange to blow off any IRE's dependent on this
439 	 * ILL, and shut down fragmentation reassembly.
440 	 */
441 	ill_down(ill);
442 
443 	/* Let SCTP know, so that it can remove this from its list. */
444 	sctp_update_ill(ill, SCTP_ILL_REMOVE);
445 
446 	/*
447 	 * Walk all CONNs that can have a reference on an ire or nce for this
448 	 * ill (we actually walk all that now have stale references).
449 	 */
450 	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
451 
452 	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
453 	if (ill->ill_isv6)
454 		dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
455 
456 	/*
457 	 * If an address on this ILL is being used as a source address then
458 	 * clear out the pointers in other ILLs that point to this ILL.
459 	 */
460 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
461 	if (ill->ill_usesrc_grp_next != NULL) {
462 		if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
463 			ill_disband_usesrc_group(ill);
464 		} else {	/* consumer of the usesrc ILL */
465 			prev_ill = ill_prev_usesrc(ill);
466 			prev_ill->ill_usesrc_grp_next =
467 			    ill->ill_usesrc_grp_next;
468 		}
469 	}
470 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
471 }
472 
473 static void
ipif_non_duplicate(ipif_t * ipif)474 ipif_non_duplicate(ipif_t *ipif)
475 {
476 	ill_t *ill = ipif->ipif_ill;
477 	mutex_enter(&ill->ill_lock);
478 	if (ipif->ipif_flags & IPIF_DUPLICATE) {
479 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
480 		ASSERT(ill->ill_ipif_dup_count > 0);
481 		ill->ill_ipif_dup_count--;
482 	}
483 	mutex_exit(&ill->ill_lock);
484 }
485 
486 /*
487  * ill_delete_tail is called from ip_modclose after all references
488  * to the closing ill are gone. The wait is done in ip_modclose
489  */
490 void
ill_delete_tail(ill_t * ill)491 ill_delete_tail(ill_t *ill)
492 {
493 	mblk_t	**mpp;
494 	ipif_t	*ipif;
495 	ip_stack_t *ipst = ill->ill_ipst;
496 
497 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
498 		ipif_non_duplicate(ipif);
499 		(void) ipif_down_tail(ipif);
500 	}
501 
502 	ASSERT(ill->ill_ipif_dup_count == 0);
503 
504 	/*
505 	 * If polling capability is enabled (which signifies direct
506 	 * upcall into IP and driver has ill saved as a handle),
507 	 * we need to make sure that unbind has completed before we
508 	 * let the ill disappear and driver no longer has any reference
509 	 * to this ill.
510 	 */
511 	mutex_enter(&ill->ill_lock);
512 	while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
513 		cv_wait(&ill->ill_cv, &ill->ill_lock);
514 	mutex_exit(&ill->ill_lock);
515 	ASSERT(!(ill->ill_capabilities &
516 	    (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
517 
518 	if (ill->ill_net_type != IRE_LOOPBACK)
519 		qprocsoff(ill->ill_rq);
520 
521 	/*
522 	 * We do an ipsq_flush once again now. New messages could have
523 	 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
524 	 * could also have landed up if an ioctl thread had looked up
525 	 * the ill before we set the ILL_CONDEMNED flag, but not yet
526 	 * enqueued the ioctl when we did the ipsq_flush last time.
527 	 */
528 	ipsq_flush(ill);
529 
530 	/*
531 	 * Free capabilities.
532 	 */
533 	if (ill->ill_hcksum_capab != NULL) {
534 		kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
535 		ill->ill_hcksum_capab = NULL;
536 	}
537 
538 	if (ill->ill_zerocopy_capab != NULL) {
539 		kmem_free(ill->ill_zerocopy_capab,
540 		    sizeof (ill_zerocopy_capab_t));
541 		ill->ill_zerocopy_capab = NULL;
542 	}
543 
544 	if (ill->ill_lso_capab != NULL) {
545 		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
546 		ill->ill_lso_capab = NULL;
547 	}
548 
549 	if (ill->ill_dld_capab != NULL) {
550 		kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
551 		ill->ill_dld_capab = NULL;
552 	}
553 
554 	/* Clean up ill_allowed_ips* related state */
555 	if (ill->ill_allowed_ips != NULL) {
556 		ASSERT(ill->ill_allowed_ips_cnt > 0);
557 		kmem_free(ill->ill_allowed_ips,
558 		    ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
559 		ill->ill_allowed_ips = NULL;
560 		ill->ill_allowed_ips_cnt = 0;
561 	}
562 
563 	while (ill->ill_ipif != NULL)
564 		ipif_free_tail(ill->ill_ipif);
565 
566 	/*
567 	 * We have removed all references to ilm from conn and the ones joined
568 	 * within the kernel.
569 	 *
570 	 * We don't walk conns, mrts and ires because
571 	 *
572 	 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
573 	 * 2) ill_down ->ill_downi walks all the ires and cleans up
574 	 *    ill references.
575 	 */
576 
577 	/*
578 	 * If this ill is an IPMP meta-interface, blow away the illgrp.  This
579 	 * is safe to do because the illgrp has already been unlinked from the
580 	 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
581 	 */
582 	if (IS_IPMP(ill)) {
583 		ipmp_illgrp_destroy(ill->ill_grp);
584 		ill->ill_grp = NULL;
585 	}
586 
587 	if (ill->ill_mphysaddr_list != NULL) {
588 		multiphysaddr_t *mpa, *tmpa;
589 
590 		mpa = ill->ill_mphysaddr_list;
591 		ill->ill_mphysaddr_list = NULL;
592 		while (mpa) {
593 			tmpa = mpa->mpa_next;
594 			kmem_free(mpa, sizeof (*mpa));
595 			mpa = tmpa;
596 		}
597 	}
598 	/*
599 	 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
600 	 * could free the phyint. No more reference to the phyint after this
601 	 * point.
602 	 */
603 	(void) ill_glist_delete(ill);
604 
605 	if (ill->ill_frag_ptr != NULL) {
606 		uint_t count;
607 
608 		for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
609 			mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
610 		}
611 		mi_free(ill->ill_frag_ptr);
612 		ill->ill_frag_ptr = NULL;
613 		ill->ill_frag_hash_tbl = NULL;
614 	}
615 
616 	freemsg(ill->ill_nd_lla_mp);
617 	/* Free all retained control messages. */
618 	mpp = &ill->ill_first_mp_to_free;
619 	do {
620 		while (mpp[0]) {
621 			mblk_t  *mp;
622 			mblk_t  *mp1;
623 
624 			mp = mpp[0];
625 			mpp[0] = mp->b_next;
626 			for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
627 				mp1->b_next = NULL;
628 				mp1->b_prev = NULL;
629 			}
630 			freemsg(mp);
631 		}
632 	} while (mpp++ != &ill->ill_last_mp_to_free);
633 
634 	ill_free_mib(ill);
635 
636 #ifdef DEBUG
637 	ill_trace_cleanup(ill);
638 #endif
639 
640 	/* The default multicast interface might have changed */
641 	ire_increment_multicast_generation(ipst, ill->ill_isv6);
642 
643 	/* Drop refcnt here */
644 	netstack_rele(ill->ill_ipst->ips_netstack);
645 	ill->ill_ipst = NULL;
646 }
647 
648 static void
ill_free_mib(ill_t * ill)649 ill_free_mib(ill_t *ill)
650 {
651 	ip_stack_t *ipst = ill->ill_ipst;
652 
653 	/*
654 	 * MIB statistics must not be lost, so when an interface
655 	 * goes away the counter values will be added to the global
656 	 * MIBs.
657 	 */
658 	if (ill->ill_ip_mib != NULL) {
659 		if (ill->ill_isv6) {
660 			ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
661 			    ill->ill_ip_mib);
662 		} else {
663 			ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
664 			    ill->ill_ip_mib);
665 		}
666 
667 		kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
668 		ill->ill_ip_mib = NULL;
669 	}
670 	if (ill->ill_icmp6_mib != NULL) {
671 		ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
672 		    ill->ill_icmp6_mib);
673 		kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
674 		ill->ill_icmp6_mib = NULL;
675 	}
676 }
677 
678 /*
679  * Concatenate together a physical address and a sap.
680  *
681  * Sap_lengths are interpreted as follows:
682  *   sap_length == 0	==>	no sap
683  *   sap_length > 0	==>	sap is at the head of the dlpi address
684  *   sap_length < 0	==>	sap is at the tail of the dlpi address
685  */
686 static void
ill_dlur_copy_address(uchar_t * phys_src,uint_t phys_length,t_scalar_t sap_src,t_scalar_t sap_length,uchar_t * dst)687 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
688     t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
689 {
690 	uint16_t sap_addr = (uint16_t)sap_src;
691 
692 	if (sap_length == 0) {
693 		if (phys_src == NULL)
694 			bzero(dst, phys_length);
695 		else
696 			bcopy(phys_src, dst, phys_length);
697 	} else if (sap_length < 0) {
698 		if (phys_src == NULL)
699 			bzero(dst, phys_length);
700 		else
701 			bcopy(phys_src, dst, phys_length);
702 		bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
703 	} else {
704 		bcopy(&sap_addr, dst, sizeof (sap_addr));
705 		if (phys_src == NULL)
706 			bzero((char *)dst + sap_length, phys_length);
707 		else
708 			bcopy(phys_src, (char *)dst + sap_length, phys_length);
709 	}
710 }
711 
712 /*
713  * Generate a dl_unitdata_req mblk for the device and address given.
714  * addr_length is the length of the physical portion of the address.
715  * If addr is NULL include an all zero address of the specified length.
716  * TRUE? In any case, addr_length is taken to be the entire length of the
717  * dlpi address, including the absolute value of sap_length.
718  */
719 mblk_t *
ill_dlur_gen(uchar_t * addr,uint_t addr_length,t_uscalar_t sap,t_scalar_t sap_length)720 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
721     t_scalar_t sap_length)
722 {
723 	dl_unitdata_req_t *dlur;
724 	mblk_t	*mp;
725 	t_scalar_t	abs_sap_length;		/* absolute value */
726 
727 	abs_sap_length = ABS(sap_length);
728 	mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
729 	    DL_UNITDATA_REQ);
730 	if (mp == NULL)
731 		return (NULL);
732 	dlur = (dl_unitdata_req_t *)mp->b_rptr;
733 	/* HACK: accomodate incompatible DLPI drivers */
734 	if (addr_length == 8)
735 		addr_length = 6;
736 	dlur->dl_dest_addr_length = addr_length + abs_sap_length;
737 	dlur->dl_dest_addr_offset = sizeof (*dlur);
738 	dlur->dl_priority.dl_min = 0;
739 	dlur->dl_priority.dl_max = 0;
740 	ill_dlur_copy_address(addr, addr_length, sap, sap_length,
741 	    (uchar_t *)&dlur[1]);
742 	return (mp);
743 }
744 
745 /*
746  * Add the pending mp to the list. There can be only 1 pending mp
747  * in the list. Any exclusive ioctl that needs to wait for a response
748  * from another module or driver needs to use this function to set
749  * the ipx_pending_mp to the ioctl mblk and wait for the response from
750  * the other module/driver. This is also used while waiting for the
751  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
752  */
753 boolean_t
ipsq_pending_mp_add(conn_t * connp,ipif_t * ipif,queue_t * q,mblk_t * add_mp,int waitfor)754 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
755     int waitfor)
756 {
757 	ipxop_t	*ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
758 
759 	ASSERT(IAM_WRITER_IPIF(ipif));
760 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
761 	ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
762 	ASSERT(ipx->ipx_pending_mp == NULL);
763 	/*
764 	 * The caller may be using a different ipif than the one passed into
765 	 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
766 	 * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
767 	 * that `ipx_current_ipif == ipif'.
768 	 */
769 	ASSERT(ipx->ipx_current_ipif != NULL);
770 
771 	/*
772 	 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
773 	 * driver.
774 	 */
775 	ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
776 	    (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
777 	    (DB_TYPE(add_mp) == M_PCPROTO));
778 
779 	if (connp != NULL) {
780 		ASSERT(MUTEX_HELD(&connp->conn_lock));
781 		/*
782 		 * Return error if the conn has started closing. The conn
783 		 * could have finished cleaning up the pending mp list,
784 		 * If so we should not add another mp to the list negating
785 		 * the cleanup.
786 		 */
787 		if (connp->conn_state_flags & CONN_CLOSING)
788 			return (B_FALSE);
789 	}
790 	mutex_enter(&ipx->ipx_lock);
791 	ipx->ipx_pending_ipif = ipif;
792 	/*
793 	 * Note down the queue in b_queue. This will be returned by
794 	 * ipsq_pending_mp_get. Caller will then use these values to restart
795 	 * the processing
796 	 */
797 	add_mp->b_next = NULL;
798 	add_mp->b_queue = q;
799 	ipx->ipx_pending_mp = add_mp;
800 	ipx->ipx_waitfor = waitfor;
801 	mutex_exit(&ipx->ipx_lock);
802 
803 	if (connp != NULL)
804 		connp->conn_oper_pending_ill = ipif->ipif_ill;
805 
806 	return (B_TRUE);
807 }
808 
809 /*
810  * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
811  * queued in the list.
812  */
813 mblk_t *
ipsq_pending_mp_get(ipsq_t * ipsq,conn_t ** connpp)814 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
815 {
816 	mblk_t	*curr = NULL;
817 	ipxop_t	*ipx = ipsq->ipsq_xop;
818 
819 	*connpp = NULL;
820 	mutex_enter(&ipx->ipx_lock);
821 	if (ipx->ipx_pending_mp == NULL) {
822 		mutex_exit(&ipx->ipx_lock);
823 		return (NULL);
824 	}
825 
826 	/* There can be only 1 such excl message */
827 	curr = ipx->ipx_pending_mp;
828 	ASSERT(curr->b_next == NULL);
829 	ipx->ipx_pending_ipif = NULL;
830 	ipx->ipx_pending_mp = NULL;
831 	ipx->ipx_waitfor = 0;
832 	mutex_exit(&ipx->ipx_lock);
833 
834 	if (CONN_Q(curr->b_queue)) {
835 		/*
836 		 * This mp did a refhold on the conn, at the start of the ioctl.
837 		 * So we can safely return a pointer to the conn to the caller.
838 		 */
839 		*connpp = Q_TO_CONN(curr->b_queue);
840 	} else {
841 		*connpp = NULL;
842 	}
843 	curr->b_next = NULL;
844 	curr->b_prev = NULL;
845 	return (curr);
846 }
847 
848 /*
849  * Cleanup the ioctl mp queued in ipx_pending_mp
850  * - Called in the ill_delete path
851  * - Called in the M_ERROR or M_HANGUP path on the ill.
852  * - Called in the conn close path.
853  *
854  * Returns success on finding the pending mblk associated with the ioctl or
855  * exclusive operation in progress, failure otherwise.
856  */
857 boolean_t
ipsq_pending_mp_cleanup(ill_t * ill,conn_t * connp)858 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
859 {
860 	mblk_t	*mp;
861 	ipxop_t	*ipx;
862 	queue_t	*q;
863 	ipif_t	*ipif;
864 	int	cmd;
865 
866 	ASSERT(IAM_WRITER_ILL(ill));
867 	ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
868 
869 	mutex_enter(&ipx->ipx_lock);
870 	mp = ipx->ipx_pending_mp;
871 	if (connp != NULL) {
872 		if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
873 			/*
874 			 * Nothing to clean since the conn that is closing
875 			 * does not have a matching pending mblk in
876 			 * ipx_pending_mp.
877 			 */
878 			mutex_exit(&ipx->ipx_lock);
879 			return (B_FALSE);
880 		}
881 	} else {
882 		/*
883 		 * A non-zero ill_error signifies we are called in the
884 		 * M_ERROR or M_HANGUP path and we need to unconditionally
885 		 * abort any current ioctl and do the corresponding cleanup.
886 		 * A zero ill_error means we are in the ill_delete path and
887 		 * we do the cleanup only if there is a pending mp.
888 		 */
889 		if (mp == NULL && ill->ill_error == 0) {
890 			mutex_exit(&ipx->ipx_lock);
891 			return (B_FALSE);
892 		}
893 	}
894 
895 	/* Now remove from the ipx_pending_mp */
896 	ipx->ipx_pending_mp = NULL;
897 	ipif = ipx->ipx_pending_ipif;
898 	ipx->ipx_pending_ipif = NULL;
899 	ipx->ipx_waitfor = 0;
900 	ipx->ipx_current_ipif = NULL;
901 	cmd = ipx->ipx_current_ioctl;
902 	ipx->ipx_current_ioctl = 0;
903 	ipx->ipx_current_done = B_TRUE;
904 	mutex_exit(&ipx->ipx_lock);
905 
906 	if (mp == NULL)
907 		return (B_FALSE);
908 
909 	q = mp->b_queue;
910 	mp->b_next = NULL;
911 	mp->b_prev = NULL;
912 	mp->b_queue = NULL;
913 
914 	if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
915 		DTRACE_PROBE4(ipif__ioctl,
916 		    char *, "ipsq_pending_mp_cleanup",
917 		    int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
918 		    ipif_t *, ipif);
919 		if (connp == NULL) {
920 			ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
921 		} else {
922 			ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
923 			mutex_enter(&ipif->ipif_ill->ill_lock);
924 			ipif->ipif_state_flags &= ~IPIF_CHANGING;
925 			mutex_exit(&ipif->ipif_ill->ill_lock);
926 		}
927 	} else {
928 		inet_freemsg(mp);
929 	}
930 	return (B_TRUE);
931 }
932 
933 /*
934  * Called in the conn close path and ill delete path
935  */
936 static void
ipsq_xopq_mp_cleanup(ill_t * ill,conn_t * connp)937 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
938 {
939 	ipsq_t	*ipsq;
940 	mblk_t	*prev;
941 	mblk_t	*curr;
942 	mblk_t	*next;
943 	queue_t	*wq, *rq = NULL;
944 	mblk_t	*tmp_list = NULL;
945 
946 	ASSERT(IAM_WRITER_ILL(ill));
947 	if (connp != NULL)
948 		wq = CONNP_TO_WQ(connp);
949 	else
950 		wq = ill->ill_wq;
951 
952 	/*
953 	 * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
954 	 * against this here.
955 	 */
956 	if (wq != NULL)
957 		rq = RD(wq);
958 
959 	ipsq = ill->ill_phyint->phyint_ipsq;
960 	/*
961 	 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
962 	 * In the case of ioctl from a conn, there can be only 1 mp
963 	 * queued on the ipsq. If an ill is being unplumbed flush all
964 	 * the messages.
965 	 */
966 	mutex_enter(&ipsq->ipsq_lock);
967 	for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
968 	    curr = next) {
969 		next = curr->b_next;
970 		if (connp == NULL ||
971 		    (curr->b_queue == wq || curr->b_queue == rq)) {
972 			/* Unlink the mblk from the pending mp list */
973 			if (prev != NULL) {
974 				prev->b_next = curr->b_next;
975 			} else {
976 				ASSERT(ipsq->ipsq_xopq_mphead == curr);
977 				ipsq->ipsq_xopq_mphead = curr->b_next;
978 			}
979 			if (ipsq->ipsq_xopq_mptail == curr)
980 				ipsq->ipsq_xopq_mptail = prev;
981 			/*
982 			 * Create a temporary list and release the ipsq lock
983 			 * New elements are added to the head of the tmp_list
984 			 */
985 			curr->b_next = tmp_list;
986 			tmp_list = curr;
987 		} else {
988 			prev = curr;
989 		}
990 	}
991 	mutex_exit(&ipsq->ipsq_lock);
992 
993 	while (tmp_list != NULL) {
994 		curr = tmp_list;
995 		tmp_list = curr->b_next;
996 		curr->b_next = NULL;
997 		curr->b_prev = NULL;
998 		wq = curr->b_queue;
999 		curr->b_queue = NULL;
1000 		if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
1001 			DTRACE_PROBE4(ipif__ioctl,
1002 			    char *, "ipsq_xopq_mp_cleanup",
1003 			    int, 0, ill_t *, NULL, ipif_t *, NULL);
1004 			ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
1005 			    CONN_CLOSE : NO_COPYOUT, NULL);
1006 		} else {
1007 			/*
1008 			 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1009 			 * this can't be just inet_freemsg. we have to
1010 			 * restart it otherwise the thread will be stuck.
1011 			 */
1012 			inet_freemsg(curr);
1013 		}
1014 	}
1015 }
1016 
1017 /*
1018  * This conn has started closing. Cleanup any pending ioctl from this conn.
1019  * STREAMS ensures that there can be at most 1 active ioctl on a stream.
1020  */
1021 void
conn_ioctl_cleanup(conn_t * connp)1022 conn_ioctl_cleanup(conn_t *connp)
1023 {
1024 	ipsq_t	*ipsq;
1025 	ill_t	*ill;
1026 	boolean_t refheld;
1027 
1028 	/*
1029 	 * Check for a queued ioctl. If the ioctl has not yet started, the mp
1030 	 * is pending in the list headed by ipsq_xopq_head. If the ioctl has
1031 	 * started the mp could be present in ipx_pending_mp. Note that if
1032 	 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and
1033 	 * not yet queued anywhere. In this case, the conn close code will wait
1034 	 * until the conn_ref is dropped. If the stream was a tcp stream, then
1035 	 * tcp_close will wait first until all ioctls have completed for this
1036 	 * conn.
1037 	 */
1038 	mutex_enter(&connp->conn_lock);
1039 	ill = connp->conn_oper_pending_ill;
1040 	if (ill == NULL) {
1041 		mutex_exit(&connp->conn_lock);
1042 		return;
1043 	}
1044 
1045 	/*
1046 	 * We may not be able to refhold the ill if the ill/ipif
1047 	 * is changing. But we need to make sure that the ill will
1048 	 * not vanish. So we just bump up the ill_waiter count.
1049 	 */
1050 	refheld = ill_waiter_inc(ill);
1051 	mutex_exit(&connp->conn_lock);
1052 	if (refheld) {
1053 		if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
1054 			ill_waiter_dcr(ill);
1055 			/*
1056 			 * Check whether this ioctl has started and is
1057 			 * pending. If it is not found there then check
1058 			 * whether this ioctl has not even started and is in
1059 			 * the ipsq_xopq list.
1060 			 */
1061 			if (!ipsq_pending_mp_cleanup(ill, connp))
1062 				ipsq_xopq_mp_cleanup(ill, connp);
1063 			ipsq = ill->ill_phyint->phyint_ipsq;
1064 			ipsq_exit(ipsq);
1065 			return;
1066 		}
1067 	}
1068 
1069 	/*
1070 	 * The ill is also closing and we could not bump up the
1071 	 * ill_waiter_count or we could not enter the ipsq. Leave
1072 	 * the cleanup to ill_delete
1073 	 */
1074 	mutex_enter(&connp->conn_lock);
1075 	while (connp->conn_oper_pending_ill != NULL)
1076 		cv_wait(&connp->conn_refcv, &connp->conn_lock);
1077 	mutex_exit(&connp->conn_lock);
1078 	if (refheld)
1079 		ill_waiter_dcr(ill);
1080 }
1081 
1082 /*
1083  * ipcl_walk function for cleaning up conn_*_ill fields.
1084  * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
1085  * conn_bound_if in place. We prefer dropping
1086  * packets instead of sending them out the wrong interface, or accepting
1087  * packets from the wrong ifindex.
1088  */
1089 static void
conn_cleanup_ill(conn_t * connp,caddr_t arg)1090 conn_cleanup_ill(conn_t *connp, caddr_t arg)
1091 {
1092 	ill_t	*ill = (ill_t *)arg;
1093 
1094 	mutex_enter(&connp->conn_lock);
1095 	if (connp->conn_dhcpinit_ill == ill) {
1096 		connp->conn_dhcpinit_ill = NULL;
1097 		ASSERT(ill->ill_dhcpinit != 0);
1098 		atomic_dec_32(&ill->ill_dhcpinit);
1099 		ill_set_inputfn(ill);
1100 	}
1101 	mutex_exit(&connp->conn_lock);
1102 }
1103 
1104 static int
ill_down_ipifs_tail(ill_t * ill)1105 ill_down_ipifs_tail(ill_t *ill)
1106 {
1107 	ipif_t	*ipif;
1108 	int err;
1109 
1110 	ASSERT(IAM_WRITER_ILL(ill));
1111 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1112 		ipif_non_duplicate(ipif);
1113 		/*
1114 		 * ipif_down_tail will call arp_ll_down on the last ipif
1115 		 * and typically return EINPROGRESS when the DL_UNBIND is sent.
1116 		 */
1117 		if ((err = ipif_down_tail(ipif)) != 0)
1118 			return (err);
1119 	}
1120 	return (0);
1121 }
1122 
1123 /* ARGSUSED */
1124 void
ipif_all_down_tail(ipsq_t * ipsq,queue_t * q,mblk_t * mp,void * dummy_arg)1125 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1126 {
1127 	ASSERT(IAM_WRITER_IPSQ(ipsq));
1128 	(void) ill_down_ipifs_tail(q->q_ptr);
1129 	freemsg(mp);
1130 	ipsq_current_finish(ipsq);
1131 }
1132 
1133 /*
1134  * ill_down_start is called when we want to down this ill and bring it up again
1135  * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1136  * all interfaces, but don't tear down any plumbing.
1137  */
1138 boolean_t
ill_down_start(queue_t * q,mblk_t * mp)1139 ill_down_start(queue_t *q, mblk_t *mp)
1140 {
1141 	ill_t	*ill = q->q_ptr;
1142 	ipif_t	*ipif;
1143 
1144 	ASSERT(IAM_WRITER_ILL(ill));
1145 	/*
1146 	 * It is possible that some ioctl is already in progress while we
1147 	 * received the M_ERROR / M_HANGUP in which case, we need to abort
1148 	 * the ioctl. ill_down_start() is being processed as CUR_OP rather
1149 	 * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
1150 	 * the in progress ioctl from ever completing.
1151 	 *
1152 	 * The thread that started the ioctl (if any) must have returned,
1153 	 * since we are now executing as writer. After the 2 calls below,
1154 	 * the state of the ipsq and the ill would reflect no trace of any
1155 	 * pending operation. Subsequently if there is any response to the
1156 	 * original ioctl from the driver, it would be discarded as an
1157 	 * unsolicited message from the driver.
1158 	 */
1159 	(void) ipsq_pending_mp_cleanup(ill, NULL);
1160 	ill_dlpi_clear_deferred(ill);
1161 
1162 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1163 		(void) ipif_down(ipif, NULL, NULL);
1164 
1165 	ill_down(ill);
1166 
1167 	/*
1168 	 * Walk all CONNs that can have a reference on an ire or nce for this
1169 	 * ill (we actually walk all that now have stale references).
1170 	 */
1171 	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
1172 
1173 	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
1174 	if (ill->ill_isv6)
1175 		dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
1176 
1177 	ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
1178 
1179 	/*
1180 	 * Atomically test and add the pending mp if references are active.
1181 	 */
1182 	mutex_enter(&ill->ill_lock);
1183 	if (!ill_is_quiescent(ill)) {
1184 		/* call cannot fail since `conn_t *' argument is NULL */
1185 		(void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1186 		    mp, ILL_DOWN);
1187 		mutex_exit(&ill->ill_lock);
1188 		return (B_FALSE);
1189 	}
1190 	mutex_exit(&ill->ill_lock);
1191 	return (B_TRUE);
1192 }
1193 
1194 static void
ill_down(ill_t * ill)1195 ill_down(ill_t *ill)
1196 {
1197 	mblk_t	*mp;
1198 	ip_stack_t	*ipst = ill->ill_ipst;
1199 
1200 	/*
1201 	 * Blow off any IREs dependent on this ILL.
1202 	 * The caller needs to handle conn_ixa_cleanup
1203 	 */
1204 	ill_delete_ires(ill);
1205 
1206 	ire_walk_ill(0, 0, ill_downi, ill, ill);
1207 
1208 	/* Remove any conn_*_ill depending on this ill */
1209 	ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
1210 
1211 	/*
1212 	 * Free state for additional IREs.
1213 	 */
1214 	mutex_enter(&ill->ill_saved_ire_lock);
1215 	mp = ill->ill_saved_ire_mp;
1216 	ill->ill_saved_ire_mp = NULL;
1217 	ill->ill_saved_ire_cnt = 0;
1218 	mutex_exit(&ill->ill_saved_ire_lock);
1219 	freemsg(mp);
1220 }
1221 
1222 /*
1223  * ire_walk routine used to delete every IRE that depends on
1224  * 'ill'.  (Always called as writer, and may only be called from ire_walk.)
1225  *
1226  * Note: since the routes added by the kernel are deleted separately,
1227  * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
1228  *
1229  * We also remove references on ire_nce_cache entries that refer to the ill.
1230  */
1231 void
ill_downi(ire_t * ire,char * ill_arg)1232 ill_downi(ire_t *ire, char *ill_arg)
1233 {
1234 	ill_t	*ill = (ill_t *)ill_arg;
1235 	nce_t	*nce;
1236 
1237 	mutex_enter(&ire->ire_lock);
1238 	nce = ire->ire_nce_cache;
1239 	if (nce != NULL && nce->nce_ill == ill)
1240 		ire->ire_nce_cache = NULL;
1241 	else
1242 		nce = NULL;
1243 	mutex_exit(&ire->ire_lock);
1244 	if (nce != NULL)
1245 		nce_refrele(nce);
1246 	if (ire->ire_ill == ill) {
1247 		/*
1248 		 * The existing interface binding for ire must be
1249 		 * deleted before trying to bind the route to another
1250 		 * interface. However, since we are using the contents of the
1251 		 * ire after ire_delete, the caller has to ensure that
1252 		 * CONDEMNED (deleted) ire's are not removed from the list
1253 		 * when ire_delete() returns. Currently ill_downi() is
1254 		 * only called as part of ire_walk*() routines, so that
1255 		 * the irb_refhold() done by ire_walk*() will ensure that
1256 		 * ire_delete() does not lead to ire_inactive().
1257 		 */
1258 		ASSERT(ire->ire_bucket->irb_refcnt > 0);
1259 		ire_delete(ire);
1260 		if (ire->ire_unbound)
1261 			ire_rebind(ire);
1262 	}
1263 }
1264 
1265 /* Remove IRE_IF_CLONE on this ill */
1266 void
ill_downi_if_clone(ire_t * ire,char * ill_arg)1267 ill_downi_if_clone(ire_t *ire, char *ill_arg)
1268 {
1269 	ill_t	*ill = (ill_t *)ill_arg;
1270 
1271 	ASSERT(ire->ire_type & IRE_IF_CLONE);
1272 	if (ire->ire_ill == ill)
1273 		ire_delete(ire);
1274 }
1275 
1276 /* Consume an M_IOCACK of the fastpath probe. */
1277 void
ill_fastpath_ack(ill_t * ill,mblk_t * mp)1278 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1279 {
1280 	mblk_t	*mp1 = mp;
1281 
1282 	/*
1283 	 * If this was the first attempt turn on the fastpath probing.
1284 	 */
1285 	mutex_enter(&ill->ill_lock);
1286 	if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
1287 		ill->ill_dlpi_fastpath_state = IDS_OK;
1288 	mutex_exit(&ill->ill_lock);
1289 
1290 	/* Free the M_IOCACK mblk, hold on to the data */
1291 	mp = mp->b_cont;
1292 	freeb(mp1);
1293 	if (mp == NULL)
1294 		return;
1295 	if (mp->b_cont != NULL)
1296 		nce_fastpath_update(ill, mp);
1297 	else
1298 		ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
1299 	freemsg(mp);
1300 }
1301 
1302 /*
1303  * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1304  * The data portion of the request is a dl_unitdata_req_t template for
1305  * what we would send downstream in the absence of a fastpath confirmation.
1306  */
1307 int
ill_fastpath_probe(ill_t * ill,mblk_t * dlur_mp)1308 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1309 {
1310 	struct iocblk	*ioc;
1311 	mblk_t	*mp;
1312 
1313 	if (dlur_mp == NULL)
1314 		return (EINVAL);
1315 
1316 	mutex_enter(&ill->ill_lock);
1317 	switch (ill->ill_dlpi_fastpath_state) {
1318 	case IDS_FAILED:
1319 		/*
1320 		 * Driver NAKed the first fastpath ioctl - assume it doesn't
1321 		 * support it.
1322 		 */
1323 		mutex_exit(&ill->ill_lock);
1324 		return (ENOTSUP);
1325 	case IDS_UNKNOWN:
1326 		/* This is the first probe */
1327 		ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
1328 		break;
1329 	default:
1330 		break;
1331 	}
1332 	mutex_exit(&ill->ill_lock);
1333 
1334 	if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1335 		return (EAGAIN);
1336 
1337 	mp->b_cont = copyb(dlur_mp);
1338 	if (mp->b_cont == NULL) {
1339 		freeb(mp);
1340 		return (EAGAIN);
1341 	}
1342 
1343 	ioc = (struct iocblk *)mp->b_rptr;
1344 	ioc->ioc_count = msgdsize(mp->b_cont);
1345 
1346 	DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
1347 	    char *, "DL_IOC_HDR_INFO", ill_t *, ill);
1348 	putnext(ill->ill_wq, mp);
1349 	return (0);
1350 }
1351 
1352 void
ill_capability_probe(ill_t * ill)1353 ill_capability_probe(ill_t *ill)
1354 {
1355 	mblk_t	*mp;
1356 
1357 	ASSERT(IAM_WRITER_ILL(ill));
1358 
1359 	if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
1360 	    ill->ill_dlpi_capab_state != IDCS_FAILED)
1361 		return;
1362 
1363 	/*
1364 	 * We are starting a new cycle of capability negotiation.
1365 	 * Free up the capab reset messages of any previous incarnation.
1366 	 * We will do a fresh allocation when we get the response to our probe
1367 	 */
1368 	if (ill->ill_capab_reset_mp != NULL) {
1369 		freemsg(ill->ill_capab_reset_mp);
1370 		ill->ill_capab_reset_mp = NULL;
1371 	}
1372 
1373 	ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1374 
1375 	mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
1376 	if (mp == NULL)
1377 		return;
1378 
1379 	ill_capability_send(ill, mp);
1380 	ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
1381 }
1382 
1383 void
ill_capability_reset(ill_t * ill,boolean_t reneg)1384 ill_capability_reset(ill_t *ill, boolean_t reneg)
1385 {
1386 	ASSERT(IAM_WRITER_ILL(ill));
1387 
1388 	if (ill->ill_dlpi_capab_state != IDCS_OK)
1389 		return;
1390 
1391 	ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
1392 
1393 	ill_capability_send(ill, ill->ill_capab_reset_mp);
1394 	ill->ill_capab_reset_mp = NULL;
1395 	/*
1396 	 * We turn off all capabilities except those pertaining to
1397 	 * direct function call capabilities viz. ILL_CAPAB_DLD*
1398 	 * which will be turned off by the corresponding reset functions.
1399 	 */
1400 	ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM  | ILL_CAPAB_ZEROCOPY);
1401 }
1402 
1403 static void
ill_capability_reset_alloc(ill_t * ill)1404 ill_capability_reset_alloc(ill_t *ill)
1405 {
1406 	mblk_t *mp;
1407 	size_t	size = 0;
1408 	int	err;
1409 	dl_capability_req_t	*capb;
1410 
1411 	ASSERT(IAM_WRITER_ILL(ill));
1412 	ASSERT(ill->ill_capab_reset_mp == NULL);
1413 
1414 	if (ILL_HCKSUM_CAPABLE(ill)) {
1415 		size += sizeof (dl_capability_sub_t) +
1416 		    sizeof (dl_capab_hcksum_t);
1417 	}
1418 
1419 	if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
1420 		size += sizeof (dl_capability_sub_t) +
1421 		    sizeof (dl_capab_zerocopy_t);
1422 	}
1423 
1424 	if (ill->ill_capabilities & ILL_CAPAB_DLD) {
1425 		size += sizeof (dl_capability_sub_t) +
1426 		    sizeof (dl_capab_dld_t);
1427 	}
1428 
1429 	mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
1430 	    STR_NOSIG, &err);
1431 
1432 	mp->b_datap->db_type = M_PROTO;
1433 	bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
1434 
1435 	capb = (dl_capability_req_t *)mp->b_rptr;
1436 	capb->dl_primitive = DL_CAPABILITY_REQ;
1437 	capb->dl_sub_offset = sizeof (dl_capability_req_t);
1438 	capb->dl_sub_length = size;
1439 
1440 	mp->b_wptr += sizeof (dl_capability_req_t);
1441 
1442 	/*
1443 	 * Each handler fills in the corresponding dl_capability_sub_t
1444 	 * inside the mblk,
1445 	 */
1446 	ill_capability_hcksum_reset_fill(ill, mp);
1447 	ill_capability_zerocopy_reset_fill(ill, mp);
1448 	ill_capability_dld_reset_fill(ill, mp);
1449 
1450 	ill->ill_capab_reset_mp = mp;
1451 }
1452 
1453 static void
ill_capability_id_ack(ill_t * ill,mblk_t * mp,dl_capability_sub_t * outers)1454 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1455 {
1456 	dl_capab_id_t *id_ic;
1457 	uint_t sub_dl_cap = outers->dl_cap;
1458 	dl_capability_sub_t *inners;
1459 	uint8_t *capend;
1460 
1461 	ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1462 
1463 	/*
1464 	 * Note: range checks here are not absolutely sufficient to
1465 	 * make us robust against malformed messages sent by drivers;
1466 	 * this is in keeping with the rest of IP's dlpi handling.
1467 	 * (Remember, it's coming from something else in the kernel
1468 	 * address space)
1469 	 */
1470 
1471 	capend = (uint8_t *)(outers + 1) + outers->dl_length;
1472 	if (capend > mp->b_wptr) {
1473 		cmn_err(CE_WARN, "ill_capability_id_ack: "
1474 		    "malformed sub-capability too long for mblk");
1475 		return;
1476 	}
1477 
1478 	id_ic = (dl_capab_id_t *)(outers + 1);
1479 
1480 	inners = &id_ic->id_subcap;
1481 	if (outers->dl_length < sizeof (*id_ic) ||
1482 	    inners->dl_length > (outers->dl_length - sizeof (*inners))) {
1483 		cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1484 		    "encapsulated capab type %d too long for mblk",
1485 		    inners->dl_cap);
1486 		return;
1487 	}
1488 
1489 	if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1490 		ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1491 		    "isn't as expected; pass-thru module(s) detected, "
1492 		    "discarding capability\n", inners->dl_cap));
1493 		return;
1494 	}
1495 
1496 	/* Process the encapsulated sub-capability */
1497 	ill_capability_dispatch(ill, mp, inners);
1498 }
1499 
1500 static void
ill_capability_dld_reset_fill(ill_t * ill,mblk_t * mp)1501 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
1502 {
1503 	dl_capability_sub_t *dl_subcap;
1504 
1505 	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
1506 		return;
1507 
1508 	/*
1509 	 * The dl_capab_dld_t that follows the dl_capability_sub_t is not
1510 	 * initialized below since it is not used by DLD.
1511 	 */
1512 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1513 	dl_subcap->dl_cap = DL_CAPAB_DLD;
1514 	dl_subcap->dl_length = sizeof (dl_capab_dld_t);
1515 
1516 	mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
1517 }
1518 
1519 static void
ill_capability_dispatch(ill_t * ill,mblk_t * mp,dl_capability_sub_t * subp)1520 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
1521 {
1522 	/*
1523 	 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
1524 	 * is only to get the VRRP capability.
1525 	 *
1526 	 * Note that we cannot check ill_ipif_up_count here since
1527 	 * ill_ipif_up_count is only incremented when the resolver is setup.
1528 	 * That is done asynchronously, and can race with this function.
1529 	 */
1530 	if (!ill->ill_dl_up) {
1531 		if (subp->dl_cap == DL_CAPAB_VRRP)
1532 			ill_capability_vrrp_ack(ill, mp, subp);
1533 		return;
1534 	}
1535 
1536 	switch (subp->dl_cap) {
1537 	case DL_CAPAB_HCKSUM:
1538 		ill_capability_hcksum_ack(ill, mp, subp);
1539 		break;
1540 	case DL_CAPAB_ZEROCOPY:
1541 		ill_capability_zerocopy_ack(ill, mp, subp);
1542 		break;
1543 	case DL_CAPAB_DLD:
1544 		ill_capability_dld_ack(ill, mp, subp);
1545 		break;
1546 	case DL_CAPAB_VRRP:
1547 		break;
1548 	default:
1549 		ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
1550 		    subp->dl_cap));
1551 	}
1552 }
1553 
1554 /*
1555  * Process the vrrp capability received from a DLS Provider. isub must point
1556  * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
1557  */
1558 static void
ill_capability_vrrp_ack(ill_t * ill,mblk_t * mp,dl_capability_sub_t * isub)1559 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1560 {
1561 	dl_capab_vrrp_t	*vrrp;
1562 	uint_t		sub_dl_cap = isub->dl_cap;
1563 	uint8_t		*capend;
1564 
1565 	ASSERT(IAM_WRITER_ILL(ill));
1566 	ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
1567 
1568 	/*
1569 	 * Note: range checks here are not absolutely sufficient to
1570 	 * make us robust against malformed messages sent by drivers;
1571 	 * this is in keeping with the rest of IP's dlpi handling.
1572 	 * (Remember, it's coming from something else in the kernel
1573 	 * address space)
1574 	 */
1575 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
1576 	if (capend > mp->b_wptr) {
1577 		cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
1578 		    "malformed sub-capability too long for mblk");
1579 		return;
1580 	}
1581 	vrrp = (dl_capab_vrrp_t *)(isub + 1);
1582 
1583 	/*
1584 	 * Compare the IP address family and set ILLF_VRRP for the right ill.
1585 	 */
1586 	if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
1587 	    (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
1588 		ill->ill_flags |= ILLF_VRRP;
1589 	}
1590 }
1591 
1592 /*
1593  * Process a hardware checksum offload capability negotiation ack received
1594  * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
1595  * of a DL_CAPABILITY_ACK message.
1596  */
1597 static void
ill_capability_hcksum_ack(ill_t * ill,mblk_t * mp,dl_capability_sub_t * isub)1598 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1599 {
1600 	dl_capability_req_t	*ocap;
1601 	dl_capab_hcksum_t	*ihck, *ohck;
1602 	ill_hcksum_capab_t	**ill_hcksum;
1603 	mblk_t			*nmp = NULL;
1604 	uint_t			sub_dl_cap = isub->dl_cap;
1605 	uint8_t			*capend;
1606 
1607 	ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
1608 
1609 	ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
1610 
1611 	/*
1612 	 * Note: range checks here are not absolutely sufficient to
1613 	 * make us robust against malformed messages sent by drivers;
1614 	 * this is in keeping with the rest of IP's dlpi handling.
1615 	 * (Remember, it's coming from something else in the kernel
1616 	 * address space)
1617 	 */
1618 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
1619 	if (capend > mp->b_wptr) {
1620 		cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1621 		    "malformed sub-capability too long for mblk");
1622 		return;
1623 	}
1624 
1625 	/*
1626 	 * There are two types of acks we process here:
1627 	 * 1. acks in reply to a (first form) generic capability req
1628 	 *    (no ENABLE flag set)
1629 	 * 2. acks in reply to a ENABLE capability req.
1630 	 *    (ENABLE flag set)
1631 	 */
1632 	ihck = (dl_capab_hcksum_t *)(isub + 1);
1633 
1634 	if (ihck->hcksum_version != HCKSUM_VERSION_1) {
1635 		cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
1636 		    "unsupported hardware checksum "
1637 		    "sub-capability (version %d, expected %d)",
1638 		    ihck->hcksum_version, HCKSUM_VERSION_1);
1639 		return;
1640 	}
1641 
1642 	if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
1643 		ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
1644 		    "checksum capability isn't as expected; pass-thru "
1645 		    "module(s) detected, discarding capability\n"));
1646 		return;
1647 	}
1648 
1649 #define	CURR_HCKSUM_CAPAB				\
1650 	(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |	\
1651 	HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
1652 
1653 	if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
1654 	    (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
1655 		/* do ENABLE processing */
1656 		if (*ill_hcksum == NULL) {
1657 			*ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
1658 			    KM_NOSLEEP);
1659 
1660 			if (*ill_hcksum == NULL) {
1661 				cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1662 				    "could not enable hcksum version %d "
1663 				    "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
1664 				    ill->ill_name);
1665 				return;
1666 			}
1667 		}
1668 
1669 		(*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
1670 		(*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
1671 		ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
1672 		ip1dbg(("ill_capability_hcksum_ack: interface %s "
1673 		    "has enabled hardware checksumming\n ",
1674 		    ill->ill_name));
1675 	} else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
1676 		/*
1677 		 * Enabling hardware checksum offload
1678 		 * Currently IP supports {TCP,UDP}/IPv4
1679 		 * partial and full cksum offload and
1680 		 * IPv4 header checksum offload.
1681 		 * Allocate new mblk which will
1682 		 * contain a new capability request
1683 		 * to enable hardware checksum offload.
1684 		 */
1685 		uint_t	size;
1686 		uchar_t	*rptr;
1687 
1688 		size = sizeof (dl_capability_req_t) +
1689 		    sizeof (dl_capability_sub_t) + isub->dl_length;
1690 
1691 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1692 			cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1693 			    "could not enable hardware cksum for %s (ENOMEM)\n",
1694 			    ill->ill_name);
1695 			return;
1696 		}
1697 
1698 		rptr = nmp->b_rptr;
1699 		/* initialize dl_capability_req_t */
1700 		ocap = (dl_capability_req_t *)nmp->b_rptr;
1701 		ocap->dl_sub_offset =
1702 		    sizeof (dl_capability_req_t);
1703 		ocap->dl_sub_length =
1704 		    sizeof (dl_capability_sub_t) +
1705 		    isub->dl_length;
1706 		nmp->b_rptr += sizeof (dl_capability_req_t);
1707 
1708 		/* initialize dl_capability_sub_t */
1709 		bcopy(isub, nmp->b_rptr, sizeof (*isub));
1710 		nmp->b_rptr += sizeof (*isub);
1711 
1712 		/* initialize dl_capab_hcksum_t */
1713 		ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
1714 		bcopy(ihck, ohck, sizeof (*ihck));
1715 
1716 		nmp->b_rptr = rptr;
1717 		ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
1718 
1719 		/* Set ENABLE flag */
1720 		ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
1721 		ohck->hcksum_txflags |= HCKSUM_ENABLE;
1722 
1723 		/*
1724 		 * nmp points to a DL_CAPABILITY_REQ message to enable
1725 		 * hardware checksum acceleration.
1726 		 */
1727 		ill_capability_send(ill, nmp);
1728 	} else {
1729 		ip1dbg(("ill_capability_hcksum_ack: interface %s has "
1730 		    "advertised %x hardware checksum capability flags\n",
1731 		    ill->ill_name, ihck->hcksum_txflags));
1732 	}
1733 }
1734 
1735 static void
ill_capability_hcksum_reset_fill(ill_t * ill,mblk_t * mp)1736 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
1737 {
1738 	dl_capab_hcksum_t *hck_subcap;
1739 	dl_capability_sub_t *dl_subcap;
1740 
1741 	if (!ILL_HCKSUM_CAPABLE(ill))
1742 		return;
1743 
1744 	ASSERT(ill->ill_hcksum_capab != NULL);
1745 
1746 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1747 	dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
1748 	dl_subcap->dl_length = sizeof (*hck_subcap);
1749 
1750 	hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
1751 	hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
1752 	hck_subcap->hcksum_txflags = 0;
1753 
1754 	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
1755 }
1756 
1757 static void
ill_capability_zerocopy_ack(ill_t * ill,mblk_t * mp,dl_capability_sub_t * isub)1758 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1759 {
1760 	mblk_t *nmp = NULL;
1761 	dl_capability_req_t *oc;
1762 	dl_capab_zerocopy_t *zc_ic, *zc_oc;
1763 	ill_zerocopy_capab_t **ill_zerocopy_capab;
1764 	uint_t sub_dl_cap = isub->dl_cap;
1765 	uint8_t *capend;
1766 
1767 	ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
1768 
1769 	ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
1770 
1771 	/*
1772 	 * Note: range checks here are not absolutely sufficient to
1773 	 * make us robust against malformed messages sent by drivers;
1774 	 * this is in keeping with the rest of IP's dlpi handling.
1775 	 * (Remember, it's coming from something else in the kernel
1776 	 * address space)
1777 	 */
1778 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
1779 	if (capend > mp->b_wptr) {
1780 		cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1781 		    "malformed sub-capability too long for mblk");
1782 		return;
1783 	}
1784 
1785 	zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
1786 	if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
1787 		cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
1788 		    "unsupported ZEROCOPY sub-capability (version %d, "
1789 		    "expected %d)", zc_ic->zerocopy_version,
1790 		    ZEROCOPY_VERSION_1);
1791 		return;
1792 	}
1793 
1794 	if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
1795 		ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
1796 		    "capability isn't as expected; pass-thru module(s) "
1797 		    "detected, discarding capability\n"));
1798 		return;
1799 	}
1800 
1801 	if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
1802 		if (*ill_zerocopy_capab == NULL) {
1803 			*ill_zerocopy_capab =
1804 			    kmem_zalloc(sizeof (ill_zerocopy_capab_t),
1805 			    KM_NOSLEEP);
1806 
1807 			if (*ill_zerocopy_capab == NULL) {
1808 				cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1809 				    "could not enable Zero-copy version %d "
1810 				    "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
1811 				    ill->ill_name);
1812 				return;
1813 			}
1814 		}
1815 
1816 		ip1dbg(("ill_capability_zerocopy_ack: interface %s "
1817 		    "supports Zero-copy version %d\n", ill->ill_name,
1818 		    ZEROCOPY_VERSION_1));
1819 
1820 		(*ill_zerocopy_capab)->ill_zerocopy_version =
1821 		    zc_ic->zerocopy_version;
1822 		(*ill_zerocopy_capab)->ill_zerocopy_flags =
1823 		    zc_ic->zerocopy_flags;
1824 
1825 		ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
1826 	} else {
1827 		uint_t size;
1828 		uchar_t *rptr;
1829 
1830 		size = sizeof (dl_capability_req_t) +
1831 		    sizeof (dl_capability_sub_t) +
1832 		    sizeof (dl_capab_zerocopy_t);
1833 
1834 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1835 			cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1836 			    "could not enable zerocopy for %s (ENOMEM)\n",
1837 			    ill->ill_name);
1838 			return;
1839 		}
1840 
1841 		rptr = nmp->b_rptr;
1842 		/* initialize dl_capability_req_t */
1843 		oc = (dl_capability_req_t *)rptr;
1844 		oc->dl_sub_offset = sizeof (dl_capability_req_t);
1845 		oc->dl_sub_length = sizeof (dl_capability_sub_t) +
1846 		    sizeof (dl_capab_zerocopy_t);
1847 		rptr += sizeof (dl_capability_req_t);
1848 
1849 		/* initialize dl_capability_sub_t */
1850 		bcopy(isub, rptr, sizeof (*isub));
1851 		rptr += sizeof (*isub);
1852 
1853 		/* initialize dl_capab_zerocopy_t */
1854 		zc_oc = (dl_capab_zerocopy_t *)rptr;
1855 		*zc_oc = *zc_ic;
1856 
1857 		ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
1858 		    "to enable zero-copy version %d\n", ill->ill_name,
1859 		    ZEROCOPY_VERSION_1));
1860 
1861 		/* set VMSAFE_MEM flag */
1862 		zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
1863 
1864 		/* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
1865 		ill_capability_send(ill, nmp);
1866 	}
1867 }
1868 
1869 static void
ill_capability_zerocopy_reset_fill(ill_t * ill,mblk_t * mp)1870 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
1871 {
1872 	dl_capab_zerocopy_t *zerocopy_subcap;
1873 	dl_capability_sub_t *dl_subcap;
1874 
1875 	if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
1876 		return;
1877 
1878 	ASSERT(ill->ill_zerocopy_capab != NULL);
1879 
1880 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1881 	dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
1882 	dl_subcap->dl_length = sizeof (*zerocopy_subcap);
1883 
1884 	zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
1885 	zerocopy_subcap->zerocopy_version =
1886 	    ill->ill_zerocopy_capab->ill_zerocopy_version;
1887 	zerocopy_subcap->zerocopy_flags = 0;
1888 
1889 	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
1890 }
1891 
1892 /*
1893  * DLD capability
1894  * Refer to dld.h for more information regarding the purpose and usage
1895  * of this capability.
1896  */
1897 static void
ill_capability_dld_ack(ill_t * ill,mblk_t * mp,dl_capability_sub_t * isub)1898 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1899 {
1900 	dl_capab_dld_t		*dld_ic, dld;
1901 	uint_t			sub_dl_cap = isub->dl_cap;
1902 	uint8_t			*capend;
1903 	ill_dld_capab_t		*idc;
1904 
1905 	ASSERT(IAM_WRITER_ILL(ill));
1906 	ASSERT(sub_dl_cap == DL_CAPAB_DLD);
1907 
1908 	/*
1909 	 * Note: range checks here are not absolutely sufficient to
1910 	 * make us robust against malformed messages sent by drivers;
1911 	 * this is in keeping with the rest of IP's dlpi handling.
1912 	 * (Remember, it's coming from something else in the kernel
1913 	 * address space)
1914 	 */
1915 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
1916 	if (capend > mp->b_wptr) {
1917 		cmn_err(CE_WARN, "ill_capability_dld_ack: "
1918 		    "malformed sub-capability too long for mblk");
1919 		return;
1920 	}
1921 	dld_ic = (dl_capab_dld_t *)(isub + 1);
1922 	if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
1923 		cmn_err(CE_CONT, "ill_capability_dld_ack: "
1924 		    "unsupported DLD sub-capability (version %d, "
1925 		    "expected %d)", dld_ic->dld_version,
1926 		    DLD_CURRENT_VERSION);
1927 		return;
1928 	}
1929 	if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
1930 		ip1dbg(("ill_capability_dld_ack: mid token for dld "
1931 		    "capability isn't as expected; pass-thru module(s) "
1932 		    "detected, discarding capability\n"));
1933 		return;
1934 	}
1935 
1936 	/*
1937 	 * Copy locally to ensure alignment.
1938 	 */
1939 	bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
1940 
1941 	if ((idc = ill->ill_dld_capab) == NULL) {
1942 		idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
1943 		if (idc == NULL) {
1944 			cmn_err(CE_WARN, "ill_capability_dld_ack: "
1945 			    "could not enable DLD version %d "
1946 			    "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
1947 			    ill->ill_name);
1948 			return;
1949 		}
1950 		ill->ill_dld_capab = idc;
1951 	}
1952 	idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
1953 	idc->idc_capab_dh = (void *)dld.dld_capab_handle;
1954 	ip1dbg(("ill_capability_dld_ack: interface %s "
1955 	    "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
1956 
1957 	ill_capability_dld_enable(ill);
1958 }
1959 
1960 /*
1961  * Typically capability negotiation between IP and the driver happens via
1962  * DLPI message exchange. However GLD also offers a direct function call
1963  * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
1964  * But arbitrary function calls into IP or GLD are not permitted, since both
1965  * of them are protected by their own perimeter mechanism. The perimeter can
1966  * be viewed as a coarse lock or serialization mechanism. The hierarchy of
1967  * these perimeters is IP -> MAC. Thus for example to enable the squeue
1968  * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
1969  * to enter the mac perimeter and then do the direct function calls into
1970  * GLD to enable squeue polling. The ring related callbacks from the mac into
1971  * the stack to add, bind, quiesce, restart or cleanup a ring are all
1972  * protected by the mac perimeter.
1973  */
1974 static void
ill_mac_perim_enter(ill_t * ill,mac_perim_handle_t * mphp)1975 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
1976 {
1977 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
1978 	int			err;
1979 
1980 	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
1981 	    DLD_ENABLE);
1982 	ASSERT(err == 0);
1983 }
1984 
1985 static void
ill_mac_perim_exit(ill_t * ill,mac_perim_handle_t mph)1986 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
1987 {
1988 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
1989 	int			err;
1990 
1991 	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
1992 	    DLD_DISABLE);
1993 	ASSERT(err == 0);
1994 }
1995 
1996 boolean_t
ill_mac_perim_held(ill_t * ill)1997 ill_mac_perim_held(ill_t *ill)
1998 {
1999 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
2000 
2001 	return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
2002 	    DLD_QUERY));
2003 }
2004 
2005 static void
ill_capability_direct_enable(ill_t * ill)2006 ill_capability_direct_enable(ill_t *ill)
2007 {
2008 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
2009 	ill_dld_direct_t	*idd = &idc->idc_direct;
2010 	dld_capab_direct_t	direct;
2011 	int			rc;
2012 
2013 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2014 
2015 	bzero(&direct, sizeof (direct));
2016 	direct.di_rx_cf = (uintptr_t)ip_input;
2017 	direct.di_rx_ch = ill;
2018 
2019 	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
2020 	    DLD_ENABLE);
2021 	if (rc == 0) {
2022 		idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
2023 		idd->idd_tx_dh = direct.di_tx_dh;
2024 		idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
2025 		idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
2026 		idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
2027 		idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
2028 		ASSERT(idd->idd_tx_cb_df != NULL);
2029 		ASSERT(idd->idd_tx_fctl_df != NULL);
2030 		ASSERT(idd->idd_tx_df != NULL);
2031 		/*
2032 		 * One time registration of flow enable callback function
2033 		 */
2034 		ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
2035 		    ill_flow_enable, ill);
2036 		ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
2037 		DTRACE_PROBE1(direct_on, (ill_t *), ill);
2038 	} else {
2039 		cmn_err(CE_WARN, "warning: could not enable DIRECT "
2040 		    "capability, rc = %d\n", rc);
2041 		DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
2042 	}
2043 }
2044 
2045 static void
ill_capability_poll_enable(ill_t * ill)2046 ill_capability_poll_enable(ill_t *ill)
2047 {
2048 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
2049 	dld_capab_poll_t	poll;
2050 	int			rc;
2051 
2052 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2053 
2054 	bzero(&poll, sizeof (poll));
2055 	poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
2056 	poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
2057 	poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
2058 	poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
2059 	poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
2060 	poll.poll_ring_ch = ill;
2061 	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
2062 	    DLD_ENABLE);
2063 	if (rc == 0) {
2064 		ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
2065 		DTRACE_PROBE1(poll_on, (ill_t *), ill);
2066 	} else {
2067 		ip1dbg(("warning: could not enable POLL "
2068 		    "capability, rc = %d\n", rc));
2069 		DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
2070 	}
2071 }
2072 
2073 /*
2074  * Enable the LSO capability.
2075  */
2076 static void
ill_capability_lso_enable(ill_t * ill)2077 ill_capability_lso_enable(ill_t *ill)
2078 {
2079 	ill_dld_capab_t	*idc = ill->ill_dld_capab;
2080 	dld_capab_lso_t	lso;
2081 	int rc;
2082 
2083 	ASSERT(IAM_WRITER_ILL(ill));
2084 
2085 	if (ill->ill_lso_capab == NULL) {
2086 		ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
2087 		    KM_NOSLEEP);
2088 		if (ill->ill_lso_capab == NULL) {
2089 			cmn_err(CE_WARN, "ill_capability_lso_enable: "
2090 			    "could not enable LSO for %s (ENOMEM)\n",
2091 			    ill->ill_name);
2092 			return;
2093 		}
2094 	}
2095 
2096 	bzero(&lso, sizeof (lso));
2097 	if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
2098 	    DLD_ENABLE)) == 0) {
2099 		ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
2100 		ill->ill_lso_capab->ill_lso_max_tcpv4 = lso.lso_max_tcpv4;
2101 		ill->ill_lso_capab->ill_lso_max_tcpv6 = lso.lso_max_tcpv6;
2102 		ill->ill_capabilities |= ILL_CAPAB_LSO;
2103 		ip1dbg(("ill_capability_lso_enable: interface %s "
2104 		    "has enabled LSO\n ", ill->ill_name));
2105 	} else {
2106 		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
2107 		ill->ill_lso_capab = NULL;
2108 		DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
2109 	}
2110 }
2111 
2112 static void
ill_capability_dld_enable(ill_t * ill)2113 ill_capability_dld_enable(ill_t *ill)
2114 {
2115 	mac_perim_handle_t mph;
2116 
2117 	ASSERT(IAM_WRITER_ILL(ill));
2118 
2119 	ill_mac_perim_enter(ill, &mph);
2120 	if (!ill->ill_isv6) {
2121 		ill_capability_direct_enable(ill);
2122 		ill_capability_poll_enable(ill);
2123 	}
2124 	ill_capability_lso_enable(ill);
2125 	ill->ill_capabilities |= ILL_CAPAB_DLD;
2126 	ill_mac_perim_exit(ill, mph);
2127 }
2128 
2129 static void
ill_capability_dld_disable(ill_t * ill)2130 ill_capability_dld_disable(ill_t *ill)
2131 {
2132 	ill_dld_capab_t	*idc;
2133 	ill_dld_direct_t *idd;
2134 	mac_perim_handle_t	mph;
2135 
2136 	ASSERT(IAM_WRITER_ILL(ill));
2137 
2138 	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
2139 		return;
2140 
2141 	ill_mac_perim_enter(ill, &mph);
2142 
2143 	idc = ill->ill_dld_capab;
2144 	if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
2145 		/*
2146 		 * For performance we avoid locks in the transmit data path
2147 		 * and don't maintain a count of the number of threads using
2148 		 * direct calls. Thus some threads could be using direct
2149 		 * transmit calls to GLD, even after the capability mechanism
2150 		 * turns it off. This is still safe since the handles used in
2151 		 * the direct calls continue to be valid until the unplumb is
2152 		 * completed. Remove the callback that was added (1-time) at
2153 		 * capab enable time.
2154 		 */
2155 		mutex_enter(&ill->ill_lock);
2156 		ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
2157 		mutex_exit(&ill->ill_lock);
2158 		if (ill->ill_flownotify_mh != NULL) {
2159 			idd = &idc->idc_direct;
2160 			idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
2161 			    ill->ill_flownotify_mh);
2162 			ill->ill_flownotify_mh = NULL;
2163 		}
2164 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
2165 		    NULL, DLD_DISABLE);
2166 	}
2167 
2168 	if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
2169 		ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
2170 		ip_squeue_clean_all(ill);
2171 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
2172 		    NULL, DLD_DISABLE);
2173 	}
2174 
2175 	if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
2176 		ASSERT(ill->ill_lso_capab != NULL);
2177 		/*
2178 		 * Clear the capability flag for LSO but retain the
2179 		 * ill_lso_capab structure since it's possible that another
2180 		 * thread is still referring to it.  The structure only gets
2181 		 * deallocated when we destroy the ill.
2182 		 */
2183 
2184 		ill->ill_capabilities &= ~ILL_CAPAB_LSO;
2185 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
2186 		    NULL, DLD_DISABLE);
2187 	}
2188 
2189 	ill->ill_capabilities &= ~ILL_CAPAB_DLD;
2190 	ill_mac_perim_exit(ill, mph);
2191 }
2192 
2193 /*
2194  * Capability Negotiation protocol
2195  *
2196  * We don't wait for DLPI capability operations to finish during interface
2197  * bringup or teardown. Doing so would introduce more asynchrony and the
2198  * interface up/down operations will need multiple return and restarts.
2199  * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
2200  * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
2201  * exclusive operation won't start until the DLPI operations of the previous
2202  * exclusive operation complete.
2203  *
2204  * The capability state machine is shown below.
2205  *
2206  * state		next state		event, action
2207  *
2208  * IDCS_UNKNOWN		IDCS_PROBE_SENT		ill_capability_probe
2209  * IDCS_PROBE_SENT	IDCS_OK			ill_capability_ack
2210  * IDCS_PROBE_SENT	IDCS_FAILED		ip_rput_dlpi_writer (nack)
2211  * IDCS_OK		IDCS_RENEG		Receipt of DL_NOTE_CAPAB_RENEG
2212  * IDCS_OK		IDCS_RESET_SENT		ill_capability_reset
2213  * IDCS_RESET_SENT	IDCS_UNKNOWN		ill_capability_ack_thr
2214  * IDCS_RENEG		IDCS_PROBE_SENT		ill_capability_ack_thr ->
2215  *						    ill_capability_probe.
2216  */
2217 
2218 /*
2219  * Dedicated thread started from ip_stack_init that handles capability
2220  * disable. This thread ensures the taskq dispatch does not fail by waiting
2221  * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
2222  * that direct calls to DLD are done in a cv_waitable context.
2223  */
2224 void
ill_taskq_dispatch(ip_stack_t * ipst)2225 ill_taskq_dispatch(ip_stack_t *ipst)
2226 {
2227 	callb_cpr_t cprinfo;
2228 	char	name[64];
2229 	mblk_t	*mp;
2230 
2231 	(void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
2232 	    ipst->ips_netstack->netstack_stackid);
2233 	CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
2234 	    name);
2235 	mutex_enter(&ipst->ips_capab_taskq_lock);
2236 
2237 	for (;;) {
2238 		mp = ipst->ips_capab_taskq_head;
2239 		while (mp != NULL) {
2240 			ipst->ips_capab_taskq_head = mp->b_next;
2241 			if (ipst->ips_capab_taskq_head == NULL)
2242 				ipst->ips_capab_taskq_tail = NULL;
2243 			mutex_exit(&ipst->ips_capab_taskq_lock);
2244 			mp->b_next = NULL;
2245 
2246 			VERIFY(taskq_dispatch(system_taskq,
2247 			    ill_capability_ack_thr, mp, TQ_SLEEP) !=
2248 			    TASKQID_INVALID);
2249 			mutex_enter(&ipst->ips_capab_taskq_lock);
2250 			mp = ipst->ips_capab_taskq_head;
2251 		}
2252 
2253 		if (ipst->ips_capab_taskq_quit)
2254 			break;
2255 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2256 		cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
2257 		CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
2258 	}
2259 	VERIFY(ipst->ips_capab_taskq_head == NULL);
2260 	VERIFY(ipst->ips_capab_taskq_tail == NULL);
2261 	CALLB_CPR_EXIT(&cprinfo);
2262 	thread_exit();
2263 }
2264 
2265 /*
2266  * Consume a new-style hardware capabilities negotiation ack.
2267  * Called via taskq on receipt of DL_CAPABILITY_ACK.
2268  */
2269 static void
ill_capability_ack_thr(void * arg)2270 ill_capability_ack_thr(void *arg)
2271 {
2272 	mblk_t	*mp = arg;
2273 	dl_capability_ack_t *capp;
2274 	dl_capability_sub_t *subp, *endp;
2275 	ill_t	*ill;
2276 	boolean_t reneg;
2277 
2278 	ill = (ill_t *)mp->b_prev;
2279 	mp->b_prev = NULL;
2280 
2281 	VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
2282 
2283 	if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
2284 	    ill->ill_dlpi_capab_state == IDCS_RENEG) {
2285 		/*
2286 		 * We have received the ack for our DL_CAPAB reset request.
2287 		 * There isnt' anything in the message that needs processing.
2288 		 * All message based capabilities have been disabled, now
2289 		 * do the function call based capability disable.
2290 		 */
2291 		reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
2292 		ill_capability_dld_disable(ill);
2293 		ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
2294 		if (reneg)
2295 			ill_capability_probe(ill);
2296 		goto done;
2297 	}
2298 
2299 	if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
2300 		ill->ill_dlpi_capab_state = IDCS_OK;
2301 
2302 	capp = (dl_capability_ack_t *)mp->b_rptr;
2303 
2304 	if (capp->dl_sub_length == 0) {
2305 		/* no new-style capabilities */
2306 		goto done;
2307 	}
2308 
2309 	/* make sure the driver supplied correct dl_sub_length */
2310 	if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
2311 		ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
2312 		    "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
2313 		goto done;
2314 	}
2315 
2316 #define	SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
2317 	/*
2318 	 * There are sub-capabilities. Process the ones we know about.
2319 	 * Loop until we don't have room for another sub-cap header..
2320 	 */
2321 	for (subp = SC(capp, capp->dl_sub_offset),
2322 	    endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
2323 	    subp <= endp;
2324 	    subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
2325 
2326 		switch (subp->dl_cap) {
2327 		case DL_CAPAB_ID_WRAPPER:
2328 			ill_capability_id_ack(ill, mp, subp);
2329 			break;
2330 		default:
2331 			ill_capability_dispatch(ill, mp, subp);
2332 			break;
2333 		}
2334 	}
2335 #undef SC
2336 done:
2337 	inet_freemsg(mp);
2338 	ill_capability_done(ill);
2339 	ipsq_exit(ill->ill_phyint->phyint_ipsq);
2340 }
2341 
2342 /*
2343  * This needs to be started in a taskq thread to provide a cv_waitable
2344  * context.
2345  */
2346 void
ill_capability_ack(ill_t * ill,mblk_t * mp)2347 ill_capability_ack(ill_t *ill, mblk_t *mp)
2348 {
2349 	ip_stack_t	*ipst = ill->ill_ipst;
2350 
2351 	mp->b_prev = (mblk_t *)ill;
2352 	ASSERT(mp->b_next == NULL);
2353 
2354 	if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
2355 	    TQ_NOSLEEP) != TASKQID_INVALID)
2356 		return;
2357 
2358 	/*
2359 	 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
2360 	 * which will do the dispatch using TQ_SLEEP to guarantee success.
2361 	 */
2362 	mutex_enter(&ipst->ips_capab_taskq_lock);
2363 	if (ipst->ips_capab_taskq_head == NULL) {
2364 		ASSERT(ipst->ips_capab_taskq_tail == NULL);
2365 		ipst->ips_capab_taskq_head = mp;
2366 	} else {
2367 		ipst->ips_capab_taskq_tail->b_next = mp;
2368 	}
2369 	ipst->ips_capab_taskq_tail = mp;
2370 
2371 	cv_signal(&ipst->ips_capab_taskq_cv);
2372 	mutex_exit(&ipst->ips_capab_taskq_lock);
2373 }
2374 
2375 /*
2376  * This routine is called to scan the fragmentation reassembly table for
2377  * the specified ILL for any packets that are starting to smell.
2378  * dead_interval is the maximum time in seconds that will be tolerated.  It
2379  * will either be the value specified in ip_g_frag_timeout, or zero if the
2380  * ILL is shutting down and it is time to blow everything off.
2381  *
2382  * It returns the number of seconds (as a time_t) that the next frag timer
2383  * should be scheduled for, 0 meaning that the timer doesn't need to be
2384  * re-started.  Note that the method of calculating next_timeout isn't
2385  * entirely accurate since time will flow between the time we grab
2386  * current_time and the time we schedule the next timeout.  This isn't a
2387  * big problem since this is the timer for sending an ICMP reassembly time
2388  * exceeded messages, and it doesn't have to be exactly accurate.
2389  *
2390  * This function is
2391  * sometimes called as writer, although this is not required.
2392  */
2393 time_t
ill_frag_timeout(ill_t * ill,time_t dead_interval)2394 ill_frag_timeout(ill_t *ill, time_t dead_interval)
2395 {
2396 	ipfb_t	*ipfb;
2397 	ipfb_t	*endp;
2398 	ipf_t	*ipf;
2399 	ipf_t	*ipfnext;
2400 	mblk_t	*mp;
2401 	time_t	current_time = gethrestime_sec();
2402 	time_t	next_timeout = 0;
2403 	uint32_t	hdr_length;
2404 	mblk_t	*send_icmp_head;
2405 	mblk_t	*send_icmp_head_v6;
2406 	ip_stack_t *ipst = ill->ill_ipst;
2407 	ip_recv_attr_t iras;
2408 
2409 	bzero(&iras, sizeof (iras));
2410 	iras.ira_flags = 0;
2411 	iras.ira_ill = iras.ira_rill = ill;
2412 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2413 	iras.ira_rifindex = iras.ira_ruifindex;
2414 
2415 	ipfb = ill->ill_frag_hash_tbl;
2416 	if (ipfb == NULL)
2417 		return (B_FALSE);
2418 	endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
2419 	/* Walk the frag hash table. */
2420 	for (; ipfb < endp; ipfb++) {
2421 		send_icmp_head = NULL;
2422 		send_icmp_head_v6 = NULL;
2423 		mutex_enter(&ipfb->ipfb_lock);
2424 		while ((ipf = ipfb->ipfb_ipf) != 0) {
2425 			time_t frag_time = current_time - ipf->ipf_timestamp;
2426 			time_t frag_timeout;
2427 
2428 			if (frag_time < dead_interval) {
2429 				/*
2430 				 * There are some outstanding fragments
2431 				 * that will timeout later.  Make note of
2432 				 * the time so that we can reschedule the
2433 				 * next timeout appropriately.
2434 				 */
2435 				frag_timeout = dead_interval - frag_time;
2436 				if (next_timeout == 0 ||
2437 				    frag_timeout < next_timeout) {
2438 					next_timeout = frag_timeout;
2439 				}
2440 				break;
2441 			}
2442 			/* Time's up.  Get it out of here. */
2443 			hdr_length = ipf->ipf_nf_hdr_len;
2444 			ipfnext = ipf->ipf_hash_next;
2445 			if (ipfnext)
2446 				ipfnext->ipf_ptphn = ipf->ipf_ptphn;
2447 			*ipf->ipf_ptphn = ipfnext;
2448 			mp = ipf->ipf_mp->b_cont;
2449 			for (; mp; mp = mp->b_cont) {
2450 				/* Extra points for neatness. */
2451 				IP_REASS_SET_START(mp, 0);
2452 				IP_REASS_SET_END(mp, 0);
2453 			}
2454 			mp = ipf->ipf_mp->b_cont;
2455 			atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
2456 			ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
2457 			ipfb->ipfb_count -= ipf->ipf_count;
2458 			ASSERT(ipfb->ipfb_frag_pkts > 0);
2459 			ipfb->ipfb_frag_pkts--;
2460 			/*
2461 			 * We do not send any icmp message from here because
2462 			 * we currently are holding the ipfb_lock for this
2463 			 * hash chain. If we try and send any icmp messages
2464 			 * from here we may end up via a put back into ip
2465 			 * trying to get the same lock, causing a recursive
2466 			 * mutex panic. Instead we build a list and send all
2467 			 * the icmp messages after we have dropped the lock.
2468 			 */
2469 			if (ill->ill_isv6) {
2470 				if (hdr_length != 0) {
2471 					mp->b_next = send_icmp_head_v6;
2472 					send_icmp_head_v6 = mp;
2473 				} else {
2474 					freemsg(mp);
2475 				}
2476 			} else {
2477 				if (hdr_length != 0) {
2478 					mp->b_next = send_icmp_head;
2479 					send_icmp_head = mp;
2480 				} else {
2481 					freemsg(mp);
2482 				}
2483 			}
2484 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2485 			ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
2486 			freeb(ipf->ipf_mp);
2487 		}
2488 		mutex_exit(&ipfb->ipfb_lock);
2489 		/*
2490 		 * Now need to send any icmp messages that we delayed from
2491 		 * above.
2492 		 */
2493 		while (send_icmp_head_v6 != NULL) {
2494 			ip6_t *ip6h;
2495 
2496 			mp = send_icmp_head_v6;
2497 			send_icmp_head_v6 = send_icmp_head_v6->b_next;
2498 			mp->b_next = NULL;
2499 			ip6h = (ip6_t *)mp->b_rptr;
2500 			iras.ira_flags = 0;
2501 			/*
2502 			 * This will result in an incorrect ALL_ZONES zoneid
2503 			 * for multicast packets, but we
2504 			 * don't send ICMP errors for those in any case.
2505 			 */
2506 			iras.ira_zoneid =
2507 			    ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
2508 			    ill, ipst);
2509 			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2510 			icmp_time_exceeded_v6(mp,
2511 			    ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
2512 			    &iras);
2513 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2514 		}
2515 		while (send_icmp_head != NULL) {
2516 			ipaddr_t dst;
2517 
2518 			mp = send_icmp_head;
2519 			send_icmp_head = send_icmp_head->b_next;
2520 			mp->b_next = NULL;
2521 
2522 			dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
2523 
2524 			iras.ira_flags = IRAF_IS_IPV4;
2525 			/*
2526 			 * This will result in an incorrect ALL_ZONES zoneid
2527 			 * for broadcast and multicast packets, but we
2528 			 * don't send ICMP errors for those in any case.
2529 			 */
2530 			iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
2531 			    ill, ipst);
2532 			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2533 			icmp_time_exceeded(mp,
2534 			    ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
2535 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2536 		}
2537 	}
2538 	/*
2539 	 * A non-dying ILL will use the return value to decide whether to
2540 	 * restart the frag timer, and for how long.
2541 	 */
2542 	return (next_timeout);
2543 }
2544 
2545 /*
2546  * This routine is called when the approximate count of mblk memory used
2547  * for the specified ILL has exceeded max_count.
2548  */
2549 void
ill_frag_prune(ill_t * ill,uint_t max_count)2550 ill_frag_prune(ill_t *ill, uint_t max_count)
2551 {
2552 	ipfb_t	*ipfb;
2553 	ipf_t	*ipf;
2554 	size_t	count;
2555 	clock_t now;
2556 
2557 	/*
2558 	 * If we are here within ip_min_frag_prune_time msecs remove
2559 	 * ill_frag_free_num_pkts oldest packets from each bucket and increment
2560 	 * ill_frag_free_num_pkts.
2561 	 */
2562 	mutex_enter(&ill->ill_lock);
2563 	now = ddi_get_lbolt();
2564 	if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
2565 	    (ip_min_frag_prune_time != 0 ?
2566 	    ip_min_frag_prune_time : msec_per_tick)) {
2567 
2568 		ill->ill_frag_free_num_pkts++;
2569 
2570 	} else {
2571 		ill->ill_frag_free_num_pkts = 0;
2572 	}
2573 	ill->ill_last_frag_clean_time = now;
2574 	mutex_exit(&ill->ill_lock);
2575 
2576 	/*
2577 	 * free ill_frag_free_num_pkts oldest packets from each bucket.
2578 	 */
2579 	if (ill->ill_frag_free_num_pkts != 0) {
2580 		int ix;
2581 
2582 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2583 			ipfb = &ill->ill_frag_hash_tbl[ix];
2584 			mutex_enter(&ipfb->ipfb_lock);
2585 			if (ipfb->ipfb_ipf != NULL) {
2586 				ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
2587 				    ill->ill_frag_free_num_pkts);
2588 			}
2589 			mutex_exit(&ipfb->ipfb_lock);
2590 		}
2591 	}
2592 	/*
2593 	 * While the reassembly list for this ILL is too big, prune a fragment
2594 	 * queue by age, oldest first.
2595 	 */
2596 	while (ill->ill_frag_count > max_count) {
2597 		int	ix;
2598 		ipfb_t	*oipfb = NULL;
2599 		uint_t	oldest = UINT_MAX;
2600 
2601 		count = 0;
2602 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2603 			ipfb = &ill->ill_frag_hash_tbl[ix];
2604 			mutex_enter(&ipfb->ipfb_lock);
2605 			ipf = ipfb->ipfb_ipf;
2606 			if (ipf != NULL && ipf->ipf_gen < oldest) {
2607 				oldest = ipf->ipf_gen;
2608 				oipfb = ipfb;
2609 			}
2610 			count += ipfb->ipfb_count;
2611 			mutex_exit(&ipfb->ipfb_lock);
2612 		}
2613 		if (oipfb == NULL)
2614 			break;
2615 
2616 		if (count <= max_count)
2617 			return;	/* Somebody beat us to it, nothing to do */
2618 		mutex_enter(&oipfb->ipfb_lock);
2619 		ipf = oipfb->ipfb_ipf;
2620 		if (ipf != NULL) {
2621 			ill_frag_free_pkts(ill, oipfb, ipf, 1);
2622 		}
2623 		mutex_exit(&oipfb->ipfb_lock);
2624 	}
2625 }
2626 
2627 /*
2628  * free 'free_cnt' fragmented packets starting at ipf.
2629  */
2630 void
ill_frag_free_pkts(ill_t * ill,ipfb_t * ipfb,ipf_t * ipf,int free_cnt)2631 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
2632 {
2633 	size_t	count;
2634 	mblk_t	*mp;
2635 	mblk_t	*tmp;
2636 	ipf_t **ipfp = ipf->ipf_ptphn;
2637 
2638 	ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
2639 	ASSERT(ipfp != NULL);
2640 	ASSERT(ipf != NULL);
2641 
2642 	while (ipf != NULL && free_cnt-- > 0) {
2643 		count = ipf->ipf_count;
2644 		mp = ipf->ipf_mp;
2645 		ipf = ipf->ipf_hash_next;
2646 		for (tmp = mp; tmp; tmp = tmp->b_cont) {
2647 			IP_REASS_SET_START(tmp, 0);
2648 			IP_REASS_SET_END(tmp, 0);
2649 		}
2650 		atomic_add_32(&ill->ill_frag_count, -count);
2651 		ASSERT(ipfb->ipfb_count >= count);
2652 		ipfb->ipfb_count -= count;
2653 		ASSERT(ipfb->ipfb_frag_pkts > 0);
2654 		ipfb->ipfb_frag_pkts--;
2655 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2656 		ip_drop_input("ipIfStatsReasmFails", mp, ill);
2657 		freemsg(mp);
2658 	}
2659 
2660 	if (ipf)
2661 		ipf->ipf_ptphn = ipfp;
2662 	ipfp[0] = ipf;
2663 }
2664 
2665 /*
2666  * Helper function for ill_forward_set().
2667  */
2668 static void
ill_forward_set_on_ill(ill_t * ill,boolean_t enable)2669 ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
2670 {
2671 	ip_stack_t	*ipst = ill->ill_ipst;
2672 
2673 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2674 
2675 	ip1dbg(("ill_forward_set: %s %s forwarding on %s",
2676 	    (enable ? "Enabling" : "Disabling"),
2677 	    (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
2678 	mutex_enter(&ill->ill_lock);
2679 	if (enable)
2680 		ill->ill_flags |= ILLF_ROUTER;
2681 	else
2682 		ill->ill_flags &= ~ILLF_ROUTER;
2683 	mutex_exit(&ill->ill_lock);
2684 	if (ill->ill_isv6)
2685 		ill_set_nce_router_flags(ill, enable);
2686 	/* Notify routing socket listeners of this change. */
2687 	if (ill->ill_ipif != NULL)
2688 		ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
2689 }
2690 
2691 /*
2692  * Set an ill's ILLF_ROUTER flag appropriately.  Send up RTS_IFINFO routing
2693  * socket messages for each interface whose flags we change.
2694  */
2695 int
ill_forward_set(ill_t * ill,boolean_t enable)2696 ill_forward_set(ill_t *ill, boolean_t enable)
2697 {
2698 	ipmp_illgrp_t *illg;
2699 	ip_stack_t *ipst = ill->ill_ipst;
2700 
2701 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2702 
2703 	if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
2704 	    (!enable && !(ill->ill_flags & ILLF_ROUTER)))
2705 		return (0);
2706 
2707 	if (IS_LOOPBACK(ill))
2708 		return (EINVAL);
2709 
2710 	if (enable && ill->ill_allowed_ips_cnt > 0)
2711 		return (EPERM);
2712 
2713 	if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
2714 		/*
2715 		 * Update all of the interfaces in the group.
2716 		 */
2717 		illg = ill->ill_grp;
2718 		ill = list_head(&illg->ig_if);
2719 		for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2720 			ill_forward_set_on_ill(ill, enable);
2721 
2722 		/*
2723 		 * Update the IPMP meta-interface.
2724 		 */
2725 		ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
2726 		return (0);
2727 	}
2728 
2729 	ill_forward_set_on_ill(ill, enable);
2730 	return (0);
2731 }
2732 
2733 /*
2734  * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
2735  * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
2736  * set or clear.
2737  */
2738 static void
ill_set_nce_router_flags(ill_t * ill,boolean_t enable)2739 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
2740 {
2741 	ipif_t *ipif;
2742 	ncec_t *ncec;
2743 	nce_t *nce;
2744 
2745 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
2746 		/*
2747 		 * NOTE: we match across the illgrp because nce's for
2748 		 * addresses on IPMP interfaces have an nce_ill that points to
2749 		 * the bound underlying ill.
2750 		 */
2751 		nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
2752 		if (nce != NULL) {
2753 			ncec = nce->nce_common;
2754 			mutex_enter(&ncec->ncec_lock);
2755 			if (enable)
2756 				ncec->ncec_flags |= NCE_F_ISROUTER;
2757 			else
2758 				ncec->ncec_flags &= ~NCE_F_ISROUTER;
2759 			mutex_exit(&ncec->ncec_lock);
2760 			nce_refrele(nce);
2761 		}
2762 	}
2763 }
2764 
2765 /*
2766  * Intializes the context structure and returns the first ill in the list
2767  * cuurently start_list and end_list can have values:
2768  * MAX_G_HEADS		Traverse both IPV4 and IPV6 lists.
2769  * IP_V4_G_HEAD		Traverse IPV4 list only.
2770  * IP_V6_G_HEAD		Traverse IPV6 list only.
2771  */
2772 
2773 /*
2774  * We don't check for CONDEMNED ills here. Caller must do that if
2775  * necessary under the ill lock.
2776  */
2777 ill_t *
ill_first(int start_list,int end_list,ill_walk_context_t * ctx,ip_stack_t * ipst)2778 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
2779     ip_stack_t *ipst)
2780 {
2781 	ill_if_t *ifp;
2782 	ill_t *ill;
2783 	avl_tree_t *avl_tree;
2784 
2785 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
2786 	ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
2787 
2788 	/*
2789 	 * setup the lists to search
2790 	 */
2791 	if (end_list != MAX_G_HEADS) {
2792 		ctx->ctx_current_list = start_list;
2793 		ctx->ctx_last_list = end_list;
2794 	} else {
2795 		ctx->ctx_last_list = MAX_G_HEADS - 1;
2796 		ctx->ctx_current_list = 0;
2797 	}
2798 
2799 	while (ctx->ctx_current_list <= ctx->ctx_last_list) {
2800 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2801 		if (ifp != (ill_if_t *)
2802 		    &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2803 			avl_tree = &ifp->illif_avl_by_ppa;
2804 			ill = avl_first(avl_tree);
2805 			/*
2806 			 * ill is guaranteed to be non NULL or ifp should have
2807 			 * not existed.
2808 			 */
2809 			ASSERT(ill != NULL);
2810 			return (ill);
2811 		}
2812 		ctx->ctx_current_list++;
2813 	}
2814 
2815 	return (NULL);
2816 }
2817 
2818 /*
2819  * returns the next ill in the list. ill_first() must have been called
2820  * before calling ill_next() or bad things will happen.
2821  */
2822 
2823 /*
2824  * We don't check for CONDEMNED ills here. Caller must do that if
2825  * necessary under the ill lock.
2826  */
2827 ill_t *
ill_next(ill_walk_context_t * ctx,ill_t * lastill)2828 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
2829 {
2830 	ill_if_t *ifp;
2831 	ill_t *ill;
2832 	ip_stack_t	*ipst = lastill->ill_ipst;
2833 
2834 	ASSERT(lastill->ill_ifptr != (ill_if_t *)
2835 	    &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
2836 	if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
2837 	    AVL_AFTER)) != NULL) {
2838 		return (ill);
2839 	}
2840 
2841 	/* goto next ill_ifp in the list. */
2842 	ifp = lastill->ill_ifptr->illif_next;
2843 
2844 	/* make sure not at end of circular list */
2845 	while (ifp ==
2846 	    (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2847 		if (++ctx->ctx_current_list > ctx->ctx_last_list)
2848 			return (NULL);
2849 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2850 	}
2851 
2852 	return (avl_first(&ifp->illif_avl_by_ppa));
2853 }
2854 
2855 /*
2856  * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
2857  * The final number (PPA) must not have any leading zeros.  Upon success, a
2858  * pointer to the start of the PPA is returned; otherwise NULL is returned.
2859  */
2860 static char *
ill_get_ppa_ptr(char * name)2861 ill_get_ppa_ptr(char *name)
2862 {
2863 	int namelen = strlen(name);
2864 	int end_ndx = namelen - 1;
2865 	int ppa_ndx, i;
2866 
2867 	/*
2868 	 * Check that the first character is [a-zA-Z], and that the last
2869 	 * character is [0-9].
2870 	 */
2871 	if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
2872 		return (NULL);
2873 
2874 	/*
2875 	 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
2876 	 */
2877 	for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
2878 		if (!isdigit(name[ppa_ndx - 1]))
2879 			break;
2880 
2881 	if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
2882 		return (NULL);
2883 
2884 	/*
2885 	 * Check that the intermediate characters are [a-z0-9.]
2886 	 */
2887 	for (i = 1; i < ppa_ndx; i++) {
2888 		if (!isalpha(name[i]) && !isdigit(name[i]) &&
2889 		    name[i] != '.' && name[i] != '_') {
2890 			return (NULL);
2891 		}
2892 	}
2893 
2894 	return (name + ppa_ndx);
2895 }
2896 
2897 /*
2898  * use avl tree to locate the ill.
2899  */
2900 static ill_t *
ill_find_by_name(char * name,boolean_t isv6,ip_stack_t * ipst)2901 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
2902 {
2903 	char *ppa_ptr = NULL;
2904 	int len;
2905 	uint_t ppa;
2906 	ill_t *ill = NULL;
2907 	ill_if_t *ifp;
2908 	int list;
2909 
2910 	/*
2911 	 * get ppa ptr
2912 	 */
2913 	if (isv6)
2914 		list = IP_V6_G_HEAD;
2915 	else
2916 		list = IP_V4_G_HEAD;
2917 
2918 	if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
2919 		return (NULL);
2920 	}
2921 
2922 	len = ppa_ptr - name + 1;
2923 
2924 	ppa = stoi(&ppa_ptr);
2925 
2926 	ifp = IP_VX_ILL_G_LIST(list, ipst);
2927 
2928 	while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2929 		/*
2930 		 * match is done on len - 1 as the name is not null
2931 		 * terminated it contains ppa in addition to the interface
2932 		 * name.
2933 		 */
2934 		if ((ifp->illif_name_len == len) &&
2935 		    bcmp(ifp->illif_name, name, len - 1) == 0) {
2936 			break;
2937 		} else {
2938 			ifp = ifp->illif_next;
2939 		}
2940 	}
2941 
2942 	if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2943 		/*
2944 		 * Even the interface type does not exist.
2945 		 */
2946 		return (NULL);
2947 	}
2948 
2949 	ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
2950 	if (ill != NULL) {
2951 		mutex_enter(&ill->ill_lock);
2952 		if (ILL_CAN_LOOKUP(ill)) {
2953 			ill_refhold_locked(ill);
2954 			mutex_exit(&ill->ill_lock);
2955 			return (ill);
2956 		}
2957 		mutex_exit(&ill->ill_lock);
2958 	}
2959 	return (NULL);
2960 }
2961 
2962 /*
2963  * comparison function for use with avl.
2964  */
2965 static int
ill_compare_ppa(const void * ppa_ptr,const void * ill_ptr)2966 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
2967 {
2968 	uint_t ppa;
2969 	uint_t ill_ppa;
2970 
2971 	ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
2972 
2973 	ppa = *((uint_t *)ppa_ptr);
2974 	ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
2975 	/*
2976 	 * We want the ill with the lowest ppa to be on the
2977 	 * top.
2978 	 */
2979 	if (ill_ppa < ppa)
2980 		return (1);
2981 	if (ill_ppa > ppa)
2982 		return (-1);
2983 	return (0);
2984 }
2985 
2986 /*
2987  * remove an interface type from the global list.
2988  */
2989 static void
ill_delete_interface_type(ill_if_t * interface)2990 ill_delete_interface_type(ill_if_t *interface)
2991 {
2992 	ASSERT(interface != NULL);
2993 	ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
2994 
2995 	avl_destroy(&interface->illif_avl_by_ppa);
2996 	if (interface->illif_ppa_arena != NULL)
2997 		vmem_destroy(interface->illif_ppa_arena);
2998 
2999 	remque(interface);
3000 
3001 	mi_free(interface);
3002 }
3003 
3004 /*
3005  * remove ill from the global list.
3006  */
3007 static void
ill_glist_delete(ill_t * ill)3008 ill_glist_delete(ill_t *ill)
3009 {
3010 	ip_stack_t	*ipst;
3011 	phyint_t	*phyi;
3012 
3013 	if (ill == NULL)
3014 		return;
3015 	ipst = ill->ill_ipst;
3016 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3017 
3018 	/*
3019 	 * If the ill was never inserted into the AVL tree
3020 	 * we skip the if branch.
3021 	 */
3022 	if (ill->ill_ifptr != NULL) {
3023 		/*
3024 		 * remove from AVL tree and free ppa number
3025 		 */
3026 		avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
3027 
3028 		if (ill->ill_ifptr->illif_ppa_arena != NULL) {
3029 			vmem_free(ill->ill_ifptr->illif_ppa_arena,
3030 			    (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3031 		}
3032 		if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
3033 			ill_delete_interface_type(ill->ill_ifptr);
3034 		}
3035 
3036 		/*
3037 		 * Indicate ill is no longer in the list.
3038 		 */
3039 		ill->ill_ifptr = NULL;
3040 		ill->ill_name_length = 0;
3041 		ill->ill_name[0] = '\0';
3042 		ill->ill_ppa = UINT_MAX;
3043 	}
3044 
3045 	/* Generate one last event for this ill. */
3046 	ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
3047 	    ill->ill_name_length);
3048 
3049 	ASSERT(ill->ill_phyint != NULL);
3050 	phyi = ill->ill_phyint;
3051 	ill->ill_phyint = NULL;
3052 
3053 	/*
3054 	 * ill_init allocates a phyint always to store the copy
3055 	 * of flags relevant to phyint. At that point in time, we could
3056 	 * not assign the name and hence phyint_illv4/v6 could not be
3057 	 * initialized. Later in ipif_set_values, we assign the name to
3058 	 * the ill, at which point in time we assign phyint_illv4/v6.
3059 	 * Thus we don't rely on phyint_illv6 to be initialized always.
3060 	 */
3061 	if (ill->ill_flags & ILLF_IPV6)
3062 		phyi->phyint_illv6 = NULL;
3063 	else
3064 		phyi->phyint_illv4 = NULL;
3065 
3066 	if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
3067 		rw_exit(&ipst->ips_ill_g_lock);
3068 		return;
3069 	}
3070 
3071 	/*
3072 	 * There are no ills left on this phyint; pull it out of the phyint
3073 	 * avl trees, and free it.
3074 	 */
3075 	if (phyi->phyint_ifindex > 0) {
3076 		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3077 		    phyi);
3078 		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3079 		    phyi);
3080 	}
3081 	rw_exit(&ipst->ips_ill_g_lock);
3082 
3083 	phyint_free(phyi);
3084 }
3085 
3086 /*
3087  * allocate a ppa, if the number of plumbed interfaces of this type are
3088  * less than ill_no_arena do a linear search to find a unused ppa.
3089  * When the number goes beyond ill_no_arena switch to using an arena.
3090  * Note: ppa value of zero cannot be allocated from vmem_arena as it
3091  * is the return value for an error condition, so allocation starts at one
3092  * and is decremented by one.
3093  */
3094 static int
ill_alloc_ppa(ill_if_t * ifp,ill_t * ill)3095 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
3096 {
3097 	ill_t *tmp_ill;
3098 	uint_t start, end;
3099 	int ppa;
3100 
3101 	if (ifp->illif_ppa_arena == NULL &&
3102 	    (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
3103 		/*
3104 		 * Create an arena.
3105 		 */
3106 		ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
3107 		    (void *)1, UINT_MAX - 1, 1, NULL, NULL,
3108 		    NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
3109 			/* allocate what has already been assigned */
3110 		for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
3111 		    tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
3112 		    tmp_ill, AVL_AFTER)) {
3113 			ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3114 			    1,		/* size */
3115 			    1,		/* align/quantum */
3116 			    0,		/* phase */
3117 			    0,		/* nocross */
3118 			    /* minaddr */
3119 			    (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
3120 			    /* maxaddr */
3121 			    (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
3122 			    VM_NOSLEEP|VM_FIRSTFIT);
3123 			if (ppa == 0) {
3124 				ip1dbg(("ill_alloc_ppa: ppa allocation"
3125 				    " failed while switching"));
3126 				vmem_destroy(ifp->illif_ppa_arena);
3127 				ifp->illif_ppa_arena = NULL;
3128 				break;
3129 			}
3130 		}
3131 	}
3132 
3133 	if (ifp->illif_ppa_arena != NULL) {
3134 		if (ill->ill_ppa == UINT_MAX) {
3135 			ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
3136 			    1, VM_NOSLEEP|VM_FIRSTFIT);
3137 			if (ppa == 0)
3138 				return (EAGAIN);
3139 			ill->ill_ppa = --ppa;
3140 		} else {
3141 			ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3142 			    1,		/* size */
3143 			    1,		/* align/quantum */
3144 			    0,		/* phase */
3145 			    0,		/* nocross */
3146 			    (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
3147 			    (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
3148 			    VM_NOSLEEP|VM_FIRSTFIT);
3149 			/*
3150 			 * Most likely the allocation failed because
3151 			 * the requested ppa was in use.
3152 			 */
3153 			if (ppa == 0)
3154 				return (EEXIST);
3155 		}
3156 		return (0);
3157 	}
3158 
3159 	/*
3160 	 * No arena is in use and not enough (>ill_no_arena) interfaces have
3161 	 * been plumbed to create one. Do a linear search to get a unused ppa.
3162 	 */
3163 	if (ill->ill_ppa == UINT_MAX) {
3164 		end = UINT_MAX - 1;
3165 		start = 0;
3166 	} else {
3167 		end = start = ill->ill_ppa;
3168 	}
3169 
3170 	tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
3171 	while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
3172 		if (start++ >= end) {
3173 			if (ill->ill_ppa == UINT_MAX)
3174 				return (EAGAIN);
3175 			else
3176 				return (EEXIST);
3177 		}
3178 		tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
3179 	}
3180 	ill->ill_ppa = start;
3181 	return (0);
3182 }
3183 
3184 /*
3185  * Insert ill into the list of configured ill's. Once this function completes,
3186  * the ill is globally visible and is available through lookups. More precisely
3187  * this happens after the caller drops the ill_g_lock.
3188  */
3189 static int
ill_glist_insert(ill_t * ill,char * name,boolean_t isv6)3190 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
3191 {
3192 	ill_if_t *ill_interface;
3193 	avl_index_t where = 0;
3194 	int error;
3195 	int name_length;
3196 	int index;
3197 	boolean_t check_length = B_FALSE;
3198 	ip_stack_t	*ipst = ill->ill_ipst;
3199 
3200 	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
3201 
3202 	name_length = mi_strlen(name) + 1;
3203 
3204 	if (isv6)
3205 		index = IP_V6_G_HEAD;
3206 	else
3207 		index = IP_V4_G_HEAD;
3208 
3209 	ill_interface = IP_VX_ILL_G_LIST(index, ipst);
3210 	/*
3211 	 * Search for interface type based on name
3212 	 */
3213 	while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3214 		if ((ill_interface->illif_name_len == name_length) &&
3215 		    (strcmp(ill_interface->illif_name, name) == 0)) {
3216 			break;
3217 		}
3218 		ill_interface = ill_interface->illif_next;
3219 	}
3220 
3221 	/*
3222 	 * Interface type not found, create one.
3223 	 */
3224 	if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3225 		ill_g_head_t ghead;
3226 
3227 		/*
3228 		 * allocate ill_if_t structure
3229 		 */
3230 		ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
3231 		if (ill_interface == NULL) {
3232 			return (ENOMEM);
3233 		}
3234 
3235 		(void) strcpy(ill_interface->illif_name, name);
3236 		ill_interface->illif_name_len = name_length;
3237 
3238 		avl_create(&ill_interface->illif_avl_by_ppa,
3239 		    ill_compare_ppa, sizeof (ill_t),
3240 		    offsetof(struct ill_s, ill_avl_byppa));
3241 
3242 		/*
3243 		 * link the structure in the back to maintain order
3244 		 * of configuration for ifconfig output.
3245 		 */
3246 		ghead = ipst->ips_ill_g_heads[index];
3247 		insque(ill_interface, ghead.ill_g_list_tail);
3248 	}
3249 
3250 	if (ill->ill_ppa == UINT_MAX)
3251 		check_length = B_TRUE;
3252 
3253 	error = ill_alloc_ppa(ill_interface, ill);
3254 	if (error != 0) {
3255 		if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3256 			ill_delete_interface_type(ill->ill_ifptr);
3257 		return (error);
3258 	}
3259 
3260 	/*
3261 	 * When the ppa is choosen by the system, check that there is
3262 	 * enough space to insert ppa. if a specific ppa was passed in this
3263 	 * check is not required as the interface name passed in will have
3264 	 * the right ppa in it.
3265 	 */
3266 	if (check_length) {
3267 		/*
3268 		 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
3269 		 */
3270 		char buf[sizeof (uint_t) * 3];
3271 
3272 		/*
3273 		 * convert ppa to string to calculate the amount of space
3274 		 * required for it in the name.
3275 		 */
3276 		numtos(ill->ill_ppa, buf);
3277 
3278 		/* Do we have enough space to insert ppa ? */
3279 
3280 		if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
3281 			/* Free ppa and interface type struct */
3282 			if (ill_interface->illif_ppa_arena != NULL) {
3283 				vmem_free(ill_interface->illif_ppa_arena,
3284 				    (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3285 			}
3286 			if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3287 				ill_delete_interface_type(ill->ill_ifptr);
3288 
3289 			return (EINVAL);
3290 		}
3291 	}
3292 
3293 	(void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
3294 	ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
3295 
3296 	(void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
3297 	    &where);
3298 	ill->ill_ifptr = ill_interface;
3299 	avl_insert(&