xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_output.c (revision b22a70ab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2018 Joyent, Inc.
25  */
26 /* Copyright (c) 1990 Mentat Inc. */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsubr.h>
31 #include <sys/dlpi.h>
32 #include <sys/strsun.h>
33 #include <sys/zone.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/atomic.h>
39 
40 #include <sys/systm.h>
41 #include <sys/param.h>
42 #include <sys/kmem.h>
43 #include <sys/sdt.h>
44 #include <sys/socket.h>
45 #include <sys/mac.h>
46 #include <net/if.h>
47 #include <net/if_arp.h>
48 #include <net/route.h>
49 #include <sys/sockio.h>
50 #include <netinet/in.h>
51 #include <net/if_dl.h>
52 
53 #include <inet/common.h>
54 #include <inet/mi.h>
55 #include <inet/mib2.h>
56 #include <inet/nd.h>
57 #include <inet/arp.h>
58 #include <inet/snmpcom.h>
59 #include <inet/kstatcom.h>
60 
61 #include <netinet/igmp_var.h>
62 #include <netinet/ip6.h>
63 #include <netinet/icmp6.h>
64 #include <netinet/sctp.h>
65 
66 #include <inet/ip.h>
67 #include <inet/ip_impl.h>
68 #include <inet/ip6.h>
69 #include <inet/ip6_asp.h>
70 #include <inet/tcp.h>
71 #include <inet/ip_multi.h>
72 #include <inet/ip_if.h>
73 #include <inet/ip_ire.h>
74 #include <inet/ip_ftable.h>
75 #include <inet/ip_rts.h>
76 #include <inet/optcom.h>
77 #include <inet/ip_ndp.h>
78 #include <inet/ip_listutils.h>
79 #include <netinet/igmp.h>
80 #include <netinet/ip_mroute.h>
81 #include <inet/ipp_common.h>
82 
83 #include <net/pfkeyv2.h>
84 #include <inet/sadb.h>
85 #include <inet/ipsec_impl.h>
86 #include <inet/ipdrop.h>
87 #include <inet/ip_netinfo.h>
88 
89 #include <sys/pattr.h>
90 #include <inet/ipclassifier.h>
91 #include <inet/sctp_ip.h>
92 #include <inet/sctp/sctp_impl.h>
93 #include <inet/udp_impl.h>
94 #include <sys/sunddi.h>
95 
96 #include <sys/tsol/label.h>
97 #include <sys/tsol/tnet.h>
98 
99 #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
100 
101 #ifdef	DEBUG
102 extern boolean_t skip_sctp_cksum;
103 #endif
104 
105 static int	ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
106 static int	ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
107 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
108 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
109 static void	ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
110 
111 /*
112  * There are two types of output functions for IP used for different
113  * purposes:
114  *  - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
115  *     is no context in the form of a conn_t. However, there is a
116  *     ip_xmit_attr_t that the callers use to influence interface selection
117  *     (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
118  *
119  *  - conn_ip_output() is used when sending packets with a conn_t and
120  *    ip_set_destination has been called to cache information. In that case
121  *    various socket options are recorded in the ip_xmit_attr_t and should
122  *    be taken into account.
123  */
124 
125 /*
126  * The caller *must* have called conn_connect() or ip_attr_connect()
127  * before calling conn_ip_output(). The caller needs to redo that each time
128  * the destination IP address or port changes, as well as each time there is
129  * a change to any socket option that would modify how packets are routed out
130  * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
131  *
132  * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
133  * We assert for that here.
134  */
135 int
conn_ip_output(mblk_t * mp,ip_xmit_attr_t * ixa)136 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
137 {
138 	iaflags_t	ixaflags = ixa->ixa_flags;
139 	ire_t		*ire;
140 	nce_t		*nce;
141 	dce_t		*dce;
142 	ill_t		*ill;
143 	ip_stack_t	*ipst = ixa->ixa_ipst;
144 	int		error;
145 
146 	/* We defer ipIfStatsHCOutRequests until an error or we have an ill */
147 
148 	ASSERT(ixa->ixa_ire != NULL);
149 	/* Note there is no ixa_nce when reject and blackhole routes */
150 	ASSERT(ixa->ixa_dce != NULL);	/* Could be default dce */
151 
152 #ifdef DEBUG
153 	ASSERT(ixa->ixa_curthread == NULL);
154 	ixa->ixa_curthread = curthread;
155 #endif
156 
157 	/*
158 	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
159 	 * for IGMP/MLD traffic.
160 	 */
161 
162 	ire = ixa->ixa_ire;
163 
164 	/*
165 	 * If the ULP says the (old) IRE resulted in reachability we
166 	 * record this before determine whether to use a new IRE.
167 	 * No locking for performance reasons.
168 	 */
169 	if (ixaflags & IXAF_REACH_CONF)
170 		ire->ire_badcnt = 0;
171 
172 	/*
173 	 * Has routing changed since we cached the results of the lookup?
174 	 *
175 	 * This check captures all of:
176 	 *  - the cached ire being deleted (by means of the special
177 	 *    IRE_GENERATION_CONDEMNED)
178 	 *  - A potentially better ire being added (ire_generation being
179 	 *    increased)
180 	 *  - A deletion of the nexthop ire that was used when we did the
181 	 *    lookup.
182 	 *  - An addition of a potentially better nexthop ire.
183 	 * The last two are handled by walking and increasing the generation
184 	 * number on all dependant IREs in ire_flush_cache().
185 	 *
186 	 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
187 	 * since we ensure that each time we set ixa_ire to such an IRE we
188 	 * make sure the ixa_ire_generation does not match (by using
189 	 * IRE_GENERATION_VERIFY).
190 	 */
191 	if (ire->ire_generation != ixa->ixa_ire_generation) {
192 		error = ip_verify_ire(mp, ixa);
193 		if (error != 0) {
194 			ip_drop_output("ipIfStatsOutDiscards - verify ire",
195 			    mp, NULL);
196 			goto drop;
197 		}
198 		ire = ixa->ixa_ire;
199 		ASSERT(ire != NULL);
200 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
201 #ifdef DEBUG
202 			ASSERT(ixa->ixa_curthread == curthread);
203 			ixa->ixa_curthread = NULL;
204 #endif
205 			ire->ire_ob_pkt_count++;
206 			/* ixa_dce might be condemned; use default one */
207 			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
208 			    &ipst->ips_dce_default->dce_ident));
209 		}
210 		/*
211 		 * If the ncec changed then ip_verify_ire already set
212 		 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
213 		 * so we can recheck the interface mtu.
214 		 */
215 
216 		/*
217 		 * Note that ire->ire_generation could already have changed.
218 		 * We catch that next time we send a packet.
219 		 */
220 	}
221 
222 	/*
223 	 * No need to lock access to ixa_nce since the ip_xmit_attr usage
224 	 * is single threaded.
225 	 */
226 	ASSERT(ixa->ixa_nce != NULL);
227 	nce = ixa->ixa_nce;
228 	if (nce->nce_is_condemned) {
229 		error = ip_verify_nce(mp, ixa);
230 		/*
231 		 * In case ZEROCOPY capability become not available, we
232 		 * copy the message and free the original one. We might
233 		 * be copying more data than needed but it doesn't hurt
234 		 * since such change rarely happens.
235 		 */
236 		switch (error) {
237 		case 0:
238 			break;
239 		case ENOTSUP: { /* ZEROCOPY */
240 			mblk_t *nmp;
241 
242 			if ((nmp = copymsg(mp)) != NULL) {
243 				freemsg(mp);
244 				mp = nmp;
245 
246 				break;
247 			}
248 		}
249 		/* FALLTHROUGH */
250 		default:
251 			ip_drop_output("ipIfStatsOutDiscards - verify nce",
252 			    mp, NULL);
253 			goto drop;
254 		}
255 		ire = ixa->ixa_ire;
256 		ASSERT(ire != NULL);
257 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
258 #ifdef DEBUG
259 			ASSERT(ixa->ixa_curthread == curthread);
260 			ixa->ixa_curthread = NULL;
261 #endif
262 			ire->ire_ob_pkt_count++;
263 			/* ixa_dce might be condemned; use default one */
264 			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
265 			    ixa, &ipst->ips_dce_default->dce_ident));
266 		}
267 		ASSERT(ixa->ixa_nce != NULL);
268 		nce = ixa->ixa_nce;
269 
270 		/*
271 		 * Note that some other event could already have made
272 		 * the new nce condemned. We catch that next time we
273 		 * try to send a packet.
274 		 */
275 	}
276 	/*
277 	 * If there is no per-destination dce_t then we have a reference to
278 	 * the default dce_t (which merely contains the dce_ipid).
279 	 * The generation check captures both the introduction of a
280 	 * per-destination dce_t (e.g., due to ICMP packet too big) and
281 	 * any change to the per-destination dce (including it becoming
282 	 * condemned by use of the special DCE_GENERATION_CONDEMNED).
283 	 */
284 	dce = ixa->ixa_dce;
285 
286 	/*
287 	 * To avoid a periodic timer to increase the path MTU we
288 	 * look at dce_last_change_time each time we send a packet.
289 	 */
290 	if (dce->dce_flags & DCEF_PMTU) {
291 		int64_t		now = LBOLT_FASTPATH64;
292 
293 		if ((TICK_TO_SEC(now) - dce->dce_last_change_time >
294 		    ipst->ips_ip_pathmtu_interval)) {
295 			/*
296 			 * Older than 20 minutes. Drop the path MTU information.
297 			 * Since the path MTU changes as a result of this,
298 			 * twiddle ixa_dce_generation to make us go through the
299 			 * dce verification code in conn_ip_output.
300 			 */
301 			mutex_enter(&dce->dce_lock);
302 			dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
303 			dce->dce_last_change_time = TICK_TO_SEC(now);
304 			mutex_exit(&dce->dce_lock);
305 			dce_increment_generation(dce);
306 		}
307 	}
308 
309 	if (dce->dce_generation != ixa->ixa_dce_generation) {
310 		error = ip_verify_dce(mp, ixa);
311 		if (error != 0) {
312 			ip_drop_output("ipIfStatsOutDiscards - verify dce",
313 			    mp, NULL);
314 			goto drop;
315 		}
316 		dce = ixa->ixa_dce;
317 
318 		/*
319 		 * Note that some other event could already have made the
320 		 * new dce's generation number change.
321 		 * We catch that next time we try to send a packet.
322 		 */
323 	}
324 
325 	ill = nce->nce_ill;
326 
327 	/*
328 	 * An initial ixa_fragsize was set in ip_set_destination
329 	 * and we update it if any routing changes above.
330 	 * A change to ill_mtu with ifconfig will increase all dce_generation
331 	 * so that we will detect that with the generation check. Ditto for
332 	 * ill_mc_mtu.
333 	 */
334 
335 	/*
336 	 * Caller needs to make sure IXAF_VERIFY_SRC is not set if
337 	 * conn_unspec_src.
338 	 */
339 	if ((ixaflags & IXAF_VERIFY_SOURCE) &&
340 	    ixa->ixa_src_generation != ipst->ips_src_generation) {
341 		/* Check if the IP source is still assigned to the host. */
342 		uint_t gen;
343 
344 		if (!ip_verify_src(mp, ixa, &gen)) {
345 			/* Don't send a packet with a source that isn't ours */
346 			error = EADDRNOTAVAIL;
347 			ip_drop_output("ipIfStatsOutDiscards - invalid src",
348 			    mp, NULL);
349 			goto drop;
350 		}
351 		/* The source is still valid - update the generation number */
352 		ixa->ixa_src_generation = gen;
353 	}
354 
355 	/*
356 	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
357 	 * can only count the use prior to fragmentation. However the MIB
358 	 * counters on the ill will be incremented in post fragmentation.
359 	 */
360 	ire->ire_ob_pkt_count++;
361 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
362 
363 	/*
364 	 * Based on ire_type and ire_flags call one of:
365 	 *	ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
366 	 *	ire_send_multirt_v* - if RTF_MULTIRT
367 	 *	ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
368 	 *	ire_send_multicast_v* - for IRE_MULTICAST
369 	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
370 	 *	ire_send_wire_v* - for the rest.
371 	 */
372 #ifdef DEBUG
373 	ASSERT(ixa->ixa_curthread == curthread);
374 	ixa->ixa_curthread = NULL;
375 #endif
376 	return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
377 
378 drop:
379 	if (ixaflags & IXAF_IS_IPV4) {
380 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
381 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
382 	} else {
383 		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
384 		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
385 	}
386 	freemsg(mp);
387 #ifdef DEBUG
388 	ASSERT(ixa->ixa_curthread == curthread);
389 	ixa->ixa_curthread = NULL;
390 #endif
391 	return (error);
392 }
393 
394 /*
395  * Handle both IPv4 and IPv6. Sets the generation number
396  * to allow the caller to know when to call us again.
397  * Returns true if the source address in the packet is a valid source.
398  * We handle callers which try to send with a zero address (since we only
399  * get here if UNSPEC_SRC is not set).
400  */
401 boolean_t
ip_verify_src(mblk_t * mp,ip_xmit_attr_t * ixa,uint_t * generationp)402 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
403 {
404 	ip_stack_t	*ipst = ixa->ixa_ipst;
405 
406 	/*
407 	 * Need to grab the generation number before we check to
408 	 * avoid a race with a change to the set of local addresses.
409 	 * No lock needed since the thread which updates the set of local
410 	 * addresses use ipif/ill locks and exit those (hence a store memory
411 	 * barrier) before doing the atomic increase of ips_src_generation.
412 	 */
413 	if (generationp != NULL)
414 		*generationp = ipst->ips_src_generation;
415 
416 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
417 		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
418 
419 		if (ipha->ipha_src == INADDR_ANY)
420 			return (B_FALSE);
421 
422 		return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
423 		    ipst, B_FALSE) != IPVL_BAD);
424 	} else {
425 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
426 		uint_t	scopeid;
427 
428 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
429 			return (B_FALSE);
430 
431 		if (ixa->ixa_flags & IXAF_SCOPEID_SET)
432 			scopeid = ixa->ixa_scopeid;
433 		else
434 			scopeid = 0;
435 
436 		return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
437 		    ipst, B_FALSE, scopeid) != IPVL_BAD);
438 	}
439 }
440 
441 /*
442  * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
443  */
444 int
ip_verify_ire(mblk_t * mp,ip_xmit_attr_t * ixa)445 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
446 {
447 	uint_t		gen;
448 	ire_t		*ire;
449 	nce_t		*nce;
450 	int		error;
451 	boolean_t	multirt = B_FALSE;
452 
453 	/*
454 	 * Redo ip_select_route.
455 	 * Need to grab generation number as part of the lookup to
456 	 * avoid race.
457 	 */
458 	error = 0;
459 	ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt);
460 	ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
461 	if (error != 0) {
462 		ire_refrele(ire);
463 		return (error);
464 	}
465 
466 	if (ixa->ixa_ire != NULL)
467 		ire_refrele_notr(ixa->ixa_ire);
468 #ifdef DEBUG
469 	ire_refhold_notr(ire);
470 	ire_refrele(ire);
471 #endif
472 	ixa->ixa_ire = ire;
473 	ixa->ixa_ire_generation = gen;
474 	if (multirt) {
475 		if (ixa->ixa_flags & IXAF_IS_IPV4)
476 			ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
477 		else
478 			ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
479 		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
480 	} else {
481 		ixa->ixa_postfragfn = ire->ire_postfragfn;
482 		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
483 	}
484 
485 	/*
486 	 * Don't look for an nce for reject or blackhole.
487 	 * They have ire_generation set to IRE_GENERATION_VERIFY which
488 	 * makes conn_ip_output avoid references to ixa_nce.
489 	 */
490 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
491 		ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
492 		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
493 		return (0);
494 	}
495 
496 	/* The NCE could now be different */
497 	nce = ire_to_nce_pkt(ire, mp);
498 	if (nce == NULL) {
499 		/*
500 		 * Allocation failure. Make sure we redo ire/nce selection
501 		 * next time we send.
502 		 */
503 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
504 		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
505 		return (ENOBUFS);
506 	}
507 	if (nce == ixa->ixa_nce) {
508 		/* No change */
509 		nce_refrele(nce);
510 		return (0);
511 	}
512 
513 	/*
514 	 * Since the path MTU might change as a result of this
515 	 * route change, we twiddle ixa_dce_generation to
516 	 * make conn_ip_output go through the ip_verify_dce code.
517 	 */
518 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
519 
520 	if (ixa->ixa_nce != NULL)
521 		nce_refrele(ixa->ixa_nce);
522 	ixa->ixa_nce = nce;
523 	return (0);
524 }
525 
526 /*
527  * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
528  */
529 static int
ip_verify_nce(mblk_t * mp,ip_xmit_attr_t * ixa)530 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
531 {
532 	ire_t		*ire = ixa->ixa_ire;
533 	nce_t		*nce;
534 	int		error = 0;
535 	ipha_t		*ipha = NULL;
536 	ip6_t		*ip6h = NULL;
537 
538 	if (ire->ire_ipversion == IPV4_VERSION)
539 		ipha = (ipha_t *)mp->b_rptr;
540 	else
541 		ip6h = (ip6_t *)mp->b_rptr;
542 
543 	nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
544 	if (nce == NULL) {
545 		/* Try to find a better ire */
546 		return (ip_verify_ire(mp, ixa));
547 	}
548 
549 	/*
550 	 * The hardware offloading capabilities, for example LSO, of the
551 	 * interface might have changed, so do sanity verification here.
552 	 */
553 	if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
554 		if (!ip_verify_lso(nce->nce_ill, ixa)) {
555 			ASSERT(ixa->ixa_notify != NULL);
556 			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
557 			    IXAN_LSO, 0);
558 			error = ENOTSUP;
559 		}
560 	}
561 
562 	/*
563 	 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
564 	 * any ZEROCOPY changes. In case ZEROCOPY capability is not available
565 	 * any more, return error so that conn_ip_output() can take care of
566 	 * the ZEROCOPY message properly. It's safe to continue send the
567 	 * message when ZEROCOPY newly become available.
568 	 */
569 	if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
570 		if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
571 			ASSERT(ixa->ixa_notify != NULL);
572 			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
573 			    IXAN_ZCOPY, 0);
574 			if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
575 				error = ENOTSUP;
576 		}
577 	}
578 
579 	/*
580 	 * Since the path MTU might change as a result of this
581 	 * change, we twiddle ixa_dce_generation to
582 	 * make conn_ip_output go through the ip_verify_dce code.
583 	 */
584 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
585 
586 	nce_refrele(ixa->ixa_nce);
587 	ixa->ixa_nce = nce;
588 	return (error);
589 }
590 
591 /*
592  * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
593  */
594 static int
ip_verify_dce(mblk_t * mp,ip_xmit_attr_t * ixa)595 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
596 {
597 	dce_t		*dce;
598 	uint_t		gen;
599 	uint_t		pmtu;
600 
601 	dce = dce_lookup_pkt(mp, ixa, &gen);
602 	ASSERT(dce != NULL);
603 
604 	dce_refrele_notr(ixa->ixa_dce);
605 #ifdef DEBUG
606 	dce_refhold_notr(dce);
607 	dce_refrele(dce);
608 #endif
609 	ixa->ixa_dce = dce;
610 	ixa->ixa_dce_generation = gen;
611 
612 	/* Extract the (path) mtu from the dce, ncec_ill etc */
613 	pmtu = ip_get_pmtu(ixa);
614 
615 	/*
616 	 * Tell ULP about PMTU changes - increase or decrease - by returning
617 	 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
618 	 * both ixa_pmtu and ixa_fragsize appropriately.
619 	 *
620 	 * If ULP doesn't set that flag then we need to update ixa_fragsize
621 	 * since routing could have changed the ill after after ixa_fragsize
622 	 * was set previously in the conn_ip_output path or in
623 	 * ip_set_destination.
624 	 *
625 	 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
626 	 *
627 	 * In the case of a path MTU increase we send the packet after the
628 	 * notify to the ULP.
629 	 */
630 	if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
631 		if (ixa->ixa_pmtu != pmtu) {
632 			uint_t oldmtu = ixa->ixa_pmtu;
633 
634 			DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
635 			    uint32_t, ixa->ixa_pmtu);
636 			ASSERT(ixa->ixa_notify != NULL);
637 			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
638 			    IXAN_PMTU, pmtu);
639 			if (pmtu < oldmtu)
640 				return (EMSGSIZE);
641 		}
642 	} else {
643 		ixa->ixa_fragsize = pmtu;
644 	}
645 	return (0);
646 }
647 
648 /*
649  * Verify LSO usability. Keep the return value simple to indicate whether
650  * the LSO capability has changed. Handle both IPv4 and IPv6.
651  */
652 static boolean_t
ip_verify_lso(ill_t * ill,ip_xmit_attr_t * ixa)653 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
654 {
655 	ill_lso_capab_t	*lsoc = &ixa->ixa_lso_capab;
656 	ill_lso_capab_t	*new_lsoc = ill->ill_lso_capab;
657 
658 	if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
659 		/*
660 		 * Not unsable any more.
661 		 */
662 		if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
663 		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
664 		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
665 		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
666 		    !ILL_LSO_TCP_IPV4_USABLE(ill) :
667 		    !ILL_LSO_TCP_IPV6_USABLE(ill))) {
668 			ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
669 
670 			return (B_FALSE);
671 		}
672 
673 		/*
674 		 * Capability has changed, refresh the copy in ixa.
675 		 */
676 		if (lsoc->ill_lso_max_tcpv4 != new_lsoc->ill_lso_max_tcpv4 ||
677 		    lsoc->ill_lso_max_tcpv6 != new_lsoc->ill_lso_max_tcpv6) {
678 			*lsoc = *new_lsoc;
679 
680 			return (B_FALSE);
681 		}
682 	} else { /* Was not usable */
683 		if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
684 		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
685 		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
686 		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
687 		    ILL_LSO_TCP_IPV4_USABLE(ill) :
688 		    ILL_LSO_TCP_IPV6_USABLE(ill))) {
689 			*lsoc = *new_lsoc;
690 			ixa->ixa_flags |= IXAF_LSO_CAPAB;
691 
692 			return (B_FALSE);
693 		}
694 	}
695 
696 	return (B_TRUE);
697 }
698 
699 /*
700  * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
701  * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
702  */
703 static boolean_t
ip_verify_zcopy(ill_t * ill,ip_xmit_attr_t * ixa)704 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
705 {
706 	if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
707 		/*
708 		 * Not unsable any more.
709 		 */
710 		if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
711 		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
712 		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
713 		    !ILL_ZCOPY_USABLE(ill)) {
714 			ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
715 
716 			return (B_FALSE);
717 		}
718 	} else { /* Was not usable */
719 		if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
720 		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
721 		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
722 		    ILL_ZCOPY_USABLE(ill)) {
723 			ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
724 
725 			return (B_FALSE);
726 		}
727 	}
728 
729 	return (B_TRUE);
730 }
731 
732 
733 /*
734  * When there is no conn_t context, this will send a packet.
735  * The caller must *not* have called conn_connect() or ip_attr_connect()
736  * before calling ip_output_simple().
737  * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
738  * Honors IXAF_SET_SOURCE.
739  *
740  * We acquire the ire and after calling ire_sendfn we release
741  * the hold on the ire. Ditto for the nce and dce.
742  *
743  * This assumes that the caller has set the following in ip_xmit_attr_t:
744  *	ixa_tsl, ixa_zoneid, and ixa_ipst must always be set.
745  *	If ixa_ifindex is non-zero it means send out that ill. (If it is
746  *	an upper IPMP ill we load balance across the group; if a lower we send
747  *	on that lower ill without load balancing.)
748  *	IXAF_IS_IPV4 must be set correctly.
749  *	If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
750  *	If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
751  *	If neither of those two are set we do an IPsec policy lookup.
752  *
753  * We handle setting things like
754  *	ixa_pktlen
755  *	ixa_ip_hdr_length
756  *	ixa->ixa_protocol
757  *
758  * The caller may set ixa_xmit_hint, which is used for ECMP selection and
759  * transmit ring selecting in GLD.
760  *
761  * The caller must do an ixa_cleanup() to release any IPsec references
762  * after we return.
763  */
764 int
ip_output_simple(mblk_t * mp,ip_xmit_attr_t * ixa)765 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
766 {
767 	ts_label_t	*effective_tsl = NULL;
768 	int		err;
769 
770 	ASSERT(ixa->ixa_ipst != NULL);
771 
772 	if (is_system_labeled()) {
773 		ip_stack_t *ipst = ixa->ixa_ipst;
774 
775 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
776 			err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
777 			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
778 			    &effective_tsl);
779 		} else {
780 			err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
781 			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
782 			    &effective_tsl);
783 		}
784 		if (err != 0) {
785 			ip2dbg(("tsol_check: label check failed (%d)\n", err));
786 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
787 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
788 			ip_drop_output("tsol_check_label", mp, NULL);
789 			freemsg(mp);
790 			return (err);
791 		}
792 		if (effective_tsl != NULL) {
793 			/* Update the label */
794 			ip_xmit_attr_replace_tsl(ixa, effective_tsl);
795 		}
796 	}
797 
798 	if (ixa->ixa_flags & IXAF_IS_IPV4)
799 		return (ip_output_simple_v4(mp, ixa));
800 	else
801 		return (ip_output_simple_v6(mp, ixa));
802 }
803 
804 int
ip_output_simple_v4(mblk_t * mp,ip_xmit_attr_t * ixa)805 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
806 {
807 	ipha_t		*ipha;
808 	ipaddr_t	firsthop; /* In IP header */
809 	ipaddr_t	dst;	/* End of source route, or ipha_dst if none */
810 	ire_t		*ire;
811 	ipaddr_t	setsrc;	/* RTF_SETSRC */
812 	int		error;
813 	ill_t		*ill = NULL;
814 	dce_t		*dce = NULL;
815 	nce_t		*nce;
816 	iaflags_t	ixaflags = ixa->ixa_flags;
817 	ip_stack_t	*ipst = ixa->ixa_ipst;
818 	boolean_t	repeat = B_FALSE;
819 	boolean_t	multirt = B_FALSE;
820 	int64_t		now;
821 
822 	ipha = (ipha_t *)mp->b_rptr;
823 	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
824 
825 	/*
826 	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
827 	 * for IGMP/MLD traffic.
828 	 */
829 
830 	/* Caller already set flags */
831 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
832 
833 	ASSERT(ixa->ixa_nce == NULL);
834 
835 	ixa->ixa_pktlen = ntohs(ipha->ipha_length);
836 	ASSERT(ixa->ixa_pktlen == msgdsize(mp));
837 	ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
838 	ixa->ixa_protocol = ipha->ipha_protocol;
839 
840 	/*
841 	 * Assumes that source routed packets have already been massaged by
842 	 * the ULP (ip_massage_options) and as a result ipha_dst is the next
843 	 * hop in the source route. The final destination is used for IPsec
844 	 * policy and DCE lookup.
845 	 */
846 	firsthop = ipha->ipha_dst;
847 	dst = ip_get_dst(ipha);
848 
849 repeat_ire:
850 	error = 0;
851 	setsrc = INADDR_ANY;
852 	ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL,
853 	    &setsrc, &error, &multirt);
854 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
855 	if (error != 0) {
856 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
857 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
858 		ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
859 		freemsg(mp);
860 		goto done;
861 	}
862 
863 	if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
864 		/* ire_ill might be NULL hence need to skip some code */
865 		if (ixaflags & IXAF_SET_SOURCE)
866 			ipha->ipha_src = htonl(INADDR_LOOPBACK);
867 		ixa->ixa_fragsize = IP_MAXPACKET;
868 		ill = NULL;
869 		nce = NULL;
870 		ire->ire_ob_pkt_count++;
871 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
872 		/* No dce yet; use default one */
873 		error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
874 		    &ipst->ips_dce_default->dce_ident);
875 		goto done;
876 	}
877 
878 	/* Note that ipha_dst is only used for IRE_MULTICAST */
879 	nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
880 	if (nce == NULL) {
881 		/* Allocation failure? */
882 		ip_drop_output("ire_to_nce", mp, ill);
883 		freemsg(mp);
884 		error = ENOBUFS;
885 		goto done;
886 	}
887 	if (nce->nce_is_condemned) {
888 		nce_t *nce1;
889 
890 		nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
891 		nce_refrele(nce);
892 		if (nce1 == NULL) {
893 			if (!repeat) {
894 				/* Try finding a better IRE */
895 				repeat = B_TRUE;
896 				ire_refrele(ire);
897 				goto repeat_ire;
898 			}
899 			/* Tried twice - drop packet */
900 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
901 			ip_drop_output("No nce", mp, ill);
902 			freemsg(mp);
903 			error = ENOBUFS;
904 			goto done;
905 		}
906 		nce = nce1;
907 	}
908 
909 	/*
910 	 * For multicast with multirt we have a flag passed back from
911 	 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
912 	 * possible multicast address.
913 	 * We also need a flag for multicast since we can't check
914 	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
915 	 */
916 	if (multirt) {
917 		ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
918 		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
919 	} else {
920 		ixa->ixa_postfragfn = ire->ire_postfragfn;
921 		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
922 	}
923 	ASSERT(ixa->ixa_nce == NULL);
924 	ixa->ixa_nce = nce;
925 
926 	/*
927 	 * Check for a dce_t with a path mtu.
928 	 */
929 	dce = dce_lookup_v4(dst, ipst, NULL);
930 	ASSERT(dce != NULL);
931 
932 	if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
933 		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
934 	} else if (dce->dce_flags & DCEF_PMTU) {
935 		/*
936 		 * To avoid a periodic timer to increase the path MTU we
937 		 * look at dce_last_change_time each time we send a packet.
938 		 */
939 		now = ddi_get_lbolt64();
940 		if (TICK_TO_SEC(now) - dce->dce_last_change_time >
941 		    ipst->ips_ip_pathmtu_interval) {
942 			/*
943 			 * Older than 20 minutes. Drop the path MTU information.
944 			 */
945 			mutex_enter(&dce->dce_lock);
946 			dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
947 			dce->dce_last_change_time = TICK_TO_SEC(now);
948 			mutex_exit(&dce->dce_lock);
949 			dce_increment_generation(dce);
950 			ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
951 		} else {
952 			uint_t fragsize;
953 
954 			fragsize = ip_get_base_mtu(nce->nce_ill, ire);
955 			if (fragsize > dce->dce_pmtu)
956 				fragsize = dce->dce_pmtu;
957 			ixa->ixa_fragsize = fragsize;
958 		}
959 	} else {
960 		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
961 	}
962 
963 	/*
964 	 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
965 	 * interface for source address selection.
966 	 */
967 	ill = ire_nexthop_ill(ire);
968 
969 	if (ixaflags & IXAF_SET_SOURCE) {
970 		ipaddr_t	src;
971 
972 		/*
973 		 * We use the final destination to get
974 		 * correct selection for source routed packets
975 		 */
976 
977 		/* If unreachable we have no ill but need some source */
978 		if (ill == NULL) {
979 			src = htonl(INADDR_LOOPBACK);
980 			error = 0;
981 		} else {
982 			error = ip_select_source_v4(ill, setsrc, dst,
983 			    ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
984 			    &src, NULL, NULL);
985 		}
986 		if (error != 0) {
987 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
988 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
989 			ip_drop_output("ipIfStatsOutDiscards - no source",
990 			    mp, ill);
991 			freemsg(mp);
992 			goto done;
993 		}
994 		ipha->ipha_src = src;
995 	} else if (ixaflags & IXAF_VERIFY_SOURCE) {
996 		/* Check if the IP source is assigned to the host. */
997 		if (!ip_verify_src(mp, ixa, NULL)) {
998 			/* Don't send a packet with a source that isn't ours */
999 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1000 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
1001 			ip_drop_output("ipIfStatsOutDiscards - invalid source",
1002 			    mp, ill);
1003 			freemsg(mp);
1004 			error = EADDRNOTAVAIL;
1005 			goto done;
1006 		}
1007 	}
1008 
1009 
1010 	/*
1011 	 * Check against global IPsec policy to set the AH/ESP attributes.
1012 	 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
1013 	 */
1014 	if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1015 		ASSERT(ixa->ixa_ipsec_policy == NULL);
1016 		mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
1017 		if (mp == NULL) {
1018 			/* MIB and ip_drop_packet already done */
1019 			return (EHOSTUNREACH);	/* IPsec policy failure */
1020 		}
1021 	}
1022 
1023 	if (ill != NULL) {
1024 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
1025 	} else {
1026 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1027 	}
1028 
1029 	/*
1030 	 * We update the statistics on the most specific IRE i.e., the first
1031 	 * one we found.
1032 	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
1033 	 * can only count the use prior to fragmentation. However the MIB
1034 	 * counters on the ill will be incremented in post fragmentation.
1035 	 */
1036 	ire->ire_ob_pkt_count++;
1037 
1038 	/*
1039 	 * Based on ire_type and ire_flags call one of:
1040 	 *	ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
1041 	 *	ire_send_multirt_v4 - if RTF_MULTIRT
1042 	 *	ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
1043 	 *	ire_send_multicast_v4 - for IRE_MULTICAST
1044 	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
1045 	 *	ire_send_wire_v4 - for the rest.
1046 	 */
1047 	error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
1048 done:
1049 	ire_refrele(ire);
1050 	if (dce != NULL)
1051 		dce_refrele(dce);
1052 	if (ill != NULL)
1053 		ill_refrele(ill);
1054 	if (ixa->ixa_nce != NULL)
1055 		nce_refrele(ixa->ixa_nce);
1056 	ixa->ixa_nce = NULL;
1057 	return (error);
1058 }
1059 
1060 /*
1061  * ire_sendfn() functions.
1062  * These functions use the following xmit_attr:
1063  *  - ixa_fragsize - read to determine whether or not to fragment
1064  *  - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
1065  *  - ixa_ipsec_*  are used inside IPsec
1066  *  - IXAF_SET_SOURCE - replace IP source in broadcast case.
1067  *  - IXAF_LOOPBACK_COPY - for multicast and broadcast
1068  */
1069 
1070 
1071 /*
1072  * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
1073  *
1074  * The checks for restrict_interzone_loopback are done in ire_route_recursive.
1075  */
1076 /* ARGSUSED4 */
1077 int
ire_send_local_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1078 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1079     ip_xmit_attr_t *ixa, uint32_t *identp)
1080 {
1081 	ipha_t		*ipha = (ipha_t *)iph_arg;
1082 	ip_stack_t	*ipst = ixa->ixa_ipst;
1083 	ill_t		*ill = ire->ire_ill;
1084 	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
1085 	uint_t		pktlen = ixa->ixa_pktlen;
1086 
1087 	/*
1088 	 * No fragmentation, no nce, no application of IPsec,
1089 	 * and no ipha_ident assignment.
1090 	 *
1091 	 * Note different order between IP provider and FW_HOOKS than in
1092 	 * send_wire case.
1093 	 */
1094 
1095 	/*
1096 	 * DTrace this as ip:::send.  A packet blocked by FW_HOOKS will fire the
1097 	 * send probe, but not the receive probe.
1098 	 */
1099 	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1100 	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1101 	    int, 1);
1102 
1103 	if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
1104 		int error = 0;
1105 
1106 		DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
1107 		    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
1108 		FW_HOOKS(ipst->ips_ip4_loopback_out_event,
1109 		    ipst->ips_ipv4firewall_loopback_out,
1110 		    NULL, ill, ipha, mp, mp, 0, ipst, error);
1111 		DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
1112 		if (mp == NULL)
1113 			return (error);
1114 
1115 		/*
1116 		 * Even if the destination was changed by the filter we use the
1117 		 * forwarding decision that was made based on the address
1118 		 * in ip_output/ip_set_destination.
1119 		 */
1120 		/* Length could be different */
1121 		ipha = (ipha_t *)mp->b_rptr;
1122 		pktlen = ntohs(ipha->ipha_length);
1123 	}
1124 
1125 	/*
1126 	 * If a callback is enabled then we need to know the
1127 	 * source and destination zoneids for the packet. We already
1128 	 * have those handy.
1129 	 */
1130 	if (ipst->ips_ip4_observe.he_interested) {
1131 		zoneid_t szone, dzone;
1132 		zoneid_t stackzoneid;
1133 
1134 		stackzoneid = netstackid_to_zoneid(
1135 		    ipst->ips_netstack->netstack_stackid);
1136 
1137 		if (stackzoneid == GLOBAL_ZONEID) {
1138 			/* Shared-IP zone */
1139 			dzone = ire->ire_zoneid;
1140 			szone = ixa->ixa_zoneid;
1141 		} else {
1142 			szone = dzone = stackzoneid;
1143 		}
1144 		ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
1145 	}
1146 
1147 	/* Handle lo0 stats */
1148 	ipst->ips_loopback_packets++;
1149 
1150 	/* Map ixa to ira including IPsec policies */
1151 	ipsec_out_to_in(ixa, ill, &iras);
1152 	iras.ira_pktlen = pktlen;
1153 
1154 	if (!IS_SIMPLE_IPH(ipha)) {
1155 		ip_output_local_options(ipha, ipst);
1156 		iras.ira_flags |= IRAF_IPV4_OPTIONS;
1157 	}
1158 
1159 	if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
1160 		int error = 0;
1161 
1162 		DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
1163 		    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
1164 		FW_HOOKS(ipst->ips_ip4_loopback_in_event,
1165 		    ipst->ips_ipv4firewall_loopback_in,
1166 		    ill, NULL, ipha, mp, mp, 0, ipst, error);
1167 
1168 		DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
1169 		if (mp == NULL) {
1170 			ira_cleanup(&iras, B_FALSE);
1171 			return (error);
1172 		}
1173 		/*
1174 		 * Even if the destination was changed by the filter we use the
1175 		 * forwarding decision that was made based on the address
1176 		 * in ip_output/ip_set_destination.
1177 		 */
1178 		/* Length could be different */
1179 		ipha = (ipha_t *)mp->b_rptr;
1180 		pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
1181 	}
1182 
1183 	DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1184 	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1185 	    int, 1);
1186 
1187 	ire->ire_ib_pkt_count++;
1188 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
1189 	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
1190 
1191 	/* Destined to ire_zoneid - use that for fanout */
1192 	iras.ira_zoneid = ire->ire_zoneid;
1193 
1194 	if (is_system_labeled()) {
1195 		iras.ira_flags |= IRAF_SYSTEM_LABELED;
1196 
1197 		/*
1198 		 * This updates ira_cred, ira_tsl and ira_free_flags based
1199 		 * on the label. We don't expect this to ever fail for
1200 		 * loopback packets, so we silently drop the packet should it
1201 		 * fail.
1202 		 */
1203 		if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) {
1204 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1205 			ip_drop_input("tsol_get_pkt_label", mp, ill);
1206 			freemsg(mp);
1207 			return (0);
1208 		}
1209 		ASSERT(iras.ira_tsl != NULL);
1210 
1211 		/* tsol_get_pkt_label sometimes does pullupmsg */
1212 		ipha = (ipha_t *)mp->b_rptr;
1213 	}
1214 
1215 	ip_fanout_v4(mp, ipha, &iras);
1216 
1217 	/* We moved any IPsec refs from ixa to iras */
1218 	ira_cleanup(&iras, B_FALSE);
1219 	return (0);
1220 }
1221 
1222 /*
1223  * ire_sendfn for IRE_BROADCAST
1224  * If the broadcast address is present on multiple ills and ixa_ifindex
1225  * isn't set, then we generate
1226  * a separate datagram (potentially with different source address) for
1227  * those ills. In any case, only one copy is looped back to ip_input_v4.
1228  */
1229 int
ire_send_broadcast_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1230 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1231     ip_xmit_attr_t *ixa, uint32_t *identp)
1232 {
1233 	ipha_t		*ipha = (ipha_t *)iph_arg;
1234 	ip_stack_t	*ipst = ixa->ixa_ipst;
1235 	irb_t		*irb = ire->ire_bucket;
1236 	ire_t		*ire1;
1237 	mblk_t		*mp1;
1238 	ipha_t		*ipha1;
1239 	iaflags_t	ixaflags = ixa->ixa_flags;
1240 	nce_t		*nce1, *nce_orig;
1241 
1242 	/*
1243 	 * Unless ire_send_multirt_v4 already set a ttl, force the
1244 	 * ttl to a smallish value.
1245 	 */
1246 	if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
1247 		/*
1248 		 * To avoid broadcast storms, we usually set the TTL to 1 for
1249 		 * broadcasts.  This can
1250 		 * be overridden stack-wide through the ip_broadcast_ttl
1251 		 * ndd tunable, or on a per-connection basis through the
1252 		 * IP_BROADCAST_TTL socket option.
1253 		 *
1254 		 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
1255 		 * will force ttl to one after we've set this.
1256 		 */
1257 		if (ixaflags & IXAF_BROADCAST_TTL_SET)
1258 			ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
1259 		else
1260 			ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
1261 	}
1262 	/*
1263 	 * Make sure we get a loopback copy (after IPsec and frag)
1264 	 * Skip hardware checksum so that loopback copy is checksumed.
1265 	 */
1266 	ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1267 
1268 	/* Do we need to potentially generate multiple copies? */
1269 	if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
1270 		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1271 
1272 	/*
1273 	 * Loop over all IRE_BROADCAST in the bucket (might only be one).
1274 	 * Note that everything in the bucket has the same destination address.
1275 	 */
1276 	irb_refhold(irb);
1277 	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1278 		/* We do the main IRE after the end of the loop */
1279 		if (ire1 == ire)
1280 			continue;
1281 
1282 		/*
1283 		 * Only IREs for the same IP address should be in the same
1284 		 * bucket.
1285 		 * But could have IRE_HOSTs in the case of CGTP.
1286 		 * If we find any multirt routes we bail out of the loop
1287 		 * and just do the single packet at the end; ip_postfrag_multirt
1288 		 * will duplicate the packet.
1289 		 */
1290 		ASSERT(ire1->ire_addr == ire->ire_addr);
1291 		if (!(ire1->ire_type & IRE_BROADCAST))
1292 			continue;
1293 
1294 		if (IRE_IS_CONDEMNED(ire1))
1295 			continue;
1296 
1297 		if (ixa->ixa_zoneid != ALL_ZONES &&
1298 		    ire->ire_zoneid != ire1->ire_zoneid)
1299 			continue;
1300 
1301 		ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
1302 
1303 		if (ire1->ire_flags & RTF_MULTIRT)
1304 			break;
1305 
1306 		/*
1307 		 * For IPMP we only send for the ipmp_ill. arp_nce_init() will
1308 		 * ensure that this goes out on the cast_ill.
1309 		 */
1310 		if (IS_UNDER_IPMP(ire1->ire_ill))
1311 			continue;
1312 
1313 		mp1 = copymsg(mp);
1314 		if (mp1 == NULL) {
1315 			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1316 			    ipIfStatsOutDiscards);
1317 			ip_drop_output("ipIfStatsOutDiscards",
1318 			    mp, ire1->ire_ill);
1319 			continue;
1320 		}
1321 
1322 		ipha1 = (ipha_t *)mp1->b_rptr;
1323 		if (ixa->ixa_flags & IXAF_SET_SOURCE) {
1324 			/*
1325 			 * Need to pick a different source address for each
1326 			 * interface. If we have a global IPsec policy and
1327 			 * no per-socket policy then we punt to
1328 			 * ip_output_simple_v4 using a separate ip_xmit_attr_t.
1329 			 */
1330 			if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
1331 				ip_output_simple_broadcast(ixa, mp1);
1332 				continue;
1333 			}
1334 			/* Pick a new source address for each interface */
1335 			if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
1336 			    ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
1337 			    &ipha1->ipha_src, NULL, NULL) != 0) {
1338 				BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1339 				    ipIfStatsOutDiscards);
1340 				ip_drop_output("ipIfStatsOutDiscards - select "
1341 				    "broadcast source", mp1, ire1->ire_ill);
1342 				freemsg(mp1);
1343 				continue;
1344 			}
1345 			/*
1346 			 * Check against global IPsec policy to set the AH/ESP
1347 			 * attributes. IPsec will set IXAF_IPSEC_* and
1348 			 * ixa_ipsec_* as appropriate.
1349 			 */
1350 			if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1351 				ASSERT(ixa->ixa_ipsec_policy == NULL);
1352 				mp1 = ip_output_attach_policy(mp1, ipha, NULL,
1353 				    NULL, ixa);
1354 				if (mp1 == NULL) {
1355 					/*
1356 					 * MIB and ip_drop_packet already
1357 					 * done
1358 					 */
1359 					continue;
1360 				}
1361 			}
1362 		}
1363 		/* Make sure we have an NCE on this ill */
1364 		nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
1365 		    ire1->ire_type);
1366 		if (nce1 == NULL) {
1367 			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1368 			    ipIfStatsOutDiscards);
1369 			ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
1370 			    mp1, ire1->ire_ill);
1371 			freemsg(mp1);
1372 			continue;
1373 		}
1374 		nce_orig = ixa->ixa_nce;
1375 		ixa->ixa_nce = nce1;
1376 
1377 		ire_refhold(ire1);
1378 		/*
1379 		 * Ignore any errors here. We just collect the errno for
1380 		 * the main ire below
1381 		 */
1382 		(void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
1383 		ire_refrele(ire1);
1384 
1385 		ixa->ixa_nce = nce_orig;
1386 		nce_refrele(nce1);
1387 
1388 		ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
1389 	}
1390 	irb_refrele(irb);
1391 	/* Finally, the main one */
1392 
1393 	/*
1394 	 * For IPMP we only send broadcasts on the ipmp_ill.
1395 	 */
1396 	if (IS_UNDER_IPMP(ire->ire_ill)) {
1397 		freemsg(mp);
1398 		return (0);
1399 	}
1400 
1401 	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1402 }
1403 
1404 /*
1405  * Send a packet using a different source address and different
1406  * IPsec policy.
1407  */
1408 static void
ip_output_simple_broadcast(ip_xmit_attr_t * ixa,mblk_t * mp)1409 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
1410 {
1411 	ip_xmit_attr_t ixas;
1412 
1413 	bzero(&ixas, sizeof (ixas));
1414 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1415 	ixas.ixa_zoneid = ixa->ixa_zoneid;
1416 	ixas.ixa_ifindex = 0;
1417 	ixas.ixa_ipst = ixa->ixa_ipst;
1418 	ixas.ixa_cred = ixa->ixa_cred;
1419 	ixas.ixa_cpid = ixa->ixa_cpid;
1420 	ixas.ixa_tsl = ixa->ixa_tsl;
1421 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1422 
1423 	(void) ip_output_simple(mp, &ixas);
1424 	ixa_cleanup(&ixas);
1425 }
1426 
1427 
1428 static void
multirt_check_v4(ire_t * ire,ipha_t * ipha,ip_xmit_attr_t * ixa)1429 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa)
1430 {
1431 	ip_stack_t	*ipst = ixa->ixa_ipst;
1432 
1433 	/* Limit the TTL on multirt packets */
1434 	if (ire->ire_type & IRE_MULTICAST) {
1435 		if (ipha->ipha_ttl > 1) {
1436 			ip2dbg(("ire_send_multirt_v4: forcing multicast "
1437 			    "multirt TTL to 1 (was %d), dst 0x%08x\n",
1438 			    ipha->ipha_ttl, ntohl(ire->ire_addr)));
1439 			ipha->ipha_ttl = 1;
1440 		}
1441 		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1442 	} else if ((ipst->ips_ip_multirt_ttl > 0) &&
1443 	    (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
1444 		ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
1445 		/*
1446 		 * Need to ensure we don't increase the ttl should we go through
1447 		 * ire_send_broadcast or multicast.
1448 		 */
1449 		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1450 	}
1451 }
1452 
1453 /*
1454  * ire_sendfn for IRE_MULTICAST
1455  */
1456 int
ire_send_multicast_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1457 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1458     ip_xmit_attr_t *ixa, uint32_t *identp)
1459 {
1460 	ipha_t		*ipha = (ipha_t *)iph_arg;
1461 	ip_stack_t	*ipst = ixa->ixa_ipst;
1462 	ill_t		*ill = ire->ire_ill;
1463 	iaflags_t	ixaflags = ixa->ixa_flags;
1464 
1465 	/*
1466 	 * The IRE_MULTICAST is the same whether or not multirt is in use.
1467 	 * Hence we need special-case code.
1468 	 */
1469 	if (ixaflags & IXAF_MULTIRT_MULTICAST)
1470 		multirt_check_v4(ire, ipha, ixa);
1471 
1472 	/*
1473 	 * Check if anything in ip_input_v4 wants a copy of the transmitted
1474 	 * packet (after IPsec and fragmentation)
1475 	 *
1476 	 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
1477 	 *    RSVP and the rsvp daemon is an example of a
1478 	 *    protocol and user level process that
1479 	 *    handles it's own routing. Hence, it uses the
1480 	 *    SO_DONTROUTE option to accomplish this.
1481 	 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
1482 	 *    check whether there are any receivers for the group on the ill
1483 	 *    (ignoring the zoneid).
1484 	 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
1485 	 *    any members in other shared-IP zones.
1486 	 *    If such members exist, then we indicate that the sending zone
1487 	 *    shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
1488 	 *    behavior.
1489 	 *
1490 	 * When we loopback we skip hardware checksum to make sure loopback
1491 	 * copy is checksumed.
1492 	 *
1493 	 * Note that ire_ill is the upper in the case of IPMP.
1494 	 */
1495 	ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
1496 	if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
1497 	    !(ixaflags & IXAF_DONTROUTE)) {
1498 		ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1499 	} else if (ixaflags & IXAF_MULTICAST_LOOP) {
1500 		/*
1501 		 * If this zone or any other zone has members then loopback
1502 		 * a copy.
1503 		 */
1504 		if (ill_hasmembers_v4(ill, ipha->ipha_dst))
1505 			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1506 	} else if (ipst->ips_netstack->netstack_numzones > 1) {
1507 		/*
1508 		 * This zone should not have a copy. But there are some other
1509 		 * zones which might have members.
1510 		 */
1511 		if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1512 		    ixa->ixa_zoneid)) {
1513 			ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
1514 			ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
1515 			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1516 		}
1517 	}
1518 
1519 	/*
1520 	 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl,
1521 	 * force the ttl to the IP_MULTICAST_TTL value
1522 	 */
1523 	if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
1524 		ipha->ipha_ttl = ixa->ixa_multicast_ttl;
1525 	}
1526 
1527 	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1528 }
1529 
1530 /*
1531  * ire_sendfn for IREs with RTF_MULTIRT
1532  */
1533 int
ire_send_multirt_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1534 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1535     ip_xmit_attr_t *ixa, uint32_t *identp)
1536 {
1537 	ipha_t		*ipha = (ipha_t *)iph_arg;
1538 
1539 	multirt_check_v4(ire, ipha, ixa);
1540 
1541 	if (ire->ire_type & IRE_MULTICAST)
1542 		return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp));
1543 	else if (ire->ire_type & IRE_BROADCAST)
1544 		return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp));
1545 	else
1546 		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1547 }
1548 
1549 /*
1550  * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
1551  */
1552 int
ire_send_noroute_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1553 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1554     ip_xmit_attr_t *ixa, uint32_t *identp)
1555 {
1556 	ip_stack_t	*ipst = ixa->ixa_ipst;
1557 	ipha_t		*ipha = (ipha_t *)iph_arg;
1558 	ill_t		*ill;
1559 	ip_recv_attr_t	iras;
1560 	boolean_t	dummy;
1561 
1562 	/* We assign an IP ident for nice errors */
1563 	ipha->ipha_ident = atomic_inc_32_nv(identp);
1564 
1565 	BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
1566 
1567 	if (ire->ire_type & IRE_NOROUTE) {
1568 		/* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
1569 		ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
1570 		    RTA_DST, ipst);
1571 	}
1572 
1573 	if (ire->ire_flags & RTF_BLACKHOLE) {
1574 		ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
1575 		freemsg(mp);
1576 		/* No error even for local senders - silent blackhole */
1577 		return (0);
1578 	}
1579 	ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
1580 
1581 	/*
1582 	 * We need an ill_t for the ip_recv_attr_t even though this packet
1583 	 * was never received and icmp_unreachable doesn't currently use
1584 	 * ira_ill.
1585 	 */
1586 	ill = ill_lookup_on_name("lo0", B_FALSE,
1587 	    !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
1588 	if (ill == NULL) {
1589 		freemsg(mp);
1590 		return (EHOSTUNREACH);
1591 	}
1592 
1593 	bzero(&iras, sizeof (iras));
1594 	/* Map ixa to ira including IPsec policies */
1595 	ipsec_out_to_in(ixa, ill, &iras);
1596 
1597 	if (ip_source_routed(ipha, ipst)) {
1598 		icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
1599 	} else {
1600 		icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
1601 	}
1602 	/* We moved any IPsec refs from ixa to iras */
1603 	ira_cleanup(&iras, B_FALSE);
1604 	ill_refrele(ill);
1605 	return (EHOSTUNREACH);
1606 }
1607 
1608 /*
1609  * Calculate a checksum ignoring any hardware capabilities
1610  *
1611  * Returns B_FALSE if the packet was too short for the checksum. Caller
1612  * should free and do stats.
1613  */
1614 static boolean_t
ip_output_sw_cksum_v4(mblk_t * mp,ipha_t * ipha,ip_xmit_attr_t * ixa)1615 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
1616 {
1617 	ip_stack_t	*ipst = ixa->ixa_ipst;
1618 	uint_t		pktlen = ixa->ixa_pktlen;
1619 	uint16_t	*cksump;
1620 	uint32_t	cksum;
1621 	uint8_t		protocol = ixa->ixa_protocol;
1622 	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
1623 	ipaddr_t	dst = ipha->ipha_dst;
1624 	ipaddr_t	src = ipha->ipha_src;
1625 
1626 	/* Just in case it contained garbage */
1627 	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1628 
1629 	/*
1630 	 * Calculate ULP checksum
1631 	 */
1632 	if (protocol == IPPROTO_TCP) {
1633 		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1634 		cksum = IP_TCP_CSUM_COMP;
1635 	} else if (protocol == IPPROTO_UDP) {
1636 		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1637 		cksum = IP_UDP_CSUM_COMP;
1638 	} else if (protocol == IPPROTO_SCTP) {
1639 		sctp_hdr_t	*sctph;
1640 
1641 		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1642 		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1643 		/*
1644 		 * Zero out the checksum field to ensure proper
1645 		 * checksum calculation.
1646 		 */
1647 		sctph->sh_chksum = 0;
1648 #ifdef	DEBUG
1649 		if (!skip_sctp_cksum)
1650 #endif
1651 			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1652 		goto ip_hdr_cksum;
1653 	} else {
1654 		goto ip_hdr_cksum;
1655 	}
1656 
1657 	/* ULP puts the checksum field is in the first mblk */
1658 	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1659 
1660 	/*
1661 	 * We accumulate the pseudo header checksum in cksum.
1662 	 * This is pretty hairy code, so watch close.  One
1663 	 * thing to keep in mind is that UDP and TCP have
1664 	 * stored their respective datagram lengths in their
1665 	 * checksum fields.  This lines things up real nice.
1666 	 */
1667 	cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
1668 
1669 	cksum = IP_CSUM(mp, ip_hdr_length, cksum);
1670 	/*
1671 	 * For UDP/IPv4 a zero means that the packets wasn't checksummed.
1672 	 * Change to 0xffff
1673 	 */
1674 	if (protocol == IPPROTO_UDP && cksum == 0)
1675 		*cksump = ~cksum;
1676 	else
1677 		*cksump = cksum;
1678 
1679 	IP_STAT(ipst, ip_out_sw_cksum);
1680 	IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
1681 
1682 ip_hdr_cksum:
1683 	/* Calculate IPv4 header checksum */
1684 	ipha->ipha_hdr_checksum = 0;
1685 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1686 	return (B_TRUE);
1687 }
1688 
1689 /*
1690  * Calculate the ULP checksum - try to use hardware.
1691  * In the case of MULTIRT, broadcast or multicast the
1692  * IXAF_NO_HW_CKSUM is set in which case we use software.
1693  *
1694  * If the hardware supports IP header checksum offload; then clear the
1695  * contents of IP header checksum field as expected by NIC.
1696  * Do this only if we offloaded either full or partial sum.
1697  *
1698  * Returns B_FALSE if the packet was too short for the checksum. Caller
1699  * should free and do stats.
1700  */
1701 static boolean_t
ip_output_cksum_v4(iaflags_t ixaflags,mblk_t * mp,ipha_t * ipha,ip_xmit_attr_t * ixa,ill_t * ill)1702 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
1703     ip_xmit_attr_t *ixa, ill_t *ill)
1704 {
1705 	uint_t		pktlen = ixa->ixa_pktlen;
1706 	uint16_t	*cksump;
1707 	uint16_t	hck_flags;
1708 	uint32_t	cksum;
1709 	uint8_t		protocol = ixa->ixa_protocol;
1710 	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
1711 
1712 	if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
1713 	    !dohwcksum) {
1714 		return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1715 	}
1716 
1717 	/*
1718 	 * Calculate ULP checksum. Note that we don't use cksump and cksum
1719 	 * if the ill has FULL support.
1720 	 */
1721 	if (protocol == IPPROTO_TCP) {
1722 		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1723 		cksum = IP_TCP_CSUM_COMP;	/* Pseudo-header cksum */
1724 	} else if (protocol == IPPROTO_UDP) {
1725 		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1726 		cksum = IP_UDP_CSUM_COMP;	/* Pseudo-header cksum */
1727 	} else if (protocol == IPPROTO_SCTP) {
1728 		sctp_hdr_t	*sctph;
1729 
1730 		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1731 		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1732 		/*
1733 		 * Zero out the checksum field to ensure proper
1734 		 * checksum calculation.
1735 		 */
1736 		sctph->sh_chksum = 0;
1737 #ifdef	DEBUG
1738 		if (!skip_sctp_cksum)
1739 #endif
1740 			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1741 		goto ip_hdr_cksum;
1742 	} else if (protocol == IPPROTO_ICMP) {
1743 		/*
1744 		 * Note that we always calculate a SW checksum for ICMP. In the
1745 		 * future, if HW support for ICMP is advertised, we can change
1746 		 * this.
1747 		 */
1748 		return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1749 	} else {
1750 	ip_hdr_cksum:
1751 		/* Calculate IPv4 header checksum */
1752 		ipha->ipha_hdr_checksum = 0;
1753 		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1754 		return (B_TRUE);
1755 	}
1756 
1757 	/* ULP puts the checksum field is in the first mblk */
1758 	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1759 
1760 	/*
1761 	 * Underlying interface supports hardware checksum offload for
1762 	 * the payload; leave the payload checksum for the hardware to
1763 	 * calculate.  N.B: We only need to set up checksum info on the
1764 	 * first mblk.
1765 	 */
1766 	hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
1767 
1768 	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1769 	if (hck_flags & HCKSUM_INET_FULL_V4) {
1770 		/*
1771 		 * Hardware calculates pseudo-header, header and the
1772 		 * payload checksums, so clear the checksum field in
1773 		 * the protocol header.
1774 		 */
1775 		*cksump = 0;
1776 		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
1777 
1778 		ipha->ipha_hdr_checksum = 0;
1779 		if (hck_flags & HCKSUM_IPHDRCKSUM) {
1780 			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1781 		} else {
1782 			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1783 		}
1784 		return (B_TRUE);
1785 	}
1786 	if ((hck_flags) & HCKSUM_INET_PARTIAL)  {
1787 		ipaddr_t	dst = ipha->ipha_dst;
1788 		ipaddr_t	src = ipha->ipha_src;
1789 		/*
1790 		 * Partial checksum offload has been enabled.  Fill
1791 		 * the checksum field in the protocol header with the
1792 		 * pseudo-header checksum value.
1793 		 *
1794 		 * We accumulate the pseudo header checksum in cksum.
1795 		 * This is pretty hairy code, so watch close.  One
1796 		 * thing to keep in mind is that UDP and TCP have
1797 		 * stored their respective datagram lengths in their
1798 		 * checksum fields.  This lines things up real nice.
1799 		 */
1800 		cksum += (dst >> 16) + (dst & 0xFFFF) +
1801 		    (src >> 16) + (src & 0xFFFF);
1802 		cksum += *(cksump);
1803 		cksum = (cksum & 0xFFFF) + (cksum >> 16);
1804 		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
1805 
1806 		/*
1807 		 * Offsets are relative to beginning of IP header.
1808 		 */
1809 		DB_CKSUMSTART(mp) = ip_hdr_length;
1810 		DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
1811 		DB_CKSUMEND(mp) = pktlen;
1812 		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
1813 
1814 		ipha->ipha_hdr_checksum = 0;
1815 		if (hck_flags & HCKSUM_IPHDRCKSUM) {
1816 			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1817 		} else {
1818 			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1819 		}
1820 		return (B_TRUE);
1821 	}
1822 	/* Hardware capabilities include neither full nor partial IPv4 */
1823 	return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1824 }
1825 
1826 /*
1827  * ire_sendfn for offlink and onlink destinations.
1828  * Also called from the multicast, broadcast, multirt send functions.
1829  *
1830  * Assumes that the caller has a hold on the ire.
1831  *
1832  * This function doesn't care if the IRE just became condemned since that
1833  * can happen at any time.
1834  */
1835 /* ARGSUSED */
1836 int
ire_send_wire_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1837 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1838     ip_xmit_attr_t *ixa, uint32_t *identp)
1839 {
1840 	ip_stack_t	*ipst = ixa->ixa_ipst;
1841 	ipha_t		*ipha = (ipha_t *)iph_arg;
1842 	iaflags_t	ixaflags = ixa->ixa_flags;
1843 	ill_t		*ill;
1844 
1845 	ASSERT(ixa->ixa_nce != NULL);
1846 	ill = ixa->ixa_nce->nce_ill;
1847 
1848 	if (ixaflags & IXAF_DONTROUTE)
1849 		ipha->ipha_ttl = 1;
1850 
1851 	/*
1852 	 * Assign an ident value for this packet. There could be other
1853 	 * threads targeting the same destination, so we have to arrange
1854 	 * for a atomic increment.  Note that we use a 32-bit atomic add
1855 	 * because it has better performance than its 16-bit sibling.
1856 	 *
1857 	 * Normally ixa_extra_ident is 0, but in the case of LSO it will
1858 	 * be the number of TCP segments  that the driver/hardware will
1859 	 * extraly construct.
1860 	 *
1861 	 * If running in cluster mode and if the source address
1862 	 * belongs to a replicated service then vector through
1863 	 * cl_inet_ipident vector to allocate ip identifier
1864 	 * NOTE: This is a contract private interface with the
1865 	 * clustering group.
1866 	 */
1867 	if (cl_inet_ipident != NULL) {
1868 		ipaddr_t src = ipha->ipha_src;
1869 		ipaddr_t dst = ipha->ipha_dst;
1870 		netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
1871 
1872 		ASSERT(cl_inet_isclusterwide != NULL);
1873 		if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
1874 		    AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
1875 			/*
1876 			 * Note: not correct with LSO since we can't allocate
1877 			 * ixa_extra_ident+1 consecutive values.
1878 			 */
1879 			ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
1880 			    IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
1881 			    (uint8_t *)(uintptr_t)dst, NULL);
1882 		} else {
1883 			ipha->ipha_ident = atomic_add_32_nv(identp,
1884 			    ixa->ixa_extra_ident + 1);
1885 		}
1886 	} else {
1887 		ipha->ipha_ident = atomic_add_32_nv(identp,
1888 		    ixa->ixa_extra_ident + 1);
1889 	}
1890 #ifndef _BIG_ENDIAN
1891 	ipha->ipha_ident = htons(ipha->ipha_ident);
1892 #endif
1893 
1894 	/*
1895 	 * This might set b_band, thus the IPsec and fragmentation
1896 	 * code in IP ensures that b_band is updated in the first mblk.
1897 	 */
1898 	if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
1899 		/* ip_process translates an IS_UNDER_IPMP */
1900 		mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
1901 		if (mp == NULL) {
1902 			/* ip_drop_packet and MIB done */
1903 			return (0);	/* Might just be delayed */
1904 		}
1905 	}
1906 
1907 	/*
1908 	 * Verify any IPv4 options.
1909 	 *
1910 	 * The presense of IP options also forces the network stack to
1911 	 * calculate the checksum in software.  This is because:
1912 	 *
1913 	 * Wrap around: certain partial-checksum NICs (eri, ce) limit
1914 	 * the size of "start offset" width to 6-bit.  This effectively
1915 	 * sets the largest value of the offset to 64-bytes, starting
1916 	 * from the MAC header.  When the cumulative MAC and IP headers
1917 	 * exceed such limit, the offset will wrap around.  This causes
1918 	 * the checksum to be calculated at the wrong place.
1919 	 *
1920 	 * IPv4 source routing: none of the full-checksum capable NICs
1921 	 * is capable of correctly handling the	IPv4 source-routing
1922 	 * option for purposes of calculating the pseudo-header; the
1923 	 * actual destination is different from the destination in the
1924 	 * header which is that of the next-hop.  (This case may not be
1925 	 * true for NICs which can parse IPv6 extension headers, but
1926 	 * we choose to simplify the implementation by not offloading
1927 	 * checksum when they are present.)
1928 	 */
1929 	if (!IS_SIMPLE_IPH(ipha)) {
1930 		ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
1931 		/* An IS_UNDER_IPMP ill is ok here */
1932 		if (ip_output_options(mp, ipha, ixa, ill)) {
1933 			/* Packet has been consumed and ICMP error sent */
1934 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1935 			return (EINVAL);
1936 		}
1937 	}
1938 
1939 	/*
1940 	 * To handle IPsec/iptun's labeling needs we need to tag packets
1941 	 * while we still have ixa_tsl
1942 	 */
1943 	if (is_system_labeled() && ixa->ixa_tsl != NULL &&
1944 	    (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
1945 	    ill->ill_mactype == DL_IPV6)) {
1946 		cred_t *newcr;
1947 
1948 		newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
1949 		    KM_NOSLEEP);
1950 		if (newcr == NULL) {
1951 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1952 			ip_drop_output("ipIfStatsOutDiscards - newcr",
1953 			    mp, ill);
1954 			freemsg(mp);
1955 			return (ENOBUFS);
1956 		}
1957 		mblk_setcred(mp, newcr, NOPID);
1958 		crfree(newcr);	/* mblk_setcred did its own crhold */
1959 	}
1960 
1961 	if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
1962 	    (ixaflags & IXAF_IPSEC_SECURE)) {
1963 		uint32_t pktlen;
1964 
1965 		pktlen = ixa->ixa_pktlen;
1966 		if (ixaflags & IXAF_IPSEC_SECURE)
1967 			pktlen += ipsec_out_extra_length(ixa);
1968 
1969 		if (pktlen > IP_MAXPACKET)
1970 			return (EMSGSIZE);
1971 
1972 		if (ixaflags & IXAF_SET_ULP_CKSUM) {
1973 			/*
1974 			 * Compute ULP checksum and IP header checksum
1975 			 * using software
1976 			 */
1977 			if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1978 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1979 				ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1980 				freemsg(mp);
1981 				return (EINVAL);
1982 			}
1983 		} else {
1984 			/* Calculate IPv4 header checksum */
1985 			ipha->ipha_hdr_checksum = 0;
1986 			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1987 		}
1988 
1989 		/*
1990 		 * If this packet would generate a icmp_frag_needed
1991 		 * message, we need to handle it before we do the IPsec
1992 		 * processing. Otherwise, we need to strip the IPsec
1993 		 * headers before we send up the message to the ULPs
1994 		 * which becomes messy and difficult.
1995 		 *
1996 		 * We check using IXAF_DONTFRAG. The DF bit in the header
1997 		 * is not inspected - it will be copied to any generated
1998 		 * fragments.
1999 		 */
2000 		if ((pktlen > ixa->ixa_fragsize) &&
2001 		    (ixaflags & IXAF_DONTFRAG)) {
2002 			/* Generate ICMP and return error */
2003 			ip_recv_attr_t	iras;
2004 
2005 			DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
2006 			    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2007 			    uint_t, ixa->ixa_pmtu);
2008 
2009 			bzero(&iras, sizeof (iras));
2010 			/* Map ixa to ira including IPsec policies */
2011 			ipsec_out_to_in(ixa, ill, &iras);
2012 
2013 			ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
2014 			icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
2015 			/* We moved any IPsec refs from ixa to iras */
2016 			ira_cleanup(&iras, B_FALSE);
2017 			return (EMSGSIZE);
2018 		}
2019 		DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
2020 		    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2021 		    uint_t, ixa->ixa_pmtu);
2022 
2023 		if (ixaflags & IXAF_IPSEC_SECURE) {
2024 			/*
2025 			 * Pass in sufficient information so that
2026 			 * IPsec can determine whether to fragment, and
2027 			 * which function to call after fragmentation.
2028 			 */
2029 			return (ipsec_out_process(mp, ixa));
2030 		}
2031 		return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
2032 		    ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
2033 		    ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
2034 		    ixa->ixa_postfragfn, &ixa->ixa_cookie));
2035 	}
2036 	if (ixaflags & IXAF_SET_ULP_CKSUM) {
2037 		/* Compute ULP checksum and IP header checksum */
2038 		/* An IS_UNDER_IPMP ill is ok here */
2039 		if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
2040 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2041 			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2042 			freemsg(mp);
2043 			return (EINVAL);
2044 		}
2045 	} else {
2046 		/* Calculate IPv4 header checksum */
2047 		ipha->ipha_hdr_checksum = 0;
2048 		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2049 	}
2050 	return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
2051 	    ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
2052 	    ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
2053 }
2054 
2055 /*
2056  * Send mp into ip_input
2057  * Common for IPv4 and IPv6
2058  */
2059 void
ip_postfrag_loopback(mblk_t * mp,nce_t * nce,iaflags_t ixaflags,uint_t pkt_len,zoneid_t nolzid)2060 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2061     uint_t pkt_len, zoneid_t nolzid)
2062 {
2063 	rtc_t		rtc;
2064 	ill_t		*ill = nce->nce_ill;
2065 	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
2066 	ncec_t		*ncec;
2067 
2068 	ncec = nce->nce_common;
2069 	iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
2070 	    IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
2071 	if (ncec->ncec_flags & NCE_F_BCAST)
2072 		iras.ira_flags |= IRAF_L2DST_BROADCAST;
2073 	else if (ncec->ncec_flags & NCE_F_MCAST)
2074 		iras.ira_flags |= IRAF_L2DST_MULTICAST;
2075 
2076 	iras.ira_free_flags = 0;
2077 	iras.ira_cred = NULL;
2078 	iras.ira_cpid = NOPID;
2079 	iras.ira_tsl = NULL;
2080 	iras.ira_zoneid = ALL_ZONES;
2081 	iras.ira_pktlen = pkt_len;
2082 	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
2083 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
2084 
2085 	if (ixaflags & IXAF_IS_IPV4)
2086 		iras.ira_flags |= IRAF_IS_IPV4;
2087 
2088 	iras.ira_ill = iras.ira_rill = ill;
2089 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2090 	iras.ira_rifindex = iras.ira_ruifindex;
2091 	iras.ira_mhip = NULL;
2092 
2093 	iras.ira_flags |= ixaflags & IAF_MASK;
2094 	iras.ira_no_loop_zoneid = nolzid;
2095 
2096 	/* Broadcast and multicast doesn't care about the squeue */
2097 	iras.ira_sqp = NULL;
2098 
2099 	rtc.rtc_ire = NULL;
2100 	if (ixaflags & IXAF_IS_IPV4) {
2101 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2102 
2103 		rtc.rtc_ipaddr = INADDR_ANY;
2104 
2105 		(*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
2106 		if (rtc.rtc_ire != NULL) {
2107 			ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
2108 			ire_refrele(rtc.rtc_ire);
2109 		}
2110 	} else {
2111 		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
2112 
2113 		rtc.rtc_ip6addr = ipv6_all_zeros;
2114 
2115 		(*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
2116 		if (rtc.rtc_ire != NULL) {
2117 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
2118 			ire_refrele(rtc.rtc_ire);
2119 		}
2120 	}
2121 	/* Any references to clean up? No hold on ira */
2122 	if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
2123 		ira_cleanup(&iras, B_FALSE);
2124 }
2125 
2126 /*
2127  * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
2128  * looks at the IXAF_LOOPBACK_COPY flag.
2129  * Common for IPv4 and IPv6.
2130  *
2131  * If the loopback copy fails (due to no memory) but we send the packet out
2132  * on the wire we return no failure. Only in the case we supress the wire
2133  * sending do we take the loopback failure into account.
2134  *
2135  * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
2136  * Those operations are performed on this packet in ip_xmit() and it would
2137  * be odd to do it twice for the same packet.
2138  */
2139 int
ip_postfrag_loopcheck(mblk_t * mp,nce_t * nce,iaflags_t ixaflags,uint_t pkt_len,uint32_t xmit_hint,zoneid_t szone,zoneid_t nolzid,uintptr_t * ixacookie)2140 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2141     uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2142     uintptr_t *ixacookie)
2143 {
2144 	ill_t		*ill = nce->nce_ill;
2145 	int		error = 0;
2146 
2147 	/*
2148 	 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
2149 	 * had looped it back
2150 	 */
2151 	if (ixaflags & IXAF_LOOPBACK_COPY) {
2152 		mblk_t		*mp1;
2153 
2154 		mp1 = copymsg(mp);
2155 		if (mp1 == NULL) {
2156 			/* Failed to deliver the loopback copy. */
2157 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2158 			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2159 			error = ENOBUFS;
2160 		} else {
2161 			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2162 			    nolzid);
2163 		}
2164 	}
2165 
2166 	/*
2167 	 * If TTL = 0 then only do the loopback to this host i.e. we are
2168 	 * done. We are also done if this was the
2169 	 * loopback interface since it is sufficient
2170 	 * to loopback one copy of a multicast packet.
2171 	 */
2172 	if (ixaflags & IXAF_IS_IPV4) {
2173 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
2174 
2175 		if (ipha->ipha_ttl == 0) {
2176 			ip_drop_output("multicast ipha_ttl not sent to wire",
2177 			    mp, ill);
2178 			freemsg(mp);
2179 			return (error);
2180 		}
2181 	} else {
2182 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
2183 
2184 		if (ip6h->ip6_hops == 0) {
2185 			ip_drop_output("multicast ipha_ttl not sent to wire",
2186 			    mp, ill);
2187 			freemsg(mp);
2188 			return (error);
2189 		}
2190 	}
2191 	if (nce->nce_ill->ill_wq == NULL) {
2192 		/* Loopback interface */
2193 		ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
2194 		freemsg(mp);
2195 		return (error);
2196 	}
2197 
2198 	return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2199 	    ixacookie));
2200 }
2201 
2202 /*
2203  * Post fragmentation function for RTF_MULTIRT routes.
2204  * Since IRE_BROADCASTs can have RTF_MULTIRT, this function
2205  * checks IXAF_LOOPBACK_COPY.
2206  *
2207  * If no packet is sent due to failures then we return an errno, but if at
2208  * least one succeeded we return zero.
2209  */
2210 int
ip_postfrag_multirt_v4(mblk_t * mp,nce_t * nce,iaflags_t ixaflags,uint_t pkt_len,uint32_t xmit_hint,zoneid_t szone,zoneid_t nolzid,uintptr_t * ixacookie)2211 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2212     uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2213     uintptr_t *ixacookie)
2214 {
2215 	irb_t		*irb;
2216 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2217 	ire_t		*ire;
2218 	ire_t		*ire1;
2219 	mblk_t		*mp1;
2220 	nce_t		*nce1;
2221 	ill_t		*ill = nce->nce_ill;
2222 	ill_t		*ill1;
2223 	ip_stack_t	*ipst = ill->ill_ipst;
2224 	int		error = 0;
2225 	int		num_sent = 0;
2226 	int		err;
2227 	uint_t		ire_type;
2228 	ipaddr_t	nexthop;
2229 
2230 	ASSERT(ixaflags & IXAF_IS_IPV4);
2231 
2232 	/* Check for IXAF_LOOPBACK_COPY */
2233 	if (ixaflags & IXAF_LOOPBACK_COPY) {
2234 		mblk_t *mp1;
2235 
2236 		mp1 = copymsg(mp);
2237 		if (mp1 == NULL) {
2238 			/* Failed to deliver the loopback copy. */
2239 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2240 			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2241 			error = ENOBUFS;
2242 		} else {
2243 			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2244 			    nolzid);
2245 		}
2246 	}
2247 
2248 	/*
2249 	 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send
2250 	 * a copy to each one.
2251 	 * Use the nce (nexthop) and ipha_dst to find the ire.
2252 	 *
2253 	 * MULTIRT is not designed to work with shared-IP zones thus we don't
2254 	 * need to pass a zoneid or a label to the IRE lookup.
2255 	 */
2256 	if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) {
2257 		/* Broadcast and multicast case */
2258 		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0,
2259 		    NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2260 	} else {
2261 		ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr);
2262 
2263 		/* Unicast case */
2264 		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0,
2265 		    NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
2266 	}
2267 
2268 	if (ire == NULL ||
2269 	    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2270 	    !(ire->ire_flags & RTF_MULTIRT)) {
2271 		/* Drop */
2272 		ip_drop_output("ip_postfrag_multirt didn't find route",
2273 		    mp, nce->nce_ill);
2274 		if (ire != NULL)
2275 			ire_refrele(ire);
2276 		return (ENETUNREACH);
2277 	}
2278 
2279 	irb = ire->ire_bucket;
2280 	irb_refhold(irb);
2281 	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
2282 		/*
2283 		 * For broadcast we can have a mixture of IRE_BROADCAST and
2284 		 * IRE_HOST due to the manually added IRE_HOSTs that are used
2285 		 * to trigger the creation of the special CGTP broadcast routes.
2286 		 * Thus we have to skip if ire_type doesn't match the original.
2287 		 */
2288 		if (IRE_IS_CONDEMNED(ire1) ||
2289 		    !(ire1->ire_flags & RTF_MULTIRT) ||
2290 		    ire1->ire_type != ire->ire_type)
2291 			continue;
2292 
2293 		/* Do the ire argument one after the loop */
2294 		if (ire1 == ire)
2295 			continue;
2296 
2297 		ill1 = ire_nexthop_ill(ire1);
2298 		if (ill1 == NULL) {
2299 			/*
2300 			 * This ire might not have been picked by
2301 			 * ire_route_recursive, in which case ire_dep might
2302 			 * not have been setup yet.
2303 			 * We kick ire_route_recursive to try to resolve
2304 			 * starting at ire1.
2305 			 */
2306 			ire_t *ire2;
2307 			uint_t	match_flags = MATCH_IRE_DSTONLY;
2308 
2309 			if (ire1->ire_ill != NULL)
2310 				match_flags |= MATCH_IRE_ILL;
2311 			ire2 = ire_route_recursive_impl_v4(ire1,
2312 			    ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
2313 			    ire1->ire_zoneid, NULL, match_flags,
2314 			    IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2315 			if (ire2 != NULL)
2316 				ire_refrele(ire2);
2317 			ill1 = ire_nexthop_ill(ire1);
2318 		}
2319 
2320 		if (ill1 == NULL) {
2321 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2322 			ip_drop_output("ipIfStatsOutDiscards - no ill",
2323 			    mp, ill);
2324 			error = ENETUNREACH;
2325 			continue;
2326 		}
2327 
2328 		/* Pick the addr and type to use for arp_nce_init */
2329 		if (nce->nce_common->ncec_flags & NCE_F_BCAST) {
2330 			ire_type = IRE_BROADCAST;
2331 			nexthop = ire1->ire_gateway_addr;
2332 		} else if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
2333 			ire_type = IRE_MULTICAST;
2334 			nexthop = ipha->ipha_dst;
2335 		} else {
2336 			ire_type = ire1->ire_type;	/* Doesn't matter */
2337 			nexthop = ire1->ire_gateway_addr;
2338 		}
2339 
2340 		/* If IPMP meta or under, then we just drop */
2341 		if (ill1->ill_grp != NULL) {
2342 			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2343 			ip_drop_output("ipIfStatsOutDiscards - IPMP",
2344 			    mp, ill1);
2345 			ill_refrele(ill1);
2346 			error = ENETUNREACH;
2347 			continue;
2348 		}
2349 
2350 		nce1 = arp_nce_init(ill1, nexthop, ire_type);
2351 		if (nce1 == NULL) {
2352 			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2353 			ip_drop_output("ipIfStatsOutDiscards - no nce",
2354 			    mp, ill1);
2355 			ill_refrele(ill1);
2356 			error = ENETUNREACH;
2357 			continue;
2358 		}
2359 		mp1 = copymsg(mp);
2360 		if (mp1 == NULL) {
2361 			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2362 			ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
2363 			nce_refrele(nce1);
2364 			ill_refrele(ill1);
2365 			error = ENOBUFS;
2366 			continue;
2367 		}
2368 		/* Preserve HW checksum for this copy */
2369 		DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
2370 		DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
2371 		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
2372 		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
2373 		DB_LSOMSS(mp1) = DB_LSOMSS(mp);
2374 
2375 		ire1->ire_ob_pkt_count++;
2376 		err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
2377 		    0, ixacookie);
2378 		if (err == 0)
2379 			num_sent++;
2380 		else
2381 			error = err;
2382 		nce_refrele(nce1);
2383 		ill_refrele(ill1);
2384 	}
2385 	irb_refrele(irb);
2386 	ire_refrele(ire);
2387 	/* Finally, the main one */
2388 	err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2389 	    ixacookie);
2390 	if (err == 0)
2391 		num_sent++;
2392 	else
2393 		error = err;
2394 	if (num_sent > 0)
2395 		return (0);
2396 	else
2397 		return (error);
2398 }
2399 
2400 /*
2401  * Verify local connectivity. This check is called by ULP fusion code.
2402  * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
2403  * the interface is brought down and back up. So we simply fail the local
2404  * process. The caller, TCP Fusion, should unfuse the connection.
2405  */
2406 boolean_t
ip_output_verify_local(ip_xmit_attr_t * ixa)2407 ip_output_verify_local(ip_xmit_attr_t *ixa)
2408 {
2409 	ire_t		*ire = ixa->ixa_ire;
2410 
2411 	if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
2412 		return (B_FALSE);
2413 
2414 	return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
2415 }
2416 
2417 /*
2418  * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
2419  *
2420  * The caller must call ip_output_verify_local() first. This function handles
2421  * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
2422  */
2423 mblk_t *
ip_output_process_local(mblk_t * mp,ip_xmit_attr_t * ixa,boolean_t hooks_out,boolean_t hooks_in,conn_t * peer_connp)2424 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
2425     boolean_t hooks_in, conn_t *peer_connp)
2426 {
2427 	ill_t		*ill = ixa->ixa_ire->ire_ill;
2428 	ipha_t		*ipha = NULL;
2429 	ip6_t		*ip6h = NULL;
2430 	ip_stack_t	*ipst = ixa->ixa_ipst;
2431 	iaflags_t	ixaflags = ixa->ixa_flags;
2432 	ip_recv_attr_t	iras;
2433 	int		error;
2434 
2435 	ASSERT(mp != NULL);
2436 
2437 	if (ixaflags & IXAF_IS_IPV4) {
2438 		ipha = (ipha_t *)mp->b_rptr;
2439 
2440 		/*
2441 		 * If a callback is enabled then we need to know the
2442 		 * source and destination zoneids for the packet. We already
2443 		 * have those handy.
2444 		 */
2445 		if (ipst->ips_ip4_observe.he_interested) {
2446 			zoneid_t szone, dzone;
2447 			zoneid_t stackzoneid;
2448 
2449 			stackzoneid = netstackid_to_zoneid(
2450 			    ipst->ips_netstack->netstack_stackid);
2451 
2452 			if (stackzoneid == GLOBAL_ZONEID) {
2453 				/* Shared-IP zone */
2454 				dzone = ixa->ixa_ire->ire_zoneid;
2455 				szone = ixa->ixa_zoneid;
2456 			} else {
2457 				szone = dzone = stackzoneid;
2458 			}
2459 			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2460 			    ipst);
2461 		}
2462 		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2463 		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2464 		    NULL, int, 1);
2465 
2466 		/* FW_HOOKS: LOOPBACK_OUT */
2467 		if (hooks_out) {
2468 			DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
2469 			    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
2470 			FW_HOOKS(ipst->ips_ip4_loopback_out_event,
2471 			    ipst->ips_ipv4firewall_loopback_out,
2472 			    NULL, ill, ipha, mp, mp, 0, ipst, error);
2473 			DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
2474 		}
2475 		if (mp == NULL)
2476 			return (NULL);
2477 
2478 		/* FW_HOOKS: LOOPBACK_IN */
2479 		if (hooks_in) {
2480 			DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
2481 			    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
2482 			FW_HOOKS(ipst->ips_ip4_loopback_in_event,
2483 			    ipst->ips_ipv4firewall_loopback_in,
2484 			    ill, NULL, ipha, mp, mp, 0, ipst, error);
2485 			DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
2486 		}
2487 		if (mp == NULL)
2488 			return (NULL);
2489 
2490 		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2491 		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2492 		    NULL, int, 1);
2493 
2494 		/* Inbound IPsec polocies */
2495 		if (peer_connp != NULL) {
2496 			/* Map ixa to ira including IPsec policies. */
2497 			ipsec_out_to_in(ixa, ill, &iras);
2498 			mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
2499 			    NULL, &iras);
2500 		}
2501 	} else {
2502 		ip6h = (ip6_t *)mp->b_rptr;
2503 
2504 		/*
2505 		 * If a callback is enabled then we need to know the
2506 		 * source and destination zoneids for the packet. We already
2507 		 * have those handy.
2508 		 */
2509 		if (ipst->ips_ip6_observe.he_interested) {
2510 			zoneid_t szone, dzone;
2511 			zoneid_t stackzoneid;
2512 
2513 			stackzoneid = netstackid_to_zoneid(
2514 			    ipst->ips_netstack->netstack_stackid);
2515 
2516 			if (stackzoneid == GLOBAL_ZONEID) {
2517 				/* Shared-IP zone */
2518 				dzone = ixa->ixa_ire->ire_zoneid;
2519 				szone = ixa->ixa_zoneid;
2520 			} else {
2521 				szone = dzone = stackzoneid;
2522 			}
2523 			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2524 			    ipst);
2525 		}
2526 		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2527 		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2528 		    ip6h, int, 1);
2529 
2530 		/* FW_HOOKS: LOOPBACK_OUT */
2531 		if (hooks_out) {
2532 			DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
2533 			    ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
2534 			FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
2535 			    ipst->ips_ipv6firewall_loopback_out,
2536 			    NULL, ill, ip6h, mp, mp, 0, ipst, error);
2537 			DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
2538 		}
2539 		if (mp == NULL)
2540 			return (NULL);
2541 
2542 		/* FW_HOOKS: LOOPBACK_IN */
2543 		if (hooks_in) {
2544 			DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
2545 			    ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
2546 			FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
2547 			    ipst->ips_ipv6firewall_loopback_in,
2548 			    ill, NULL, ip6h, mp, mp, 0, ipst, error);
2549 			DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
2550 		}
2551 		if (mp == NULL)
2552 			return (NULL);
2553 
2554 		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2555 		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2556 		    ip6h, int, 1);
2557 
2558 		/* Inbound IPsec polocies */
2559 		if (peer_connp != NULL) {
2560 			/* Map ixa to ira including IPsec policies. */
2561 			ipsec_out_to_in(ixa, ill, &iras);
2562 			mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
2563 			    ip6h, &iras);
2564 		}
2565 	}
2566 
2567 	if (mp == NULL) {
2568 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2569 		ip_drop_input("ipIfStatsInDiscards", NULL, ill);
2570 	}
2571 
2572 	return (mp);
2573 }
2574