1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 1990 Mentat Inc.
24 * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
25 */
26
27#include <sys/types.h>
28#include <sys/stream.h>
29#include <sys/dlpi.h>
30#include <sys/stropts.h>
31#include <sys/sysmacros.h>
32#include <sys/strsun.h>
33#include <sys/strlog.h>
34#include <sys/strsubr.h>
35#define	_SUN_TPI_VERSION	2
36#include <sys/tihdr.h>
37#include <sys/ddi.h>
38#include <sys/sunddi.h>
39#include <sys/cmn_err.h>
40#include <sys/debug.h>
41#include <sys/sdt.h>
42#include <sys/kobj.h>
43#include <sys/zone.h>
44#include <sys/neti.h>
45#include <sys/hook.h>
46
47#include <sys/kmem.h>
48#include <sys/systm.h>
49#include <sys/param.h>
50#include <sys/socket.h>
51#include <sys/vtrace.h>
52#include <sys/isa_defs.h>
53#include <sys/atomic.h>
54#include <sys/policy.h>
55#include <sys/mac.h>
56#include <net/if.h>
57#include <net/if_types.h>
58#include <net/route.h>
59#include <net/if_dl.h>
60#include <sys/sockio.h>
61#include <netinet/in.h>
62#include <netinet/ip6.h>
63#include <netinet/icmp6.h>
64#include <netinet/sctp.h>
65
66#include <inet/common.h>
67#include <inet/mi.h>
68#include <inet/optcom.h>
69#include <inet/mib2.h>
70#include <inet/nd.h>
71#include <inet/arp.h>
72
73#include <inet/ip.h>
74#include <inet/ip_impl.h>
75#include <inet/ip6.h>
76#include <inet/ip6_asp.h>
77#include <inet/tcp.h>
78#include <inet/tcp_impl.h>
79#include <inet/udp_impl.h>
80#include <inet/ipp_common.h>
81
82#include <inet/ip_multi.h>
83#include <inet/ip_if.h>
84#include <inet/ip_ire.h>
85#include <inet/ip_rts.h>
86#include <inet/ip_ndp.h>
87#include <net/pfkeyv2.h>
88#include <inet/sadb.h>
89#include <inet/ipsec_impl.h>
90#include <inet/iptun/iptun_impl.h>
91#include <inet/sctp_ip.h>
92#include <sys/pattr.h>
93#include <inet/ipclassifier.h>
94#include <inet/ipsecah.h>
95#include <inet/rawip_impl.h>
96#include <inet/rts_impl.h>
97#include <sys/squeue_impl.h>
98#include <sys/squeue.h>
99
100#include <sys/tsol/label.h>
101#include <sys/tsol/tnet.h>
102
103/* Temporary; for CR 6451644 work-around */
104#include <sys/ethernet.h>
105
106/*
107 * Naming conventions:
108 *      These rules should be judiciously applied
109 *	if there is a need to identify something as IPv6 versus IPv4
110 *	IPv6 funcions will end with _v6 in the ip module.
111 *	IPv6 funcions will end with _ipv6 in the transport modules.
112 *	IPv6 macros:
113 *		Some macros end with _V6; e.g. ILL_FRAG_HASH_V6
114 *		Some macros start with V6_; e.g. V6_OR_V4_INADDR_ANY
115 *		And then there are ..V4_PART_OF_V6.
116 *		The intent is that macros in the ip module end with _V6.
117 *	IPv6 global variables will start with ipv6_
118 *	IPv6 structures will start with ipv6
119 *	IPv6 defined constants should start with IPV6_
120 *		(but then there are NDP_DEFAULT_VERS_PRI_AND_FLOW, etc)
121 */
122
123/*
124 * ip6opt_ls is used to enable IPv6 (via /etc/system on TX systems).
125 * We need to do this because we didn't obtain the IP6OPT_LS (0x0a)
126 * from IANA. This mechanism will remain in effect until an official
127 * number is obtained.
128 */
129uchar_t ip6opt_ls;
130
131const in6_addr_t ipv6_all_ones =
132	{ 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU };
133const in6_addr_t ipv6_all_zeros = { 0, 0, 0, 0 };
134
135#ifdef	_BIG_ENDIAN
136const in6_addr_t ipv6_unspecified_group = { 0xff000000U, 0, 0, 0 };
137#else	/* _BIG_ENDIAN */
138const in6_addr_t ipv6_unspecified_group = { 0x000000ffU, 0, 0, 0 };
139#endif	/* _BIG_ENDIAN */
140
141#ifdef	_BIG_ENDIAN
142const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x00000001U };
143#else  /* _BIG_ENDIAN */
144const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x01000000U };
145#endif /* _BIG_ENDIAN */
146
147#ifdef _BIG_ENDIAN
148const in6_addr_t ipv6_all_hosts_mcast = { 0xff020000U, 0, 0, 0x00000001U };
149#else  /* _BIG_ENDIAN */
150const in6_addr_t ipv6_all_hosts_mcast = { 0x000002ffU, 0, 0, 0x01000000U };
151#endif /* _BIG_ENDIAN */
152
153#ifdef _BIG_ENDIAN
154const in6_addr_t ipv6_all_rtrs_mcast = { 0xff020000U, 0, 0, 0x00000002U };
155#else  /* _BIG_ENDIAN */
156const in6_addr_t ipv6_all_rtrs_mcast = { 0x000002ffU, 0, 0, 0x02000000U };
157#endif /* _BIG_ENDIAN */
158
159#ifdef _BIG_ENDIAN
160const in6_addr_t ipv6_all_v2rtrs_mcast = { 0xff020000U, 0, 0, 0x00000016U };
161#else  /* _BIG_ENDIAN */
162const in6_addr_t ipv6_all_v2rtrs_mcast = { 0x000002ffU, 0, 0, 0x16000000U };
163#endif /* _BIG_ENDIAN */
164
165#ifdef _BIG_ENDIAN
166const in6_addr_t ipv6_solicited_node_mcast =
167			{ 0xff020000U, 0, 0x00000001U, 0xff000000U };
168#else  /* _BIG_ENDIAN */
169const in6_addr_t ipv6_solicited_node_mcast =
170			{ 0x000002ffU, 0, 0x01000000U, 0x000000ffU };
171#endif /* _BIG_ENDIAN */
172
173static boolean_t icmp_inbound_verify_v6(mblk_t *, icmp6_t *, ip_recv_attr_t *);
174static void	icmp_inbound_too_big_v6(icmp6_t *, ip_recv_attr_t *);
175static void	icmp_pkt_v6(mblk_t *, void *, size_t, const in6_addr_t *,
176    ip_recv_attr_t *);
177static void	icmp_redirect_v6(mblk_t *, ip6_t *, nd_redirect_t *,
178    ip_recv_attr_t *);
179static void	icmp_send_redirect_v6(mblk_t *, in6_addr_t *,
180    in6_addr_t *, ip_recv_attr_t *);
181static void	icmp_send_reply_v6(mblk_t *, ip6_t *, icmp6_t *,
182    ip_recv_attr_t *);
183static boolean_t	ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *);
184
185/*
186 * icmp_inbound_v6 deals with ICMP messages that are handled by IP.
187 * If the ICMP message is consumed by IP, i.e., it should not be delivered
188 * to any IPPROTO_ICMP raw sockets, then it returns NULL.
189 * Likewise, if the ICMP error is misformed (too short, etc), then it
190 * returns NULL. The caller uses this to determine whether or not to send
191 * to raw sockets.
192 *
193 * All error messages are passed to the matching transport stream.
194 *
195 * See comment for icmp_inbound_v4() on how IPsec is handled.
196 */
197mblk_t *
198icmp_inbound_v6(mblk_t *mp, ip_recv_attr_t *ira)
199{
200	icmp6_t		*icmp6;
201	ip6_t		*ip6h;		/* Outer header */
202	int		ip_hdr_length;	/* Outer header length */
203	boolean_t	interested;
204	ill_t		*ill = ira->ira_ill;
205	ip_stack_t	*ipst = ill->ill_ipst;
206	mblk_t		*mp_ret = NULL;
207
208	ip6h = (ip6_t *)mp->b_rptr;
209
210	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
211
212	/* Check for Martian packets  */
213	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
214		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
215		ip_drop_input("ipIfStatsInAddrErrors: mcast src", mp, ill);
216		freemsg(mp);
217		return (NULL);
218	}
219
220	/* Make sure ira_l2src is set for ndp_input */
221	if (!(ira->ira_flags & IRAF_L2SRC_SET))
222		ip_setl2src(mp, ira, ira->ira_rill);
223
224	ip_hdr_length = ira->ira_ip_hdr_length;
225	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) {
226		if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) {
227			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
228			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
229			freemsg(mp);
230			return (NULL);
231		}
232		ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira);
233		if (ip6h == NULL) {
234			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
235			freemsg(mp);
236			return (NULL);
237		}
238	}
239
240	icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
241	DTRACE_PROBE2(icmp__inbound__v6, ip6_t *, ip6h, icmp6_t *, icmp6);
242	ip2dbg(("icmp_inbound_v6: type %d code %d\n", icmp6->icmp6_type,
243	    icmp6->icmp6_code));
244
245	/*
246	 * We will set "interested" to "true" if we should pass a copy to
247	 * the transport i.e., if it is an error message.
248	 */
249	interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK);
250
251	switch (icmp6->icmp6_type) {
252	case ICMP6_DST_UNREACH:
253		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInDestUnreachs);
254		if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN)
255			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInAdminProhibs);
256		break;
257
258	case ICMP6_TIME_EXCEEDED:
259		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInTimeExcds);
260		break;
261
262	case ICMP6_PARAM_PROB:
263		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInParmProblems);
264		break;
265
266	case ICMP6_PACKET_TOO_BIG:
267		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInPktTooBigs);
268		break;
269
270	case ICMP6_ECHO_REQUEST:
271		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchos);
272		if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
273		    !ipst->ips_ipv6_resp_echo_mcast)
274			break;
275
276		/*
277		 * We must have exclusive use of the mblk to convert it to
278		 * a response.
279		 * If not, we copy it.
280		 */
281		if (mp->b_datap->db_ref > 1) {
282			mblk_t	*mp1;
283
284			mp1 = copymsg(mp);
285			if (mp1 == NULL) {
286				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
287				ip_drop_input("ipIfStatsInDiscards - copymsg",
288				    mp, ill);
289				freemsg(mp);
290				return (NULL);
291			}
292			freemsg(mp);
293			mp = mp1;
294			ip6h = (ip6_t *)mp->b_rptr;
295			icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
296		}
297
298		icmp6->icmp6_type = ICMP6_ECHO_REPLY;
299		icmp_send_reply_v6(mp, ip6h, icmp6, ira);
300		return (NULL);
301
302	case ICMP6_ECHO_REPLY:
303		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchoReplies);
304		break;
305
306	case ND_ROUTER_SOLICIT:
307		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterSolicits);
308		break;
309
310	case ND_ROUTER_ADVERT:
311		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterAdvertisements);
312		break;
313
314	case ND_NEIGHBOR_SOLICIT:
315		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborSolicits);
316		ndp_input(mp, ira);
317		return (NULL);
318
319	case ND_NEIGHBOR_ADVERT:
320		BUMP_MIB(ill->ill_icmp6_mib,
321		    ipv6IfIcmpInNeighborAdvertisements);
322		ndp_input(mp, ira);
323		return (NULL);
324
325	case ND_REDIRECT:
326		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRedirects);
327
328		if (ipst->ips_ipv6_ignore_redirect)
329			break;
330
331		/* We now allow a RAW socket to receive this. */
332		interested = B_TRUE;
333		break;
334
335	/*
336	 * The next three icmp messages will be handled by MLD.
337	 * Pass all valid MLD packets up to any process(es)
338	 * listening on a raw ICMP socket.
339	 */
340	case MLD_LISTENER_QUERY:
341	case MLD_LISTENER_REPORT:
342	case MLD_LISTENER_REDUCTION:
343		mp = mld_input(mp, ira);
344		return (mp);
345	default:
346		break;
347	}
348	/*
349	 * See if there is an ICMP client to avoid an extra copymsg/freemsg
350	 * if there isn't one.
351	 */
352	if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_ICMPV6].connf_head != NULL) {
353		/* If there is an ICMP client and we want one too, copy it. */
354
355		if (!interested) {
356			/* Caller will deliver to RAW sockets */
357			return (mp);
358		}
359		mp_ret = copymsg(mp);
360		if (mp_ret == NULL) {
361			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
362			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
363		}
364	} else if (!interested) {
365		/* Neither we nor raw sockets are interested. Drop packet now */
366		freemsg(mp);
367		return (NULL);
368	}
369
370	/*
371	 * ICMP error or redirect packet. Make sure we have enough of
372	 * the header and that db_ref == 1 since we might end up modifying
373	 * the packet.
374	 */
375	if (mp->b_cont != NULL) {
376		if (ip_pullup(mp, -1, ira) == NULL) {
377			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
378			ip_drop_input("ipIfStatsInDiscards - ip_pullup",
379			    mp, ill);
380			freemsg(mp);
381			return (mp_ret);
382		}
383	}
384
385	if (mp->b_datap->db_ref > 1) {
386		mblk_t	*mp1;
387
388		mp1 = copymsg(mp);
389		if (mp1 == NULL) {
390			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
391			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
392			freemsg(mp);
393			return (mp_ret);
394		}
395		freemsg(mp);
396		mp = mp1;
397	}
398
399	/*
400	 * In case mp has changed, verify the message before any further
401	 * processes.
402	 */
403	ip6h = (ip6_t *)mp->b_rptr;
404	icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
405	if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
406		freemsg(mp);
407		return (mp_ret);
408	}
409
410	switch (icmp6->icmp6_type) {
411	case ND_REDIRECT:
412		icmp_redirect_v6(mp, ip6h, (nd_redirect_t *)icmp6, ira);
413		break;
414	case ICMP6_PACKET_TOO_BIG:
415		/* Update DCE and adjust MTU is icmp header if needed */
416		icmp_inbound_too_big_v6(icmp6, ira);
417		/* FALLTHROUGH */
418	default:
419		icmp_inbound_error_fanout_v6(mp, icmp6, ira);
420		break;
421	}
422
423	return (mp_ret);
424}
425
426/*
427 * Send an ICMP echo reply.
428 * The caller has already updated the payload part of the packet.
429 * We handle the ICMP checksum, IP source address selection and feed
430 * the packet into ip_output_simple.
431 */
432static void
433icmp_send_reply_v6(mblk_t *mp, ip6_t *ip6h, icmp6_t *icmp6,
434    ip_recv_attr_t *ira)
435{
436	uint_t		ip_hdr_length = ira->ira_ip_hdr_length;
437	ill_t		*ill = ira->ira_ill;
438	ip_stack_t	*ipst = ill->ill_ipst;
439	ip_xmit_attr_t	ixas;
440	in6_addr_t	origsrc;
441
442	/*
443	 * Remove any extension headers (do not reverse a source route)
444	 * and clear the flow id (keep traffic class for now).
445	 */
446	if (ip_hdr_length != IPV6_HDR_LEN) {
447		int	i;
448
449		for (i = 0; i < IPV6_HDR_LEN; i++) {
450			mp->b_rptr[ip_hdr_length - i - 1] =
451			    mp->b_rptr[IPV6_HDR_LEN - i - 1];
452		}
453		mp->b_rptr += (ip_hdr_length - IPV6_HDR_LEN);
454		ip6h = (ip6_t *)mp->b_rptr;
455		ip6h->ip6_nxt = IPPROTO_ICMPV6;
456		i = ntohs(ip6h->ip6_plen);
457		i -= (ip_hdr_length - IPV6_HDR_LEN);
458		ip6h->ip6_plen = htons(i);
459		ip_hdr_length = IPV6_HDR_LEN;
460		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == msgdsize(mp));
461	}
462	ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
463
464	/* Reverse the source and destination addresses. */
465	origsrc = ip6h->ip6_src;
466	ip6h->ip6_src = ip6h->ip6_dst;
467	ip6h->ip6_dst = origsrc;
468
469	/* set the hop limit */
470	ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
471
472	/*
473	 * Prepare for checksum by putting icmp length in the icmp
474	 * checksum field. The checksum is calculated in ip_output
475	 */
476	icmp6->icmp6_cksum = ip6h->ip6_plen;
477
478	bzero(&ixas, sizeof (ixas));
479	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
480	ixas.ixa_zoneid = ira->ira_zoneid;
481	ixas.ixa_cred = kcred;
482	ixas.ixa_cpid = NOPID;
483	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
484	ixas.ixa_ifindex = 0;
485	ixas.ixa_ipst = ipst;
486	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
487
488	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
489		/*
490		 * This packet should go out the same way as it
491		 * came in i.e in clear, independent of the IPsec
492		 * policy for transmitting packets.
493		 */
494		ixas.ixa_flags |= IXAF_NO_IPSEC;
495	} else {
496		if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
497			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
498			/* Note: mp already consumed and ip_drop_packet done */
499			return;
500		}
501	}
502
503	/* Was the destination (now source) link-local? Send out same group */
504	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
505		ixas.ixa_flags |= IXAF_SCOPEID_SET;
506		if (IS_UNDER_IPMP(ill))
507			ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
508		else
509			ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
510	}
511
512	if (ira->ira_flags & IRAF_MULTIBROADCAST) {
513		/*
514		 * Not one or our addresses (IRE_LOCALs), thus we let
515		 * ip_output_simple pick the source.
516		 */
517		ip6h->ip6_src = ipv6_all_zeros;
518		ixas.ixa_flags |= IXAF_SET_SOURCE;
519	}
520
521	/* Should we send using dce_pmtu? */
522	if (ipst->ips_ipv6_icmp_return_pmtu)
523		ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
524
525	(void) ip_output_simple(mp, &ixas);
526	ixa_cleanup(&ixas);
527
528}
529
530/*
531 * Verify the ICMP messages for either for ICMP error or redirect packet.
532 * The caller should have fully pulled up the message. If it's a redirect
533 * packet, only basic checks on IP header will be done; otherwise, verify
534 * the packet by looking at the included ULP header.
535 *
536 * Called before icmp_inbound_error_fanout_v6 is called.
537 */
538static boolean_t
539icmp_inbound_verify_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
540{
541	ill_t		*ill = ira->ira_ill;
542	uint16_t	hdr_length;
543	uint8_t		*nexthdrp;
544	uint8_t		nexthdr;
545	ip_stack_t	*ipst = ill->ill_ipst;
546	conn_t		*connp;
547	ip6_t		*ip6h;	/* Inner header */
548
549	ip6h = (ip6_t *)&icmp6[1];
550	if ((uchar_t *)ip6h + IPV6_HDR_LEN > mp->b_wptr)
551		goto truncated;
552
553	if (icmp6->icmp6_type == ND_REDIRECT) {
554		hdr_length = sizeof (nd_redirect_t);
555	} else {
556		if ((IPH_HDR_VERSION(ip6h) != IPV6_VERSION))
557			goto discard_pkt;
558		hdr_length = IPV6_HDR_LEN;
559	}
560
561	if ((uchar_t *)ip6h + hdr_length > mp->b_wptr)
562		goto truncated;
563
564	/*
565	 * Stop here for ICMP_REDIRECT.
566	 */
567	if (icmp6->icmp6_type == ND_REDIRECT)
568		return (B_TRUE);
569
570	/*
571	 * ICMP errors only.
572	 */
573	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
574		goto discard_pkt;
575	nexthdr = *nexthdrp;
576
577	/* Try to pass the ICMP message to clients who need it */
578	switch (nexthdr) {
579	case IPPROTO_UDP:
580		/*
581		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
582		 * transport header.
583		 */
584		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
585		    mp->b_wptr)
586			goto truncated;
587		break;
588	case IPPROTO_TCP: {
589		tcpha_t		*tcpha;
590
591		/*
592		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
593		 * transport header.
594		 */
595		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
596		    mp->b_wptr)
597			goto truncated;
598
599		tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
600		/*
601		 * With IPMP we need to match across group, which we do
602		 * since we have the upper ill from ira_ill.
603		 */
604		connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN,
605		    ill->ill_phyint->phyint_ifindex, ipst);
606		if (connp == NULL)
607			goto discard_pkt;
608
609		if ((connp->conn_verifyicmp != NULL) &&
610		    !connp->conn_verifyicmp(connp, tcpha, NULL, icmp6, ira)) {
611			CONN_DEC_REF(connp);
612			goto discard_pkt;
613		}
614		CONN_DEC_REF(connp);
615		break;
616	}
617	case IPPROTO_SCTP:
618		/*
619		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
620		 * transport header.
621		 */
622		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
623		    mp->b_wptr)
624			goto truncated;
625		break;
626	case IPPROTO_ESP:
627	case IPPROTO_AH:
628		break;
629	case IPPROTO_ENCAP:
630	case IPPROTO_IPV6: {
631		/* Look for self-encapsulated packets that caused an error */
632		ip6_t *in_ip6h;
633
634		in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
635		if ((uint8_t *)in_ip6h + (nexthdr == IPPROTO_ENCAP ?
636		    sizeof (ipha_t) : sizeof (ip6_t)) > mp->b_wptr)
637			goto truncated;
638		break;
639	}
640	default:
641		break;
642	}
643
644	return (B_TRUE);
645
646discard_pkt:
647	/* Bogus ICMP error. */
648	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
649	return (B_FALSE);
650
651truncated:
652	/* We pulled up everthing already. Must be truncated */
653	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
654	return (B_FALSE);
655}
656
657/*
658 * Process received IPv6 ICMP Packet too big.
659 * The caller is responsible for validating the packet before passing it in
660 * and also to fanout the ICMP error to any matching transport conns. Assumes
661 * the message has been fully pulled up.
662 *
663 * Before getting here, the caller has called icmp_inbound_verify_v6()
664 * that should have verified with ULP to prevent undoing the changes we're
665 * going to make to DCE. For example, TCP might have verified that the packet
666 * which generated error is in the send window.
667 *
668 * In some cases modified this MTU in the ICMP header packet; the caller
669 * should pass to the matching ULP after this returns.
670 */
671static void
672icmp_inbound_too_big_v6(icmp6_t *icmp6, ip_recv_attr_t *ira)
673{
674	uint32_t	mtu;
675	dce_t		*dce;
676	ill_t		*ill = ira->ira_ill;	/* Upper ill if IPMP */
677	ip_stack_t	*ipst = ill->ill_ipst;
678	int		old_max_frag;
679	in6_addr_t	final_dst;
680	ip6_t		*ip6h;	/* Inner IP header */
681
682	/* Caller has already pulled up everything. */
683	ip6h = (ip6_t *)&icmp6[1];
684	final_dst = ip_get_dst_v6(ip6h, NULL, NULL);
685
686	mtu = ntohl(icmp6->icmp6_mtu);
687	if (mtu < IPV6_MIN_MTU) {
688		/*
689		 * RFC 8021 suggests to ignore messages where mtu is
690		 * less than the IPv6 minimum.
691		 */
692		ip1dbg(("Received mtu less than IPv6 "
693		    "min mtu %d: %d\n", IPV6_MIN_MTU, mtu));
694		DTRACE_PROBE1(icmp6__too__small__mtu, uint32_t, mtu);
695		return;
696	}
697
698	/*
699	 * For link local destinations matching simply on address is not
700	 * sufficient. Same link local addresses for different ILL's is
701	 * possible.
702	 */
703	if (IN6_IS_ADDR_LINKSCOPE(&final_dst)) {
704		dce = dce_lookup_and_add_v6(&final_dst,
705		    ill->ill_phyint->phyint_ifindex, ipst);
706	} else {
707		dce = dce_lookup_and_add_v6(&final_dst, 0, ipst);
708	}
709	if (dce == NULL) {
710		/* Couldn't add a unique one - ENOMEM */
711		if (ip_debug > 2) {
712			/* ip1dbg */
713			pr_addr_dbg("icmp_inbound_too_big_v6:"
714			    "no dce for dst %s\n", AF_INET6,
715			    &final_dst);
716		}
717		return;
718	}
719
720	mutex_enter(&dce->dce_lock);
721	if (dce->dce_flags & DCEF_PMTU)
722		old_max_frag = dce->dce_pmtu;
723	else if (IN6_IS_ADDR_MULTICAST(&final_dst))
724		old_max_frag = ill->ill_mc_mtu;
725	else
726		old_max_frag = ill->ill_mtu;
727
728	ip1dbg(("Received mtu from router: %d\n", mtu));
729	DTRACE_PROBE1(icmp6__received__mtu, uint32_t, mtu);
730	dce->dce_pmtu = MIN(old_max_frag, mtu);
731	icmp6->icmp6_mtu = htonl(dce->dce_pmtu);
732
733	/* We now have a PMTU for sure */
734	dce->dce_flags |= DCEF_PMTU;
735	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
736
737	mutex_exit(&dce->dce_lock);
738	/*
739	 * After dropping the lock the new value is visible to everyone.
740	 * Then we bump the generation number so any cached values reinspect
741	 * the dce_t.
742	 */
743	dce_increment_generation(dce);
744	dce_refrele(dce);
745}
746
747/*
748 * Fanout received ICMPv6 error packets to the transports.
749 * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
750 *
751 * The caller must have called icmp_inbound_verify_v6.
752 */
753void
754icmp_inbound_error_fanout_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
755{
756	uint16_t	*up;	/* Pointer to ports in ULP header */
757	uint32_t	ports;	/* reversed ports for fanout */
758	ip6_t		rip6h;	/* With reversed addresses */
759	ip6_t		*ip6h;	/* Inner IP header */
760	uint16_t	hdr_length; /* Inner IP header length */
761	uint8_t		*nexthdrp;
762	uint8_t		nexthdr;
763	tcpha_t		*tcpha;
764	conn_t		*connp;
765	ill_t		*ill = ira->ira_ill;	/* Upper in the case of IPMP */
766	ip_stack_t	*ipst = ill->ill_ipst;
767	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
768
769	/* Caller has already pulled up everything. */
770	ip6h = (ip6_t *)&icmp6[1];
771	ASSERT(mp->b_cont == NULL);
772	ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
773
774	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
775		goto drop_pkt;
776	nexthdr = *nexthdrp;
777	ira->ira_protocol = nexthdr;
778
779	/*
780	 * We need a separate IP header with the source and destination
781	 * addresses reversed to do fanout/classification because the ip6h in
782	 * the ICMPv6 error is in the form we sent it out.
783	 */
784	rip6h.ip6_src = ip6h->ip6_dst;
785	rip6h.ip6_dst = ip6h->ip6_src;
786	rip6h.ip6_nxt = nexthdr;
787
788	/* Try to pass the ICMP message to clients who need it */
789	switch (nexthdr) {
790	case IPPROTO_UDP: {
791		/* Attempt to find a client stream based on port. */
792		up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
793
794		/* Note that we send error to all matches. */
795		ira->ira_flags |= IRAF_ICMP_ERROR;
796		ip_fanout_udp_multi_v6(mp, &rip6h, up[0], up[1], ira);
797		ira->ira_flags &= ~IRAF_ICMP_ERROR;
798		return;
799	}
800	case IPPROTO_TCP: {
801		/*
802		 * Attempt to find a client stream based on port.
803		 * Note that we do a reverse lookup since the header is
804		 * in the form we sent it out.
805		 */
806		tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
807		/*
808		 * With IPMP we need to match across group, which we do
809		 * since we have the upper ill from ira_ill.
810		 */
811		connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha,
812		    TCPS_LISTEN, ill->ill_phyint->phyint_ifindex, ipst);
813		if (connp == NULL) {
814			goto drop_pkt;
815		}
816
817		if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
818		    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
819			mp = ipsec_check_inbound_policy(mp, connp,
820			    NULL, ip6h, ira);
821			if (mp == NULL) {
822				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
823				/* Note that mp is NULL */
824				ip_drop_input("ipIfStatsInDiscards", mp, ill);
825				CONN_DEC_REF(connp);
826				return;
827			}
828		}
829
830		ira->ira_flags |= IRAF_ICMP_ERROR;
831		if (IPCL_IS_TCP(connp)) {
832			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
833			    connp->conn_recvicmp, connp, ira, SQ_FILL,
834			    SQTAG_TCP6_INPUT_ICMP_ERR);
835		} else {
836			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
837			ill_t *rill = ira->ira_rill;
838
839			ira->ira_ill = ira->ira_rill = NULL;
840			(connp->conn_recv)(connp, mp, NULL, ira);
841			CONN_DEC_REF(connp);
842			ira->ira_ill = ill;
843			ira->ira_rill = rill;
844		}
845		ira->ira_flags &= ~IRAF_ICMP_ERROR;
846		return;
847
848	}
849	case IPPROTO_SCTP:
850		up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
851		/* Find a SCTP client stream for this packet. */
852		((uint16_t *)&ports)[0] = up[1];
853		((uint16_t *)&ports)[1] = up[0];
854
855		ira->ira_flags |= IRAF_ICMP_ERROR;
856		ip_fanout_sctp(mp, NULL, &rip6h, ports, ira);
857		ira->ira_flags &= ~IRAF_ICMP_ERROR;
858		return;
859
860	case IPPROTO_ESP:
861	case IPPROTO_AH:
862		if (!ipsec_loaded(ipss)) {
863			ip_proto_not_sup(mp, ira);
864			return;
865		}
866
867		if (nexthdr == IPPROTO_ESP)
868			mp = ipsecesp_icmp_error(mp, ira);
869		else
870			mp = ipsecah_icmp_error(mp, ira);
871		if (mp == NULL)
872			return;
873
874		/* Just in case ipsec didn't preserve the NULL b_cont */
875		if (mp->b_cont != NULL) {
876			if (!pullupmsg(mp, -1))
877				goto drop_pkt;
878		}
879
880		/*
881		 * If succesful, the mp has been modified to not include
882		 * the ESP/AH header so we can fanout to the ULP's icmp
883		 * error handler.
884		 */
885		if (mp->b_wptr - mp->b_rptr < IPV6_HDR_LEN)
886			goto drop_pkt;
887
888		ip6h = (ip6_t *)mp->b_rptr;
889		/* Don't call hdr_length_v6() unless you have to. */
890		if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
891			hdr_length = ip_hdr_length_v6(mp, ip6h);
892		else
893			hdr_length = IPV6_HDR_LEN;
894
895		/* Verify the modified message before any further processes. */
896		icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
897		if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
898			freemsg(mp);
899			return;
900		}
901
902		icmp_inbound_error_fanout_v6(mp, icmp6, ira);
903		return;
904
905	case IPPROTO_IPV6: {
906		/* Look for self-encapsulated packets that caused an error */
907		ip6_t *in_ip6h;
908
909		in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
910
911		if (IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_src, &ip6h->ip6_src) &&
912		    IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_dst, &ip6h->ip6_dst)) {
913			/*
914			 * Self-encapsulated case. As in the ipv4 case,
915			 * we need to strip the 2nd IP header. Since mp
916			 * is already pulled-up, we can simply bcopy
917			 * the 3rd header + data over the 2nd header.
918			 */
919			uint16_t unused_len;
920
921			/*
922			 * Make sure we don't do recursion more than once.
923			 */
924			if (!ip_hdr_length_nexthdr_v6(mp, in_ip6h,
925			    &unused_len, &nexthdrp) ||
926			    *nexthdrp == IPPROTO_IPV6) {
927				goto drop_pkt;
928			}
929
930			/*
931			 * Copy the 3rd header + remaining data on top
932			 * of the 2nd header.
933			 */
934			bcopy(in_ip6h, ip6h, mp->b_wptr - (uchar_t *)in_ip6h);
935
936			/*
937			 * Subtract length of the 2nd header.
938			 */
939			mp->b_wptr -= hdr_length;
940
941			ip6h = (ip6_t *)mp->b_rptr;
942			/* Don't call hdr_length_v6() unless you have to. */
943			if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
944				hdr_length = ip_hdr_length_v6(mp, ip6h);
945			else
946				hdr_length = IPV6_HDR_LEN;
947
948			/*
949			 * Verify the modified message before any further
950			 * processes.
951			 */
952			icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
953			if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
954				freemsg(mp);
955				return;
956			}
957
958			/*
959			 * Now recurse, and see what I _really_ should be
960			 * doing here.
961			 */
962			icmp_inbound_error_fanout_v6(mp, icmp6, ira);
963			return;
964		}
965	}
966	/* FALLTHROUGH */
967	case IPPROTO_ENCAP:
968		if ((connp = ipcl_iptun_classify_v6(&rip6h.ip6_src,
969		    &rip6h.ip6_dst, ipst)) != NULL) {
970			ira->ira_flags |= IRAF_ICMP_ERROR;
971			connp->conn_recvicmp(connp, mp, NULL, ira);
972			CONN_DEC_REF(connp);
973			ira->ira_flags &= ~IRAF_ICMP_ERROR;
974			return;
975		}
976		/*
977		 * No IP tunnel is interested, fallthrough and see
978		 * if a raw socket will want it.
979		 */
980		/* FALLTHROUGH */
981	default:
982		ira->ira_flags |= IRAF_ICMP_ERROR;
983		ASSERT(ira->ira_protocol == nexthdr);
984		ip_fanout_proto_v6(mp, &rip6h, ira);
985		ira->ira_flags &= ~IRAF_ICMP_ERROR;
986		return;
987	}
988	/* NOTREACHED */
989drop_pkt:
990	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
991	ip1dbg(("icmp_inbound_error_fanout_v6: drop pkt\n"));
992	freemsg(mp);
993}
994
995/*
996 * Process received IPv6 ICMP Redirect messages.
997 * Assumes the caller has verified that the headers are in the pulled up mblk.
998 * Consumes mp.
999 */
1000/* ARGSUSED */
1001static void
1002icmp_redirect_v6(mblk_t *mp, ip6_t *ip6h, nd_redirect_t *rd,
1003    ip_recv_attr_t *ira)
1004{
1005	ire_t		*ire, *nire;
1006	ire_t		*prev_ire = NULL;
1007	ire_t		*redir_ire;
1008	in6_addr_t	*src, *dst, *gateway;
1009	nd_opt_hdr_t	*opt;
1010	nce_t		*nce;
1011	int		ncec_flags = 0;
1012	int		err = 0;
1013	boolean_t	redirect_to_router = B_FALSE;
1014	int		len;
1015	int		optlen;
1016	ill_t		*ill = ira->ira_rill;
1017	ill_t		*rill = ira->ira_rill;
1018	ip_stack_t	*ipst = ill->ill_ipst;
1019
1020	/*
1021	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
1022	 * and make it be the IPMP upper so avoid being confused by a packet
1023	 * addressed to a unicast address on a different ill.
1024	 */
1025	if (IS_UNDER_IPMP(rill)) {
1026		rill = ipmp_ill_hold_ipmp_ill(rill);
1027		if (rill == NULL) {
1028			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1029			ip_drop_input("ipv6IfIcmpInBadRedirects - IPMP ill",
1030			    mp, ill);
1031			freemsg(mp);
1032			return;
1033		}
1034		ASSERT(rill != ira->ira_rill);
1035	}
1036
1037	len = mp->b_wptr - (uchar_t *)rd;
1038	src = &ip6h->ip6_src;
1039	dst = &rd->nd_rd_dst;
1040	gateway = &rd->nd_rd_target;
1041
1042	/* Verify if it is a valid redirect */
1043	if (!IN6_IS_ADDR_LINKLOCAL(src) ||
1044	    (ip6h->ip6_hops != IPV6_MAX_HOPS) ||
1045	    (rd->nd_rd_code != 0) ||
1046	    (len < sizeof (nd_redirect_t)) ||
1047	    (IN6_IS_ADDR_V4MAPPED(dst)) ||
1048	    (IN6_IS_ADDR_MULTICAST(dst))) {
1049		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1050		ip_drop_input("ipv6IfIcmpInBadRedirects - addr/len", mp, ill);
1051		goto fail_redirect;
1052	}
1053
1054	if (!(IN6_IS_ADDR_LINKLOCAL(gateway) ||
1055	    IN6_ARE_ADDR_EQUAL(gateway, dst))) {
1056		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1057		ip_drop_input("ipv6IfIcmpInBadRedirects - bad gateway",
1058		    mp, ill);
1059		goto fail_redirect;
1060	}
1061
1062	optlen = len - sizeof (nd_redirect_t);
1063	if (optlen != 0) {
1064		if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1], optlen)) {
1065			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1066			ip_drop_input("ipv6IfIcmpInBadRedirects - options",
1067			    mp, ill);
1068			goto fail_redirect;
1069		}
1070	}
1071
1072	if (!IN6_ARE_ADDR_EQUAL(gateway, dst)) {
1073		redirect_to_router = B_TRUE;
1074		ncec_flags |= NCE_F_ISROUTER;
1075	} else {
1076		gateway = dst;	/* Add nce for dst */
1077	}
1078
1079
1080	/*
1081	 * Verify that the IP source address of the redirect is
1082	 * the same as the current first-hop router for the specified
1083	 * ICMP destination address.
1084	 * Also, Make sure we had a route for the dest in question and
1085	 * that route was pointing to the old gateway (the source of the
1086	 * redirect packet.)
1087	 * We do longest match and then compare ire_gateway_addr_v6 below.
1088	 */
1089	prev_ire = ire_ftable_lookup_v6(dst, 0, 0, 0, rill,
1090	    ALL_ZONES, NULL, MATCH_IRE_ILL, 0, ipst, NULL);
1091
1092	/*
1093	 * Check that
1094	 *	the redirect was not from ourselves
1095	 *	old gateway is still directly reachable
1096	 */
1097	if (prev_ire == NULL ||
1098	    (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
1099	    (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1100	    !IN6_ARE_ADDR_EQUAL(src, &prev_ire->ire_gateway_addr_v6)) {
1101		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1102		ip_drop_input("ipv6IfIcmpInBadRedirects - ire", mp, ill);
1103		goto fail_redirect;
1104	}
1105
1106	ASSERT(prev_ire->ire_ill != NULL);
1107	if (prev_ire->ire_ill->ill_flags & ILLF_NONUD)
1108		ncec_flags |= NCE_F_NONUD;
1109
1110	opt = (nd_opt_hdr_t *)&rd[1];
1111	opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR);
1112	if (opt != NULL) {
1113		err = nce_lookup_then_add_v6(rill,
1114		    (uchar_t *)&opt[1],		/* Link layer address */
1115		    rill->ill_phys_addr_length,
1116		    gateway, ncec_flags, ND_STALE, &nce);
1117		switch (err) {
1118		case 0:
1119			nce_refrele(nce);
1120			break;
1121		case EEXIST:
1122			/*
1123			 * Check to see if link layer address has changed and
1124			 * process the ncec_state accordingly.
1125			 */
1126			nce_process(nce->nce_common,
1127			    (uchar_t *)&opt[1], 0, B_FALSE);
1128			nce_refrele(nce);
1129			break;
1130		default:
1131			ip1dbg(("icmp_redirect_v6: NCE create failed %d\n",
1132			    err));
1133			goto fail_redirect;
1134		}
1135	}
1136	if (redirect_to_router) {
1137		ASSERT(IN6_IS_ADDR_LINKLOCAL(gateway));
1138
1139		/*
1140		 * Create a Route Association.  This will allow us to remember
1141		 * a router told us to use the particular gateway.
1142		 */
1143		ire = ire_create_v6(
1144		    dst,
1145		    &ipv6_all_ones,		/* mask */
1146		    gateway,			/* gateway addr */
1147		    IRE_HOST,
1148		    prev_ire->ire_ill,
1149		    ALL_ZONES,
1150		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
1151		    NULL,
1152		    ipst);
1153	} else {
1154		ipif_t *ipif;
1155		in6_addr_t gw;
1156
1157		/*
1158		 * Just create an on link entry, i.e. interface route.
1159		 * The gateway field is our link-local on the ill.
1160		 */
1161		mutex_enter(&rill->ill_lock);
1162		for (ipif = rill->ill_ipif; ipif != NULL;
1163		    ipif = ipif->ipif_next) {
1164			if (!(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1165			    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
1166				break;
1167		}
1168		if (ipif == NULL) {
1169			/* We have no link-local address! */
1170			mutex_exit(&rill->ill_lock);
1171			goto fail_redirect;
1172		}
1173		gw = ipif->ipif_v6lcl_addr;
1174		mutex_exit(&rill->ill_lock);
1175
1176		ire = ire_create_v6(
1177		    dst,				/* gateway == dst */
1178		    &ipv6_all_ones,			/* mask */
1179		    &gw,				/* gateway addr */
1180		    rill->ill_net_type,			/* IF_[NO]RESOLVER */
1181		    prev_ire->ire_ill,
1182		    ALL_ZONES,
1183		    (RTF_DYNAMIC | RTF_HOST),
1184		    NULL,
1185		    ipst);
1186	}
1187
1188	if (ire == NULL)
1189		goto fail_redirect;
1190
1191	nire = ire_add(ire);
1192	/* Check if it was a duplicate entry */
1193	if (nire != NULL && nire != ire) {
1194		ASSERT(nire->ire_identical_ref > 1);
1195		ire_delete(nire);
1196		ire_refrele(nire);
1197		nire = NULL;
1198	}
1199	ire = nire;
1200	if (ire != NULL) {
1201		ire_refrele(ire);		/* Held in ire_add */
1202
1203		/* tell routing sockets that we received a redirect */
1204		ip_rts_change_v6(RTM_REDIRECT,
1205		    &rd->nd_rd_dst,
1206		    &rd->nd_rd_target,
1207		    &ipv6_all_ones, 0, src,
1208		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
1209		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
1210
1211		/*
1212		 * Delete any existing IRE_HOST type ires for this destination.
1213		 * This together with the added IRE has the effect of
1214		 * modifying an existing redirect.
1215		 */
1216		redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST,
1217		    prev_ire->ire_ill, ALL_ZONES, NULL,
1218		    (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst,
1219		    NULL);
1220
1221		if (redir_ire != NULL) {
1222			if (redir_ire->ire_flags & RTF_DYNAMIC)
1223				ire_delete(redir_ire);
1224			ire_refrele(redir_ire);
1225		}
1226	}
1227
1228	ire_refrele(prev_ire);
1229	prev_ire = NULL;
1230
1231fail_redirect:
1232	if (prev_ire != NULL)
1233		ire_refrele(prev_ire);
1234	freemsg(mp);
1235	if (rill != ira->ira_rill)
1236		ill_refrele(rill);
1237}
1238
1239/*
1240 * Build and ship an IPv6 ICMP message using the packet data in mp,
1241 * and the ICMP header pointed to by "stuff".  (May be called as
1242 * writer.)
1243 * Note: assumes that icmp_pkt_err_ok_v6 has been called to
1244 * verify that an icmp error packet can be sent.
1245 *
1246 * If v6src_ptr is set use it as a source. Otherwise select a reasonable
1247 * source address (see above function).
1248 */
1249static void
1250icmp_pkt_v6(mblk_t *mp, void *stuff, size_t len,
1251    const in6_addr_t *v6src_ptr, ip_recv_attr_t *ira)
1252{
1253	ip6_t		*ip6h;
1254	in6_addr_t	v6dst;
1255	size_t		len_needed;
1256	size_t		msg_len;
1257	mblk_t		*mp1;
1258	icmp6_t		*icmp6;
1259	in6_addr_t	v6src;
1260	ill_t		*ill = ira->ira_ill;
1261	ip_stack_t	*ipst = ill->ill_ipst;
1262	ip_xmit_attr_t	ixas;
1263
1264	ip6h = (ip6_t *)mp->b_rptr;
1265
1266	bzero(&ixas, sizeof (ixas));
1267	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
1268	ixas.ixa_zoneid = ira->ira_zoneid;
1269	ixas.ixa_ifindex = 0;
1270	ixas.ixa_ipst = ipst;
1271	ixas.ixa_cred = kcred;
1272	ixas.ixa_cpid = NOPID;
1273	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
1274	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1275
1276	/*
1277	 * If the source of the original packet was link-local, then
1278	 * make sure we send on the same ill (group) as we received it on.
1279	 */
1280	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
1281		ixas.ixa_flags |= IXAF_SCOPEID_SET;
1282		if (IS_UNDER_IPMP(ill))
1283			ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
1284		else
1285			ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
1286	}
1287
1288	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1289		/*
1290		 * Apply IPsec based on how IPsec was applied to
1291		 * the packet that had the error.
1292		 *
1293		 * If it was an outbound packet that caused the ICMP
1294		 * error, then the caller will have setup the IRA
1295		 * appropriately.
1296		 */
1297		if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
1298			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
1299			/* Note: mp already consumed and ip_drop_packet done */
1300			return;
1301		}
1302	} else {
1303		/*
1304		 * This is in clear. The icmp message we are building
1305		 * here should go out in clear, independent of our policy.
1306		 */
1307		ixas.ixa_flags |= IXAF_NO_IPSEC;
1308	}
1309
1310	/*
1311	 * If the caller specified the source we use that.
1312	 * Otherwise, if the packet was for one of our unicast addresses, make
1313	 * sure we respond with that as the source. Otherwise
1314	 * have ip_output_simple pick the source address.
1315	 */
1316	if (v6src_ptr != NULL) {
1317		v6src = *v6src_ptr;
1318	} else {
1319		ire_t *ire;
1320		uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY;
1321
1322		if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) ||
1323		    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst))
1324			match_flags |= MATCH_IRE_ILL;
1325
1326		ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0,
1327		    (IRE_LOCAL|IRE_LOOPBACK), ill, ira->ira_zoneid, NULL,
1328		    match_flags, 0, ipst, NULL);
1329		if (ire != NULL) {
1330			v6src = ip6h->ip6_dst;
1331			ire_refrele(ire);
1332		} else {
1333			v6src = ipv6_all_zeros;
1334			ixas.ixa_flags |= IXAF_SET_SOURCE;
1335		}
1336	}
1337	v6dst = ip6h->ip6_src;
1338	len_needed = ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len;
1339	msg_len = msgdsize(mp);
1340	if (msg_len > len_needed) {
1341		if (!adjmsg(mp, len_needed - msg_len)) {
1342			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1343			freemsg(mp);
1344			return;
1345		}
1346		msg_len = len_needed;
1347	}
1348	mp1 = allocb(IPV6_HDR_LEN + len, BPRI_MED);
1349	if (mp1 == NULL) {
1350		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1351		freemsg(mp);
1352		return;
1353	}
1354	mp1->b_cont = mp;
1355	mp = mp1;
1356
1357	/*
1358	 * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
1359	 * node generates be accepted in peace by all on-host destinations.
1360	 * If we do NOT assume that all on-host destinations trust
1361	 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
1362	 * (Look for IXAF_TRUSTED_ICMP).
1363	 */
1364	ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
1365
1366	ip6h = (ip6_t *)mp->b_rptr;
1367	mp1->b_wptr = (uchar_t *)ip6h + (IPV6_HDR_LEN + len);
1368
1369	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1370	ip6h->ip6_nxt = IPPROTO_ICMPV6;
1371	ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
1372	ip6h->ip6_dst = v6dst;
1373	ip6h->ip6_src = v6src;
1374	msg_len += IPV6_HDR_LEN + len;
1375	if (msg_len > IP_MAXPACKET + IPV6_HDR_LEN) {
1376		(void) adjmsg(mp, IP_MAXPACKET + IPV6_HDR_LEN - msg_len);
1377		msg_len = IP_MAXPACKET + IPV6_HDR_LEN;
1378	}
1379	ip6h->ip6_plen = htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN));
1380	icmp6 = (icmp6_t *)&ip6h[1];
1381	bcopy(stuff, (char *)icmp6, len);
1382	/*
1383	 * Prepare for checksum by putting icmp length in the icmp
1384	 * checksum field. The checksum is calculated in ip_output_wire_v6.
1385	 */
1386	icmp6->icmp6_cksum = ip6h->ip6_plen;
1387	if (icmp6->icmp6_type == ND_REDIRECT) {
1388		ip6h->ip6_hops = IPV6_MAX_HOPS;
1389	}
1390
1391	(void) ip_output_simple(mp, &ixas);
1392	ixa_cleanup(&ixas);
1393}
1394
1395/*
1396 * Update the output mib when ICMPv6 packets are sent.
1397 */
1398void
1399icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6)
1400{
1401	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutMsgs);
1402
1403	switch (icmp6->icmp6_type) {
1404	case ICMP6_DST_UNREACH:
1405		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutDestUnreachs);
1406		if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN)
1407			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutAdminProhibs);
1408		break;
1409
1410	case ICMP6_TIME_EXCEEDED:
1411		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutTimeExcds);
1412		break;
1413
1414	case ICMP6_PARAM_PROB:
1415		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutParmProblems);
1416		break;
1417
1418	case ICMP6_PACKET_TOO_BIG:
1419		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutPktTooBigs);
1420		break;
1421
1422	case ICMP6_ECHO_REQUEST:
1423		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchos);
1424		break;
1425
1426	case ICMP6_ECHO_REPLY:
1427		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchoReplies);
1428		break;
1429
1430	case ND_ROUTER_SOLICIT:
1431		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterSolicits);
1432		break;
1433
1434	case ND_ROUTER_ADVERT:
1435		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterAdvertisements);
1436		break;
1437
1438	case ND_NEIGHBOR_SOLICIT:
1439		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutNeighborSolicits);
1440		break;
1441
1442	case ND_NEIGHBOR_ADVERT:
1443		BUMP_MIB(ill->ill_icmp6_mib,
1444		    ipv6IfIcmpOutNeighborAdvertisements);
1445		break;
1446
1447	case ND_REDIRECT:
1448		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRedirects);
1449		break;
1450
1451	case MLD_LISTENER_QUERY:
1452		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembQueries);
1453		break;
1454
1455	case MLD_LISTENER_REPORT:
1456	case MLD_V2_LISTENER_REPORT:
1457		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembResponses);
1458		break;
1459
1460	case MLD_LISTENER_REDUCTION:
1461		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembReductions);
1462		break;
1463	}
1464}
1465
1466/*
1467 * Check if it is ok to send an ICMPv6 error packet in
1468 * response to the IP packet in mp.
1469 * Free the message and return null if no
1470 * ICMP error packet should be sent.
1471 */
1472static mblk_t *
1473icmp_pkt_err_ok_v6(mblk_t *mp, boolean_t mcast_ok, ip_recv_attr_t *ira)
1474{
1475	ill_t		*ill = ira->ira_ill;
1476	ip_stack_t	*ipst = ill->ill_ipst;
1477	boolean_t	llbcast;
1478	ip6_t		*ip6h;
1479
1480	if (!mp)
1481		return (NULL);
1482
1483	/* We view multicast and broadcast as the same.. */
1484	llbcast = (ira->ira_flags &
1485	    (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) != 0;
1486	ip6h = (ip6_t *)mp->b_rptr;
1487
1488	/* Check if source address uniquely identifies the host */
1489
1490	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src) ||
1491	    IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_src) ||
1492	    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
1493		freemsg(mp);
1494		return (NULL);
1495	}
1496
1497	if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1498		size_t	len_needed = IPV6_HDR_LEN + ICMP6_MINLEN;
1499		icmp6_t		*icmp6;
1500
1501		if (mp->b_wptr - mp->b_rptr < len_needed) {
1502			if (!pullupmsg(mp, len_needed)) {
1503				BUMP_MIB(ill->ill_icmp6_mib,
1504				    ipv6IfIcmpInErrors);
1505				freemsg(mp);
1506				return (NULL);
1507			}
1508			ip6h = (ip6_t *)mp->b_rptr;
1509		}
1510		icmp6 = (icmp6_t *)&ip6h[1];
1511		/* Explicitly do not generate errors in response to redirects */
1512		if (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
1513		    icmp6->icmp6_type == ND_REDIRECT) {
1514			freemsg(mp);
1515			return (NULL);
1516		}
1517	}
1518	/*
1519	 * Check that the destination is not multicast and that the packet
1520	 * was not sent on link layer broadcast or multicast.  (Exception
1521	 * is Packet too big message as per the draft - when mcast_ok is set.)
1522	 */
1523	if (!mcast_ok &&
1524	    (llbcast || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))) {
1525		freemsg(mp);
1526		return (NULL);
1527	}
1528	/*
1529	 * If this is a labeled system, then check to see if we're allowed to
1530	 * send a response to this particular sender.  If not, then just drop.
1531	 */
1532	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
1533		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1534		freemsg(mp);
1535		return (NULL);
1536	}
1537
1538	if (icmp_err_rate_limit(ipst)) {
1539		/*
1540		 * Only send ICMP error packets every so often.
1541		 * This should be done on a per port/source basis,
1542		 * but for now this will suffice.
1543		 */
1544		freemsg(mp);
1545		return (NULL);
1546	}
1547	return (mp);
1548}
1549
1550/*
1551 * Called when a packet was sent out the same link that it arrived on.
1552 * Check if it is ok to send a redirect and then send it.
1553 */
1554void
1555ip_send_potential_redirect_v6(mblk_t *mp, ip6_t *ip6h, ire_t *ire,
1556    ip_recv_attr_t *ira)
1557{
1558	ill_t		*ill = ira->ira_ill;
1559	ip_stack_t	*ipst = ill->ill_ipst;
1560	in6_addr_t	*v6targ;
1561	ire_t		*src_ire_v6 = NULL;
1562	mblk_t		*mp1;
1563	ire_t		*nhop_ire = NULL;
1564
1565	/*
1566	 * Don't send a redirect when forwarding a source
1567	 * routed packet.
1568	 */
1569	if (ip_source_routed_v6(ip6h, mp, ipst))
1570		return;
1571
1572	if (ire->ire_type & IRE_ONLINK) {
1573		/* Target is directly connected */
1574		v6targ = &ip6h->ip6_dst;
1575	} else {
1576		/* Determine the most specific IRE used to send the packets */
1577		nhop_ire = ire_nexthop(ire);
1578		if (nhop_ire == NULL)
1579			return;
1580
1581		/*
1582		 * We won't send redirects to a router
1583		 * that doesn't have a link local
1584		 * address, but will forward.
1585		 */
1586		if (!IN6_IS_ADDR_LINKLOCAL(&nhop_ire->ire_addr_v6)) {
1587			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
1588			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1589			ire_refrele(nhop_ire);
1590			return;
1591		}
1592		v6targ = &nhop_ire->ire_addr_v6;
1593	}
1594	src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src,
1595	    NULL, NULL, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
1596	    MATCH_IRE_ILL | MATCH_IRE_TYPE, 0, ipst, NULL);
1597
1598	if (src_ire_v6 == NULL) {
1599		if (nhop_ire != NULL)
1600			ire_refrele(nhop_ire);
1601		return;
1602	}
1603
1604	/*
1605	 * The source is directly connected.
1606	 */
1607	mp1 = copymsg(mp);
1608	if (mp1 != NULL)
1609		icmp_send_redirect_v6(mp1, v6targ, &ip6h->ip6_dst, ira);
1610
1611	if (nhop_ire != NULL)
1612		ire_refrele(nhop_ire);
1613	ire_refrele(src_ire_v6);
1614}
1615
1616/*
1617 * Generate an ICMPv6 redirect message.
1618 * Include target link layer address option if it exits.
1619 * Always include redirect header.
1620 */
1621static void
1622icmp_send_redirect_v6(mblk_t *mp, in6_addr_t *targetp, in6_addr_t *dest,
1623    ip_recv_attr_t *ira)
1624{
1625	nd_redirect_t	*rd;
1626	nd_opt_rd_hdr_t	*rdh;
1627	uchar_t		*buf;
1628	ncec_t		*ncec = NULL;
1629	nd_opt_hdr_t	*opt;
1630	int		len;
1631	int		ll_opt_len = 0;
1632	int		max_redir_hdr_data_len;
1633	int		pkt_len;
1634	in6_addr_t	*srcp;
1635	ill_t		*ill;
1636	boolean_t	need_refrele;
1637	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
1638
1639	mp = icmp_pkt_err_ok_v6(mp, B_FALSE, ira);
1640	if (mp == NULL)
1641		return;
1642
1643	if (IS_UNDER_IPMP(ira->ira_ill)) {
1644		ill = ipmp_ill_hold_ipmp_ill(ira->ira_ill);
1645		if (ill == NULL) {
1646			ill = ira->ira_ill;
1647			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1648			ip_drop_output("no IPMP ill for sending redirect",
1649			    mp, ill);
1650			freemsg(mp);
1651			return;
1652		}
1653		need_refrele = B_TRUE;
1654	} else {
1655		ill = ira->ira_ill;
1656		need_refrele = B_FALSE;
1657	}
1658
1659	ncec = ncec_lookup_illgrp_v6(ill, targetp);
1660	if (ncec != NULL && ncec->ncec_state != ND_INCOMPLETE &&
1661	    ncec->ncec_lladdr != NULL) {
1662		ll_opt_len = (sizeof (nd_opt_hdr_t) +
1663		    ill->ill_phys_addr_length + 7)/8 * 8;
1664	}
1665	len = sizeof (nd_redirect_t) + sizeof (nd_opt_rd_hdr_t) + ll_opt_len;
1666	ASSERT(len % 4 == 0);
1667	buf = kmem_alloc(len, KM_NOSLEEP);
1668	if (buf == NULL) {
1669		if (ncec != NULL)
1670			ncec_refrele(ncec);
1671		if (need_refrele)
1672			ill_refrele(ill);
1673		freemsg(mp);
1674		return;
1675	}
1676
1677	rd = (nd_redirect_t *)buf;
1678	rd->nd_rd_type = (uint8_t)ND_REDIRECT;
1679	rd->nd_rd_code = 0;
1680	rd->nd_rd_reserved = 0;
1681	rd->nd_rd_target = *targetp;
1682	rd->nd_rd_dst = *dest;
1683
1684	opt = (nd_opt_hdr_t *)(buf + sizeof (nd_redirect_t));
1685	if (ncec != NULL && ll_opt_len != 0) {
1686		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
1687		opt->nd_opt_len = ll_opt_len/8;
1688		bcopy((char *)ncec->ncec_lladdr, &opt[1],
1689		    ill->ill_phys_addr_length);
1690	}
1691	if (ncec != NULL)
1692		ncec_refrele(ncec);
1693	rdh = (nd_opt_rd_hdr_t *)(buf + sizeof (nd_redirect_t) + ll_opt_len);
1694	rdh->nd_opt_rh_type = (uint8_t)ND_OPT_REDIRECTED_HEADER;
1695	/* max_redir_hdr_data_len and nd_opt_rh_len must be multiple of 8 */
1696	max_redir_hdr_data_len =
1697	    (ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len)/8*8;
1698	pkt_len = msgdsize(mp);
1699	/* Make sure mp is 8 byte aligned */
1700	if (pkt_len > max_redir_hdr_data_len) {
1701		rdh->nd_opt_rh_len = (max_redir_hdr_data_len +
1702		    sizeof (nd_opt_rd_hdr_t))/8;
1703		(void) adjmsg(mp, max_redir_hdr_data_len - pkt_len);
1704	} else {
1705		rdh->nd_opt_rh_len = (pkt_len + sizeof (nd_opt_rd_hdr_t))/8;
1706		(void) adjmsg(mp, -(pkt_len % 8));
1707	}
1708	rdh->nd_opt_rh_reserved1 = 0;
1709	rdh->nd_opt_rh_reserved2 = 0;
1710	/* ipif_v6lcl_addr contains the link-local source address */
1711	srcp = &ill->ill_ipif->ipif_v6lcl_addr;
1712
1713	/* Redirects sent by router, and router is global zone */
1714	ASSERT(ira->ira_zoneid == ALL_ZONES);
1715	ira->ira_zoneid = GLOBAL_ZONEID;
1716	icmp_pkt_v6(mp, buf, len, srcp, ira);
1717	kmem_free(buf, len);
1718	if (need_refrele)
1719		ill_refrele(ill);
1720}
1721
1722
1723/* Generate an ICMP time exceeded message.  (May be called as writer.) */
1724void
1725icmp_time_exceeded_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
1726    ip_recv_attr_t *ira)
1727{
1728	icmp6_t	icmp6;
1729
1730	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1731	if (mp == NULL)
1732		return;
1733
1734	bzero(&icmp6, sizeof (icmp6_t));
1735	icmp6.icmp6_type = ICMP6_TIME_EXCEEDED;
1736	icmp6.icmp6_code = code;
1737	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1738}
1739
1740/*
1741 * Generate an ICMP unreachable message.
1742 * When called from ip_output side a minimal ip_recv_attr_t needs to be
1743 * constructed by the caller.
1744 */
1745void
1746icmp_unreachable_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
1747    ip_recv_attr_t *ira)
1748{
1749	icmp6_t	icmp6;
1750
1751	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1752	if (mp == NULL)
1753		return;
1754
1755	bzero(&icmp6, sizeof (icmp6_t));
1756	icmp6.icmp6_type = ICMP6_DST_UNREACH;
1757	icmp6.icmp6_code = code;
1758	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1759}
1760
1761/*
1762 * Generate an ICMP pkt too big message.
1763 * When called from ip_output side a minimal ip_recv_attr_t needs to be
1764 * constructed by the caller.
1765 */
1766void
1767icmp_pkt2big_v6(mblk_t *mp, uint32_t mtu, boolean_t mcast_ok,
1768    ip_recv_attr_t *ira)
1769{
1770	icmp6_t	icmp6;
1771
1772	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1773	if (mp == NULL)
1774		return;
1775
1776	bzero(&icmp6, sizeof (icmp6_t));
1777	icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
1778	icmp6.icmp6_code = 0;
1779	icmp6.icmp6_mtu = htonl(mtu);
1780
1781	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1782}
1783
1784/*
1785 * Generate an ICMP parameter problem message. (May be called as writer.)
1786 * 'offset' is the offset from the beginning of the packet in error.
1787 * When called from ip_output side a minimal ip_recv_attr_t needs to be
1788 * constructed by the caller.
1789 */
1790static void
1791icmp_param_problem_v6(mblk_t *mp, uint8_t code, uint32_t offset,
1792    boolean_t mcast_ok, ip_recv_attr_t *ira)
1793{
1794	icmp6_t	icmp6;
1795
1796	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1797	if (mp == NULL)
1798		return;
1799
1800	bzero((char *)&icmp6, sizeof (icmp6_t));
1801	icmp6.icmp6_type = ICMP6_PARAM_PROB;
1802	icmp6.icmp6_code = code;
1803	icmp6.icmp6_pptr = htonl(offset);
1804	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1805}
1806
1807void
1808icmp_param_problem_nexthdr_v6(mblk_t *mp, boolean_t mcast_ok,
1809    ip_recv_attr_t *ira)
1810{
1811	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
1812	uint16_t	hdr_length;
1813	uint8_t		*nexthdrp;
1814	uint32_t	offset;
1815	ill_t		*ill = ira->ira_ill;
1816
1817	/* Determine the offset of the bad nexthdr value */
1818	if (!ip_hdr_length_nexthdr_v6(mp, ip6h,	&hdr_length, &nexthdrp)) {
1819		/* Malformed packet */
1820		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1821		ip_drop_input("ipIfStatsInDiscards", mp, ill);
1822		freemsg(mp);
1823		return;
1824	}
1825
1826	offset = nexthdrp - mp->b_rptr;
1827	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_NEXTHEADER, offset,
1828	    mcast_ok, ira);
1829}
1830
1831/*
1832 * Verify whether or not the IP address is a valid local address.
1833 * Could be a unicast, including one for a down interface.
1834 * If allow_mcbc then a multicast or broadcast address is also
1835 * acceptable.
1836 *
1837 * In the case of a multicast address, however, the
1838 * upper protocol is expected to reset the src address
1839 * to zero when we return IPVL_MCAST so that
1840 * no packets are emitted with multicast address as
1841 * source address.
1842 * The addresses valid for bind are:
1843 *	(1) - in6addr_any
1844 *	(2) - IP address of an UP interface
1845 *	(3) - IP address of a DOWN interface
1846 *	(4) - a multicast address. In this case
1847 *	the conn will only receive packets destined to
1848 *	the specified multicast address. Note: the
1849 *	application still has to issue an
1850 *	IPV6_JOIN_GROUP socket option.
1851 *
1852 * In all the above cases, the bound address must be valid in the current zone.
1853 * When the address is loopback or multicast, there might be many matching IREs
1854 * so bind has to look up based on the zone.
1855 */
1856ip_laddr_t
1857ip_laddr_verify_v6(const in6_addr_t *v6src, zoneid_t zoneid,
1858    ip_stack_t *ipst, boolean_t allow_mcbc, uint_t scopeid)
1859{
1860	ire_t		*src_ire;
1861	uint_t		match_flags;
1862	ill_t		*ill = NULL;
1863
1864	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6src));
1865	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(v6src));
1866
1867	match_flags = MATCH_IRE_ZONEONLY;
1868	if (scopeid != 0) {
1869		ill = ill_lookup_on_ifindex(scopeid, B_TRUE, ipst);
1870		if (ill == NULL)
1871			return (IPVL_BAD);
1872		match_flags |= MATCH_IRE_ILL;
1873	}
1874
1875	src_ire = ire_ftable_lookup_v6(v6src, NULL, NULL, 0,
1876	    ill, zoneid, NULL, match_flags, 0, ipst, NULL);
1877	if (ill != NULL)
1878		ill_refrele(ill);
1879
1880	/*
1881	 * If an address other than in6addr_any is requested,
1882	 * we verify that it is a valid address for bind
1883	 * Note: Following code is in if-else-if form for
1884	 * readability compared to a condition check.
1885	 */
1886	if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
1887		/*
1888		 * (2) Bind to address of local UP interface
1889		 */
1890		ire_refrele(src_ire);
1891		return (IPVL_UNICAST_UP);
1892	} else if (IN6_IS_ADDR_MULTICAST(v6src)) {
1893		/* (4) bind to multicast address. */
1894		if (src_ire != NULL)
1895			ire_refrele(src_ire);
1896
1897		/*
1898		 * Note: caller should take IPV6_MULTICAST_IF
1899		 * into account when selecting a real source address.
1900		 */
1901		if (allow_mcbc)
1902			return (IPVL_MCAST);
1903		else
1904			return (IPVL_BAD);
1905	} else {
1906		ipif_t *ipif;
1907
1908		/*
1909		 * (3) Bind to address of local DOWN interface?
1910		 * (ipif_lookup_addr() looks up all interfaces
1911		 * but we do not get here for UP interfaces
1912		 * - case (2) above)
1913		 */
1914		if (src_ire != NULL)
1915			ire_refrele(src_ire);
1916
1917		ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid, ipst);
1918		if (ipif == NULL)
1919			return (IPVL_BAD);
1920
1921		/* Not a useful source? */
1922		if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
1923			ipif_refrele(ipif);
1924			return (IPVL_BAD);
1925		}
1926		ipif_refrele(ipif);
1927		return (IPVL_UNICAST_DOWN);
1928	}
1929}
1930
1931/*
1932 * Verify that both the source and destination addresses are valid.  If
1933 * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
1934 * i.e. have no route to it.  Protocols like TCP want to verify destination
1935 * reachability, while tunnels do not.
1936 *
1937 * Determine the route, the interface, and (optionally) the source address
1938 * to use to reach a given destination.
1939 * Note that we allow connect to broadcast and multicast addresses when
1940 * IPDF_ALLOW_MCBC is set.
1941 * first_hop and dst_addr are normally the same, but if source routing
1942 * they will differ; in that case the first_hop is what we'll use for the
1943 * routing lookup but the dce and label checks will be done on dst_addr,
1944 *
1945 * If uinfo is set, then we fill in the best available information
1946 * we have for the destination. This is based on (in priority order) any
1947 * metrics and path MTU stored in a dce_t, route metrics, and finally the
1948 * ill_mtu/ill_mc_mtu.
1949 *
1950 * Tsol note: If we have a source route then dst_addr != firsthop. But we
1951 * always do the label check on dst_addr.
1952 *
1953 * Assumes that the caller has set ixa_scopeid for link-local communication.
1954 */
1955int
1956ip_set_destination_v6(in6_addr_t *src_addrp, const in6_addr_t *dst_addr,
1957    const in6_addr_t *firsthop, ip_xmit_attr_t *ixa, iulp_t *uinfo,
1958    uint32_t flags, uint_t mac_mode)
1959{
1960	ire_t		*ire;
1961	int		error = 0;
1962	in6_addr_t	setsrc;				/* RTF_SETSRC */
1963	zoneid_t	zoneid = ixa->ixa_zoneid;	/* Honors SO_ALLZONES */
1964	ip_stack_t	*ipst = ixa->ixa_ipst;
1965	dce_t		*dce;
1966	uint_t		pmtu;
1967	uint_t		ifindex;
1968	uint_t		generation;
1969	nce_t		*nce;
1970	ill_t		*ill = NULL;
1971	boolean_t	multirt = B_FALSE;
1972
1973	ASSERT(!IN6_IS_ADDR_V4MAPPED(dst_addr));
1974
1975	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1976
1977	/*
1978	 * We never send to zero; the ULPs map it to the loopback address.
1979	 * We can't allow it since we use zero to mean unitialized in some
1980	 * places.
1981	 */
1982	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(dst_addr));
1983
1984	if (is_system_labeled()) {
1985		ts_label_t *tsl = NULL;
1986
1987		error = tsol_check_dest(ixa->ixa_tsl, dst_addr, IPV6_VERSION,
1988		    mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
1989		if (error != 0)
1990			return (error);
1991		if (tsl != NULL) {
1992			/* Update the label */
1993			ip_xmit_attr_replace_tsl(ixa, tsl);
1994		}
1995	}
1996
1997	setsrc = ipv6_all_zeros;
1998	/*
1999	 * Select a route; For IPMP interfaces, we would only select
2000	 * a "hidden" route (i.e., going through a specific under_ill)
2001	 * if ixa_ifindex has been specified.
2002	 */
2003	ire = ip_select_route_v6(firsthop, *src_addrp, ixa, &generation,
2004	    &setsrc, &error, &multirt);
2005	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
2006	if (error != 0)
2007		goto bad_addr;
2008
2009	/*
2010	 * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
2011	 * If IPDF_VERIFY_DST is set, the destination must be reachable.
2012	 * Otherwise the destination needn't be reachable.
2013	 *
2014	 * If we match on a reject or black hole, then we've got a
2015	 * local failure.  May as well fail out the connect() attempt,
2016	 * since it's never going to succeed.
2017	 */
2018	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2019		/*
2020		 * If we're verifying destination reachability, we always want
2021		 * to complain here.
2022		 *
2023		 * If we're not verifying destination reachability but the
2024		 * destination has a route, we still want to fail on the
2025		 * temporary address and broadcast address tests.
2026		 *
2027		 * In both cases do we let the code continue so some reasonable
2028		 * information is returned to the caller. That enables the
2029		 * caller to use (and even cache) the IRE. conn_ip_ouput will
2030		 * use the generation mismatch path to check for the unreachable
2031		 * case thereby avoiding any specific check in the main path.
2032		 */
2033		ASSERT(generation == IRE_GENERATION_VERIFY);
2034		if (flags & IPDF_VERIFY_DST) {
2035			/*
2036			 * Set errno but continue to set up ixa_ire to be
2037			 * the RTF_REJECT|RTF_BLACKHOLE IRE.
2038			 * That allows callers to use ip_output to get an
2039			 * ICMP error back.
2040			 */
2041			if (!(ire->ire_type & IRE_HOST))
2042				error = ENETUNREACH;
2043			else
2044				error = EHOSTUNREACH;
2045		}
2046	}
2047
2048	if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
2049	    !(flags & IPDF_ALLOW_MCBC)) {
2050		ire_refrele(ire);
2051		ire = ire_reject(ipst, B_FALSE);
2052		generation = IRE_GENERATION_VERIFY;
2053		error = ENETUNREACH;
2054	}
2055
2056	/* Cache things */
2057	if (ixa->ixa_ire != NULL)
2058		ire_refrele_notr(ixa->ixa_ire);
2059#ifdef DEBUG
2060	ire_refhold_notr(ire);
2061	ire_refrele(ire);
2062#endif
2063	ixa->ixa_ire = ire;
2064	ixa->ixa_ire_generation = generation;
2065
2066	/*
2067	 * Ensure that ixa_dce is always set any time that ixa_ire is set,
2068	 * since some callers will send a packet to conn_ip_output() even if
2069	 * there's an error.
2070	 */
2071	ifindex = 0;
2072	if (IN6_IS_ADDR_LINKSCOPE(dst_addr)) {
2073		/* If we are creating a DCE we'd better have an ifindex */
2074		if (ill != NULL)
2075			ifindex = ill->ill_phyint->phyint_ifindex;
2076		else
2077			flags &= ~IPDF_UNIQUE_DCE;
2078	}
2079
2080	if (flags & IPDF_UNIQUE_DCE) {
2081		/* Fallback to the default dce if allocation fails */
2082		dce = dce_lookup_and_add_v6(dst_addr, ifindex, ipst);
2083		if (dce != NULL) {
2084			generation = dce->dce_generation;
2085		} else {
2086			dce = dce_lookup_v6(dst_addr, ifindex, ipst,
2087			    &generation);
2088		}
2089	} else {
2090		dce = dce_lookup_v6(dst_addr, ifindex, ipst, &generation);
2091	}
2092	ASSERT(dce != NULL);
2093	if (ixa->ixa_dce != NULL)
2094		dce_refrele_notr(ixa->ixa_dce);
2095#ifdef DEBUG
2096	dce_refhold_notr(dce);
2097	dce_refrele(dce);
2098#endif
2099	ixa->ixa_dce = dce;
2100	ixa->ixa_dce_generation = generation;
2101
2102
2103	/*
2104	 * For multicast with multirt we have a flag passed back from
2105	 * ire_lookup_multi_ill_v6 since we don't have an IRE for each
2106	 * possible multicast address.
2107	 * We also need a flag for multicast since we can't check
2108	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
2109	 */
2110	if (multirt) {
2111		ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
2112		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
2113	} else {
2114		ixa->ixa_postfragfn = ire->ire_postfragfn;
2115		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
2116	}
2117	if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
2118		/* Get an nce to cache. */
2119		nce = ire_to_nce(ire, 0, firsthop);
2120		if (nce == NULL) {
2121			/* Allocation failure? */
2122			ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2123		} else {
2124			if (ixa->ixa_nce != NULL)
2125				nce_refrele(ixa->ixa_nce);
2126			ixa->ixa_nce = nce;
2127		}
2128	}
2129
2130	/*
2131	 * If the source address is a loopback address, the
2132	 * destination had best be local or multicast.
2133	 * If we are sending to an IRE_LOCAL using a loopback source then
2134	 * it had better be the same zoneid.
2135	 */
2136	if (IN6_IS_ADDR_LOOPBACK(src_addrp)) {
2137		if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
2138			ire = NULL;	/* Stored in ixa_ire */
2139			error = EADDRNOTAVAIL;
2140			goto bad_addr;
2141		}
2142		if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
2143			ire = NULL;	/* Stored in ixa_ire */
2144			error = EADDRNOTAVAIL;
2145			goto bad_addr;
2146		}
2147	}
2148
2149	/*
2150	 * Does the caller want us to pick a source address?
2151	 */
2152	if (flags & IPDF_SELECT_SRC) {
2153		in6_addr_t	src_addr;
2154
2155		/*
2156		 * We use use ire_nexthop_ill to avoid the under ipmp
2157		 * interface for source address selection. Note that for ipmp
2158		 * probe packets, ixa_ifindex would have been specified, and
2159		 * the ip_select_route() invocation would have picked an ire
2160		 * will ire_ill pointing at an under interface.
2161		 */
2162		ill = ire_nexthop_ill(ire);
2163
2164		/* If unreachable we have no ill but need some source */
2165		if (ill == NULL) {
2166			src_addr = ipv6_loopback;
2167			/* Make sure we look for a better source address */
2168			generation = SRC_GENERATION_VERIFY;
2169		} else {
2170			error = ip_select_source_v6(ill, &setsrc, dst_addr,
2171			    zoneid, ipst, B_FALSE, ixa->ixa_src_preferences,
2172			    &src_addr, &generation, NULL);
2173			if (error != 0) {
2174				ire = NULL;	/* Stored in ixa_ire */
2175				goto bad_addr;
2176			}
2177		}
2178
2179		/*
2180		 * We allow the source address to to down.
2181		 * However, we check that we don't use the loopback address
2182		 * as a source when sending out on the wire.
2183		 */
2184		if (IN6_IS_ADDR_LOOPBACK(&src_addr) &&
2185		    !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
2186		    !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
2187			ire = NULL;	/* Stored in ixa_ire */
2188			error = EADDRNOTAVAIL;
2189			goto bad_addr;
2190		}
2191
2192		*src_addrp = src_addr;
2193		ixa->ixa_src_generation = generation;
2194	}
2195
2196	/*
2197	 * Make sure we don't leave an unreachable ixa_nce in place
2198	 * since ip_select_route is used when we unplumb i.e., remove
2199	 * references on ixa_ire, ixa_nce, and ixa_dce.
2200	 */
2201	nce = ixa->ixa_nce;
2202	if (nce != NULL && nce->nce_is_condemned) {
2203		nce_refrele(nce);
2204		ixa->ixa_nce = NULL;
2205		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2206	}
2207
2208	/*
2209	 * Note that IPv6 multicast supports PMTU discovery unlike IPv4
2210	 * multicast. But pmtu discovery is only enabled for connected
2211	 * sockets in general.
2212	 */
2213
2214	/*
2215	 * Set initial value for fragmentation limit.  Either conn_ip_output
2216	 * or ULP might updates it when there are routing changes.
2217	 * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
2218	 */
2219	pmtu = ip_get_pmtu(ixa);
2220	ixa->ixa_fragsize = pmtu;
2221	/* Make sure ixa_fragsize and ixa_pmtu remain identical */
2222	if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
2223		ixa->ixa_pmtu = pmtu;
2224
2225	/*
2226	 * Extract information useful for some transports.
2227	 * First we look for DCE metrics. Then we take what we have in
2228	 * the metrics in the route, where the offlink is used if we have
2229	 * one.
2230	 */
2231	if (uinfo != NULL) {
2232		bzero(uinfo, sizeof (*uinfo));
2233
2234		if (dce->dce_flags & DCEF_UINFO)
2235			*uinfo = dce->dce_uinfo;
2236
2237		rts_merge_metrics(uinfo, &ire->ire_metrics);
2238
2239		/* Allow ire_metrics to decrease the path MTU from above */
2240		if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
2241			uinfo->iulp_mtu = pmtu;
2242
2243		uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
2244		uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
2245		uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
2246	}
2247
2248	if (ill != NULL)
2249		ill_refrele(ill);
2250
2251	return (error);
2252
2253bad_addr:
2254	if (ire != NULL)
2255		ire_refrele(ire);
2256
2257	if (ill != NULL)
2258		ill_refrele(ill);
2259
2260	/*
2261	 * Make sure we don't leave an unreachable ixa_nce in place
2262	 * since ip_select_route is used when we unplumb i.e., remove
2263	 * references on ixa_ire, ixa_nce, and ixa_dce.
2264	 */
2265	nce = ixa->ixa_nce;
2266	if (nce != NULL && nce->nce_is_condemned) {
2267		nce_refrele(nce);
2268		ixa->ixa_nce = NULL;
2269		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2270	}
2271
2272	return (error);
2273}
2274
2275/*
2276 * Handle protocols with which IP is less intimate.  There
2277 * can be more than one stream bound to a particular
2278 * protocol.  When this is the case, normally each one gets a copy
2279 * of any incoming packets.
2280 *
2281 * Zones notes:
2282 * Packets will be distributed to conns in all zones. This is really only
2283 * useful for ICMPv6 as only applications in the global zone can create raw
2284 * sockets for other protocols.
2285 */
2286void
2287ip_fanout_proto_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
2288{
2289	mblk_t		*mp1;
2290	in6_addr_t	laddr = ip6h->ip6_dst;
2291	conn_t		*connp, *first_connp, *next_connp;
2292	connf_t		*connfp;
2293	ill_t		*ill = ira->ira_ill;
2294	ip_stack_t	*ipst = ill->ill_ipst;
2295
2296	connfp = &ipst->ips_ipcl_proto_fanout_v6[ira->ira_protocol];
2297	mutex_enter(&connfp->connf_lock);
2298	connp = connfp->connf_head;
2299	for (connp = connfp->connf_head; connp != NULL;
2300	    connp = connp->conn_next) {
2301		/* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
2302		if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
2303		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2304		    tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
2305			break;
2306	}
2307
2308	if (connp == NULL) {
2309		/*
2310		 * No one bound to this port.  Is
2311		 * there a client that wants all
2312		 * unclaimed datagrams?
2313		 */
2314		mutex_exit(&connfp->connf_lock);
2315		ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
2316		    ICMP6_PARAMPROB_NEXTHEADER, ira);
2317		return;
2318	}
2319
2320	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
2321
2322	CONN_INC_REF(connp);
2323	first_connp = connp;
2324
2325	/*
2326	 * XXX: Fix the multiple protocol listeners case. We should not
2327	 * be walking the conn->conn_next list here.
2328	 */
2329	connp = connp->conn_next;
2330	for (;;) {
2331		while (connp != NULL) {
2332			/* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
2333			if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
2334			    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2335			    tsol_receive_local(mp, &laddr, IPV6_VERSION,
2336			    ira, connp)))
2337				break;
2338			connp = connp->conn_next;
2339		}
2340
2341		if (connp == NULL) {
2342			/* No more interested clients */
2343			connp = first_connp;
2344			break;
2345		}
2346		if (((mp1 = dupmsg(mp)) == NULL) &&
2347		    ((mp1 = copymsg(mp)) == NULL)) {
2348			/* Memory allocation failed */
2349			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2350			ip_drop_input("ipIfStatsInDiscards", mp, ill);
2351			connp = first_connp;
2352			break;
2353		}
2354
2355		CONN_INC_REF(connp);
2356		mutex_exit(&connfp->connf_lock);
2357
2358		ip_fanout_proto_conn(connp, mp1, NULL, (ip6_t *)mp1->b_rptr,
2359		    ira);
2360
2361		mutex_enter(&connfp->connf_lock);
2362		/* Follow the next pointer before releasing the conn. */
2363		next_connp = connp->conn_next;
2364		CONN_DEC_REF(connp);
2365		connp = next_connp;
2366	}
2367
2368	/* Last one.  Send it upstream. */
2369	mutex_exit(&connfp->connf_lock);
2370
2371	ip_fanout_proto_conn(connp, mp, NULL, ip6h, ira);
2372
2373	CONN_DEC_REF(connp);
2374}
2375
2376/*
2377 * Called when it is conceptually a ULP that would sent the packet
2378 * e.g., port unreachable and nexthdr unknown. Check that the packet
2379 * would have passed the IPsec global policy before sending the error.
2380 *
2381 * Send an ICMP error after patching up the packet appropriately.
2382 * Uses ip_drop_input and bumps the appropriate MIB.
2383 * For ICMP6_PARAMPROB_NEXTHEADER we determine the offset to use.
2384 */
2385void
2386ip_fanout_send_icmp_v6(mblk_t *mp, uint_t icmp_type, uint8_t icmp_code,
2387    ip_recv_attr_t *ira)
2388{
2389	ip6_t		*ip6h;
2390	boolean_t	secure;
2391	ill_t		*ill = ira->ira_ill;
2392	ip_stack_t	*ipst = ill->ill_ipst;
2393	netstack_t	*ns = ipst->ips_netstack;
2394	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2395
2396	secure = ira->ira_flags & IRAF_IPSEC_SECURE;
2397
2398	/*
2399	 * We are generating an icmp error for some inbound packet.
2400	 * Called from all ip_fanout_(udp, tcp, proto) functions.
2401	 * Before we generate an error, check with global policy
2402	 * to see whether this is allowed to enter the system. As
2403	 * there is no "conn", we are checking with global policy.
2404	 */
2405	ip6h = (ip6_t *)mp->b_rptr;
2406	if (secure || ipss->ipsec_inbound_v6_policy_present) {
2407		mp = ipsec_check_global_policy(mp, NULL, NULL, ip6h, ira, ns);
2408		if (mp == NULL)
2409			return;
2410	}
2411
2412	/* We never send errors for protocols that we do implement */
2413	if (ira->ira_protocol == IPPROTO_ICMPV6) {
2414		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2415		ip_drop_input("ip_fanout_send_icmp_v6", mp, ill);
2416		freemsg(mp);
2417		return;
2418	}
2419
2420	switch (icmp_type) {
2421	case ICMP6_DST_UNREACH:
2422		ASSERT(icmp_code == ICMP6_DST_UNREACH_NOPORT);
2423
2424		BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
2425		ip_drop_input("ipIfStatsNoPorts", mp, ill);
2426
2427		icmp_unreachable_v6(mp, icmp_code, B_FALSE, ira);
2428		break;
2429	case ICMP6_PARAM_PROB:
2430		ASSERT(icmp_code == ICMP6_PARAMPROB_NEXTHEADER);
2431
2432		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
2433		ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
2434
2435		/* Let the system determine the offset for this one */
2436		icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira);
2437		break;
2438	default:
2439#ifdef DEBUG
2440		panic("ip_fanout_send_icmp_v6: wrong type");
2441		/*NOTREACHED*/
2442#else
2443		freemsg(mp);
2444		break;
2445#endif
2446	}
2447}
2448
2449/*
2450 * Fanout for UDP packets that are multicast or ICMP errors.
2451 * (Unicast fanout is handled in ip_input_v6.)
2452 *
2453 * If SO_REUSEADDR is set all multicast packets
2454 * will be delivered to all conns bound to the same port.
2455 *
2456 * Fanout for UDP packets.
2457 * The caller puts <fport, lport> in the ports parameter.
2458 * ire_type must be IRE_BROADCAST for multicast and broadcast packets.
2459 *
2460 * If SO_REUSEADDR is set all multicast and broadcast packets
2461 * will be delivered to all conns bound to the same port.
2462 *
2463 * Zones notes:
2464 * Earlier in ip_input on a system with multiple shared-IP zones we
2465 * duplicate the multicast and broadcast packets and send them up
2466 * with each explicit zoneid that exists on that ill.
2467 * This means that here we can match the zoneid with SO_ALLZONES being special.
2468 */
2469void
2470ip_fanout_udp_multi_v6(mblk_t *mp, ip6_t *ip6h, uint16_t lport, uint16_t fport,
2471    ip_recv_attr_t *ira)
2472{
2473	in6_addr_t	laddr;
2474	conn_t		*connp;
2475	connf_t		*connfp;
2476	in6_addr_t	faddr;
2477	ill_t		*ill = ira->ira_ill;
2478	ip_stack_t	*ipst = ill->ill_ipst;
2479
2480	ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
2481
2482	laddr = ip6h->ip6_dst;
2483	faddr = ip6h->ip6_src;
2484
2485	/* Attempt to find a client stream based on destination port. */
2486	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
2487	mutex_enter(&connfp->connf_lock);
2488	connp = connfp->connf_head;
2489	while (connp != NULL) {
2490		if ((IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr)) &&
2491		    conn_wantpacket_v6(connp, ira, ip6h) &&
2492		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2493		    tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
2494			break;
2495		connp = connp->conn_next;
2496	}
2497
2498	if (connp == NULL)
2499		goto notfound;
2500
2501	CONN_INC_REF(connp);
2502
2503	if (connp->conn_reuseaddr) {
2504		conn_t		*first_connp = connp;
2505		conn_t		*next_connp;
2506		mblk_t		*mp1;
2507
2508		connp = connp->conn_next;
2509		for (;;) {
2510			while (connp != NULL) {
2511				if (IPCL_UDP_MATCH_V6(connp, lport, laddr,
2512				    fport, faddr) &&
2513				    conn_wantpacket_v6(connp, ira, ip6h) &&
2514				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2515				    tsol_receive_local(mp, &laddr, IPV6_VERSION,
2516				    ira, connp)))
2517					break;
2518				connp = connp->conn_next;
2519			}
2520			if (connp == NULL) {
2521				/* No more interested clients */
2522				connp = first_connp;
2523				break;
2524			}
2525			if (((mp1 = dupmsg(mp)) == NULL) &&
2526			    ((mp1 = copymsg(mp)) == NULL)) {
2527				/* Memory allocation failed */
2528				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2529				ip_drop_input("ipIfStatsInDiscards", mp, ill);
2530				connp = first_connp;
2531				break;
2532			}
2533
2534			CONN_INC_REF(connp);
2535			mutex_exit(&connfp->connf_lock);
2536
2537			IP6_STAT(ipst, ip6_udp_fanmb);
2538			ip_fanout_udp_conn(connp, mp1, NULL,
2539			    (ip6_t *)mp1->b_rptr, ira);
2540
2541			mutex_enter(&connfp->connf_lock);
2542			/* Follow the next pointer before releasing the conn. */
2543			next_connp = connp->conn_next;
2544			IP6_STAT(ipst, ip6_udp_fanmb);
2545			CONN_DEC_REF(connp);
2546			connp = next_connp;
2547		}
2548	}
2549
2550	/* Last one.  Send it upstream. */
2551	mutex_exit(&connfp->connf_lock);
2552
2553	IP6_STAT(ipst, ip6_udp_fanmb);
2554	ip_fanout_udp_conn(connp, mp, NULL, ip6h, ira);
2555	CONN_DEC_REF(connp);
2556	return;
2557
2558notfound:
2559	mutex_exit(&connfp->connf_lock);
2560	/*
2561	 * No one bound to this port.  Is
2562	 * there a client that wants all
2563	 * unclaimed datagrams?
2564	 */
2565	if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].connf_head != NULL) {
2566		ASSERT(ira->ira_protocol == IPPROTO_UDP);
2567		ip_fanout_proto_v6(mp, ip6h, ira);
2568	} else {
2569		ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH,
2570		    ICMP6_DST_UNREACH_NOPORT, ira);
2571	}
2572}
2573
2574/*
2575 * int ip_find_hdr_v6()
2576 *
2577 * This routine is used by the upper layer protocols, iptun, and IPsec:
2578 * - Set extension header pointers to appropriate locations
2579 * - Determine IPv6 header length and return it
2580 * - Return a pointer to the last nexthdr value
2581 *
2582 * The caller must initialize ipp_fields.
2583 * The upper layer protocols normally set label_separate which makes the
2584 * routine put the TX label in ipp_label_v6. If this is not set then
2585 * the hop-by-hop options including the label are placed in ipp_hopopts.
2586 *
2587 * NOTE: If multiple extension headers of the same type are present,
2588 * ip_find_hdr_v6() will set the respective extension header pointers
2589 * to the first one that it encounters in the IPv6 header.  It also
2590 * skips fragment headers.  This routine deals with malformed packets
2591 * of various sorts in which case the returned length is up to the
2592 * malformed part.
2593 */
2594int
2595ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, boolean_t label_separate, ip_pkt_t *ipp,
2596    uint8_t *nexthdrp)
2597{
2598	uint_t	length, ehdrlen;
2599	uint8_t nexthdr;
2600	uint8_t *whereptr, *endptr;
2601	ip6_dest_t *tmpdstopts;
2602	ip6_rthdr_t *tmprthdr;
2603	ip6_hbh_t *tmphopopts;
2604	ip6_frag_t *tmpfraghdr;
2605
2606	ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
2607	ipp->ipp_hoplimit = ip6h->ip6_hops;
2608	ipp->ipp_tclass = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
2609	ipp->ipp_addr = ip6h->ip6_dst;
2610
2611	length = IPV6_HDR_LEN;
2612	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
2613	endptr = mp->b_wptr;
2614
2615	nexthdr = ip6h->ip6_nxt;
2616	while (whereptr < endptr) {
2617		/* Is there enough left for len + nexthdr? */
2618		if (whereptr + MIN_EHDR_LEN > endptr)
2619			goto done;
2620
2621		switch (nexthdr) {
2622		case IPPROTO_HOPOPTS: {
2623			/* We check for any CIPSO */
2624			uchar_t *secopt;
2625			boolean_t hbh_needed;
2626			uchar_t *after_secopt;
2627
2628			tmphopopts = (ip6_hbh_t *)whereptr;
2629			ehdrlen = 8 * (tmphopopts->ip6h_len + 1);
2630			if ((uchar_t *)tmphopopts +  ehdrlen > endptr)
2631				goto done;
2632			nexthdr = tmphopopts->ip6h_nxt;
2633
2634			if (!label_separate) {
2635				secopt = NULL;
2636				after_secopt = whereptr;
2637			} else {
2638				/*
2639				 * We have dropped packets with bad options in
2640				 * ip6_input. No need to check return value
2641				 * here.
2642				 */
2643				(void) tsol_find_secopt_v6(whereptr, ehdrlen,
2644				    &secopt, &after_secopt, &hbh_needed);
2645			}
2646			if (secopt != NULL && after_secopt - whereptr > 0) {
2647				ipp->ipp_fields |= IPPF_LABEL_V6;
2648				ipp->ipp_label_v6 = secopt;
2649				ipp->ipp_label_len_v6 = after_secopt - whereptr;
2650			} else {
2651				ipp->ipp_label_len_v6 = 0;
2652				after_secopt = whereptr;
2653				hbh_needed = B_TRUE;
2654			}
2655			/* return only 1st hbh */
2656			if (hbh_needed && !(ipp->ipp_fields & IPPF_HOPOPTS)) {
2657				ipp->ipp_fields |= IPPF_HOPOPTS;
2658				ipp->ipp_hopopts = (ip6_hbh_t *)after_secopt;
2659				ipp->ipp_hopoptslen = ehdrlen -
2660				    ipp->ipp_label_len_v6;
2661			}
2662			break;
2663		}
2664		case IPPROTO_DSTOPTS:
2665			tmpdstopts = (ip6_dest_t *)whereptr;
2666			ehdrlen = 8 * (tmpdstopts->ip6d_len + 1);
2667			if ((uchar_t *)tmpdstopts +  ehdrlen > endptr)
2668				goto done;
2669			nexthdr = tmpdstopts->ip6d_nxt;
2670			/*
2671			 * ipp_dstopts is set to the destination header after a
2672			 * routing header.
2673			 * Assume it is a post-rthdr destination header
2674			 * and adjust when we find an rthdr.
2675			 */
2676			if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
2677				ipp->ipp_fields |= IPPF_DSTOPTS;
2678				ipp->ipp_dstopts = tmpdstopts;
2679				ipp->ipp_dstoptslen = ehdrlen;
2680			}
2681			break;
2682		case IPPROTO_ROUTING:
2683			tmprthdr = (ip6_rthdr_t *)whereptr;
2684			ehdrlen = 8 * (tmprthdr->ip6r_len + 1);
2685			if ((uchar_t *)tmprthdr +  ehdrlen > endptr)
2686				goto done;
2687			nexthdr = tmprthdr->ip6r_nxt;
2688			/* return only 1st rthdr */
2689			if (!(ipp->ipp_fields & IPPF_RTHDR)) {
2690				ipp->ipp_fields |= IPPF_RTHDR;
2691				ipp->ipp_rthdr = tmprthdr;
2692				ipp->ipp_rthdrlen = ehdrlen;
2693			}
2694			/*
2695			 * Make any destination header we've seen be a
2696			 * pre-rthdr destination header.
2697			 */
2698			if (ipp->ipp_fields & IPPF_DSTOPTS) {
2699				ipp->ipp_fields &= ~IPPF_DSTOPTS;
2700				ipp->ipp_fields |= IPPF_RTHDRDSTOPTS;
2701				ipp->ipp_rthdrdstopts = ipp->ipp_dstopts;
2702				ipp->ipp_dstopts = NULL;
2703				ipp->ipp_rthdrdstoptslen = ipp->ipp_dstoptslen;
2704				ipp->ipp_dstoptslen = 0;
2705			}
2706			break;
2707		case IPPROTO_FRAGMENT:
2708			tmpfraghdr = (ip6_frag_t *)whereptr;
2709			ehdrlen = sizeof (ip6_frag_t);
2710			if ((uchar_t *)tmpfraghdr + ehdrlen > endptr)
2711				goto done;
2712			nexthdr = tmpfraghdr->ip6f_nxt;
2713			if (!(ipp->ipp_fields & IPPF_FRAGHDR)) {
2714				ipp->ipp_fields |= IPPF_FRAGHDR;
2715				ipp->ipp_fraghdr = tmpfraghdr;
2716				ipp->ipp_fraghdrlen = ehdrlen;
2717			}
2718			break;
2719		case IPPROTO_NONE:
2720		default:
2721			goto done;
2722		}
2723		length += ehdrlen;
2724		whereptr += ehdrlen;
2725	}
2726done:
2727	if (nexthdrp != NULL)
2728		*nexthdrp = nexthdr;
2729	return (length);
2730}
2731
2732/*
2733 * Try to determine where and what are the IPv6 header length and
2734 * pointer to nexthdr value for the upper layer protocol (or an
2735 * unknown next hdr).
2736 *
2737 * Parameters returns a pointer to the nexthdr value;
2738 * Must handle malformed packets of various sorts.
2739 * Function returns failure for malformed cases.
2740 */
2741boolean_t
2742ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr,
2743    uint8_t **nexthdrpp)
2744{
2745	uint16_t length;
2746	uint_t	ehdrlen;
2747	uint8_t	*nexthdrp;
2748	uint8_t *whereptr;
2749	uint8_t *endptr;
2750	ip6_dest_t *desthdr;
2751	ip6_rthdr_t *rthdr;
2752	ip6_frag_t *fraghdr;
2753
2754	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
2755	length = IPV6_HDR_LEN;
2756	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
2757	endptr = mp->b_wptr;
2758
2759	nexthdrp = &ip6h->ip6_nxt;
2760	while (whereptr < endptr) {
2761		/* Is there enough left for len + nexthdr? */
2762		if (whereptr + MIN_EHDR_LEN > endptr)
2763			break;
2764
2765		switch (*nexthdrp) {
2766		case IPPROTO_HOPOPTS:
2767		case IPPROTO_DSTOPTS:
2768			/* Assumes the headers are identical for hbh and dst */
2769			desthdr = (ip6_dest_t *)whereptr;
2770			ehdrlen = 8 * (desthdr->ip6d_len + 1);
2771			if ((uchar_t *)desthdr +  ehdrlen > endptr)
2772				return (B_FALSE);
2773			nexthdrp = &desthdr->ip6d_nxt;
2774			break;
2775		case IPPROTO_ROUTING:
2776			rthdr = (ip6_rthdr_t *)whereptr;
2777			ehdrlen =  8 * (rthdr->ip6r_len + 1);
2778			if ((uchar_t *)rthdr +  ehdrlen > endptr)
2779				return (B_FALSE);
2780			nexthdrp = &rthdr->ip6r_nxt;
2781			break;
2782		case IPPROTO_FRAGMENT:
2783			fraghdr = (ip6_frag_t *)whereptr;
2784			ehdrlen = sizeof (ip6_frag_t);
2785			if ((uchar_t *)&fraghdr[1] > endptr)
2786				return (B_FALSE);
2787			nexthdrp = &fraghdr->ip6f_nxt;
2788			break;
2789		case IPPROTO_NONE:
2790			/* No next header means we're finished */
2791		default:
2792			*hdr_length_ptr = length;
2793			*nexthdrpp = nexthdrp;
2794			return (B_TRUE);
2795		}
2796		length += ehdrlen;
2797		whereptr += ehdrlen;
2798		*hdr_length_ptr = length;
2799		*nexthdrpp = nexthdrp;
2800	}
2801	switch (*nexthdrp) {
2802	case IPPROTO_HOPOPTS:
2803	case IPPROTO_DSTOPTS:
2804	case IPPROTO_ROUTING:
2805	case IPPROTO_FRAGMENT:
2806		/*
2807		 * If any know extension headers are still to be processed,
2808		 * the packet's malformed (or at least all the IP header(s) are
2809		 * not in the same mblk - and that should never happen.
2810		 */
2811		return (B_FALSE);
2812
2813	default:
2814		/*
2815		 * If we get here, we know that all of the IP headers were in
2816		 * the same mblk, even if the ULP header is in the next mblk.
2817		 */
2818		*hdr_length_ptr = length;
2819		*nexthdrpp = nexthdrp;
2820		return (B_TRUE);
2821	}
2822}
2823
2824/*
2825 * Return the length of the IPv6 related headers (including extension headers)
2826 * Returns a length even if the packet is malformed.
2827 */
2828int
2829ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
2830{
2831	uint16_t hdr_len;
2832	uint8_t	*nexthdrp;
2833
2834	(void) ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, &nexthdrp);
2835	return (hdr_len);
2836}
2837
2838/*
2839 * Parse and process any hop-by-hop or destination options.
2840 *
2841 * Assumes that q is an ill read queue so that ICMP errors for link-local
2842 * destinations are sent out the correct interface.
2843 *
2844 * Returns -1 if there was an error and mp has been consumed.
2845 * Returns 0 if no special action is needed.
2846 * Returns 1 if the packet contained a router alert option for this node
2847 * which is verified to be "interesting/known" for our implementation.
2848 *
2849 * XXX Note: In future as more hbh or dest options are defined,
2850 * it may be better to have different routines for hbh and dest
2851 * options as opt_type fields other than IP6OPT_PAD1 and IP6OPT_PADN
2852 * may have same value in different namespaces. Or is it same namespace ??
2853 * Current code checks for each opt_type (other than pads) if it is in
2854 * the expected  nexthdr (hbh or dest)
2855 */
2856int
2857ip_process_options_v6(mblk_t *mp, ip6_t *ip6h,
2858    uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_recv_attr_t *ira)
2859{
2860	uint8_t opt_type;
2861	uint_t optused = 0;
2862	int ret = 0;
2863	const char *errtype;
2864	ill_t		*ill = ira->ira_ill;
2865	ip_stack_t	*ipst = ill->ill_ipst;
2866
2867	while (optlen != 0) {
2868		opt_type = *optptr;
2869		if (opt_type == IP6OPT_PAD1) {
2870			optused = 1;
2871		} else {
2872			if (optlen < 2)
2873				goto bad_opt;
2874			errtype = "malformed";
2875			if (opt_type == ip6opt_ls) {
2876				optused = 2 + optptr[1];
2877				if (optused > optlen)
2878					goto bad_opt;
2879			} else switch (opt_type) {
2880			case IP6OPT_PADN:
2881				/*
2882				 * Note:We don't verify that (N-2) pad octets
2883				 * are zero as required by spec. Adhere to
2884				 * "be liberal in what you accept..." part of
2885				 * implementation philosophy (RFC791,RFC1122)
2886				 */
2887				optused = 2 + optptr[1];
2888				if (optused > optlen)
2889					goto bad_opt;
2890				break;
2891
2892			case IP6OPT_JUMBO:
2893				if (hdr_type != IPPROTO_HOPOPTS)
2894					goto opt_error;
2895				goto opt_error; /* XXX Not implemented! */
2896
2897			case IP6OPT_ROUTER_ALERT: {
2898				struct ip6_opt_router *or;
2899
2900				if (hdr_type != IPPROTO_HOPOPTS)
2901					goto opt_error;
2902				optused = 2 + optptr[1];
2903				if (optused > optlen)
2904					goto bad_opt;
2905				or = (struct ip6_opt_router *)optptr;
2906				/* Check total length and alignment */
2907				if (optused != sizeof (*or) ||
2908				    ((uintptr_t)or->ip6or_value & 0x1) != 0)
2909					goto opt_error;
2910				/* Check value */
2911				switch (*((uint16_t *)or->ip6or_value)) {
2912				case IP6_ALERT_MLD:
2913				case IP6_ALERT_RSVP:
2914					ret = 1;
2915				}
2916				break;
2917			}
2918			case IP6OPT_HOME_ADDRESS: {
2919				/*
2920				 * Minimal support for the home address option
2921				 * (which is required by all IPv6 nodes).
2922				 * Implement by just swapping the home address
2923				 * and source address.
2924				 * XXX Note: this has IPsec implications since
2925				 * AH needs to take this into account.
2926				 * Also, when IPsec is used we need to ensure
2927				 * that this is only processed once
2928				 * in the received packet (to avoid swapping
2929				 * back and forth).
2930				 * NOTE:This option processing is considered
2931				 * to be unsafe and prone to a denial of
2932				 * service attack.
2933				 * The current processing is not safe even with
2934				 * IPsec secured IP packets. Since the home
2935				 * address option processing requirement still
2936				 * is in the IETF draft and in the process of
2937				 * being redefined for its usage, it has been
2938				 * decided to turn off the option by default.
2939				 * If this section of code needs to be executed,
2940				 * ndd variable ip6_ignore_home_address_opt
2941				 * should be set to 0 at the user's own risk.
2942				 */
2943				struct ip6_opt_home_address *oh;
2944				in6_addr_t tmp;
2945
2946				if (ipst->ips_ipv6_ignore_home_address_opt)
2947					goto opt_error;
2948
2949				if (hdr_type != IPPROTO_DSTOPTS)
2950					goto opt_error;
2951				optused = 2 + optptr[1];
2952				if (optused > optlen)
2953					goto bad_opt;
2954
2955				/*
2956				 * We did this dest. opt the first time
2957				 * around (i.e. before AH processing).
2958				 * If we've done AH... stop now.
2959				 */
2960				if ((ira->ira_flags & IRAF_IPSEC_SECURE) &&
2961				    ira->ira_ipsec_ah_sa != NULL)
2962					break;
2963
2964				oh = (struct ip6_opt_home_address *)optptr;
2965				/* Check total length and alignment */
2966				if (optused < sizeof (*oh) ||
2967				    ((uintptr_t)oh->ip6oh_addr & 0x7) != 0)
2968					goto opt_error;
2969				/* Swap ip6_src and the home address */
2970				tmp = ip6h->ip6_src;
2971				/* XXX Note: only 8 byte alignment option */
2972				ip6h->ip6_src = *(in6_addr_t *)oh->ip6oh_addr;
2973				*(in6_addr_t *)oh->ip6oh_addr = tmp;
2974				break;
2975			}
2976
2977			case IP6OPT_TUNNEL_LIMIT:
2978				if (hdr_type != IPPROTO_DSTOPTS) {
2979					goto opt_error;
2980				}
2981				optused = 2 + optptr[1];
2982				if (optused > optlen) {
2983					goto bad_opt;
2984				}
2985				if (optused != 3) {
2986					goto opt_error;
2987				}
2988				break;
2989
2990			default:
2991				errtype = "unknown";
2992				/* FALLTHROUGH */
2993			opt_error:
2994				/* Determine which zone should send error */
2995				switch (IP6OPT_TYPE(opt_type)) {
2996				case IP6OPT_TYPE_SKIP:
2997					optused = 2 + optptr[1];
2998					if (optused > optlen)
2999						goto bad_opt;
3000					ip1dbg(("ip_process_options_v6: %s "
3001					    "opt 0x%x skipped\n",
3002					    errtype, opt_type));
3003					break;
3004				case IP6OPT_TYPE_DISCARD:
3005					ip1dbg(("ip_process_options_v6: %s "
3006					    "opt 0x%x; packet dropped\n",
3007					    errtype, opt_type));
3008					BUMP_MIB(ill->ill_ip_mib,
3009					    ipIfStatsInHdrErrors);
3010					ip_drop_input("ipIfStatsInHdrErrors",
3011					    mp, ill);
3012					freemsg(mp);
3013					return (-1);
3014				case IP6OPT_TYPE_ICMP:
3015					BUMP_MIB(ill->ill_ip_mib,
3016					    ipIfStatsInHdrErrors);
3017					ip_drop_input("ipIfStatsInHdrErrors",
3018					    mp, ill);
3019					icmp_param_problem_v6(mp,
3020					    ICMP6_PARAMPROB_OPTION,
3021					    (uint32_t)(optptr -
3022					    (uint8_t *)ip6h),
3023					    B_FALSE, ira);
3024					return (-1);
3025				case IP6OPT_TYPE_FORCEICMP:
3026					BUMP_MIB(ill->ill_ip_mib,
3027					    ipIfStatsInHdrErrors);
3028					ip_drop_input("ipIfStatsInHdrErrors",
3029					    mp, ill);
3030					icmp_param_problem_v6(mp,
3031					    ICMP6_PARAMPROB_OPTION,
3032					    (uint32_t)(optptr -
3033					    (uint8_t *)ip6h),
3034					    B_TRUE, ira);
3035					return (-1);
3036				default:
3037					ASSERT(0);
3038				}
3039			}
3040		}
3041		optlen -= optused;
3042		optptr += optused;
3043	}
3044	return (ret);
3045
3046bad_opt:
3047	/* Determine which zone should send error */
3048	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
3049	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION,
3050	    (uint32_t)(optptr - (uint8_t *)ip6h),
3051	    B_FALSE, ira);
3052	return (-1);
3053}
3054
3055/*
3056 * Process a routing header that is not yet empty.
3057 * Because of RFC 5095, we now reject all route headers.
3058 */
3059void
3060ip_process_rthdr(mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
3061    ip_recv_attr_t *ira)
3062{
3063	ill_t		*ill = ira->ira_ill;
3064	ip_stack_t	*ipst = ill->ill_ipst;
3065
3066	ASSERT(rth->ip6r_segleft != 0);
3067
3068	if (!ipst->ips_ipv6_forward_src_routed) {
3069		/* XXX Check for source routed out same interface? */
3070		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
3071		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
3072		ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
3073		freemsg(mp);
3074		return;
3075	}
3076
3077	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
3078	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3079	    (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h),
3080	    B_FALSE, ira);
3081}
3082
3083/*
3084 * Read side put procedure for IPv6 module.
3085 */
3086int
3087ip_rput_v6(queue_t *q, mblk_t *mp)
3088{
3089	ill_t		*ill;
3090
3091	ill = (ill_t *)q->q_ptr;
3092	if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
3093		union DL_primitives *dl;
3094
3095		dl = (union DL_primitives *)mp->b_rptr;
3096		/*
3097		 * Things are opening or closing - only accept DLPI
3098		 * ack messages. If the stream is closing and ip_wsrv
3099		 * has completed, ip_close is out of the qwait, but has
3100		 * not yet completed qprocsoff. Don't proceed any further
3101		 * because the ill has been cleaned up and things hanging
3102		 * off the ill have been freed.
3103		 */
3104		if ((mp->b_datap->db_type != M_PCPROTO) ||
3105		    (dl->dl_primitive == DL_UNITDATA_IND)) {
3106			inet_freemsg(mp);
3107			return (0);
3108		}
3109	}
3110	if (DB_TYPE(mp) == M_DATA) {
3111		struct mac_header_info_s mhi;
3112
3113		ip_mdata_to_mhi(ill, mp, &mhi);
3114		ip_input_v6(ill, NULL, mp, &mhi);
3115	} else {
3116		ip_rput_notdata(ill, mp);
3117	}
3118	return (0);
3119}
3120
3121/*
3122 * Walk through the IPv6 packet in mp and see if there's an AH header
3123 * in it.  See if the AH header needs to get done before other headers in
3124 * the packet.  (Worker function for ipsec_early_ah_v6().)
3125 */
3126#define	IPSEC_HDR_DONT_PROCESS	0
3127#define	IPSEC_HDR_PROCESS	1
3128#define	IPSEC_MEMORY_ERROR	2 /* or malformed packet */
3129static int
3130ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr)
3131{
3132	uint_t	length;
3133	uint_t	ehdrlen;
3134	uint8_t *whereptr;
3135	uint8_t *endptr;
3136	uint8_t *nexthdrp;
3137	ip6_dest_t *desthdr;
3138	ip6_rthdr_t *rthdr;
3139	ip6_t	*ip6h;
3140
3141	/*
3142	 * For now just pullup everything.  In general, the less pullups,
3143	 * the better, but there's so much squirrelling through anyway,
3144	 * it's just easier this way.
3145	 */
3146	if (!pullupmsg(mp, -1)) {
3147		return (IPSEC_MEMORY_ERROR);
3148	}
3149
3150	ip6h = (ip6_t *)mp->b_rptr;
3151	length = IPV6_HDR_LEN;
3152	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
3153	endptr = mp->b_wptr;
3154
3155	/*
3156	 * We can't just use the argument nexthdr in the place
3157	 * of nexthdrp becaue we don't dereference nexthdrp
3158	 * till we confirm whether it is a valid address.
3159	 */
3160	nexthdrp = &ip6h->ip6_nxt;
3161	while (whereptr < endptr) {
3162		/* Is there enough left for len + nexthdr? */
3163		if (whereptr + MIN_EHDR_LEN > endptr)
3164			return (IPSEC_MEMORY_ERROR);
3165
3166		switch (*nexthdrp) {
3167		case IPPROTO_HOPOPTS:
3168		case IPPROTO_DSTOPTS:
3169			/* Assumes the headers are identical for hbh and dst */
3170			desthdr = (ip6_dest_t *)whereptr;
3171			ehdrlen = 8 * (desthdr->ip6d_len + 1);
3172			if ((uchar_t *)desthdr +  ehdrlen > endptr)
3173				return (IPSEC_MEMORY_ERROR);
3174			/*
3175			 * Return DONT_PROCESS because the destination
3176			 * options header may be for each hop in a
3177			 * routing-header, and we only want AH if we're
3178			 * finished with routing headers.
3179			 */
3180			if (*nexthdrp == IPPROTO_DSTOPTS)
3181				return (IPSEC_HDR_DONT_PROCESS);
3182			nexthdrp = &desthdr->ip6d_nxt;
3183			break;
3184		case IPPROTO_ROUTING:
3185			rthdr = (ip6_rthdr_t *)whereptr;
3186
3187			/*
3188			 * If there's more hops left on the routing header,
3189			 * return now with DON'T PROCESS.
3190			 */
3191			if (rthdr->ip6r_segleft > 0)
3192				return (IPSEC_HDR_DONT_PROCESS);
3193
3194			ehdrlen =  8 * (rthdr->ip6r_len + 1);
3195			if ((uchar_t *)rthdr +  ehdrlen > endptr)
3196				return (IPSEC_MEMORY_ERROR);
3197			nexthdrp = &rthdr->ip6r_nxt;
3198			break;
3199		case IPPROTO_FRAGMENT:
3200			/* Wait for reassembly */
3201			return (IPSEC_HDR_DONT_PROCESS);
3202		case IPPROTO_AH:
3203			*nexthdr = IPPROTO_AH;
3204			return (IPSEC_HDR_PROCESS);
3205		case IPPROTO_NONE:
3206			/* No next header means we're finished */
3207		default:
3208			return (IPSEC_HDR_DONT_PROCESS);
3209		}
3210		length += ehdrlen;
3211		whereptr += ehdrlen;
3212	}
3213	/*
3214	 * Malformed/truncated packet.
3215	 */
3216	return (IPSEC_MEMORY_ERROR);
3217}
3218
3219/*
3220 * Path for AH if options are present.
3221 * Returns NULL if the mblk was consumed.
3222 *
3223 * Sometimes AH needs to be done before other IPv6 headers for security
3224 * reasons.  This function (and its ipsec_needs_processing_v6() above)
3225 * indicates if that is so, and fans out to the appropriate IPsec protocol
3226 * for the datagram passed in.
3227 */
3228mblk_t *
3229ipsec_early_ah_v6(mblk_t *mp, ip_recv_attr_t *ira)
3230{
3231	uint8_t nexthdr;
3232	ah_t *ah;
3233	ill_t		*ill = ira->ira_ill;
3234	ip_stack_t	*ipst = ill->ill_ipst;
3235	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
3236
3237	switch (ipsec_needs_processing_v6(mp, &nexthdr)) {
3238	case IPSEC_MEMORY_ERROR:
3239		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3240		ip_drop_input("ipIfStatsInDiscards", mp, ill);
3241		freemsg(mp);
3242		return (NULL);
3243	case IPSEC_HDR_DONT_PROCESS:
3244		return (mp);
3245	}
3246
3247	/* Default means send it to AH! */
3248	ASSERT(nexthdr == IPPROTO_AH);
3249
3250	if (!ipsec_loaded(ipss)) {
3251		ip_proto_not_sup(mp, ira);
3252		return (NULL);
3253	}
3254
3255	mp = ipsec_inbound_ah_sa(mp, ira, &ah);
3256	if (mp == NULL)
3257		return (NULL);
3258	ASSERT(ah != NULL);
3259	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3260	ASSERT(ira->ira_ipsec_ah_sa != NULL);
3261	ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
3262	mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira);
3263
3264	if (mp == NULL) {
3265		/*
3266		 * Either it failed or is pending. In the former case
3267		 * ipIfStatsInDiscards was increased.
3268		 */
3269		return (NULL);
3270	}
3271
3272	/* we're done with IPsec processing, send it up */
3273	ip_input_post_ipsec(mp, ira);
3274	return (NULL);
3275}
3276
3277/*
3278 * Reassemble fragment.
3279 * When it returns a completed message the first mblk will only contain
3280 * the headers prior to the fragment header, with the nexthdr value updated
3281 * to be the header after the fragment header.
3282 */
3283mblk_t *
3284ip_input_fragment_v6(mblk_t *mp, ip6_t *ip6h,
3285    ip6_frag_t *fraghdr, uint_t remlen, ip_recv_attr_t *ira)
3286{
3287	uint32_t	ident = ntohl(fraghdr->ip6f_ident);
3288	uint16_t	offset;
3289	boolean_t	more_frags;
3290	uint8_t		nexthdr = fraghdr->ip6f_nxt;
3291	in6_addr_t	*v6dst_ptr;
3292	in6_addr_t	*v6src_ptr;
3293	uint_t		end;
3294	uint_t		hdr_length;
3295	size_t		count;
3296	ipf_t		*ipf;
3297	ipf_t		**ipfp;
3298	ipfb_t		*ipfb;
3299	mblk_t		*mp1;
3300	uint8_t		ecn_info = 0;
3301	size_t		msg_len;
3302	mblk_t		*tail_mp;
3303	mblk_t		*t_mp;
3304	boolean_t	pruned = B_FALSE;
3305	uint32_t	sum_val;
3306	uint16_t	sum_flags;
3307	ill_t		*ill = ira->ira_ill;
3308	ip_stack_t	*ipst = ill->ill_ipst;
3309	uint_t		prev_nexthdr_offset;
3310	uint8_t		prev_nexthdr;
3311	uint8_t		*ptr;
3312	uint32_t	packet_size;
3313
3314	/*
3315	 * We utilize hardware computed checksum info only for UDP since
3316	 * IP fragmentation is a normal occurence for the protocol.  In
3317	 * addition, checksum offload support for IP fragments carrying
3318	 * UDP payload is commonly implemented across network adapters.
3319	 */
3320	ASSERT(ira->ira_rill != NULL);
3321	if (nexthdr == IPPROTO_UDP && dohwcksum &&
3322	    ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
3323	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
3324		mblk_t *mp1 = mp->b_cont;
3325		int32_t len;
3326
3327		/* Record checksum information from the packet */
3328		sum_val = (uint32_t)DB_CKSUM16(mp);
3329		sum_flags = DB_CKSUMFLAGS(mp);
3330
3331		/* fragmented payload offset from beginning of mblk */
3332		offset = (uint16_t)((uchar_t *)&fraghdr[1] - mp->b_rptr);
3333
3334		if ((sum_flags & HCK_PARTIALCKSUM) &&
3335		    (mp1 == NULL || mp1->b_cont == NULL) &&
3336		    offset >= DB_CKSUMSTART(mp) &&
3337		    ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
3338			uint32_t adj;
3339			/*
3340			 * Partial checksum has been calculated by hardware
3341			 * and attached to the packet; in addition, any
3342			 * prepended extraneous data is even byte aligned.
3343			 * If any such data exists, we adjust the checksum;
3344			 * this would also handle any postpended data.
3345			 */
3346			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
3347			    mp, mp1, len, adj);
3348
3349			/* One's complement subtract extraneous checksum */
3350			if (adj >= sum_val)
3351				sum_val = ~(adj - sum_val) & 0xFFFF;
3352			else
3353				sum_val -= adj;
3354		}
3355	} else {
3356		sum_val = 0;
3357		sum_flags = 0;
3358	}
3359
3360	/* Clear hardware checksumming flag */
3361	DB_CKSUMFLAGS(mp) = 0;
3362
3363	/*
3364	 * Determine the offset (from the begining of the IP header)
3365	 * of the nexthdr value which has IPPROTO_FRAGMENT. We use
3366	 * this when removing the fragment header from the packet.
3367	 * This packet consists of the IPv6 header, a potential
3368	 * hop-by-hop options header, a potential pre-routing-header
3369	 * destination options header, and a potential routing header.
3370	 */
3371	prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
3372	prev_nexthdr = ip6h->ip6_nxt;
3373	ptr = (uint8_t *)&ip6h[1];
3374
3375	if (prev_nexthdr == IPPROTO_HOPOPTS) {
3376		ip6_hbh_t	*hbh_hdr;
3377		uint_t		hdr_len;
3378
3379		hbh_hdr = (ip6_hbh_t *)ptr;
3380		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
3381		prev_nexthdr = hbh_hdr->ip6h_nxt;
3382		prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
3383		    - (uint8_t *)ip6h;
3384		ptr += hdr_len;
3385	}
3386	if (prev_nexthdr == IPPROTO_DSTOPTS) {
3387		ip6_dest_t	*dest_hdr;
3388		uint_t		hdr_len;
3389
3390		dest_hdr = (ip6_dest_t *)ptr;
3391		hdr_len = 8 * (dest_hdr->ip6d_len + 1);
3392		prev_nexthdr = dest_hdr->ip6d_nxt;
3393		prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
3394		    - (uint8_t *)ip6h;
3395		ptr += hdr_len;
3396	}
3397	if (prev_nexthdr == IPPROTO_ROUTING) {
3398		ip6_rthdr_t	*rthdr;
3399		uint_t		hdr_len;
3400
3401		rthdr = (ip6_rthdr_t *)ptr;
3402		prev_nexthdr = rthdr->ip6r_nxt;
3403		prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
3404		    - (uint8_t *)ip6h;
3405		hdr_len = 8 * (rthdr->ip6r_len + 1);
3406		ptr += hdr_len;
3407	}
3408	if (prev_nexthdr != IPPROTO_FRAGMENT) {
3409		/* Can't handle other headers before the fragment header */
3410		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3411		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
3412		freemsg(mp);
3413		return (NULL);
3414	}
3415
3416	/*
3417	 * Note: Fragment offset in header is in 8-octet units.
3418	 * Clearing least significant 3 bits not only extracts
3419	 * it but also gets it in units of octets.
3420	 */
3421	offset = ntohs(fraghdr->ip6f_offlg) & ~7;
3422	more_frags = (fraghdr->ip6f_offlg & IP6F_MORE_FRAG);
3423
3424	/*
3425	 * Is the more frags flag on and the payload length not a multiple
3426	 * of eight?
3427	 */
3428	if (more_frags && (ntohs(ip6h->ip6_plen) & 7)) {
3429		ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
3430		icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3431		    (uint32_t)((char *)&ip6h->ip6_plen -
3432		    (char *)ip6h), B_FALSE, ira);
3433		return (NULL);
3434	}
3435
3436	v6src_ptr = &ip6h->ip6_src;
3437	v6dst_ptr = &ip6h->ip6_dst;
3438	end = remlen;
3439
3440	hdr_length = (uint_t)((char *)&fraghdr[1] - (char *)ip6h);
3441	end += offset;
3442
3443	/*
3444	 * Would fragment cause reassembled packet to have a payload length
3445	 * greater than IP_MAXPACKET - the max payload size?
3446	 */
3447	if (end > IP_MAXPACKET) {
3448		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3449		ip_drop_input("Reassembled packet too large", mp, ill);
3450		icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3451		    (uint32_t)((char *)&fraghdr->ip6f_offlg -
3452		    (char *)ip6h), B_FALSE, ira);
3453		return (NULL);
3454	}
3455
3456	/*
3457	 * This packet just has one fragment. Reassembly not
3458	 * needed.
3459	 */
3460	if (!more_frags && offset == 0) {
3461		goto reass_done;
3462	}
3463
3464	/*
3465	 * Drop the fragmented as early as possible, if
3466	 * we don't have resource(s) to re-assemble.
3467	 */
3468	if (ipst->ips_ip_reass_queue_bytes == 0) {
3469		freemsg(mp);
3470		return (NULL);
3471	}
3472
3473	/* Record the ECN field info. */
3474	ecn_info = (uint8_t)(ntohl(ip6h->ip6_vcf & htonl(~0xFFCFFFFF)) >> 20);
3475	/*
3476	 * If this is not the first fragment, dump the unfragmentable
3477	 * portion of the packet.
3478	 */
3479	if (offset)
3480		mp->b_rptr = (uchar_t *)&fraghdr[1];
3481
3482	/*
3483	 * Fragmentation reassembly.  Each ILL has a hash table for
3484	 * queueing packets undergoing reassembly for all IPIFs
3485	 * associated with the ILL.  The hash is based on the packet
3486	 * IP ident field.  The ILL frag hash table was allocated
3487	 * as a timer block at the time the ILL was created.  Whenever
3488	 * there is anything on the reassembly queue, the timer will
3489	 * be running.
3490	 */
3491	/* Handle vnic loopback of fragments */
3492	if (mp->b_datap->db_ref > 2)
3493		msg_len = 0;
3494	else
3495		msg_len = MBLKSIZE(mp);
3496
3497	tail_mp = mp;
3498	while (tail_mp->b_cont != NULL) {
3499		tail_mp = tail_mp->b_cont;
3500		if (tail_mp->b_datap->db_ref <= 2)
3501			msg_len += MBLKSIZE(tail_mp);
3502	}
3503	/*
3504	 * If the reassembly list for this ILL will get too big
3505	 * prune it.
3506	 */
3507
3508	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
3509	    ipst->ips_ip_reass_queue_bytes) {
3510		DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
3511		    uint_t, ill->ill_frag_count,
3512		    uint_t, ipst->ips_ip_reass_queue_bytes);
3513		ill_frag_prune(ill,
3514		    (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
3515		    (ipst->ips_ip_reass_queue_bytes - msg_len));
3516		pruned = B_TRUE;
3517	}
3518
3519	ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH_V6(*v6src_ptr, ident)];
3520	mutex_enter(&ipfb->ipfb_lock);
3521
3522	ipfp = &ipfb->ipfb_ipf;
3523	/* Try to find an existing fragment queue for this packet. */
3524	for (;;) {
3525		ipf = ipfp[0];
3526		if (ipf) {
3527			/*
3528			 * It has to match on ident, source address, and
3529			 * dest address.
3530			 */
3531			if (ipf->ipf_ident == ident &&
3532			    IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6src, v6src_ptr) &&
3533			    IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6dst, v6dst_ptr)) {
3534
3535				/*
3536				 * If we have received too many
3537				 * duplicate fragments for this packet
3538				 * free it.
3539				 */
3540				if (ipf->ipf_num_dups > ip_max_frag_dups) {
3541					ill_frag_free_pkts(ill, ipfb, ipf, 1);
3542					freemsg(mp);
3543					mutex_exit(&ipfb->ipfb_lock);
3544					return (NULL);
3545				}
3546
3547				break;
3548			}
3549			ipfp = &ipf->ipf_hash_next;
3550			continue;
3551		}
3552
3553
3554		/*
3555		 * If we pruned the list, do we want to store this new
3556		 * fragment?. We apply an optimization here based on the
3557		 * fact that most fragments will be received in order.
3558		 * So if the offset of this incoming fragment is zero,
3559		 * it is the first fragment of a new packet. We will
3560		 * keep it.  Otherwise drop the fragment, as we have
3561		 * probably pruned the packet already (since the
3562		 * packet cannot be found).
3563		 */
3564
3565		if (pruned && offset != 0) {
3566			mutex_exit(&ipfb->ipfb_lock);
3567			freemsg(mp);
3568			return (NULL);
3569		}
3570
3571		/* New guy.  Allocate a frag message. */
3572		mp1 = allocb(sizeof (*ipf), BPRI_MED);
3573		if (!mp1) {
3574			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3575			ip_drop_input("ipIfStatsInDiscards", mp, ill);
3576			freemsg(mp);
3577	partial_reass_done:
3578			mutex_exit(&ipfb->ipfb_lock);
3579			return (NULL);
3580		}
3581
3582		if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
3583			/*
3584			 * Too many fragmented packets in this hash bucket.
3585			 * Free the oldest.
3586			 */
3587			ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
3588		}
3589
3590		mp1->b_cont = mp;
3591
3592		/* Initialize the fragment header. */
3593		ipf = (ipf_t *)mp1->b_rptr;
3594		ipf->ipf_mp = mp1;
3595		ipf->ipf_ptphn = ipfp;
3596		ipfp[0] = ipf;
3597		ipf->ipf_hash_next = NULL;
3598		ipf->ipf_ident = ident;
3599		ipf->ipf_v6src = *v6src_ptr;
3600		ipf->ipf_v6dst = *v6dst_ptr;
3601		/* Record reassembly start time. */
3602		ipf->ipf_timestamp = gethrestime_sec();
3603		/* Record ipf generation and account for frag header */
3604		ipf->ipf_gen = ill->ill_ipf_gen++;
3605		ipf->ipf_count = MBLKSIZE(mp1);
3606		ipf->ipf_protocol = nexthdr;
3607		ipf->ipf_nf_hdr_len = 0;
3608		ipf->ipf_prev_nexthdr_offset = 0;
3609		ipf->ipf_last_frag_seen = B_FALSE;
3610		ipf->ipf_ecn = ecn_info;
3611		ipf->ipf_num_dups = 0;
3612		ipfb->ipfb_frag_pkts++;
3613		ipf->ipf_checksum = 0;
3614		ipf->ipf_checksum_flags = 0;
3615
3616		/* Store checksum value in fragment header */
3617		if (sum_flags != 0) {
3618			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3619			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3620			ipf->ipf_checksum = sum_val;
3621			ipf->ipf_checksum_flags = sum_flags;
3622		}
3623
3624		/*
3625		 * We handle reassembly two ways.  In the easy case,
3626		 * where all the fragments show up in order, we do
3627		 * minimal bookkeeping, and just clip new pieces on
3628		 * the end.  If we ever see a hole, then we go off
3629		 * to ip_reassemble which has to mark the pieces and
3630		 * keep track of the number of holes, etc.  Obviously,
3631		 * the point of having both mechanisms is so we can
3632		 * handle the easy case as efficiently as possible.
3633		 */
3634		if (offset == 0) {
3635			/* Easy case, in-order reassembly so far. */
3636			/* Update the byte count */
3637			ipf->ipf_count += msg_len;
3638			ipf->ipf_tail_mp = tail_mp;
3639			/*
3640			 * Keep track of next expected offset in
3641			 * ipf_end.
3642			 */
3643			ipf->ipf_end = end;
3644			ipf->ipf_nf_hdr_len = hdr_length;
3645			ipf->ipf_prev_nexthdr_offset = prev_nexthdr_offset;
3646		} else {
3647			/* Hard case, hole at the beginning. */
3648			ipf->ipf_tail_mp = NULL;
3649			/*
3650			 * ipf_end == 0 means that we have given up
3651			 * on easy reassembly.
3652			 */
3653			ipf->ipf_end = 0;
3654
3655			/* Forget checksum offload from now on */
3656			ipf->ipf_checksum_flags = 0;
3657
3658			/*
3659			 * ipf_hole_cnt is set by ip_reassemble.
3660			 * ipf_count is updated by ip_reassemble.
3661			 * No need to check for return value here
3662			 * as we don't expect reassembly to complete or
3663			 * fail for the first fragment itself.
3664			 */
3665			(void) ip_reassemble(mp, ipf, offset, more_frags, ill,
3666			    msg_len);
3667		}
3668		/* Update per ipfb and ill byte counts */
3669		ipfb->ipfb_count += ipf->ipf_count;
3670		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
3671		atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
3672		/* If the frag timer wasn't already going, start it. */
3673		mutex_enter(&ill->ill_lock);
3674		ill_frag_timer_start(ill);
3675		mutex_exit(&ill->ill_lock);
3676		goto partial_reass_done;
3677	}
3678
3679	/*
3680	 * If the packet's flag has changed (it could be coming up
3681	 * from an interface different than the previous, therefore
3682	 * possibly different checksum capability), then forget about
3683	 * any stored checksum states.  Otherwise add the value to
3684	 * the existing one stored in the fragment header.
3685	 */
3686	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
3687		sum_val += ipf->ipf_checksum;
3688		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3689		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3690		ipf->ipf_checksum = sum_val;
3691	} else if (ipf->ipf_checksum_flags != 0) {
3692		/* Forget checksum offload from now on */
3693		ipf->ipf_checksum_flags = 0;
3694	}
3695
3696	/*
3697	 * We have a new piece of a datagram which is already being
3698	 * reassembled.  Update the ECN info if all IP fragments
3699	 * are ECN capable.  If there is one which is not, clear
3700	 * all the info.  If there is at least one which has CE
3701	 * code point, IP needs to report that up to transport.
3702	 */
3703	if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
3704		if (ecn_info == IPH_ECN_CE)
3705			ipf->ipf_ecn = IPH_ECN_CE;
3706	} else {
3707		ipf->ipf_ecn = IPH_ECN_NECT;
3708	}
3709
3710	if (offset && ipf->ipf_end == offset) {
3711		/* The new fragment fits at the end */
3712		ipf->ipf_tail_mp->b_cont = mp;
3713		/* Update the byte count */
3714		ipf->ipf_count += msg_len;
3715		/* Update per ipfb and ill byte counts */
3716		ipfb->ipfb_count += msg_len;
3717		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
3718		atomic_add_32(&ill->ill_frag_count, msg_len);
3719		if (more_frags) {
3720			/* More to come. */
3721			ipf->ipf_end = end;
3722			ipf->ipf_tail_mp = tail_mp;
3723			goto partial_reass_done;
3724		}
3725	} else {
3726		/*
3727		 * Go do the hard cases.
3728		 * Call ip_reassemble().
3729		 */
3730		int ret;
3731
3732		if (offset == 0) {
3733			if (ipf->ipf_prev_nexthdr_offset == 0) {
3734				ipf->ipf_nf_hdr_len = hdr_length;
3735				ipf->ipf_prev_nexthdr_offset =
3736				    prev_nexthdr_offset;
3737			}
3738		}
3739		/* Save current byte count */
3740		count = ipf->ipf_count;
3741		ret = ip_reassemble(mp, ipf, offset, more_frags, ill, msg_len);
3742
3743		/* Count of bytes added and subtracted (freeb()ed) */
3744		count = ipf->ipf_count - count;
3745		if (count) {
3746			/* Update per ipfb and ill byte counts */
3747			ipfb->ipfb_count += count;
3748			ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
3749			atomic_add_32(&ill->ill_frag_count, count);
3750		}
3751		if (ret == IP_REASS_PARTIAL) {
3752			goto partial_reass_done;
3753		} else if (ret == IP_REASS_FAILED) {
3754			/* Reassembly failed. Free up all resources */
3755			ill_frag_free_pkts(ill, ipfb, ipf, 1);
3756			for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
3757				IP_REASS_SET_START(t_mp, 0);
3758				IP_REASS_SET_END(t_mp, 0);
3759			}
3760			freemsg(mp);
3761			goto partial_reass_done;
3762		}
3763
3764		/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
3765	}
3766	/*
3767	 * We have completed reassembly.  Unhook the frag header from
3768	 * the reassembly list.
3769	 *
3770	 * Grab the unfragmentable header length next header value out
3771	 * of the first fragment
3772	 */
3773	ASSERT(ipf->ipf_nf_hdr_len != 0);
3774	hdr_length = ipf->ipf_nf_hdr_len;
3775
3776	/*
3777	 * Before we free the frag header, record the ECN info
3778	 * to report back to the transport.
3779	 */
3780	ecn_info = ipf->ipf_ecn;
3781
3782	/*
3783	 * Store the nextheader field in the header preceding the fragment
3784	 * header
3785	 */
3786	nexthdr = ipf->ipf_protocol;
3787	prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
3788	ipfp = ipf->ipf_ptphn;
3789
3790	/* We need to supply these to caller */
3791	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
3792		sum_val = ipf->ipf_checksum;
3793	else
3794		sum_val = 0;
3795
3796	mp1 = ipf->ipf_mp;
3797	count = ipf->ipf_count;
3798	ipf = ipf->ipf_hash_next;
3799	if (ipf)
3800		ipf->ipf_ptphn = ipfp;
3801	ipfp[0] = ipf;
3802	atomic_add_32(&ill->ill_frag_count, -count);
3803	ASSERT(ipfb->ipfb_count >= count);
3804	ipfb->ipfb_count -= count;
3805	ipfb->ipfb_frag_pkts--;
3806	mutex_exit(&ipfb->ipfb_lock);
3807	/* Ditch the frag header. */
3808	mp = mp1->b_cont;
3809	freeb(mp1);
3810
3811	/*
3812	 * Make sure the packet is good by doing some sanity
3813	 * check. If bad we can silentely drop the packet.
3814	 */
3815reass_done:
3816	if (hdr_length < sizeof (ip6_frag_t)) {
3817		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3818		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
3819		ip1dbg(("ip_input_fragment_v6: bad packet\n"));
3820		freemsg(mp);
3821		return (NULL);
3822	}
3823
3824	/*
3825	 * Remove the fragment header from the initial header by
3826	 * splitting the mblk into the non-fragmentable header and
3827	 * everthing after the fragment extension header.  This has the
3828	 * side effect of putting all the headers that need destination
3829	 * processing into the b_cont block-- on return this fact is
3830	 * used in order to avoid having to look at the extensions
3831	 * already processed.
3832	 *
3833	 * Note that this code assumes that the unfragmentable portion
3834	 * of the header is in the first mblk and increments
3835	 * the read pointer past it.  If this assumption is broken
3836	 * this code fails badly.
3837	 */
3838	if (mp->b_rptr + hdr_length != mp->b_wptr) {
3839		mblk_t *nmp;
3840
3841		if (!(nmp = dupb(mp))) {
3842			ip1dbg(("ip_input_fragment_v6: dupb failed\n"));
3843			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3844			ip_drop_input("ipIfStatsInDiscards", mp, ill);
3845			freemsg(mp);
3846			return (NULL);
3847		}
3848		nmp->b_cont = mp->b_cont;
3849		mp->b_cont = nmp;
3850		nmp->b_rptr += hdr_length;
3851	}
3852	mp->b_wptr = mp->b_rptr + hdr_length - sizeof (ip6_frag_t);
3853
3854	ip6h = (ip6_t *)mp->b_rptr;
3855	((char *)ip6h)[prev_nexthdr_offset] = nexthdr;
3856
3857	/* Restore original IP length in header. */
3858	packet_size = msgdsize(mp);
3859	ip6h->ip6_plen = htons((uint16_t)(packet_size - IPV6_HDR_LEN));
3860	/* Record the ECN info. */
3861	ip6h->ip6_vcf &= htonl(0xFFCFFFFF);
3862	ip6h->ip6_vcf |= htonl(ecn_info << 20);
3863
3864	/* Update the receive attributes */
3865	ira->ira_pktlen = packet_size;
3866	ira->ira_ip_hdr_length = hdr_length - sizeof (ip6_frag_t);
3867	ira->ira_protocol = nexthdr;
3868
3869	/* Reassembly is successful; set checksum information in packet */
3870	DB_CKSUM16(mp) = (uint16_t)sum_val;
3871	DB_CKSUMFLAGS(mp) = sum_flags;
3872	DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
3873
3874	return (mp);
3875}
3876
3877/*
3878 * Given an mblk and a ptr, find the destination address in an IPv6 routing
3879 * header.
3880 */
3881static in6_addr_t
3882pluck_out_dst(const mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
3883{
3884	ip6_rthdr0_t *rt0;
3885	int segleft, numaddr;
3886	in6_addr_t *ap, rv = oldrv;
3887
3888	rt0 = (ip6_rthdr0_t *)whereptr;
3889	if (rt0->ip6r0_type != 0 && rt0->ip6r0_type != 2) {
3890		DTRACE_PROBE2(pluck_out_dst_unknown_type, mblk_t *, mp,
3891		    uint8_t *, whereptr);
3892		return (rv);
3893	}
3894	segleft = rt0->ip6r0_segleft;
3895	numaddr = rt0->ip6r0_len / 2;
3896
3897	if ((rt0->ip6r0_len & 0x1) ||
3898	    (mp != NULL && whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr) ||
3899	    (segleft > rt0->ip6r0_len / 2)) {
3900		/*
3901		 * Corrupt packet.  Either the routing header length is odd
3902		 * (can't happen) or mismatched compared to the packet, or the
3903		 * number of addresses is.  Return what we can.  This will
3904		 * only be a problem on forwarded packets that get squeezed
3905		 * through an outbound tunnel enforcing IPsec Tunnel Mode.
3906		 */
3907		DTRACE_PROBE2(pluck_out_dst_badpkt, mblk_t *, mp, uint8_t *,
3908		    whereptr);
3909		return (rv);
3910	}
3911
3912	if (segleft != 0) {
3913		ap = (in6_addr_t *)((char *)rt0 + sizeof (*rt0));
3914		rv = ap[numaddr - 1];
3915	}
3916
3917	return (rv);
3918}
3919
3920/*
3921 * Walk through the options to see if there is a routing header.
3922 * If present get the destination which is the last address of
3923 * the option.
3924 * mp needs to be provided in cases when the extension headers might span
3925 * b_cont; mp is never modified by this function.
3926 */
3927in6_addr_t
3928ip_get_dst_v6(ip6_t *ip6h, const mblk_t *mp, boolean_t *is_fragment)
3929{
3930	const mblk_t *current_mp = mp;
3931	uint8_t nexthdr;
3932	uint8_t *whereptr;
3933	int ehdrlen;
3934	in6_addr_t rv;
3935
3936	whereptr = (uint8_t *)ip6h;
3937	ehdrlen = sizeof (ip6_t);
3938
3939	/* We assume at least the IPv6 base header is within one mblk. */
3940	ASSERT(mp == NULL ||
3941	    (mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen));
3942
3943	rv = ip6h->ip6_dst;
3944	nexthdr = ip6h->ip6_nxt;
3945	if (is_fragment != NULL)
3946		*is_fragment = B_FALSE;
3947
3948	/*
3949	 * We also assume (thanks to ipsec_tun_outbound()'s pullup) that
3950	 * no extension headers will be split across mblks.
3951	 */
3952
3953	while (nexthdr == IPPROTO_HOPOPTS || nexthdr == IPPROTO_DSTOPTS ||
3954	    nexthdr == IPPROTO_ROUTING) {
3955		if (nexthdr == IPPROTO_ROUTING)
3956			rv = pluck_out_dst(current_mp, whereptr, rv);
3957
3958		/*
3959		 * All IPv6 extension headers have the next-header in byte
3960		 * 0, and the (length - 8) in 8-byte-words.
3961		 */
3962		while (current_mp != NULL &&
3963		    whereptr + ehdrlen >= current_mp->b_wptr) {
3964			ehdrlen -= (current_mp->b_wptr - whereptr);
3965			current_mp = current_mp->b_cont;
3966			if (current_mp == NULL) {
3967				/* Bad packet.  Return what we can. */
3968				DTRACE_PROBE3(ip_get_dst_v6_badpkt, mblk_t *,
3969				    mp, mblk_t *, current_mp, ip6_t *, ip6h);
3970				goto done;
3971			}
3972			whereptr = current_mp->b_rptr;
3973		}
3974		whereptr += ehdrlen;
3975
3976		nexthdr = *whereptr;
3977		ASSERT(current_mp == NULL || whereptr + 1 < current_mp->b_wptr);
3978		ehdrlen = (*(whereptr + 1) + 1) * 8;
3979	}
3980
3981done:
3982	if (nexthdr == IPPROTO_FRAGMENT && is_fragment != NULL)
3983		*is_fragment = B_TRUE;
3984	return (rv);
3985}
3986
3987/*
3988 * ip_source_routed_v6:
3989 * This function is called by redirect code (called from ip_input_v6) to
3990 * know whether this packet is source routed through this node i.e
3991 * whether this node (router) is part of the journey. This
3992 * function is called under two cases :
3993 *
3994 * case 1 : Routing header was processed by this node and
3995 *	    ip_process_rthdr replaced ip6_dst with the next hop
3996 *	    and we are forwarding the packet to the next hop.
3997 *
3998 * case 2 : Routing header was not processed by this node and we
3999 *	    are just forwarding the packet.
4000 *
4001 * For case (1) we don't want to send redirects. For case(2) we
4002 * want to send redirects.
4003 */
4004static boolean_t
4005ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
4006{
4007	uint8_t		nexthdr;
4008	in6_addr_t	*addrptr;
4009	ip6_rthdr0_t	*rthdr;
4010	uint8_t		numaddr;
4011	ip6_hbh_t	*hbhhdr;
4012	uint_t		ehdrlen;
4013	uint8_t		*byteptr;
4014
4015	ip2dbg(("ip_source_routed_v6\n"));
4016	nexthdr = ip6h->ip6_nxt;
4017	ehdrlen = IPV6_HDR_LEN;
4018
4019	/* if a routing hdr is preceeded by HOPOPT or DSTOPT */
4020	while (nexthdr == IPPROTO_HOPOPTS ||
4021	    nexthdr == IPPROTO_DSTOPTS) {
4022		byteptr = (uint8_t *)ip6h + ehdrlen;
4023		/*
4024		 * Check if we have already processed
4025		 * packets or we are just a forwarding
4026		 * router which only pulled up msgs up
4027		 * to IPV6HDR and  one HBH ext header
4028		 */
4029		if (byteptr + MIN_EHDR_LEN > mp->b_wptr) {
4030			ip2dbg(("ip_source_routed_v6: Extension"
4031			    " headers not processed\n"));
4032			return (B_FALSE);
4033		}
4034		hbhhdr = (ip6_hbh_t *)byteptr;
4035		nexthdr = hbhhdr->ip6h_nxt;
4036		ehdrlen = ehdrlen + 8 * (hbhhdr->ip6h_len + 1);
4037	}
4038	switch (nexthdr) {
4039	case IPPROTO_ROUTING:
4040		byteptr = (uint8_t *)ip6h + ehdrlen;
4041		/*
4042		 * If for some reason, we haven't pulled up
4043		 * the routing hdr data mblk, then we must
4044		 * not have processed it at all. So for sure
4045		 * we are not part of the source routed journey.
4046		 */
4047		if (byteptr + MIN_EHDR_LEN > mp->b_wptr) {
4048			ip2dbg(("ip_source_routed_v6: Routing"
4049			    " header not processed\n"));
4050			return (B_FALSE);
4051		}
4052		rthdr = (ip6_rthdr0_t *)byteptr;
4053		/*
4054		 * Either we are an intermediate router or the
4055		 * last hop before destination and we have
4056		 * already processed the routing header.
4057		 * If segment_left is greater than or equal to zero,
4058		 * then we must be the (numaddr - segleft) entry
4059		 * of the routing header. Although ip6r0_segleft
4060		 * is a unit8_t variable, we still check for zero
4061		 * or greater value, if in case the data type
4062		 * is changed someday in future.
4063		 */
4064		if (rthdr->ip6r0_segleft > 0 ||
4065		    rthdr->ip6r0_segleft == 0) {
4066			numaddr = rthdr->ip6r0_len / 2;
4067			addrptr = (in6_addr_t *)((char *)rthdr +
4068			    sizeof (*rthdr));
4069			addrptr += (numaddr - (rthdr->ip6r0_segleft + 1));
4070			if (addrptr != NULL) {
4071				if (ip_type_v6(addrptr, ipst) == IRE_LOCAL)
4072					return (B_TRUE);
4073				ip1dbg(("ip_source_routed_v6: Not local\n"));
4074			}
4075		}
4076	/* FALLTHROUGH */
4077	default:
4078		ip2dbg(("ip_source_routed_v6: Not source routed here\n"));
4079		return (B_FALSE);
4080	}
4081}
4082
4083/*
4084 * IPv6 fragmentation.  Essentially the same as IPv4 fragmentation.
4085 * We have not optimized this in terms of number of mblks
4086 * allocated. For instance, for each fragment sent we always allocate a
4087 * mblk to hold the IPv6 header and fragment header.
4088 *
4089 * Assumes that all the extension headers are contained in the first mblk
4090 * and that the fragment header has has already been added by calling
4091 * ip_fraghdr_add_v6.
4092 */
4093int
4094ip_fragment_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
4095    uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
4096    pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
4097{
4098	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
4099	ip6_t		*fip6h;
4100	mblk_t		*hmp;
4101	mblk_t		*hmp0;
4102	mblk_t		*dmp;
4103	ip6_frag_t	*fraghdr;
4104	size_t		unfragmentable_len;
4105	size_t		mlen;
4106	size_t		max_chunk;
4107	uint16_t	off_flags;
4108	uint16_t	offset = 0;
4109	ill_t		*ill = nce->nce_ill;
4110	uint8_t		nexthdr;
4111	uint8_t		*ptr;
4112	ip_stack_t	*ipst = ill->ill_ipst;
4113	uint_t		priority = mp->b_band;
4114	int		error = 0;
4115
4116	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
4117	if (max_frag == 0) {
4118		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4119		ip_drop_output("FragFails: zero max_frag", mp, ill);
4120		freemsg(mp);
4121		return (EINVAL);
4122	}
4123
4124	/*
4125	 * Caller should have added fraghdr_t to pkt_len, and also
4126	 * updated ip6_plen.
4127	 */
4128	ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == pkt_len);
4129	ASSERT(msgdsize(mp) == pkt_len);
4130
4131	/*
4132	 * Determine the length of the unfragmentable portion of this
4133	 * datagram.  This consists of the IPv6 header, a potential
4134	 * hop-by-hop options header, a potential pre-routing-header
4135	 * destination options header, and a potential routing header.
4136	 */
4137	nexthdr = ip6h->ip6_nxt;
4138	ptr = (uint8_t *)&ip6h[1];
4139
4140	if (nexthdr == IPPROTO_HOPOPTS) {
4141		ip6_hbh_t	*hbh_hdr;
4142		uint_t		hdr_len;
4143
4144		hbh_hdr = (ip6_hbh_t *)ptr;
4145		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
4146		nexthdr = hbh_hdr->ip6h_nxt;
4147		ptr += hdr_len;
4148	}
4149	if (nexthdr == IPPROTO_DSTOPTS) {
4150		ip6_dest_t	*dest_hdr;
4151		uint_t		hdr_len;
4152
4153		dest_hdr = (ip6_dest_t *)ptr;
4154		if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
4155			hdr_len = 8 * (dest_hdr->ip6d_len + 1);
4156			nexthdr = dest_hdr->ip6d_nxt;
4157			ptr += hdr_len;
4158		}
4159	}
4160	if (nexthdr == IPPROTO_ROUTING) {
4161		ip6_rthdr_t	*rthdr;
4162		uint_t		hdr_len;
4163
4164		rthdr = (ip6_rthdr_t *)ptr;
4165		nexthdr = rthdr->ip6r_nxt;
4166		hdr_len = 8 * (rthdr->ip6r_len + 1);
4167		ptr += hdr_len;
4168	}
4169	if (nexthdr != IPPROTO_FRAGMENT) {
4170		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4171		ip_drop_output("FragFails: bad nexthdr", mp, ill);
4172		freemsg(mp);
4173		return (EINVAL);
4174	}
4175	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
4176	unfragmentable_len += sizeof (ip6_frag_t);
4177
4178	max_chunk = (max_frag - unfragmentable_len) & ~7;
4179
4180	/*
4181	 * Allocate an mblk with enough room for the link-layer
4182	 * header and the unfragmentable part of the datagram, which includes
4183	 * the fragment header.  This (or a copy) will be used as the
4184	 * first mblk for each fragment we send.
4185	 */
4186	hmp = allocb_tmpl(unfragmentable_len + ipst->ips_ip_wroff_extra, mp);
4187	if (hmp == NULL) {
4188		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4189		ip_drop_output("FragFails: no hmp", mp, ill);
4190		freemsg(mp);
4191		return (ENOBUFS);
4192	}
4193	hmp->b_rptr += ipst->ips_ip_wroff_extra;
4194	hmp->b_wptr = hmp->b_rptr + unfragmentable_len;
4195
4196	fip6h = (ip6_t *)hmp->b_rptr;
4197	bcopy(ip6h, fip6h, unfragmentable_len);
4198
4199	/*
4200	 * pkt_len is set to the total length of the fragmentable data in this
4201	 * datagram.  For each fragment sent, we will decrement pkt_len
4202	 * by the amount of fragmentable data sent in that fragment
4203	 * until len reaches zero.
4204	 */
4205	pkt_len -= unfragmentable_len;
4206
4207	/*
4208	 * Move read ptr past unfragmentable portion, we don't want this part
4209	 * of the data in our fragments.
4210	 */
4211	mp->b_rptr += unfragmentable_len;
4212	if (mp->b_rptr == mp->b_wptr) {
4213		mblk_t *mp1 = mp->b_cont;
4214		freeb(mp);
4215		mp = mp1;
4216	}
4217
4218	while (pkt_len != 0) {
4219		mlen = MIN(pkt_len, max_chunk);
4220		pkt_len -= mlen;
4221		if (pkt_len != 0) {
4222			/* Not last */
4223			hmp0 = copyb(hmp);
4224			if (hmp0 == NULL) {
4225				BUMP_MIB(ill->ill_ip_mib,
4226				    ipIfStatsOutFragFails);
4227				ip_drop_output("FragFails: copyb failed",
4228				    mp, ill);
4229				freeb(hmp);
4230				freemsg(mp);
4231				ip1dbg(("ip_fragment_v6: copyb failed\n"));
4232				return (ENOBUFS);
4233			}
4234			off_flags = IP6F_MORE_FRAG;
4235		} else {
4236			/* Last fragment */
4237			hmp0 = hmp;
4238			hmp = NULL;
4239			off_flags = 0;
4240		}
4241		fip6h = (ip6_t *)(hmp0->b_rptr);
4242		fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len -
4243		    sizeof (ip6_frag_t));
4244
4245		fip6h->ip6_plen = htons((uint16_t)(mlen +
4246		    unfragmentable_len - IPV6_HDR_LEN));
4247		/*
4248		 * Note: Optimization alert.
4249		 * In IPv6 (and IPv4) protocol header, Fragment Offset
4250		 * ("offset") is 13 bits wide and in 8-octet units.
4251		 * In IPv6 protocol header (unlike IPv4) in a 16 bit field,
4252		 * it occupies the most significant 13 bits.
4253		 * (least significant 13 bits in IPv4).
4254		 * We do not do any shifts here. Not shifting is same effect
4255		 * as taking offset value in octet units, dividing by 8 and
4256		 * then shifting 3 bits left to line it up in place in proper
4257		 * place protocol header.
4258		 */
4259		fraghdr->ip6f_offlg = htons(offset) | off_flags;
4260
4261		if (!(dmp = ip_carve_mp(&mp, mlen))) {
4262			/* mp has already been freed by ip_carve_mp() */
4263			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4264			ip_drop_output("FragFails: could not carve mp",
4265			    hmp0, ill);
4266			if (hmp != NULL)
4267				freeb(hmp);
4268			freeb(hmp0);
4269			ip1dbg(("ip_carve_mp: failed\n"));
4270			return (ENOBUFS);
4271		}
4272		hmp0->b_cont = dmp;
4273		/* Get the priority marking, if any */
4274		hmp0->b_band = priority;
4275
4276		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
4277
4278		error = postfragfn(hmp0, nce, ixaflags,
4279		    mlen + unfragmentable_len, xmit_hint, szone, nolzid,
4280		    ixa_cookie);
4281		if (error != 0 && error != EWOULDBLOCK && hmp != NULL) {
4282			/* No point in sending the other fragments */
4283			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4284			ip_drop_output("FragFails: postfragfn failed",
4285			    hmp, ill);
4286			freeb(hmp);
4287			freemsg(mp);
4288			return (error);
4289		}
4290		/* No need to redo state machine in loop */
4291		ixaflags &= ~IXAF_REACH_CONF;
4292
4293		offset += mlen;
4294	}
4295	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
4296	return (error);
4297}
4298
4299/*
4300 * Add a fragment header to an IPv6 packet.
4301 * Assumes that all the extension headers are contained in the first mblk.
4302 *
4303 * The fragment header is inserted after an hop-by-hop options header
4304 * and after [an optional destinations header followed by] a routing header.
4305 */
4306mblk_t *
4307ip_fraghdr_add_v6(mblk_t *mp, uint32_t ident, ip_xmit_attr_t *ixa)
4308{
4309	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
4310	ip6_t		*fip6h;
4311	mblk_t		*hmp;
4312	ip6_frag_t	*fraghdr;
4313	size_t		unfragmentable_len;
4314	uint8_t		nexthdr;
4315	uint_t		prev_nexthdr_offset;
4316	uint8_t		*ptr;
4317	uint_t		priority = mp->b_band;
4318	ip_stack_t	*ipst = ixa->ixa_ipst;
4319
4320	/*
4321	 * Determine the length of the unfragmentable portion of this
4322	 * datagram.  This consists of the IPv6 header, a potential
4323	 * hop-by-hop options header, a potential pre-routing-header
4324	 * destination options header, and a potential routing header.
4325	 */
4326	nexthdr = ip6h->ip6_nxt;
4327	prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
4328	ptr = (uint8_t *)&ip6h[1];
4329
4330	if (nexthdr == IPPROTO_HOPOPTS) {
4331		ip6_hbh_t	*hbh_hdr;
4332		uint_t		hdr_len;
4333
4334		hbh_hdr = (ip6_hbh_t *)ptr;
4335		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
4336		nexthdr = hbh_hdr->ip6h_nxt;
4337		prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
4338		    - (uint8_t *)ip6h;
4339		ptr += hdr_len;
4340	}
4341	if (nexthdr == IPPROTO_DSTOPTS) {
4342		ip6_dest_t	*dest_hdr;
4343		uint_t		hdr_len;
4344
4345		dest_hdr = (ip6_dest_t *)ptr;
4346		if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
4347			hdr_len = 8 * (dest_hdr->ip6d_len + 1);
4348			nexthdr = dest_hdr->ip6d_nxt;
4349			prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
4350			    - (uint8_t *)ip6h;
4351			ptr += hdr_len;
4352		}
4353	}
4354	if (nexthdr == IPPROTO_ROUTING) {
4355		ip6_rthdr_t	*rthdr;
4356		uint_t		hdr_len;
4357
4358		rthdr = (ip6_rthdr_t *)ptr;
4359		nexthdr = rthdr->ip6r_nxt;
4360		prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
4361		    - (uint8_t *)ip6h;
4362		hdr_len = 8 * (rthdr->ip6r_len + 1);
4363		ptr += hdr_len;
4364	}
4365	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
4366
4367	/*
4368	 * Allocate an mblk with enough room for the link-layer
4369	 * header, the unfragmentable part of the datagram, and the
4370	 * fragment header.
4371	 */
4372	hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) +
4373	    ipst->ips_ip_wroff_extra, mp);
4374	if (hmp == NULL) {
4375		ill_t *ill = ixa->ixa_nce->nce_ill;
4376
4377		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
4378		ip_drop_output("ipIfStatsOutDiscards: allocb failure", mp, ill);
4379		freemsg(mp);
4380		return (NULL);
4381	}
4382	hmp->b_rptr += ipst->ips_ip_wroff_extra;
4383	hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t);
4384
4385	fip6h = (ip6_t *)hmp->b_rptr;
4386	fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len);
4387
4388	bcopy(ip6h, fip6h, unfragmentable_len);
4389	fip6h->ip6_plen = htons(ntohs(fip6h->ip6_plen) + sizeof (ip6_frag_t));
4390	hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
4391
4392	fraghdr->ip6f_nxt = nexthdr;
4393	fraghdr->ip6f_reserved = 0;
4394	fraghdr->ip6f_offlg = 0;
4395	fraghdr->ip6f_ident = htonl(ident);
4396
4397	/* Get the priority marking, if any */
4398	hmp->b_band = priority;
4399
4400	/*
4401	 * Move read ptr past unfragmentable portion, we don't want this part
4402	 * of the data in our fragments.
4403	 */
4404	mp->b_rptr += unfragmentable_len;
4405	hmp->b_cont = mp;
4406	return (hmp);
4407}
4408
4409/*
4410 * Determine if the ill and multicast aspects of that packets
4411 * "matches" the conn.
4412 */
4413boolean_t
4414conn_wantpacket_v6(conn_t *connp, ip_recv_attr_t *ira, ip6_t *ip6h)
4415{
4416	ill_t		*ill = ira->ira_rill;
4417	zoneid_t	zoneid = ira->ira_zoneid;
4418	uint_t		in_ifindex;
4419	in6_addr_t	*v6dst_ptr = &ip6h->ip6_dst;
4420	in6_addr_t	*v6src_ptr = &ip6h->ip6_src;
4421
4422	/*
4423	 * conn_incoming_ifindex is set by IPV6_BOUND_IF and as link-local
4424	 * scopeid. This is used to limit
4425	 * unicast and multicast reception to conn_incoming_ifindex.
4426	 * conn_wantpacket_v6 is called both for unicast and
4427	 * multicast packets.
4428	 */
4429	in_ifindex = connp->conn_incoming_ifindex;
4430
4431	/* mpathd can bind to the under IPMP interface, which we allow */
4432	if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
4433		if (!IS_UNDER_IPMP(ill))
4434			return (B_FALSE);
4435
4436		if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
4437			return (B_FALSE);
4438	}
4439
4440	if (!IPCL_ZONE_MATCH(connp, zoneid))
4441		return (B_FALSE);
4442
4443	if (!(ira->ira_flags & IRAF_MULTICAST))
4444		return (B_TRUE);
4445
4446	if (connp->conn_multi_router)
4447		return (B_TRUE);
4448
4449	if (ira->ira_protocol == IPPROTO_RSVP)
4450		return (B_TRUE);
4451
4452	return (conn_hasmembers_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr,
4453	    ira->ira_ill));
4454}
4455
4456/*
4457 * pr_addr_dbg function provides the needed buffer space to call
4458 * inet_ntop() function's 3rd argument. This function should be
4459 * used by any kernel routine which wants to save INET6_ADDRSTRLEN
4460 * stack buffer space in it's own stack frame. This function uses
4461 * a buffer from it's own stack and prints the information.
4462 * Example: pr_addr_dbg("func: no route for %s\n ", AF_INET, addr)
4463 *
4464 * Note:    This function can call inet_ntop() once.
4465 */
4466void
4467pr_addr_dbg(char *fmt1, int af, const void *addr)
4468{
4469	char	buf[INET6_ADDRSTRLEN];
4470
4471	if (fmt1 == NULL) {
4472		ip0dbg(("pr_addr_dbg: Wrong arguments\n"));
4473		return;
4474	}
4475
4476	/*
4477	 * This does not compare debug level and just prints
4478	 * out. Thus it is the responsibility of the caller
4479	 * to check the appropriate debug-level before calling
4480	 * this function.
4481	 */
4482	if (ip_debug > 0) {
4483		printf(fmt1, inet_ntop(af, addr, buf, sizeof (buf)));
4484	}
4485
4486
4487}
4488
4489
4490/*
4491 * Return the length in bytes of the IPv6 headers (base header
4492 * extension headers) that will be needed based on the
4493 * ip_pkt_t structure passed by the caller.
4494 *
4495 * The returned length does not include the length of the upper level
4496 * protocol (ULP) header.
4497 */
4498int
4499ip_total_hdrs_len_v6(const ip_pkt_t *ipp)
4500{
4501	int len;
4502
4503	len = IPV6_HDR_LEN;
4504
4505	/*
4506	 * If there's a security label here, then we ignore any hop-by-hop
4507	 * options the user may try to set.
4508	 */
4509	if (ipp->ipp_fields & IPPF_LABEL_V6) {
4510		uint_t hopoptslen;
4511		/*
4512		 * Note that ipp_label_len_v6 is just the option - not
4513		 * the hopopts extension header. It also needs to be padded
4514		 * to a multiple of 8 bytes.
4515		 */
4516		ASSERT(ipp->ipp_label_len_v6 != 0);
4517		hopoptslen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
4518		hopoptslen = (hopoptslen + 7)/8 * 8;
4519		len += hopoptslen;
4520	} else if (ipp->ipp_fields & IPPF_HOPOPTS) {
4521		ASSERT(ipp->ipp_hopoptslen != 0);
4522		len += ipp->ipp_hopoptslen;
4523	}
4524
4525	/*
4526	 * En-route destination options
4527	 * Only do them if there's a routing header as well
4528	 */
4529	if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
4530	    (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
4531		ASSERT(ipp->ipp_rthdrdstoptslen != 0);
4532		len += ipp->ipp_rthdrdstoptslen;
4533	}
4534	if (ipp->ipp_fields & IPPF_RTHDR) {
4535		ASSERT(ipp->ipp_rthdrlen != 0);
4536		len += ipp->ipp_rthdrlen;
4537	}
4538	if (ipp->ipp_fields & IPPF_DSTOPTS) {
4539		ASSERT(ipp->ipp_dstoptslen != 0);
4540		len += ipp->ipp_dstoptslen;
4541	}
4542	return (len);
4543}
4544
4545/*
4546 * All-purpose routine to build a header chain of an IPv6 header
4547 * followed by any required extension headers and a proto header.
4548 *
4549 * The caller has to set the source and destination address as well as
4550 * ip6_plen. The caller has to massage any routing header and compensate
4551 * for the ULP pseudo-header checksum due to the source route.
4552 *
4553 * The extension headers will all be fully filled in.
4554 */
4555void
4556ip_build_hdrs_v6(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
4557    uint8_t protocol, uint32_t flowinfo)
4558{
4559	uint8_t *nxthdr_ptr;
4560	uint8_t *cp;
4561	ip6_t	*ip6h = (ip6_t *)buf;
4562
4563	/* Initialize IPv6 header */
4564	ip6h->ip6_vcf =
4565	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4566	    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4567
4568	if (ipp->ipp_fields & IPPF_TCLASS) {
4569		/* Overrides the class part of flowinfo */
4570		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4571		    ipp->ipp_tclass);
4572	}
4573
4574	if (ipp->ipp_fields & IPPF_HOPLIMIT)
4575		ip6h->ip6_hops = ipp->ipp_hoplimit;
4576	else
4577		ip6h->ip6_hops = ipp->ipp_unicast_hops;
4578
4579	if ((ipp->ipp_fields & IPPF_ADDR) &&
4580	    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4581		ip6h->ip6_src = ipp->ipp_addr;
4582
4583	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
4584	cp = (uint8_t *)&ip6h[1];
4585	/*
4586	 * Here's where we have to start stringing together
4587	 * any extension headers in the right order:
4588	 * Hop-by-hop, destination, routing, and final destination opts.
4589	 */
4590	/*
4591	 * If there's a security label here, then we ignore any hop-by-hop
4592	 * options the user may try to set.
4593	 */
4594	if (ipp->ipp_fields & IPPF_LABEL_V6) {
4595		/*
4596		 * Hop-by-hop options with the label.
4597		 * Note that ipp_label_v6 is just the option - not
4598		 * the hopopts extension header. It also needs to be padded
4599		 * to a multiple of 8 bytes.
4600		 */
4601		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
4602		uint_t hopoptslen;
4603		uint_t padlen;
4604
4605		padlen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
4606		hopoptslen = (padlen + 7)/8 * 8;
4607		padlen = hopoptslen - padlen;
4608
4609		*nxthdr_ptr = IPPROTO_HOPOPTS;
4610		nxthdr_ptr = &hbh->ip6h_nxt;
4611		hbh->ip6h_len = hopoptslen/8 - 1;
4612		cp += sizeof (ip6_hbh_t);
4613		bcopy(ipp->ipp_label_v6, cp, ipp->ipp_label_len_v6);
4614		cp += ipp->ipp_label_len_v6;
4615
4616		ASSERT(padlen <= 7);
4617		switch (padlen) {
4618		case 0:
4619			break;
4620		case 1:
4621			cp[0] = IP6OPT_PAD1;
4622			break;
4623		default:
4624			cp[0] = IP6OPT_PADN;
4625			cp[1] = padlen - 2;
4626			bzero(&cp[2], padlen - 2);
4627			break;
4628		}
4629		cp += padlen;
4630	} else if (ipp->ipp_fields & IPPF_HOPOPTS) {
4631		/* Hop-by-hop options */
4632		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
4633
4634		*nxthdr_ptr = IPPROTO_HOPOPTS;
4635		nxthdr_ptr = &hbh->ip6h_nxt;
4636
4637		bcopy(ipp->ipp_hopopts, cp, ipp->ipp_hopoptslen);
4638		cp += ipp->ipp_hopoptslen;
4639	}
4640	/*
4641	 * En-route destination options
4642	 * Only do them if there's a routing header as well
4643	 */
4644	if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
4645	    (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
4646		ip6_dest_t *dst = (ip6_dest_t *)cp;
4647
4648		*nxthdr_ptr = IPPROTO_DSTOPTS;
4649		nxthdr_ptr = &dst->ip6d_nxt;
4650
4651		bcopy(ipp->ipp_rthdrdstopts, cp, ipp->ipp_rthdrdstoptslen);
4652		cp += ipp->ipp_rthdrdstoptslen;
4653	}
4654	/*
4655	 * Routing header next
4656	 */
4657	if (ipp->ipp_fields & IPPF_RTHDR) {
4658		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
4659
4660		*nxthdr_ptr = IPPROTO_ROUTING;
4661		nxthdr_ptr = &rt->ip6r_nxt;
4662
4663		bcopy(ipp->ipp_rthdr, cp, ipp->ipp_rthdrlen);
4664		cp += ipp->ipp_rthdrlen;
4665	}
4666	/*
4667	 * Do ultimate destination options
4668	 */
4669	if (ipp->ipp_fields & IPPF_DSTOPTS) {
4670		ip6_dest_t *dest = (ip6_dest_t *)cp;
4671
4672		*nxthdr_ptr = IPPROTO_DSTOPTS;
4673		nxthdr_ptr = &dest->ip6d_nxt;
4674
4675		bcopy(ipp->ipp_dstopts, cp, ipp->ipp_dstoptslen);
4676		cp += ipp->ipp_dstoptslen;
4677	}
4678	/*
4679	 * Now set the last header pointer to the proto passed in
4680	 */
4681	*nxthdr_ptr = protocol;
4682	ASSERT((int)(cp - buf) == buf_len);
4683}
4684
4685/*
4686 * Return a pointer to the routing header extension header
4687 * in the IPv6 header(s) chain passed in.
4688 * If none found, return NULL
4689 * Assumes that all extension headers are in same mblk as the v6 header
4690 */
4691ip6_rthdr_t *
4692ip_find_rthdr_v6(ip6_t *ip6h, uint8_t *endptr)
4693{
4694	ip6_dest_t	*desthdr;
4695	ip6_frag_t	*fraghdr;
4696	uint_t		hdrlen;
4697	uint8_t		nexthdr;
4698	uint8_t		*ptr = (uint8_t *)&ip6h[1];
4699
4700	if (ip6h->ip6_nxt == IPPROTO_ROUTING)
4701		return ((ip6_rthdr_t *)ptr);
4702
4703	/*
4704	 * The routing header will precede all extension headers
4705	 * other than the hop-by-hop and destination options
4706	 * extension headers, so if we see anything other than those,
4707	 * we're done and didn't find it.
4708	 * We could see a destination options header alone but no
4709	 * routing header, in which case we'll return NULL as soon as
4710	 * we see anything after that.
4711	 * Hop-by-hop and destination option headers are identical,
4712	 * so we can use either one we want as a template.
4713	 */
4714	nexthdr = ip6h->ip6_nxt;
4715	while (ptr < endptr) {
4716		/* Is there enough left for len + nexthdr? */
4717		if (ptr + MIN_EHDR_LEN > endptr)
4718			return (NULL);
4719
4720		switch (nexthdr) {
4721		case IPPROTO_HOPOPTS:
4722		case IPPROTO_DSTOPTS:
4723			/* Assumes the headers are identical for hbh and dst */
4724			desthdr = (ip6_dest_t *)ptr;
4725			hdrlen = 8 * (desthdr->ip6d_len + 1);
4726			nexthdr = desthdr->ip6d_nxt;
4727			break;
4728
4729		case IPPROTO_ROUTING:
4730			return ((ip6_rthdr_t *)ptr);
4731
4732		case IPPROTO_FRAGMENT:
4733			fraghdr = (ip6_frag_t *)ptr;
4734			hdrlen = sizeof (ip6_frag_t);
4735			nexthdr = fraghdr->ip6f_nxt;
4736			break;
4737
4738		default:
4739			return (NULL);
4740		}
4741		ptr += hdrlen;
4742	}
4743	return (NULL);
4744}
4745
4746/*
4747 * Called for source-routed packets originating on this node.
4748 * Manipulates the original routing header by moving every entry up
4749 * one slot, placing the first entry in the v6 header's v6_dst field,
4750 * and placing the ultimate destination in the routing header's last
4751 * slot.
4752 *
4753 * Returns the checksum diference between the ultimate destination
4754 * (last hop in the routing header when the packet is sent) and
4755 * the first hop (ip6_dst when the packet is sent)
4756 */
4757/* ARGSUSED2 */
4758uint32_t
4759ip_massage_options_v6(ip6_t *ip6h, ip6_rthdr_t *rth, netstack_t *ns)
4760{
4761	uint_t		numaddr;
4762	uint_t		i;
4763	in6_addr_t	*addrptr;
4764	in6_addr_t	tmp;
4765	ip6_rthdr0_t	*rthdr = (ip6_rthdr0_t *)rth;
4766	uint32_t	cksm;
4767	uint32_t	addrsum = 0;
4768	uint16_t	*ptr;
4769
4770	/*
4771	 * Perform any processing needed for source routing.
4772	 * We know that all extension headers will be in the same mblk
4773	 * as the IPv6 header.
4774	 */
4775
4776	/*
4777	 * If no segments left in header, or the header length field is zero,
4778	 * don't move hop addresses around;
4779	 * Checksum difference is zero.
4780	 */
4781	if ((rthdr->ip6r0_segleft == 0) || (rthdr->ip6r0_len == 0))
4782		return (0);
4783
4784	ptr = (uint16_t *)&ip6h->ip6_dst;
4785	cksm = 0;
4786	for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) {
4787		cksm += ptr[i];
4788	}
4789	cksm = (cksm & 0xFFFF) + (cksm >> 16);
4790
4791	/*
4792	 * Here's where the fun begins - we have to
4793	 * move all addresses up one spot, take the
4794	 * first hop and make it our first ip6_dst,
4795	 * and place the ultimate destination in the
4796	 * newly-opened last slot.
4797	 */
4798	addrptr = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
4799	numaddr = rthdr->ip6r0_len / 2;
4800	tmp = *addrptr;
4801	for (i = 0; i < (numaddr - 1); addrptr++, i++) {
4802		*addrptr = addrptr[1];
4803	}
4804	*addrptr = ip6h->ip6_dst;
4805	ip6h->ip6_dst = tmp;
4806
4807	/*
4808	 * From the checksummed ultimate destination subtract the checksummed
4809	 * current ip6_dst (the first hop address). Return that number.
4810	 * (In the v4 case, the second part of this is done in each routine
4811	 *  that calls ip_massage_options(). We do it all in this one place
4812	 *  for v6).
4813	 */
4814	ptr = (uint16_t *)&ip6h->ip6_dst;
4815	for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) {
4816		addrsum += ptr[i];
4817	}
4818	cksm -= ((addrsum >> 16) + (addrsum & 0xFFFF));
4819	if ((int)cksm < 0)
4820		cksm--;
4821	cksm = (cksm & 0xFFFF) + (cksm >> 16);
4822
4823	return (cksm);
4824}
4825
4826void
4827*ip6_kstat_init(netstackid_t stackid, ip6_stat_t *ip6_statisticsp)
4828{
4829	kstat_t *ksp;
4830
4831	ip6_stat_t template = {
4832		{ "ip6_udp_fannorm",	KSTAT_DATA_UINT64 },
4833		{ "ip6_udp_fanmb",	KSTAT_DATA_UINT64 },
4834		{ "ip6_recv_pullup",		KSTAT_DATA_UINT64 },
4835		{ "ip6_db_ref",			KSTAT_DATA_UINT64 },
4836		{ "ip6_notaligned",		KSTAT_DATA_UINT64 },
4837		{ "ip6_multimblk",		KSTAT_DATA_UINT64 },
4838		{ "ipsec_proto_ahesp",		KSTAT_DATA_UINT64 },
4839		{ "ip6_out_sw_cksum",			KSTAT_DATA_UINT64 },
4840		{ "ip6_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
4841		{ "ip6_in_sw_cksum",			KSTAT_DATA_UINT64 },
4842		{ "ip6_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
4843		{ "ip6_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
4844		{ "ip6_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
4845		{ "ip6_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
4846		{ "ip6_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
4847		{ "ip6_udp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
4848	};
4849	ksp = kstat_create_netstack("ip", 0, "ip6stat", "net",
4850	    KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
4851	    KSTAT_FLAG_VIRTUAL, stackid);
4852
4853	if (ksp == NULL)
4854		return (NULL);
4855
4856	bcopy(&template, ip6_statisticsp, sizeof (template));
4857	ksp->ks_data = (void *)ip6_statisticsp;
4858	ksp->ks_private = (void *)(uintptr_t)stackid;
4859
4860	kstat_install(ksp);
4861	return (ksp);
4862}
4863
4864void
4865ip6_kstat_fini(netstackid_t stackid, kstat_t *ksp)
4866{
4867	if (ksp != NULL) {
4868		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
4869		kstat_delete_netstack(ksp, stackid);
4870	}
4871}
4872
4873/*
4874 * The following two functions set and get the value for the
4875 * IPV6_SRC_PREFERENCES socket option.
4876 */
4877int
4878ip6_set_src_preferences(ip_xmit_attr_t *ixa, uint32_t prefs)
4879{
4880	/*
4881	 * We only support preferences that are covered by
4882	 * IPV6_PREFER_SRC_MASK.
4883	 */
4884	if (prefs & ~IPV6_PREFER_SRC_MASK)
4885		return (EINVAL);
4886
4887	/*
4888	 * Look for conflicting preferences or default preferences.  If
4889	 * both bits of a related pair are clear, the application wants the
4890	 * system's default value for that pair.  Both bits in a pair can't
4891	 * be set.
4892	 */
4893	if ((prefs & IPV6_PREFER_SRC_MIPMASK) == 0) {
4894		prefs |= IPV6_PREFER_SRC_MIPDEFAULT;
4895	} else if ((prefs & IPV6_PREFER_SRC_MIPMASK) ==
4896	    IPV6_PREFER_SRC_MIPMASK) {
4897		return (EINVAL);
4898	}
4899	if ((prefs & IPV6_PREFER_SRC_TMPMASK) == 0) {
4900		prefs |= IPV6_PREFER_SRC_TMPDEFAULT;
4901	} else if ((prefs & IPV6_PREFER_SRC_TMPMASK) ==
4902	    IPV6_PREFER_SRC_TMPMASK) {
4903		return (EINVAL);
4904	}
4905	if ((prefs & IPV6_PREFER_SRC_CGAMASK) == 0) {
4906		prefs |= IPV6_PREFER_SRC_CGADEFAULT;
4907	} else if ((prefs & IPV6_PREFER_SRC_CGAMASK) ==
4908	    IPV6_PREFER_SRC_CGAMASK) {
4909		return (EINVAL);
4910	}
4911
4912	ixa->ixa_src_preferences = prefs;
4913	return (0);
4914}
4915
4916size_t
4917ip6_get_src_preferences(ip_xmit_attr_t *ixa, uint32_t *val)
4918{
4919	*val = ixa->ixa_src_preferences;
4920	return (sizeof (ixa->ixa_src_preferences));
4921}
4922
4923/*
4924 * Get the size of the IP options (including the IP headers size)
4925 * without including the AH header's size. If till_ah is B_FALSE,
4926 * and if AH header is present, dest options beyond AH header will
4927 * also be included in the returned size.
4928 */
4929int
4930ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
4931{
4932	ip6_t *ip6h;
4933	uint8_t nexthdr;
4934	uint8_t *whereptr;
4935	ip6_hbh_t *hbhhdr;
4936	ip6_dest_t *dsthdr;
4937	ip6_rthdr_t *rthdr;
4938	int ehdrlen;
4939	int size;
4940	ah_t *ah;
4941
4942	ip6h = (ip6_t *)mp->b_rptr;
4943	size = IPV6_HDR_LEN;
4944	nexthdr = ip6h->ip6_nxt;
4945	whereptr = (uint8_t *)&ip6h[1];
4946	for (;;) {
4947		/* Assume IP has already stripped it */
4948		ASSERT(nexthdr != IPPROTO_FRAGMENT);
4949		switch (nexthdr) {
4950		case IPPROTO_HOPOPTS:
4951			hbhhdr = (ip6_hbh_t *)whereptr;
4952			nexthdr = hbhhdr->ip6h_nxt;
4953			ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
4954			break;
4955		case IPPROTO_DSTOPTS:
4956			dsthdr = (ip6_dest_t *)whereptr;
4957			nexthdr = dsthdr->ip6d_nxt;
4958			ehdrlen = 8 * (dsthdr->ip6d_len + 1);
4959			break;
4960		case IPPROTO_ROUTING:
4961			rthdr = (ip6_rthdr_t *)whereptr;
4962			nexthdr = rthdr->ip6r_nxt;
4963			ehdrlen = 8 * (rthdr->ip6r_len + 1);
4964			break;
4965		default :
4966			if (till_ah) {
4967				ASSERT(nexthdr == IPPROTO_AH);
4968				return (size);
4969			}
4970			/*
4971			 * If we don't have a AH header to traverse,
4972			 * return now. This happens normally for
4973			 * outbound datagrams where we have not inserted
4974			 * the AH header.
4975			 */
4976			if (nexthdr != IPPROTO_AH) {
4977				return (size);
4978			}
4979
4980			/*
4981			 * We don't include the AH header's size
4982			 * to be symmetrical with other cases where
4983			 * we either don't have a AH header (outbound)
4984			 * or peek into the AH header yet (inbound and
4985			 * not pulled up yet).
4986			 */
4987			ah = (ah_t *)whereptr;
4988			nexthdr = ah->ah_nexthdr;
4989			ehdrlen = (ah->ah_length << 2) + 8;
4990
4991			if (nexthdr == IPPROTO_DSTOPTS) {
4992				if (whereptr + ehdrlen >= mp->b_wptr) {
4993					/*
4994					 * The destination options header
4995					 * is not part of the first mblk.
4996					 */
4997					whereptr = mp->b_cont->b_rptr;
4998				} else {
4999					whereptr += ehdrlen;
5000				}
5001
5002				dsthdr = (ip6_dest_t *)whereptr;
5003				ehdrlen = 8 * (dsthdr->ip6d_len + 1);
5004				size += ehdrlen;
5005			}
5006			return (size);
5007		}
5008		whereptr += ehdrlen;
5009		size += ehdrlen;
5010	}
5011}
5012
5013/*
5014 * Utility routine that checks if `v6srcp' is a valid address on underlying
5015 * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
5016 * associated with `v6srcp' on success.  NOTE: if this is not called from
5017 * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
5018 * group during or after this lookup.
5019 */
5020boolean_t
5021ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp)
5022{
5023	ipif_t *ipif;
5024
5025
5026	ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst);
5027	if (ipif != NULL) {
5028		if (ipifp != NULL)
5029			*ipifp = ipif;
5030		else
5031			ipif_refrele(ipif);
5032		return (B_TRUE);
5033	}
5034
5035	if (ip_debug > 2) {
5036		pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for "
5037		    "src %s\n", AF_INET6, v6srcp);
5038	}
5039	return (B_FALSE);
5040}
5041