1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
25 * Copyright (c) 2018, Joyent, Inc.
26 */
27/* Copyright (c) 1990 Mentat Inc. */
28
29#include <sys/types.h>
30#include <sys/stream.h>
31#include <sys/stropts.h>
32#include <sys/strlog.h>
33#include <sys/strsun.h>
34#define	_SUN_TPI_VERSION 2
35#include <sys/tihdr.h>
36#include <sys/timod.h>
37#include <sys/ddi.h>
38#include <sys/sunddi.h>
39#include <sys/strsubr.h>
40#include <sys/suntpi.h>
41#include <sys/xti_inet.h>
42#include <sys/cmn_err.h>
43#include <sys/kmem.h>
44#include <sys/cred.h>
45#include <sys/policy.h>
46#include <sys/priv.h>
47#include <sys/ucred.h>
48#include <sys/zone.h>
49
50#include <sys/sockio.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/vtrace.h>
54#include <sys/sdt.h>
55#include <sys/debug.h>
56#include <sys/isa_defs.h>
57#include <sys/random.h>
58#include <netinet/in.h>
59#include <netinet/ip6.h>
60#include <netinet/icmp6.h>
61#include <netinet/udp.h>
62
63#include <inet/common.h>
64#include <inet/ip.h>
65#include <inet/ip_impl.h>
66#include <inet/ipsec_impl.h>
67#include <inet/ip6.h>
68#include <inet/ip_ire.h>
69#include <inet/ip_if.h>
70#include <inet/ip_multi.h>
71#include <inet/ip_ndp.h>
72#include <inet/proto_set.h>
73#include <inet/mib2.h>
74#include <inet/nd.h>
75#include <inet/optcom.h>
76#include <inet/snmpcom.h>
77#include <inet/kstatcom.h>
78#include <inet/ipclassifier.h>
79
80#include <sys/tsol/label.h>
81#include <sys/tsol/tnet.h>
82
83#include <inet/rawip_impl.h>
84
85#include <sys/disp.h>
86
87/*
88 * Synchronization notes:
89 *
90 * RAWIP is MT and uses the usual kernel synchronization primitives. We use
91 * conn_lock to protect the icmp_t.
92 *
93 * Plumbing notes:
94 * ICMP is always a device driver. For compatibility with mibopen() code
95 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
96 * dummy module.
97 */
98static void	icmp_addr_req(queue_t *q, mblk_t *mp);
99static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
100static void	icmp_bind_proto(icmp_t *icmp);
101static int	icmp_build_hdr_template(conn_t *, const in6_addr_t *,
102    const in6_addr_t *, uint32_t);
103static void	icmp_capability_req(queue_t *q, mblk_t *mp);
104static int	icmp_close(queue_t *q, int flags, cred_t *);
105static void	icmp_close_free(conn_t *);
106static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
107static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
108static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
109    int sys_error);
110static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
111    t_scalar_t tlierr, int sys_error);
112static void	icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
113    ip_recv_attr_t *);
114static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
115    ip_recv_attr_t *);
116static void	icmp_info_req(queue_t *q, mblk_t *mp);
117static void	icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
118static conn_t	*icmp_open(int family, cred_t *credp, int *err, int flags);
119static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
120		    cred_t *credp);
121static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
122		    cred_t *credp);
123static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
124int		icmp_opt_set(conn_t *connp, uint_t optset_context,
125		    int level, int name, uint_t inlen,
126		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
127		    void *thisdg_attrs, cred_t *cr);
128int		icmp_opt_get(conn_t *connp, int level, int name,
129		    uchar_t *ptr);
130static int	icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
131		    sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
132static mblk_t	*icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
133    const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
134static mblk_t	*icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
135    mblk_t *, const in6_addr_t *, uint32_t, int *);
136static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
137		    uchar_t *ptr, int len);
138static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
139static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
140static int	icmp_wput(queue_t *q, mblk_t *mp);
141static int	icmp_wput_fallback(queue_t *q, mblk_t *mp);
142static void	icmp_wput_other(queue_t *q, mblk_t *mp);
143static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
144static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
145static void	icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
146
147static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
148static void	rawip_stack_fini(netstackid_t stackid, void *arg);
149
150static void	*rawip_kstat_init(netstackid_t stackid);
151static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
152static int	rawip_kstat_update(kstat_t *kp, int rw);
153static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
154
155/* Common routines for TPI and socket module */
156static conn_t	*rawip_do_open(int, cred_t *, int *, int);
157static void	rawip_do_close(conn_t *);
158static int	rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
159static int	rawip_do_unbind(conn_t *);
160static int	rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
161    cred_t *, pid_t);
162
163int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
164		    socklen_t *, cred_t *);
165int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
166		    socklen_t *, cred_t *);
167
168static struct module_info icmp_mod_info =  {
169	5707, "icmp", 1, INFPSZ, 512, 128
170};
171
172/*
173 * Entry points for ICMP as a device.
174 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
175 */
176static struct qinit icmprinitv4 = {
177	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
178};
179
180static struct qinit icmprinitv6 = {
181	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
182};
183
184static struct qinit icmpwinit = {
185	icmp_wput, ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
186};
187
188/* ICMP entry point during fallback */
189static struct qinit icmp_fallback_sock_winit = {
190	icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
191};
192
193/* For AF_INET aka /dev/icmp */
194struct streamtab icmpinfov4 = {
195	&icmprinitv4, &icmpwinit
196};
197
198/* For AF_INET6 aka /dev/icmp6 */
199struct streamtab icmpinfov6 = {
200	&icmprinitv6, &icmpwinit
201};
202
203/* Default structure copied into T_INFO_ACK messages */
204static struct T_info_ack icmp_g_t_info_ack = {
205	T_INFO_ACK,
206	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
207	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
208	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
209	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
210	0,		/* ADDR_size - filled in later. */
211	0,		/* OPT_size - not initialized here */
212	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
213	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
214	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
215	(XPG4_1|SENDZERO) /* PROVIDER_flag */
216};
217
218static int
219icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo,
220    const char *ifname, const void *pval, uint_t flags)
221{
222	return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl,
223	    stack, cr, pinfo, ifname, pval, flags));
224}
225
226static int
227icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname,
228    void *val, uint_t psize, uint_t flags)
229{
230	return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack,
231	    pinfo, ifname, val, psize, flags));
232}
233
234/*
235 * All of these are alterable, within the min/max values given, at run time.
236 *
237 * Note: All those tunables which do not start with "icmp_" are Committed and
238 * therefore are public. See PSARC 2010/080.
239 */
240static mod_prop_info_t icmp_propinfo_tbl[] = {
241	/* tunable - 0 */
242	{ "_wroff_extra", MOD_PROTO_RAWIP,
243	    mod_set_uint32, mod_get_uint32,
244	    {0, 128, 32}, {32} },
245
246	{ "_ipv4_ttl", MOD_PROTO_RAWIP,
247	    mod_set_uint32, mod_get_uint32,
248	    {1, 255, 255}, {255} },
249
250	{ "_ipv6_hoplimit", MOD_PROTO_RAWIP,
251	    mod_set_uint32, mod_get_uint32,
252	    {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
253	    {IPV6_DEFAULT_HOPS} },
254
255	{ "_bsd_compat", MOD_PROTO_RAWIP,
256	    mod_set_boolean, mod_get_boolean,
257	    {B_TRUE}, {B_TRUE} },
258
259	{ "send_buf", MOD_PROTO_RAWIP,
260	    icmp_set_buf_prop, icmp_get_buf_prop,
261	    {4096, 65536, 8192}, {8192} },
262
263	{ "_xmit_lowat", MOD_PROTO_RAWIP,
264	    mod_set_uint32, mod_get_uint32,
265	    {0, 65536, 1024}, {1024} },
266
267	{ "recv_buf", MOD_PROTO_RAWIP,
268	    icmp_set_buf_prop, icmp_get_buf_prop,
269	    {4096, 65536, 8192}, {8192} },
270
271	{ "max_buf", MOD_PROTO_RAWIP,
272	    mod_set_uint32, mod_get_uint32,
273	    {65536, ULP_MAX_BUF, 256*1024}, {256*1024} },
274
275	{ "_pmtu_discovery", MOD_PROTO_RAWIP,
276	    mod_set_boolean, mod_get_boolean,
277	    {B_FALSE}, {B_FALSE} },
278
279	{ "_sendto_ignerr", MOD_PROTO_RAWIP,
280	    mod_set_boolean, mod_get_boolean,
281	    {B_FALSE}, {B_FALSE} },
282
283	{ "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} },
284
285	{ NULL, 0, NULL, NULL, {0}, {0} }
286};
287
288#define	is_wroff_extra			is_propinfo_tbl[0].prop_cur_uval
289#define	is_ipv4_ttl			is_propinfo_tbl[1].prop_cur_uval
290#define	is_ipv6_hoplimit		is_propinfo_tbl[2].prop_cur_uval
291#define	is_bsd_compat			is_propinfo_tbl[3].prop_cur_bval
292#define	is_xmit_hiwat			is_propinfo_tbl[4].prop_cur_uval
293#define	is_xmit_lowat			is_propinfo_tbl[5].prop_cur_uval
294#define	is_recv_hiwat			is_propinfo_tbl[6].prop_cur_uval
295#define	is_max_buf			is_propinfo_tbl[7].prop_cur_uval
296#define	is_pmtu_discovery		is_propinfo_tbl[8].prop_cur_bval
297#define	is_sendto_ignerr		is_propinfo_tbl[9].prop_cur_bval
298
299typedef union T_primitives *t_primp_t;
300
301/*
302 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
303 * passed to icmp_wput.
304 * It calls IP to verify the local IP address, and calls IP to insert
305 * the conn_t in the fanout table.
306 * If everything is ok it then sends the T_BIND_ACK back up.
307 */
308static void
309icmp_tpi_bind(queue_t *q, mblk_t *mp)
310{
311	int	error;
312	struct sockaddr *sa;
313	struct T_bind_req *tbr;
314	socklen_t	len;
315	sin_t	*sin;
316	sin6_t	*sin6;
317	icmp_t		*icmp;
318	conn_t	*connp = Q_TO_CONN(q);
319	mblk_t *mp1;
320	cred_t *cr;
321
322	/*
323	 * All Solaris components should pass a db_credp
324	 * for this TPI message, hence we ASSERT.
325	 * But in case there is some other M_PROTO that looks
326	 * like a TPI message sent by some other kernel
327	 * component, we check and return an error.
328	 */
329	cr = msg_getcred(mp, NULL);
330	ASSERT(cr != NULL);
331	if (cr == NULL) {
332		icmp_err_ack(q, mp, TSYSERR, EINVAL);
333		return;
334	}
335
336	icmp = connp->conn_icmp;
337	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
338		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
339		    "icmp_bind: bad req, len %u",
340		    (uint_t)(mp->b_wptr - mp->b_rptr));
341		icmp_err_ack(q, mp, TPROTO, 0);
342		return;
343	}
344
345	if (icmp->icmp_state != TS_UNBND) {
346		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
347		    "icmp_bind: bad state, %u", icmp->icmp_state);
348		icmp_err_ack(q, mp, TOUTSTATE, 0);
349		return;
350	}
351
352	/*
353	 * Reallocate the message to make sure we have enough room for an
354	 * address.
355	 */
356	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
357	if (mp1 == NULL) {
358		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
359		return;
360	}
361	mp = mp1;
362
363	/* Reset the message type in preparation for shipping it back. */
364	DB_TYPE(mp) = M_PCPROTO;
365	tbr = (struct T_bind_req *)mp->b_rptr;
366	len = tbr->ADDR_length;
367	switch (len) {
368	case 0:	/* request for a generic port */
369		tbr->ADDR_offset = sizeof (struct T_bind_req);
370		if (connp->conn_family == AF_INET) {
371			tbr->ADDR_length = sizeof (sin_t);
372			sin = (sin_t *)&tbr[1];
373			*sin = sin_null;
374			sin->sin_family = AF_INET;
375			mp->b_wptr = (uchar_t *)&sin[1];
376			sa = (struct sockaddr *)sin;
377			len = sizeof (sin_t);
378		} else {
379			ASSERT(connp->conn_family == AF_INET6);
380			tbr->ADDR_length = sizeof (sin6_t);
381			sin6 = (sin6_t *)&tbr[1];
382			*sin6 = sin6_null;
383			sin6->sin6_family = AF_INET6;
384			mp->b_wptr = (uchar_t *)&sin6[1];
385			sa = (struct sockaddr *)sin6;
386			len = sizeof (sin6_t);
387		}
388		break;
389
390	case sizeof (sin_t):	/* Complete IPv4 address */
391		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
392		    sizeof (sin_t));
393		break;
394
395	case sizeof (sin6_t):	/* Complete IPv6 address */
396		sa = (struct sockaddr *)mi_offset_param(mp,
397		    tbr->ADDR_offset, sizeof (sin6_t));
398		break;
399
400	default:
401		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
402		    "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
403		icmp_err_ack(q, mp, TBADADDR, 0);
404		return;
405	}
406
407	error = rawip_do_bind(connp, sa, len);
408	if (error != 0) {
409		if (error > 0) {
410			icmp_err_ack(q, mp, TSYSERR, error);
411		} else {
412			icmp_err_ack(q, mp, -error, 0);
413		}
414	} else {
415		tbr->PRIM_type = T_BIND_ACK;
416		qreply(q, mp);
417	}
418}
419
420static int
421rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
422{
423	sin_t		*sin;
424	sin6_t		*sin6;
425	icmp_t		*icmp = connp->conn_icmp;
426	int		error = 0;
427	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
428	in_port_t	lport;		/* Network byte order */
429	ipaddr_t	v4src;		/* Set if AF_INET */
430	in6_addr_t	v6src;
431	uint_t		scopeid = 0;
432	zoneid_t	zoneid = IPCL_ZONEID(connp);
433	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
434
435	if (sa == NULL || !OK_32PTR((char *)sa)) {
436		return (EINVAL);
437	}
438
439	switch (len) {
440	case sizeof (sin_t):    /* Complete IPv4 address */
441		sin = (sin_t *)sa;
442		if (sin->sin_family != AF_INET ||
443		    connp->conn_family != AF_INET) {
444			/* TSYSERR, EAFNOSUPPORT */
445			return (EAFNOSUPPORT);
446		}
447		v4src = sin->sin_addr.s_addr;
448		IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
449		if (v4src != INADDR_ANY) {
450			laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
451			    B_TRUE);
452		}
453		lport = sin->sin_port;
454		break;
455	case sizeof (sin6_t): /* Complete IPv6 address */
456		sin6 = (sin6_t *)sa;
457		if (sin6->sin6_family != AF_INET6 ||
458		    connp->conn_family != AF_INET6) {
459			/* TSYSERR, EAFNOSUPPORT */
460			return (EAFNOSUPPORT);
461		}
462		/* No support for mapped addresses on raw sockets */
463		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
464			/* TSYSERR, EADDRNOTAVAIL */
465			return (EADDRNOTAVAIL);
466		}
467		v6src = sin6->sin6_addr;
468		if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
469			if (IN6_IS_ADDR_LINKSCOPE(&v6src))
470				scopeid = sin6->sin6_scope_id;
471			laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
472			    B_TRUE, scopeid);
473		}
474		lport = sin6->sin6_port;
475		break;
476
477	default:
478		/* TBADADDR */
479		return (EADDRNOTAVAIL);
480	}
481
482	/* Is the local address a valid unicast, multicast, or broadcast? */
483	if (laddr_type == IPVL_BAD)
484		return (EADDRNOTAVAIL);
485
486	/*
487	 * The state must be TS_UNBND.
488	 */
489	mutex_enter(&connp->conn_lock);
490	if (icmp->icmp_state != TS_UNBND) {
491		mutex_exit(&connp->conn_lock);
492		return (-TOUTSTATE);
493	}
494
495	/*
496	 * Copy the source address into our icmp structure.  This address
497	 * may still be zero; if so, ip will fill in the correct address
498	 * each time an outbound packet is passed to it.
499	 * If we are binding to a broadcast or multicast address then
500	 * we just set the conn_bound_addr since we don't want to use
501	 * that as the source address when sending.
502	 */
503	connp->conn_bound_addr_v6 = v6src;
504	connp->conn_laddr_v6 = v6src;
505	if (scopeid != 0) {
506		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
507		connp->conn_ixa->ixa_scopeid = scopeid;
508		connp->conn_incoming_ifindex = scopeid;
509	} else {
510		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
511		connp->conn_incoming_ifindex = connp->conn_bound_if;
512	}
513
514	switch (laddr_type) {
515	case IPVL_UNICAST_UP:
516	case IPVL_UNICAST_DOWN:
517		connp->conn_saddr_v6 = v6src;
518		connp->conn_mcbc_bind = B_FALSE;
519		break;
520	case IPVL_MCAST:
521	case IPVL_BCAST:
522		/* ip_set_destination will pick a source address later */
523		connp->conn_saddr_v6 = ipv6_all_zeros;
524		connp->conn_mcbc_bind = B_TRUE;
525		break;
526	}
527
528	/* Any errors after this point should use late_error */
529
530	/*
531	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
532	 * with IPPROTO_TCP.
533	 */
534	connp->conn_lport = lport;
535	connp->conn_fport = 0;
536
537	if (connp->conn_family == AF_INET) {
538		ASSERT(connp->conn_ipversion == IPV4_VERSION);
539	} else {
540		ASSERT(connp->conn_ipversion == IPV6_VERSION);
541	}
542
543	icmp->icmp_state = TS_IDLE;
544
545	/*
546	 * We create an initial header template here to make a subsequent
547	 * sendto have a starting point. Since conn_last_dst is zero the
548	 * first sendto will always follow the 'dst changed' code path.
549	 * Note that we defer massaging options and the related checksum
550	 * adjustment until we have a destination address.
551	 */
552	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
553	    &connp->conn_faddr_v6, connp->conn_flowinfo);
554	if (error != 0) {
555		mutex_exit(&connp->conn_lock);
556		goto late_error;
557	}
558	/* Just in case */
559	connp->conn_faddr_v6 = ipv6_all_zeros;
560	connp->conn_v6lastdst = ipv6_all_zeros;
561	mutex_exit(&connp->conn_lock);
562
563	error = ip_laddr_fanout_insert(connp);
564	if (error != 0)
565		goto late_error;
566
567	/* Bind succeeded */
568	return (0);
569
570late_error:
571	mutex_enter(&connp->conn_lock);
572	connp->conn_saddr_v6 = ipv6_all_zeros;
573	connp->conn_bound_addr_v6 = ipv6_all_zeros;
574	connp->conn_laddr_v6 = ipv6_all_zeros;
575	if (scopeid != 0) {
576		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
577		connp->conn_incoming_ifindex = connp->conn_bound_if;
578	}
579	icmp->icmp_state = TS_UNBND;
580	connp->conn_v6lastdst = ipv6_all_zeros;
581	connp->conn_lport = 0;
582
583	/* Restore the header that was built above - different source address */
584	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
585	    &connp->conn_faddr_v6, connp->conn_flowinfo);
586	mutex_exit(&connp->conn_lock);
587	return (error);
588}
589
590/*
591 * Tell IP to just bind to the protocol.
592 */
593static void
594icmp_bind_proto(icmp_t *icmp)
595{
596	conn_t	*connp = icmp->icmp_connp;
597
598	mutex_enter(&connp->conn_lock);
599	connp->conn_saddr_v6 = ipv6_all_zeros;
600	connp->conn_laddr_v6 = ipv6_all_zeros;
601	connp->conn_faddr_v6 = ipv6_all_zeros;
602	connp->conn_v6lastdst = ipv6_all_zeros;
603	mutex_exit(&connp->conn_lock);
604
605	(void) ip_laddr_fanout_insert(connp);
606}
607
608/*
609 * This routine handles each T_CONN_REQ message passed to icmp.  It
610 * associates a default destination address with the stream.
611 *
612 * After various error checks are completed, icmp_connect() lays
613 * the target address and port into the composite header template.
614 * Then we ask IP for information, including a source address if we didn't
615 * already have one. Finally we send up the T_OK_ACK reply message.
616 */
617static void
618icmp_tpi_connect(queue_t *q, mblk_t *mp)
619{
620	conn_t	*connp = Q_TO_CONN(q);
621	struct T_conn_req	*tcr;
622	struct sockaddr *sa;
623	socklen_t len;
624	int error;
625	cred_t *cr;
626	pid_t pid;
627	/*
628	 * All Solaris components should pass a db_credp
629	 * for this TPI message, hence we ASSERT.
630	 * But in case there is some other M_PROTO that looks
631	 * like a TPI message sent by some other kernel
632	 * component, we check and return an error.
633	 */
634	cr = msg_getcred(mp, &pid);
635	ASSERT(cr != NULL);
636	if (cr == NULL) {
637		icmp_err_ack(q, mp, TSYSERR, EINVAL);
638		return;
639	}
640
641	tcr = (struct T_conn_req *)mp->b_rptr;
642	/* Sanity checks */
643	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
644		icmp_err_ack(q, mp, TPROTO, 0);
645		return;
646	}
647
648	if (tcr->OPT_length != 0) {
649		icmp_err_ack(q, mp, TBADOPT, 0);
650		return;
651	}
652
653	len = tcr->DEST_length;
654
655	switch (len) {
656	default:
657		icmp_err_ack(q, mp, TBADADDR, 0);
658		return;
659	case sizeof (sin_t):
660		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
661		    sizeof (sin_t));
662		break;
663	case sizeof (sin6_t):
664		sa = (struct sockaddr *)mi_offset_param(mp,
665		    tcr->DEST_offset, sizeof (sin6_t));
666		break;
667	}
668
669	error = proto_verify_ip_addr(connp->conn_family, sa, len);
670	if (error != 0) {
671		icmp_err_ack(q, mp, TSYSERR, error);
672		return;
673	}
674
675	error = rawip_do_connect(connp, sa, len, cr, pid);
676	if (error != 0) {
677		if (error < 0) {
678			icmp_err_ack(q, mp, -error, 0);
679		} else {
680			icmp_err_ack(q, mp, 0, error);
681		}
682	} else {
683		mblk_t *mp1;
684
685		/*
686		 * We have to send a connection confirmation to
687		 * keep TLI happy.
688		 */
689		if (connp->conn_family == AF_INET) {
690			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
691			    sizeof (sin_t), NULL, 0);
692		} else {
693			ASSERT(connp->conn_family == AF_INET6);
694			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
695			    sizeof (sin6_t), NULL, 0);
696		}
697		if (mp1 == NULL) {
698			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
699			return;
700		}
701
702		/*
703		 * Send ok_ack for T_CONN_REQ
704		 */
705		mp = mi_tpi_ok_ack_alloc(mp);
706		if (mp == NULL) {
707			/* Unable to reuse the T_CONN_REQ for the ack. */
708			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
709			return;
710		}
711		putnext(connp->conn_rq, mp);
712		putnext(connp->conn_rq, mp1);
713	}
714}
715
716static int
717rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
718    cred_t *cr, pid_t pid)
719{
720	icmp_t		*icmp;
721	sin_t		*sin;
722	sin6_t		*sin6;
723	int		error;
724	uint16_t	dstport;
725	ipaddr_t	v4dst;
726	in6_addr_t	v6dst;
727	uint32_t	flowinfo;
728	ip_xmit_attr_t	*ixa;
729	ip_xmit_attr_t	*oldixa;
730	uint_t		scopeid = 0;
731	uint_t		srcid = 0;
732	in6_addr_t	v6src = connp->conn_saddr_v6;
733
734	icmp = connp->conn_icmp;
735
736	if (sa == NULL || !OK_32PTR((char *)sa)) {
737		return (EINVAL);
738	}
739
740	ASSERT(sa != NULL && len != 0);
741	sin = NULL;
742	sin6 = NULL;
743	dstport = 0;
744	flowinfo = 0;
745	v4dst = INADDR_ANY;
746
747	/*
748	 * Determine packet type based on type of address passed in
749	 * the request should contain an IPv4 or IPv6 address.
750	 * Make sure that address family matches the type of
751	 * family of the address passed down.
752	 */
753	switch (len) {
754	case sizeof (sin_t):
755		sin = (sin_t *)sa;
756
757		v4dst = sin->sin_addr.s_addr;
758		dstport = sin->sin_port;
759		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
760		ASSERT(connp->conn_ipversion == IPV4_VERSION);
761		break;
762
763	case sizeof (sin6_t):
764		sin6 = (sin6_t *)sa;
765
766		/* No support for mapped addresses on raw sockets */
767		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
768			return (EADDRNOTAVAIL);
769		}
770		v6dst = sin6->sin6_addr;
771		dstport = sin6->sin6_port;
772		ASSERT(connp->conn_ipversion == IPV6_VERSION);
773		flowinfo = sin6->sin6_flowinfo;
774		if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
775			scopeid = sin6->sin6_scope_id;
776		srcid = sin6->__sin6_src_id;
777		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
778			/* Due to check above, we know sin6_addr is v6-only. */
779			if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
780			    B_FALSE, connp->conn_netstack)) {
781				/* Mismatch - v6src would be v4mapped. */
782				return (EADDRNOTAVAIL);
783			}
784		}
785		break;
786	}
787
788	/*
789	 * If there is a different thread using conn_ixa then we get a new
790	 * copy and cut the old one loose from conn_ixa. Otherwise we use
791	 * conn_ixa and prevent any other thread from using/changing it.
792	 * Once connect() is done other threads can use conn_ixa since the
793	 * refcnt will be back at one.
794	 * We defer updating conn_ixa until later to handle any concurrent
795	 * conn_ixa_cleanup thread.
796	 */
797	ixa = conn_get_ixa(connp, B_FALSE);
798	if (ixa == NULL)
799		return (ENOMEM);
800
801	mutex_enter(&connp->conn_lock);
802	/*
803	 * This icmp_t must have bound already before doing a connect.
804	 * Reject if a connect is in progress (we drop conn_lock during
805	 * rawip_do_connect).
806	 */
807	if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
808		mutex_exit(&connp->conn_lock);
809		ixa_refrele(ixa);
810		return (-TOUTSTATE);
811	}
812
813	if (icmp->icmp_state == TS_DATA_XFER) {
814		/* Already connected - clear out state */
815		if (connp->conn_mcbc_bind)
816			connp->conn_saddr_v6 = ipv6_all_zeros;
817		else
818			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
819		connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
820		connp->conn_faddr_v6 = ipv6_all_zeros;
821		icmp->icmp_state = TS_IDLE;
822	}
823
824	/*
825	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
826	 * with IPPROTO_TCP.
827	 */
828	connp->conn_fport = dstport;
829	if (connp->conn_ipversion == IPV4_VERSION) {
830		/*
831		 * Interpret a zero destination to mean loopback.
832		 * Update the T_CONN_REQ (sin/sin6) since it is used to
833		 * generate the T_CONN_CON.
834		 */
835		if (v4dst == INADDR_ANY) {
836			v4dst = htonl(INADDR_LOOPBACK);
837			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
838			ASSERT(connp->conn_family == AF_INET);
839			sin->sin_addr.s_addr = v4dst;
840		}
841		connp->conn_faddr_v6 = v6dst;
842		connp->conn_flowinfo = 0;
843	} else {
844		ASSERT(connp->conn_ipversion == IPV6_VERSION);
845		/*
846		 * Interpret a zero destination to mean loopback.
847		 * Update the T_CONN_REQ (sin/sin6) since it is used to
848		 * generate the T_CONN_CON.
849		 */
850		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
851			v6dst = ipv6_loopback;
852			sin6->sin6_addr = v6dst;
853		}
854		connp->conn_faddr_v6 = v6dst;
855		connp->conn_flowinfo = flowinfo;
856	}
857
858	/*
859	 * We update our cred/cpid based on the caller of connect
860	 */
861	if (connp->conn_cred != cr) {
862		crhold(cr);
863		crfree(connp->conn_cred);
864		connp->conn_cred = cr;
865	}
866	connp->conn_cpid = pid;
867	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
868	ixa->ixa_cred = cr;
869	ixa->ixa_cpid = pid;
870	if (is_system_labeled()) {
871		/* We need to restart with a label based on the cred */
872		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
873	}
874
875	if (scopeid != 0) {
876		ixa->ixa_flags |= IXAF_SCOPEID_SET;
877		ixa->ixa_scopeid = scopeid;
878		connp->conn_incoming_ifindex = scopeid;
879	} else {
880		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
881		connp->conn_incoming_ifindex = connp->conn_bound_if;
882	}
883
884	/*
885	 * conn_connect will drop conn_lock and reacquire it.
886	 * To prevent a send* from messing with this icmp_t while the lock
887	 * is dropped we set icmp_state and clear conn_v6lastdst.
888	 * That will make all send* fail with EISCONN.
889	 */
890	connp->conn_v6lastdst = ipv6_all_zeros;
891	icmp->icmp_state = TS_WCON_CREQ;
892
893	error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
894	mutex_exit(&connp->conn_lock);
895	if (error != 0)
896		goto connect_failed;
897
898	/*
899	 * The addresses have been verified. Time to insert in
900	 * the correct fanout list.
901	 */
902	error = ipcl_conn_insert(connp);
903	if (error != 0)
904		goto connect_failed;
905
906	mutex_enter(&connp->conn_lock);
907	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
908	    &connp->conn_faddr_v6, connp->conn_flowinfo);
909	if (error != 0) {
910		mutex_exit(&connp->conn_lock);
911		goto connect_failed;
912	}
913
914	icmp->icmp_state = TS_DATA_XFER;
915	/* Record this as the "last" send even though we haven't sent any */
916	connp->conn_v6lastdst = connp->conn_faddr_v6;
917	connp->conn_lastipversion = connp->conn_ipversion;
918	connp->conn_lastdstport = connp->conn_fport;
919	connp->conn_lastflowinfo = connp->conn_flowinfo;
920	connp->conn_lastscopeid = scopeid;
921	connp->conn_lastsrcid = srcid;
922	/* Also remember a source to use together with lastdst */
923	connp->conn_v6lastsrc = v6src;
924
925	oldixa = conn_replace_ixa(connp, ixa);
926	mutex_exit(&connp->conn_lock);
927	ixa_refrele(oldixa);
928
929	ixa_refrele(ixa);
930	return (0);
931
932connect_failed:
933	if (ixa != NULL)
934		ixa_refrele(ixa);
935	mutex_enter(&connp->conn_lock);
936	icmp->icmp_state = TS_IDLE;
937	/* In case the source address was set above */
938	if (connp->conn_mcbc_bind)
939		connp->conn_saddr_v6 = ipv6_all_zeros;
940	else
941		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
942	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
943	connp->conn_faddr_v6 = ipv6_all_zeros;
944	connp->conn_v6lastdst = ipv6_all_zeros;
945	connp->conn_flowinfo = 0;
946
947	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
948	    &connp->conn_faddr_v6, connp->conn_flowinfo);
949	mutex_exit(&connp->conn_lock);
950	return (error);
951}
952
953static void
954rawip_do_close(conn_t *connp)
955{
956	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
957
958	ip_quiesce_conn(connp);
959
960	if (!IPCL_IS_NONSTR(connp)) {
961		qprocsoff(connp->conn_rq);
962	}
963
964	icmp_close_free(connp);
965
966	/*
967	 * Now we are truly single threaded on this stream, and can
968	 * delete the things hanging off the connp, and finally the connp.
969	 * We removed this connp from the fanout list, it cannot be
970	 * accessed thru the fanouts, and we already waited for the
971	 * conn_ref to drop to 0. We are already in close, so
972	 * there cannot be any other thread from the top. qprocsoff
973	 * has completed, and service has completed or won't run in
974	 * future.
975	 */
976	ASSERT(connp->conn_ref == 1);
977
978	if (!IPCL_IS_NONSTR(connp)) {
979		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
980	} else {
981		ip_free_helper_stream(connp);
982	}
983
984	connp->conn_ref--;
985	ipcl_conn_destroy(connp);
986}
987
988/* ARGSUSED */
989static int
990icmp_close(queue_t *q, int flags, cred_t *credp __unused)
991{
992	conn_t  *connp;
993
994	if (flags & SO_FALLBACK) {
995		/*
996		 * stream is being closed while in fallback
997		 * simply free the resources that were allocated
998		 */
999		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
1000		qprocsoff(q);
1001		goto done;
1002	}
1003
1004	connp = Q_TO_CONN(q);
1005	(void) rawip_do_close(connp);
1006done:
1007	q->q_ptr = WR(q)->q_ptr = NULL;
1008	return (0);
1009}
1010
1011static void
1012icmp_close_free(conn_t *connp)
1013{
1014	icmp_t *icmp = connp->conn_icmp;
1015
1016	if (icmp->icmp_filter != NULL) {
1017		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
1018		icmp->icmp_filter = NULL;
1019	}
1020
1021	/*
1022	 * Clear any fields which the kmem_cache constructor clears.
1023	 * Only icmp_connp needs to be preserved.
1024	 * TBD: We should make this more efficient to avoid clearing
1025	 * everything.
1026	 */
1027	ASSERT(icmp->icmp_connp == connp);
1028	bzero(icmp, sizeof (icmp_t));
1029	icmp->icmp_connp = connp;
1030}
1031
1032/*
1033 * This routine handles each T_DISCON_REQ message passed to icmp
1034 * as an indicating that ICMP is no longer connected. This results
1035 * in telling IP to restore the binding to just the local address.
1036 */
1037static int
1038icmp_do_disconnect(conn_t *connp)
1039{
1040	icmp_t	*icmp = connp->conn_icmp;
1041	int	error;
1042
1043	mutex_enter(&connp->conn_lock);
1044	if (icmp->icmp_state != TS_DATA_XFER) {
1045		mutex_exit(&connp->conn_lock);
1046		return (-TOUTSTATE);
1047	}
1048	if (connp->conn_mcbc_bind)
1049		connp->conn_saddr_v6 = ipv6_all_zeros;
1050	else
1051		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
1052	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
1053	connp->conn_faddr_v6 = ipv6_all_zeros;
1054	icmp->icmp_state = TS_IDLE;
1055
1056	connp->conn_v6lastdst = ipv6_all_zeros;
1057	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
1058	    &connp->conn_faddr_v6, connp->conn_flowinfo);
1059	mutex_exit(&connp->conn_lock);
1060	if (error != 0)
1061		return (error);
1062
1063	/*
1064	 * Tell IP to remove the full binding and revert
1065	 * to the local address binding.
1066	 */
1067	return (ip_laddr_fanout_insert(connp));
1068}
1069
1070static void
1071icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1072{
1073	conn_t	*connp = Q_TO_CONN(q);
1074	int	error;
1075
1076	/*
1077	 * Allocate the largest primitive we need to send back
1078	 * T_error_ack is > than T_ok_ack
1079	 */
1080	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1081	if (mp == NULL) {
1082		/* Unable to reuse the T_DISCON_REQ for the ack. */
1083		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1084		return;
1085	}
1086
1087	error = icmp_do_disconnect(connp);
1088
1089	if (error != 0) {
1090		if (error > 0) {
1091			icmp_err_ack(q, mp, 0, error);
1092		} else {
1093			icmp_err_ack(q, mp, -error, 0);
1094		}
1095	} else {
1096		mp = mi_tpi_ok_ack_alloc(mp);
1097		ASSERT(mp != NULL);
1098		qreply(q, mp);
1099	}
1100}
1101
1102static int
1103icmp_disconnect(conn_t *connp)
1104{
1105	int	error;
1106
1107	connp->conn_dgram_errind = B_FALSE;
1108
1109	error = icmp_do_disconnect(connp);
1110
1111	if (error < 0)
1112		error = proto_tlitosyserr(-error);
1113	return (error);
1114}
1115
1116/* This routine creates a T_ERROR_ACK message and passes it upstream. */
1117static void
1118icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1119{
1120	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1121		qreply(q, mp);
1122}
1123
1124/* Shorthand to generate and send TPI error acks to our client */
1125static void
1126icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1127    t_scalar_t t_error, int sys_error)
1128{
1129	struct T_error_ack	*teackp;
1130
1131	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1132	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1133		teackp = (struct T_error_ack *)mp->b_rptr;
1134		teackp->ERROR_prim = primitive;
1135		teackp->TLI_error = t_error;
1136		teackp->UNIX_error = sys_error;
1137		qreply(q, mp);
1138	}
1139}
1140
1141/*
1142 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1143 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1144 * Assumes that IP has pulled up everything up to and including the ICMP header.
1145 */
1146/* ARGSUSED2 */
1147static void
1148icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1149{
1150	conn_t		*connp = (conn_t *)arg1;
1151	icmp_t		*icmp = connp->conn_icmp;
1152	icmph_t		*icmph;
1153	ipha_t		*ipha;
1154	int		iph_hdr_length;
1155	sin_t		sin;
1156	mblk_t		*mp1;
1157	int		error = 0;
1158
1159	ipha = (ipha_t *)mp->b_rptr;
1160
1161	ASSERT(OK_32PTR(mp->b_rptr));
1162
1163	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1164		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1165		icmp_icmp_error_ipv6(connp, mp, ira);
1166		return;
1167	}
1168	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1169
1170	/* Skip past the outer IP and ICMP headers */
1171	ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1172	iph_hdr_length = ira->ira_ip_hdr_length;
1173	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1174	ipha = (ipha_t *)&icmph[1];	/* Inner IP header */
1175
1176	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1177
1178	switch (icmph->icmph_type) {
1179	case ICMP_DEST_UNREACHABLE:
1180		switch (icmph->icmph_code) {
1181		case ICMP_FRAGMENTATION_NEEDED: {
1182			ipha_t		*ipha;
1183			ip_xmit_attr_t	*ixa;
1184			/*
1185			 * IP has already adjusted the path MTU.
1186			 * But we need to adjust DF for IPv4.
1187			 */
1188			if (connp->conn_ipversion != IPV4_VERSION)
1189				break;
1190
1191			ixa = conn_get_ixa(connp, B_FALSE);
1192			if (ixa == NULL || ixa->ixa_ire == NULL) {
1193				/*
1194				 * Some other thread holds conn_ixa. We will
1195				 * redo this on the next ICMP too big.
1196				 */
1197				if (ixa != NULL)
1198					ixa_refrele(ixa);
1199				break;
1200			}
1201			(void) ip_get_pmtu(ixa);
1202
1203			mutex_enter(&connp->conn_lock);
1204			ipha = (ipha_t *)connp->conn_ht_iphc;
1205			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1206				ipha->ipha_fragment_offset_and_flags |=
1207				    IPH_DF_HTONS;
1208			} else {
1209				ipha->ipha_fragment_offset_and_flags &=
1210				    ~IPH_DF_HTONS;
1211			}
1212			mutex_exit(&connp->conn_lock);
1213			ixa_refrele(ixa);
1214			break;
1215		}
1216		case ICMP_PORT_UNREACHABLE:
1217		case ICMP_PROTOCOL_UNREACHABLE:
1218			error = ECONNREFUSED;
1219			break;
1220		default:
1221			/* Transient errors */
1222			break;
1223		}
1224		break;
1225	default:
1226		/* Transient errors */
1227		break;
1228	}
1229	if (error == 0) {
1230		freemsg(mp);
1231		return;
1232	}
1233
1234	/*
1235	 * Deliver T_UDERROR_IND when the application has asked for it.
1236	 * The socket layer enables this automatically when connected.
1237	 */
1238	if (!connp->conn_dgram_errind) {
1239		freemsg(mp);
1240		return;
1241	}
1242
1243	sin = sin_null;
1244	sin.sin_family = AF_INET;
1245	sin.sin_addr.s_addr = ipha->ipha_dst;
1246
1247	if (IPCL_IS_NONSTR(connp)) {
1248		mutex_enter(&connp->conn_lock);
1249		if (icmp->icmp_state == TS_DATA_XFER) {
1250			if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1251				mutex_exit(&connp->conn_lock);
1252				(*connp->conn_upcalls->su_set_error)
1253				    (connp->conn_upper_handle, error);
1254				goto done;
1255			}
1256		} else {
1257			icmp->icmp_delayed_error = error;
1258			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1259		}
1260		mutex_exit(&connp->conn_lock);
1261	} else {
1262		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1263		    error);
1264		if (mp1 != NULL)
1265			putnext(connp->conn_rq, mp1);
1266	}
1267done:
1268	freemsg(mp);
1269}
1270
1271/*
1272 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1273 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1274 * Assumes that IP has pulled up all the extension headers as well as the
1275 * ICMPv6 header.
1276 */
1277static void
1278icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1279{
1280	icmp6_t		*icmp6;
1281	ip6_t		*ip6h, *outer_ip6h;
1282	uint16_t	iph_hdr_length;
1283	uint8_t		*nexthdrp;
1284	sin6_t		sin6;
1285	mblk_t		*mp1;
1286	int		error = 0;
1287	icmp_t		*icmp = connp->conn_icmp;
1288
1289	outer_ip6h = (ip6_t *)mp->b_rptr;
1290#ifdef DEBUG
1291	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1292		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1293	else
1294		iph_hdr_length = IPV6_HDR_LEN;
1295	ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1296#endif
1297	/* Skip past the outer IP and ICMP headers */
1298	iph_hdr_length = ira->ira_ip_hdr_length;
1299	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1300
1301	ip6h = (ip6_t *)&icmp6[1];	/* Inner IP header */
1302	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1303		freemsg(mp);
1304		return;
1305	}
1306
1307	switch (icmp6->icmp6_type) {
1308	case ICMP6_DST_UNREACH:
1309		switch (icmp6->icmp6_code) {
1310		case ICMP6_DST_UNREACH_NOPORT:
1311			error = ECONNREFUSED;
1312			break;
1313		case ICMP6_DST_UNREACH_ADMIN:
1314		case ICMP6_DST_UNREACH_NOROUTE:
1315		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1316		case ICMP6_DST_UNREACH_ADDR:
1317			/* Transient errors */
1318			break;
1319		default:
1320			break;
1321		}
1322		break;
1323	case ICMP6_PACKET_TOO_BIG: {
1324		struct T_unitdata_ind	*tudi;
1325		struct T_opthdr		*toh;
1326		size_t			udi_size;
1327		mblk_t			*newmp;
1328		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1329		    sizeof (struct ip6_mtuinfo);
1330		sin6_t			*sin6;
1331		struct ip6_mtuinfo	*mtuinfo;
1332
1333		/*
1334		 * If the application has requested to receive path mtu
1335		 * information, send up an empty message containing an
1336		 * IPV6_PATHMTU ancillary data item.
1337		 */
1338		if (!connp->conn_ipv6_recvpathmtu)
1339			break;
1340
1341		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1342		    opt_length;
1343		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1344			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1345			break;
1346		}
1347
1348		/*
1349		 * newmp->b_cont is left to NULL on purpose.  This is an
1350		 * empty message containing only ancillary data.
1351		 */
1352		newmp->b_datap->db_type = M_PROTO;
1353		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1354		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1355		tudi->PRIM_type = T_UNITDATA_IND;
1356		tudi->SRC_length = sizeof (sin6_t);
1357		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1358		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1359		tudi->OPT_length = opt_length;
1360
1361		sin6 = (sin6_t *)&tudi[1];
1362		bzero(sin6, sizeof (sin6_t));
1363		sin6->sin6_family = AF_INET6;
1364		sin6->sin6_addr = connp->conn_faddr_v6;
1365
1366		toh = (struct T_opthdr *)&sin6[1];
1367		toh->level = IPPROTO_IPV6;
1368		toh->name = IPV6_PATHMTU;
1369		toh->len = opt_length;
1370		toh->status = 0;
1371
1372		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1373		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1374		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1375		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1376		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1377		/*
1378		 * We've consumed everything we need from the original
1379		 * message.  Free it, then send our empty message.
1380		 */
1381		freemsg(mp);
1382		icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1383		return;
1384	}
1385	case ICMP6_TIME_EXCEEDED:
1386		/* Transient errors */
1387		break;
1388	case ICMP6_PARAM_PROB:
1389		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1390		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1391		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1392		    (uchar_t *)nexthdrp) {
1393			error = ECONNREFUSED;
1394			break;
1395		}
1396		break;
1397	}
1398	if (error == 0) {
1399		freemsg(mp);
1400		return;
1401	}
1402
1403	/*
1404	 * Deliver T_UDERROR_IND when the application has asked for it.
1405	 * The socket layer enables this automatically when connected.
1406	 */
1407	if (!connp->conn_dgram_errind) {
1408		freemsg(mp);
1409		return;
1410	}
1411
1412	sin6 = sin6_null;
1413	sin6.sin6_family = AF_INET6;
1414	sin6.sin6_addr = ip6h->ip6_dst;
1415	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1416	if (IPCL_IS_NONSTR(connp)) {
1417		mutex_enter(&connp->conn_lock);
1418		if (icmp->icmp_state == TS_DATA_XFER) {
1419			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1420			    &connp->conn_faddr_v6)) {
1421				mutex_exit(&connp->conn_lock);
1422				(*connp->conn_upcalls->su_set_error)
1423				    (connp->conn_upper_handle, error);
1424				goto done;
1425			}
1426		} else {
1427			icmp->icmp_delayed_error = error;
1428			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1429		}
1430		mutex_exit(&connp->conn_lock);
1431	} else {
1432		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1433		    NULL, 0, error);
1434		if (mp1 != NULL)
1435			putnext(connp->conn_rq, mp1);
1436	}
1437done:
1438	freemsg(mp);
1439}
1440
1441/*
1442 * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1443 * The local address is filled in if endpoint is bound. The remote address
1444 * is filled in if remote address has been precified ("connected endpoint")
1445 * (The concept of connected CLTS sockets is alien to published TPI
1446 *  but we support it anyway).
1447 */
1448static void
1449icmp_addr_req(queue_t *q, mblk_t *mp)
1450{
1451	struct sockaddr *sa;
1452	mblk_t	*ackmp;
1453	struct T_addr_ack *taa;
1454	icmp_t	*icmp = Q_TO_ICMP(q);
1455	conn_t	*connp = icmp->icmp_connp;
1456	uint_t	addrlen;
1457
1458	/* Make it large enough for worst case */
1459	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1460	    2 * sizeof (sin6_t), 1);
1461	if (ackmp == NULL) {
1462		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1463		return;
1464	}
1465	taa = (struct T_addr_ack *)ackmp->b_rptr;
1466
1467	bzero(taa, sizeof (struct T_addr_ack));
1468	ackmp->b_wptr = (uchar_t *)&taa[1];
1469
1470	taa->PRIM_type = T_ADDR_ACK;
1471	ackmp->b_datap->db_type = M_PCPROTO;
1472
1473	if (connp->conn_family == AF_INET)
1474		addrlen = sizeof (sin_t);
1475	else
1476		addrlen = sizeof (sin6_t);
1477
1478	mutex_enter(&connp->conn_lock);
1479	/*
1480	 * Note: Following code assumes 32 bit alignment of basic
1481	 * data structures like sin_t and struct T_addr_ack.
1482	 */
1483	if (icmp->icmp_state != TS_UNBND) {
1484		/*
1485		 * Fill in local address first
1486		 */
1487		taa->LOCADDR_offset = sizeof (*taa);
1488		taa->LOCADDR_length = addrlen;
1489		sa = (struct sockaddr *)&taa[1];
1490		(void) conn_getsockname(connp, sa, &addrlen);
1491		ackmp->b_wptr += addrlen;
1492	}
1493	if (icmp->icmp_state == TS_DATA_XFER) {
1494		/*
1495		 * connected, fill remote address too
1496		 */
1497		taa->REMADDR_length = addrlen;
1498		/* assumed 32-bit alignment */
1499		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1500		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1501		(void) conn_getpeername(connp, sa, &addrlen);
1502		ackmp->b_wptr += addrlen;
1503	}
1504	mutex_exit(&connp->conn_lock);
1505	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1506	qreply(q, ackmp);
1507}
1508
1509static void
1510icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1511{
1512	conn_t		*connp = icmp->icmp_connp;
1513
1514	*tap = icmp_g_t_info_ack;
1515
1516	if (connp->conn_family == AF_INET6)
1517		tap->ADDR_size = sizeof (sin6_t);
1518	else
1519		tap->ADDR_size = sizeof (sin_t);
1520	tap->CURRENT_state = icmp->icmp_state;
1521	tap->OPT_size = icmp_max_optsize;
1522}
1523
1524static void
1525icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1526    t_uscalar_t cap_bits1)
1527{
1528	tcap->CAP_bits1 = 0;
1529
1530	if (cap_bits1 & TC1_INFO) {
1531		icmp_copy_info(&tcap->INFO_ack, icmp);
1532		tcap->CAP_bits1 |= TC1_INFO;
1533	}
1534}
1535
1536/*
1537 * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1538 * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1539 * icmp_g_t_info_ack.  The current state of the stream is copied from
1540 * icmp_state.
1541 */
1542static void
1543icmp_capability_req(queue_t *q, mblk_t *mp)
1544{
1545	icmp_t			*icmp = Q_TO_ICMP(q);
1546	t_uscalar_t		cap_bits1;
1547	struct T_capability_ack	*tcap;
1548
1549	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1550
1551	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1552	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1553	if (!mp)
1554		return;
1555
1556	tcap = (struct T_capability_ack *)mp->b_rptr;
1557
1558	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1559
1560	qreply(q, mp);
1561}
1562
1563/*
1564 * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1565 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1566 * The current state of the stream is copied from icmp_state.
1567 */
1568static void
1569icmp_info_req(queue_t *q, mblk_t *mp)
1570{
1571	icmp_t	*icmp = Q_TO_ICMP(q);
1572
1573	/* Create a T_INFO_ACK message. */
1574	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1575	    T_INFO_ACK);
1576	if (!mp)
1577		return;
1578	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1579	qreply(q, mp);
1580}
1581
1582static int
1583icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1584    int family)
1585{
1586	conn_t *connp;
1587	dev_t	conn_dev;
1588	int	error;
1589
1590	/* If the stream is already open, return immediately. */
1591	if (q->q_ptr != NULL)
1592		return (0);
1593
1594	if (sflag == MODOPEN)
1595		return (EINVAL);
1596
1597	/*
1598	 * Since ICMP is not used so heavily, allocating from the small
1599	 * arena should be sufficient.
1600	 */
1601	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1602		return (EBUSY);
1603	}
1604
1605	if (flag & SO_FALLBACK) {
1606		/*
1607		 * Non streams socket needs a stream to fallback to
1608		 */
1609		RD(q)->q_ptr = (void *)conn_dev;
1610		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1611		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1612		qprocson(q);
1613		return (0);
1614	}
1615
1616	connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1617	if (connp == NULL) {
1618		ASSERT(error != 0);
1619		inet_minor_free(ip_minor_arena_sa, conn_dev);
1620		return (error);
1621	}
1622
1623	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1624	connp->conn_dev = conn_dev;
1625	connp->conn_minor_arena = ip_minor_arena_sa;
1626
1627	/*
1628	 * Initialize the icmp_t structure for this stream.
1629	 */
1630	q->q_ptr = connp;
1631	WR(q)->q_ptr = connp;
1632	connp->conn_rq = q;
1633	connp->conn_wq = WR(q);
1634
1635	WR(q)->q_hiwat = connp->conn_sndbuf;
1636	WR(q)->q_lowat = connp->conn_sndlowat;
1637
1638	qprocson(q);
1639
1640	/* Set the Stream head write offset. */
1641	(void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1642	(void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1643
1644	mutex_enter(&connp->conn_lock);
1645	connp->conn_state_flags &= ~CONN_INCIPIENT;
1646	mutex_exit(&connp->conn_lock);
1647
1648	icmp_bind_proto(connp->conn_icmp);
1649
1650	return (0);
1651}
1652
1653/* For /dev/icmp aka AF_INET open */
1654static int
1655icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1656{
1657	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1658}
1659
1660/* For /dev/icmp6 aka AF_INET6 open */
1661static int
1662icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1663{
1664	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1665}
1666
1667/*
1668 * This is the open routine for icmp.  It allocates a icmp_t structure for
1669 * the stream and, on the first open of the module, creates an ND table.
1670 */
1671static conn_t *
1672rawip_do_open(int family, cred_t *credp, int *err, int flags)
1673{
1674	icmp_t	*icmp;
1675	conn_t *connp;
1676	zoneid_t zoneid;
1677	netstack_t *ns;
1678	icmp_stack_t *is;
1679	int len;
1680	boolean_t isv6 = B_FALSE;
1681
1682	*err = secpolicy_net_icmpaccess(credp);
1683	if (*err != 0)
1684		return (NULL);
1685
1686	if (family == AF_INET6)
1687		isv6 = B_TRUE;
1688
1689	ns = netstack_find_by_cred(credp);
1690	ASSERT(ns != NULL);
1691	is = ns->netstack_icmp;
1692	ASSERT(is != NULL);
1693
1694	/*
1695	 * For exclusive stacks we set the zoneid to zero
1696	 * to make ICMP operate as if in the global zone.
1697	 */
1698	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1699		zoneid = GLOBAL_ZONEID;
1700	else
1701		zoneid = crgetzoneid(credp);
1702
1703	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1704
1705	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1706	icmp = connp->conn_icmp;
1707
1708	/*
1709	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1710	 * done by netstack_find_by_cred()
1711	 */
1712	netstack_rele(ns);
1713
1714	/*
1715	 * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1716	 * need to lock anything.
1717	 */
1718	ASSERT(connp->conn_proto == IPPROTO_ICMP);
1719	ASSERT(connp->conn_icmp == icmp);
1720	ASSERT(icmp->icmp_connp == connp);
1721
1722	/* Set the initial state of the stream and the privilege status. */
1723	icmp->icmp_state = TS_UNBND;
1724	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1725	if (isv6) {
1726		connp->conn_family = AF_INET6;
1727		connp->conn_ipversion = IPV6_VERSION;
1728		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1729		connp->conn_proto = IPPROTO_ICMPV6;
1730		/* May be changed by a SO_PROTOTYPE socket option. */
1731		connp->conn_proto = IPPROTO_ICMPV6;
1732		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1733		connp->conn_ixa->ixa_raw_cksum_offset = 2;
1734		connp->conn_default_ttl = is->is_ipv6_hoplimit;
1735		len = sizeof (ip6_t);
1736	} else {
1737		connp->conn_family = AF_INET;
1738		connp->conn_ipversion = IPV4_VERSION;
1739		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1740		/* May be changed by a SO_PROTOTYPE socket option. */
1741		connp->conn_proto = IPPROTO_ICMP;
1742		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1743		connp->conn_default_ttl = is->is_ipv4_ttl;
1744		len = sizeof (ipha_t);
1745	}
1746	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1747
1748	connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1749
1750	/*
1751	 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1752	 * the checksum is provided in the pre-built packet. We clear
1753	 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1754	 * complete IP header and not to compute the transport checksum.
1755	 */
1756	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1757	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1758	connp->conn_ixa->ixa_zoneid = zoneid;
1759
1760	connp->conn_zoneid = zoneid;
1761
1762	/*
1763	 * If the caller has the process-wide flag set, then default to MAC
1764	 * exempt mode.  This allows read-down to unlabeled hosts.
1765	 */
1766	if (getpflags(NET_MAC_AWARE, credp) != 0)
1767		connp->conn_mac_mode = CONN_MAC_AWARE;
1768
1769	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1770
1771	icmp->icmp_is = is;
1772
1773	connp->conn_rcvbuf = is->is_recv_hiwat;
1774	connp->conn_sndbuf = is->is_xmit_hiwat;
1775	connp->conn_sndlowat = is->is_xmit_lowat;
1776	connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1777
1778	connp->conn_wroff = len + is->is_wroff_extra;
1779	connp->conn_so_type = SOCK_RAW;
1780
1781	connp->conn_recv = icmp_input;
1782	connp->conn_recvicmp = icmp_icmp_input;
1783	crhold(credp);
1784	connp->conn_cred = credp;
1785	connp->conn_cpid = curproc->p_pid;
1786	connp->conn_open_time = ddi_get_lbolt64();
1787	/* Cache things in ixa without an extra refhold */
1788	ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1789	connp->conn_ixa->ixa_cred = connp->conn_cred;
1790	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1791	if (is_system_labeled())
1792		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1793
1794	connp->conn_flow_cntrld = B_FALSE;
1795
1796	if (is->is_pmtu_discovery)
1797		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1798
1799	return (connp);
1800}
1801
1802/*
1803 * Which ICMP options OK to set through T_UNITDATA_REQ...
1804 */
1805/* ARGSUSED */
1806static boolean_t
1807icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1808{
1809	return (B_TRUE);
1810}
1811
1812/*
1813 * This routine gets default values of certain options whose default
1814 * values are maintained by protcol specific code
1815 */
1816int
1817icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1818{
1819	icmp_t *icmp = Q_TO_ICMP(q);
1820	icmp_stack_t *is = icmp->icmp_is;
1821	int *i1 = (int *)ptr;
1822
1823	switch (level) {
1824	case IPPROTO_IP:
1825		switch (name) {
1826		case IP_MULTICAST_TTL:
1827			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1828			return (sizeof (uchar_t));
1829		case IP_MULTICAST_LOOP:
1830			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1831			return (sizeof (uchar_t));
1832		}
1833		break;
1834	case IPPROTO_IPV6:
1835		switch (name) {
1836		case IPV6_MULTICAST_HOPS:
1837			*i1 = IP_DEFAULT_MULTICAST_TTL;
1838			return (sizeof (int));
1839		case IPV6_MULTICAST_LOOP:
1840			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1841			return (sizeof (int));
1842		case IPV6_UNICAST_HOPS:
1843			*i1 = is->is_ipv6_hoplimit;
1844			return (sizeof (int));
1845		}
1846		break;
1847	case IPPROTO_ICMPV6:
1848		switch (name) {
1849		case ICMP6_FILTER:
1850			/* Make it look like "pass all" */
1851			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1852			return (sizeof (icmp6_filter_t));
1853		}
1854		break;
1855	}
1856	return (-1);
1857}
1858
1859/*
1860 * This routine retrieves the current status of socket options.
1861 * It returns the size of the option retrieved, or -1.
1862 */
1863int
1864icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1865{
1866	icmp_t		*icmp = connp->conn_icmp;
1867	int		*i1 = (int *)ptr;
1868	conn_opt_arg_t	coas;
1869	int		retval;
1870
1871	coas.coa_connp = connp;
1872	coas.coa_ixa = connp->conn_ixa;
1873	coas.coa_ipp = &connp->conn_xmit_ipp;
1874	coas.coa_ancillary = B_FALSE;
1875	coas.coa_changed = 0;
1876
1877	/*
1878	 * We assume that the optcom framework has checked for the set
1879	 * of levels and names that are supported, hence we don't worry
1880	 * about rejecting based on that.
1881	 * First check for ICMP specific handling, then pass to common routine.
1882	 */
1883	switch (level) {
1884	case IPPROTO_IP:
1885		/*
1886		 * Only allow IPv4 option processing on IPv4 sockets.
1887		 */
1888		if (connp->conn_family != AF_INET)
1889			return (-1);
1890
1891		switch (name) {
1892		case IP_OPTIONS:
1893		case T_IP_OPTIONS:
1894			/* Options are passed up with each packet */
1895			return (0);
1896		case IP_HDRINCL:
1897			mutex_enter(&connp->conn_lock);
1898			*i1 = (int)icmp->icmp_hdrincl;
1899			mutex_exit(&connp->conn_lock);
1900			return (sizeof (int));
1901		}
1902		break;
1903
1904	case IPPROTO_IPV6:
1905		/*
1906		 * Only allow IPv6 option processing on native IPv6 sockets.
1907		 */
1908		if (connp->conn_family != AF_INET6)
1909			return (-1);
1910
1911		switch (name) {
1912		case IPV6_CHECKSUM:
1913			/*
1914			 * Return offset or -1 if no checksum offset.
1915			 * Does not apply to IPPROTO_ICMPV6
1916			 */
1917			if (connp->conn_proto == IPPROTO_ICMPV6)
1918				return (-1);
1919
1920			mutex_enter(&connp->conn_lock);
1921			if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1922				*i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1923			else
1924				*i1 = -1;
1925			mutex_exit(&connp->conn_lock);
1926			return (sizeof (int));
1927		}
1928		break;
1929
1930	case IPPROTO_ICMPV6:
1931		/*
1932		 * Only allow IPv6 option processing on native IPv6 sockets.
1933		 */
1934		if (connp->conn_family != AF_INET6)
1935			return (-1);
1936
1937		if (connp->conn_proto != IPPROTO_ICMPV6)
1938			return (-1);
1939
1940		switch (name) {
1941		case ICMP6_FILTER:
1942			mutex_enter(&connp->conn_lock);
1943			if (icmp->icmp_filter == NULL) {
1944				/* Make it look like "pass all" */
1945				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1946			} else {
1947				(void) bcopy(icmp->icmp_filter, ptr,
1948				    sizeof (icmp6_filter_t));
1949			}
1950			mutex_exit(&connp->conn_lock);
1951			return (sizeof (icmp6_filter_t));
1952		}
1953	}
1954	mutex_enter(&connp->conn_lock);
1955	retval = conn_opt_get(&coas, level, name, ptr);
1956	mutex_exit(&connp->conn_lock);
1957	return (retval);
1958}
1959
1960/*
1961 * This routine retrieves the current status of socket options.
1962 * It returns the size of the option retrieved, or -1.
1963 */
1964int
1965icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1966{
1967	conn_t		*connp = Q_TO_CONN(q);
1968	int		err;
1969
1970	err = icmp_opt_get(connp, level, name, ptr);
1971	return (err);
1972}
1973
1974/*
1975 * This routine sets socket options.
1976 */
1977int
1978icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
1979    uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
1980{
1981	conn_t		*connp = coa->coa_connp;
1982	ip_xmit_attr_t	*ixa = coa->coa_ixa;
1983	icmp_t		*icmp = connp->conn_icmp;
1984	icmp_stack_t	*is = icmp->icmp_is;
1985	int		*i1 = (int *)invalp;
1986	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
1987	int		error;
1988
1989	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
1990
1991	/*
1992	 * For fixed length options, no sanity check
1993	 * of passed in length is done. It is assumed *_optcom_req()
1994	 * routines do the right thing.
1995	 */
1996
1997	switch (level) {
1998	case SOL_SOCKET:
1999		switch (name) {
2000		case SO_PROTOTYPE:
2001			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
2002			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
2003			    secpolicy_net_rawaccess(cr) != 0) {
2004				return (EACCES);
2005			}
2006			if (checkonly)
2007				break;
2008
2009			mutex_enter(&connp->conn_lock);
2010			connp->conn_proto = *i1 & 0xFF;
2011			ixa->ixa_protocol = connp->conn_proto;
2012			if ((connp->conn_proto == IPPROTO_RAW ||
2013			    connp->conn_proto == IPPROTO_IGMP) &&
2014			    connp->conn_family == AF_INET) {
2015				icmp->icmp_hdrincl = 1;
2016				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2017			} else if (connp->conn_proto == IPPROTO_UDP ||
2018			    connp->conn_proto == IPPROTO_TCP ||
2019			    connp->conn_proto == IPPROTO_SCTP) {
2020				/* Used by test applications like psh */
2021				icmp->icmp_hdrincl = 0;
2022				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2023			} else {
2024				icmp->icmp_hdrincl = 0;
2025				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2026			}
2027
2028			if (connp->conn_family == AF_INET6 &&
2029			    connp->conn_proto == IPPROTO_ICMPV6) {
2030				/* Set offset for icmp6_cksum */
2031				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2032				ixa->ixa_raw_cksum_offset = 2;
2033			}
2034			if (icmp->icmp_filter != NULL &&
2035			    connp->conn_proto != IPPROTO_ICMPV6) {
2036				kmem_free(icmp->icmp_filter,
2037				    sizeof (icmp6_filter_t));
2038				icmp->icmp_filter = NULL;
2039			}
2040			mutex_exit(&connp->conn_lock);
2041
2042			coa->coa_changed |= COA_HEADER_CHANGED;
2043			/*
2044			 * For SCTP, we don't use icmp_bind_proto() for
2045			 * raw socket binding.
2046			 */
2047			if (connp->conn_proto == IPPROTO_SCTP)
2048				return (0);
2049
2050			coa->coa_changed |= COA_ICMP_BIND_NEEDED;
2051			return (0);
2052
2053		case SO_SNDBUF:
2054			if (*i1 > is->is_max_buf) {
2055				return (ENOBUFS);
2056			}
2057			break;
2058		case SO_RCVBUF:
2059			if (*i1 > is->is_max_buf) {
2060				return (ENOBUFS);
2061			}
2062			break;
2063		}
2064		break;
2065
2066	case IPPROTO_IP:
2067		/*
2068		 * Only allow IPv4 option processing on IPv4 sockets.
2069		 */
2070		if (connp->conn_family != AF_INET)
2071			return (EINVAL);
2072
2073		switch (name) {
2074		case IP_HDRINCL:
2075			if (!checkonly) {
2076				mutex_enter(&connp->conn_lock);
2077				icmp->icmp_hdrincl = onoff;
2078				if (onoff)
2079					ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2080				else
2081					ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2082				mutex_exit(&connp->conn_lock);
2083			}
2084			break;
2085		}
2086		break;
2087
2088	case IPPROTO_IPV6:
2089		if (connp->conn_family != AF_INET6)
2090			return (EINVAL);
2091
2092		switch (name) {
2093		case IPV6_CHECKSUM:
2094			/*
2095			 * Integer offset into the user data of where the
2096			 * checksum is located.
2097			 * Offset of -1 disables option.
2098			 * Does not apply to IPPROTO_ICMPV6.
2099			 */
2100			if (connp->conn_proto == IPPROTO_ICMPV6 ||
2101			    coa->coa_ancillary) {
2102				return (EINVAL);
2103			}
2104			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2105				/* Negative or not 16 bit aligned offset */
2106				return (EINVAL);
2107			}
2108			if (checkonly)
2109				break;
2110
2111			mutex_enter(&connp->conn_lock);
2112			if (*i1 == -1) {
2113				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2114				ixa->ixa_raw_cksum_offset = 0;
2115				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2116			} else {
2117				ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2118				ixa->ixa_raw_cksum_offset = *i1;
2119				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2120			}
2121			mutex_exit(&connp->conn_lock);
2122			break;
2123		}
2124		break;
2125
2126	case IPPROTO_ICMPV6:
2127		/*
2128		 * Only allow IPv6 option processing on IPv6 sockets.
2129		 */
2130		if (connp->conn_family != AF_INET6)
2131			return (EINVAL);
2132		if (connp->conn_proto != IPPROTO_ICMPV6)
2133			return (EINVAL);
2134
2135		switch (name) {
2136		case ICMP6_FILTER:
2137			if (checkonly)
2138				break;
2139
2140			if ((inlen != 0) &&
2141			    (inlen != sizeof (icmp6_filter_t)))
2142				return (EINVAL);
2143
2144			mutex_enter(&connp->conn_lock);
2145			if (inlen == 0) {
2146				if (icmp->icmp_filter != NULL) {
2147					kmem_free(icmp->icmp_filter,
2148					    sizeof (icmp6_filter_t));
2149					icmp->icmp_filter = NULL;
2150				}
2151			} else {
2152				if (icmp->icmp_filter == NULL) {
2153					icmp->icmp_filter = kmem_alloc(
2154					    sizeof (icmp6_filter_t),
2155					    KM_NOSLEEP);
2156					if (icmp->icmp_filter == NULL) {
2157						mutex_exit(&connp->conn_lock);
2158						return (ENOBUFS);
2159					}
2160				}
2161				(void) bcopy(invalp, icmp->icmp_filter, inlen);
2162			}
2163			mutex_exit(&connp->conn_lock);
2164			break;
2165		}
2166		break;
2167	}
2168	error = conn_opt_set(coa, level, name, inlen, invalp,
2169	    checkonly, cr);
2170	return (error);
2171}
2172
2173/*
2174 * This routine sets socket options.
2175 */
2176int
2177icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2178    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2179    void *thisdg_attrs, cred_t *cr)
2180{
2181	icmp_t		*icmp = connp->conn_icmp;
2182	int		err;
2183	conn_opt_arg_t	coas, *coa;
2184	boolean_t	checkonly;
2185	icmp_stack_t	*is = icmp->icmp_is;
2186
2187	switch (optset_context) {
2188	case SETFN_OPTCOM_CHECKONLY:
2189		checkonly = B_TRUE;
2190		/*
2191		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2192		 * inlen != 0 implies value supplied and
2193		 *	we have to "pretend" to set it.
2194		 * inlen == 0 implies that there is no
2195		 *	value part in T_CHECK request and just validation
2196		 * done elsewhere should be enough, we just return here.
2197		 */
2198		if (inlen == 0) {
2199			*outlenp = 0;
2200			return (0);
2201		}
2202		break;
2203	case SETFN_OPTCOM_NEGOTIATE:
2204		checkonly = B_FALSE;
2205		break;
2206	case SETFN_UD_NEGOTIATE:
2207	case SETFN_CONN_NEGOTIATE:
2208		checkonly = B_FALSE;
2209		/*
2210		 * Negotiating local and "association-related" options
2211		 * through T_UNITDATA_REQ.
2212		 *
2213		 * Following routine can filter out ones we do not
2214		 * want to be "set" this way.
2215		 */
2216		if (!icmp_opt_allow_udr_set(level, name)) {
2217			*outlenp = 0;
2218			return (EINVAL);
2219		}
2220		break;
2221	default:
2222		/*
2223		 * We should never get here
2224		 */
2225		*outlenp = 0;
2226		return (EINVAL);
2227	}
2228
2229	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2230	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2231
2232	if (thisdg_attrs != NULL) {
2233		/* Options from T_UNITDATA_REQ */
2234		coa = (conn_opt_arg_t *)thisdg_attrs;
2235		ASSERT(coa->coa_connp == connp);
2236		ASSERT(coa->coa_ixa != NULL);
2237		ASSERT(coa->coa_ipp != NULL);
2238		ASSERT(coa->coa_ancillary);
2239	} else {
2240		coa = &coas;
2241		coas.coa_connp = connp;
2242		/* Get a reference on conn_ixa to prevent concurrent mods */
2243		coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2244		if (coas.coa_ixa == NULL) {
2245			*outlenp = 0;
2246			return (ENOMEM);
2247		}
2248		coas.coa_ipp = &connp->conn_xmit_ipp;
2249		coas.coa_ancillary = B_FALSE;
2250		coas.coa_changed = 0;
2251	}
2252
2253	err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2254	    cr, checkonly);
2255	if (err != 0) {
2256errout:
2257		if (!coa->coa_ancillary)
2258			ixa_refrele(coa->coa_ixa);
2259		*outlenp = 0;
2260		return (err);
2261	}
2262
2263	/*
2264	 * Common case of OK return with outval same as inval.
2265	 */
2266	if (invalp != outvalp) {
2267		/* don't trust bcopy for identical src/dst */
2268		(void) bcopy(invalp, outvalp, inlen);
2269	}
2270	*outlenp = inlen;
2271
2272	/*
2273	 * If this was not ancillary data, then we rebuild the headers,
2274	 * update the IRE/NCE, and IPsec as needed.
2275	 * Since the label depends on the destination we go through
2276	 * ip_set_destination first.
2277	 */
2278	if (coa->coa_ancillary) {
2279		return (0);
2280	}
2281
2282	if (coa->coa_changed & COA_ROUTE_CHANGED) {
2283		in6_addr_t saddr, faddr, nexthop;
2284		in_port_t fport;
2285
2286		/*
2287		 * We clear lastdst to make sure we pick up the change
2288		 * next time sending.
2289		 * If we are connected we re-cache the information.
2290		 * We ignore errors to preserve BSD behavior.
2291		 * Note that we don't redo IPsec policy lookup here
2292		 * since the final destination (or source) didn't change.
2293		 */
2294		mutex_enter(&connp->conn_lock);
2295		connp->conn_v6lastdst = ipv6_all_zeros;
2296
2297		ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2298		    &connp->conn_faddr_v6, &nexthop);
2299		saddr = connp->conn_saddr_v6;
2300		faddr = connp->conn_faddr_v6;
2301		fport = connp->conn_fport;
2302		mutex_exit(&connp->conn_lock);
2303
2304		if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2305		    !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2306			(void) ip_attr_connect(connp, coa->coa_ixa,
2307			    &saddr, &faddr, &nexthop, fport, NULL, NULL,
2308			    IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2309		}
2310	}
2311
2312	ixa_refrele(coa->coa_ixa);
2313
2314	if (coa->coa_changed & COA_HEADER_CHANGED) {
2315		/*
2316		 * Rebuild the header template if we are connected.
2317		 * Otherwise clear conn_v6lastdst so we rebuild the header
2318		 * in the data path.
2319		 */
2320		mutex_enter(&connp->conn_lock);
2321		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2322		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2323			err = icmp_build_hdr_template(connp,
2324			    &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2325			    connp->conn_flowinfo);
2326			if (err != 0) {
2327				mutex_exit(&connp->conn_lock);
2328				return (err);
2329			}
2330		} else {
2331			connp->conn_v6lastdst = ipv6_all_zeros;
2332		}
2333		mutex_exit(&connp->conn_lock);
2334	}
2335	if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2336		(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2337		    connp->conn_rcvbuf);
2338	}
2339	if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2340		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2341	}
2342	if (coa->coa_changed & COA_WROFF_CHANGED) {
2343		/* Increase wroff if needed */
2344		uint_t wroff;
2345
2346		mutex_enter(&connp->conn_lock);
2347		wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2348		if (wroff > connp->conn_wroff) {
2349			connp->conn_wroff = wroff;
2350			mutex_exit(&connp->conn_lock);
2351			(void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2352		} else {
2353			mutex_exit(&connp->conn_lock);
2354		}
2355	}
2356	if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2357		icmp_bind_proto(icmp);
2358	}
2359	return (err);
2360}
2361
2362/* This routine sets socket options. */
2363int
2364icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2365    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2366    void *thisdg_attrs, cred_t *cr)
2367{
2368	conn_t	*connp = Q_TO_CONN(q);
2369	int error;
2370
2371	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2372	    outlenp, outvalp, thisdg_attrs, cr);
2373	return (error);
2374}
2375
2376/*
2377 * Setup IP headers.
2378 *
2379 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2380 * but icmp_output_hdrincl restores ipha_protocol once we return.
2381 */
2382mblk_t *
2383icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2384    const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2385    mblk_t *data_mp, int *errorp)
2386{
2387	mblk_t		*mp;
2388	icmp_stack_t	*is = connp->conn_netstack->netstack_icmp;
2389	uint_t		data_len;
2390	uint32_t	cksum;
2391
2392	data_len = msgdsize(data_mp);
2393	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2394	    flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2395	if (mp == NULL) {
2396		ASSERT(*errorp != 0);
2397		return (NULL);
2398	}
2399
2400	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2401
2402	/*
2403	 * If there was a routing option/header then conn_prepend_hdr
2404	 * has massaged it and placed the pseudo-header checksum difference
2405	 * in the cksum argument.
2406	 *
2407	 * Prepare for ICMPv6 checksum done in IP.
2408	 *
2409	 * We make it easy for IP to include our pseudo header
2410	 * by putting our length (and any routing header adjustment)
2411	 * in the ICMPv6 checksum field.
2412	 * The IP source, destination, and length have already been set by
2413	 * conn_prepend_hdr.
2414	 */
2415	cksum += data_len;
2416	cksum = (cksum >> 16) + (cksum & 0xFFFF);
2417	ASSERT(cksum < 0x10000);
2418
2419	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2420		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
2421
2422		ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2423	} else {
2424		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
2425		uint_t	cksum_offset = 0;
2426
2427		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2428
2429		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2430			if (connp->conn_proto == IPPROTO_ICMPV6) {
2431				cksum_offset = ixa->ixa_ip_hdr_length +
2432				    offsetof(icmp6_t, icmp6_cksum);
2433			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2434				cksum_offset = ixa->ixa_ip_hdr_length +
2435				    ixa->ixa_raw_cksum_offset;
2436			}
2437		}
2438		if (cksum_offset != 0) {
2439			uint16_t *ptr;
2440
2441			/* Make sure the checksum fits in the first mblk */
2442			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2443				mblk_t *mp1;
2444
2445				mp1 = msgpullup(mp,
2446				    cksum_offset + sizeof (short));
2447				freemsg(mp);
2448				if (mp1 == NULL) {
2449					*errorp = ENOMEM;
2450					return (NULL);
2451				}
2452				mp = mp1;
2453				ip6h = (ip6_t *)mp->b_rptr;
2454			}
2455			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2456			*ptr = htons(cksum);
2457		}
2458	}
2459
2460	/* Note that we don't try to update wroff due to ancillary data */
2461	return (mp);
2462}
2463
2464static int
2465icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2466    const in6_addr_t *v6dst, uint32_t flowinfo)
2467{
2468	int		error;
2469
2470	ASSERT(MUTEX_HELD(&connp->conn_lock));
2471	/*
2472	 * We clear lastdst to make sure we don't use the lastdst path
2473	 * next time sending since we might not have set v6dst yet.
2474	 */
2475	connp->conn_v6lastdst = ipv6_all_zeros;
2476
2477	error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2478	if (error != 0)
2479		return (error);
2480
2481	/*
2482	 * Any routing header/option has been massaged. The checksum difference
2483	 * is stored in conn_sum.
2484	 */
2485	return (0);
2486}
2487
2488static mblk_t *
2489icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2490{
2491	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2492	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2493		/*
2494		 * fallback has started but messages have not been moved yet
2495		 */
2496		if (icmp->icmp_fallback_queue_head == NULL) {
2497			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2498			icmp->icmp_fallback_queue_head = mp;
2499			icmp->icmp_fallback_queue_tail = mp;
2500		} else {
2501			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2502			icmp->icmp_fallback_queue_tail->b_next = mp;
2503			icmp->icmp_fallback_queue_tail = mp;
2504		}
2505		return (NULL);
2506	} else {
2507		/*
2508		 * Fallback completed, let the caller putnext() the mblk.
2509		 */
2510		return (mp);
2511	}
2512}
2513
2514/*
2515 * Deliver data to ULP. In case we have a socket, and it's falling back to
2516 * TPI, then we'll queue the mp for later processing.
2517 */
2518static void
2519icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2520{
2521	if (IPCL_IS_NONSTR(connp)) {
2522		icmp_t *icmp = connp->conn_icmp;
2523		int error;
2524
2525		ASSERT(len == msgdsize(mp));
2526		if ((*connp->conn_upcalls->su_recv)
2527		    (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2528			mutex_enter(&icmp->icmp_recv_lock);
2529			if (error == ENOSPC) {
2530				/*
2531				 * let's confirm while holding the lock
2532				 */
2533				if ((*connp->conn_upcalls->su_recv)
2534				    (connp->conn_upper_handle, NULL, 0, 0,
2535				    &error, NULL) < 0) {
2536					ASSERT(error == ENOSPC);
2537					if (error == ENOSPC) {
2538						connp->conn_flow_cntrld =
2539						    B_TRUE;
2540					}
2541				}
2542				mutex_exit(&icmp->icmp_recv_lock);
2543			} else {
2544				ASSERT(error == EOPNOTSUPP);
2545				mp = icmp_queue_fallback(icmp, mp);
2546				mutex_exit(&icmp->icmp_recv_lock);
2547				if (mp != NULL)
2548					putnext(connp->conn_rq, mp);
2549			}
2550		}
2551		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2552	} else {
2553		putnext(connp->conn_rq, mp);
2554	}
2555}
2556
2557/*
2558 * This is the inbound data path.
2559 * IP has already pulled up the IP headers and verified alignment
2560 * etc.
2561 */
2562/* ARGSUSED2 */
2563static void
2564icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2565{
2566	conn_t			*connp = (conn_t *)arg1;
2567	struct T_unitdata_ind	*tudi;
2568	uchar_t			*rptr;		/* Pointer to IP header */
2569	int			ip_hdr_length;
2570	int			udi_size;	/* Size of T_unitdata_ind */
2571	int			pkt_len;
2572	icmp_t			*icmp;
2573	ip_pkt_t		ipps;
2574	ip6_t			*ip6h;
2575	mblk_t			*mp1;
2576	crb_t			recv_ancillary;
2577	icmp_stack_t		*is;
2578	sin_t			*sin;
2579	sin6_t			*sin6;
2580	ipha_t			*ipha;
2581
2582	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2583
2584	icmp = connp->conn_icmp;
2585	is = icmp->icmp_is;
2586	rptr = mp->b_rptr;
2587
2588	ASSERT(DB_TYPE(mp) == M_DATA);
2589	ASSERT(OK_32PTR(rptr));
2590	ASSERT(ira->ira_pktlen == msgdsize(mp));
2591	pkt_len = ira->ira_pktlen;
2592
2593	/*
2594	 * Get a snapshot of these and allow other threads to change
2595	 * them after that. We need the same recv_ancillary when determining
2596	 * the size as when adding the ancillary data items.
2597	 */
2598	mutex_enter(&connp->conn_lock);
2599	recv_ancillary = connp->conn_recv_ancillary;
2600	mutex_exit(&connp->conn_lock);
2601
2602	ip_hdr_length = ira->ira_ip_hdr_length;
2603	ASSERT(MBLKL(mp) >= ip_hdr_length);	/* IP did a pullup */
2604
2605	/* Initialize regardless of IP version */
2606	ipps.ipp_fields = 0;
2607
2608	if (ira->ira_flags & IRAF_IS_IPV4) {
2609		ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2610		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2611		ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2612
2613		ipha = (ipha_t *)mp->b_rptr;
2614		if (recv_ancillary.crb_all != 0)
2615			(void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2616
2617		/*
2618		 * BSD for some reason adjusts ipha_length to exclude the
2619		 * IP header length. We do the same.
2620		 */
2621		if (is->is_bsd_compat) {
2622			ushort_t len;
2623
2624			len = ntohs(ipha->ipha_length);
2625			if (mp->b_datap->db_ref > 1) {
2626				/*
2627				 * Allocate a new IP header so that we can
2628				 * modify ipha_length.
2629				 */
2630				mblk_t	*mp1;
2631
2632				mp1 = allocb(ip_hdr_length, BPRI_MED);
2633				if (mp1 == NULL) {
2634					freemsg(mp);
2635					BUMP_MIB(&is->is_rawip_mib,
2636					    rawipInErrors);
2637					return;
2638				}
2639				bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2640				mp->b_rptr = rptr + ip_hdr_length;
2641				rptr = mp1->b_rptr;
2642				ipha = (ipha_t *)rptr;
2643				mp1->b_cont = mp;
2644				mp1->b_wptr = rptr + ip_hdr_length;
2645				mp = mp1;
2646			}
2647			len -= ip_hdr_length;
2648			ipha->ipha_length = htons(len);
2649		}
2650
2651		/*
2652		 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2653		 * sockets. This is ensured by icmp_bind and the IP fanout code.
2654		 */
2655		ASSERT(connp->conn_family == AF_INET);
2656
2657		/*
2658		 * This is the inbound data path.  Packets are passed upstream
2659		 * as T_UNITDATA_IND messages with full IPv4 headers still
2660		 * attached.
2661		 */
2662
2663		/*
2664		 * Normally only send up the source address.
2665		 * If any ancillary data items are wanted we add those.
2666		 */
2667		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2668		if (recv_ancillary.crb_all != 0) {
2669			udi_size += conn_recvancillary_size(connp,
2670			    recv_ancillary, ira, mp, &ipps);
2671		}
2672
2673		/* Allocate a message block for the T_UNITDATA_IND structure. */
2674		mp1 = allocb(udi_size, BPRI_MED);
2675		if (mp1 == NULL) {
2676			freemsg(mp);
2677			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2678			return;
2679		}
2680		mp1->b_cont = mp;
2681		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2682		mp1->b_datap->db_type = M_PROTO;
2683		mp1->b_wptr = (uchar_t *)tudi + udi_size;
2684		tudi->PRIM_type = T_UNITDATA_IND;
2685		tudi->SRC_length = sizeof (sin_t);
2686		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2687		sin = (sin_t *)&tudi[1];
2688		*sin = sin_null;
2689		sin->sin_family = AF_INET;
2690		sin->sin_addr.s_addr = ipha->ipha_src;
2691		*(uint32_t *)&sin->sin_zero[0] = 0;
2692		*(uint32_t *)&sin->sin_zero[4] = 0;
2693		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2694		    sizeof (sin_t);
2695		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2696		tudi->OPT_length = udi_size;
2697
2698		/*
2699		 * Add options if IP_RECVIF etc is set
2700		 */
2701		if (udi_size != 0) {
2702			conn_recvancillary_add(connp, recv_ancillary, ira,
2703			    &ipps, (uchar_t *)&sin[1], udi_size);
2704		}
2705		goto deliver;
2706	}
2707
2708	ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2709	/*
2710	 * IPv6 packets can only be received by applications
2711	 * that are prepared to receive IPv6 addresses.
2712	 * The IP fanout must ensure this.
2713	 */
2714	ASSERT(connp->conn_family == AF_INET6);
2715
2716	/*
2717	 * Handle IPv6 packets. We don't pass up the IP headers with the
2718	 * payload for IPv6.
2719	 */
2720
2721	ip6h = (ip6_t *)rptr;
2722	if (recv_ancillary.crb_all != 0) {
2723		/*
2724		 * Call on ip_find_hdr_v6 which gets individual lenghts of
2725		 * extension headers (and pointers to them).
2726		 */
2727		uint8_t		nexthdr;
2728
2729		/* We don't care about the length or nextheader. */
2730		(void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2731
2732		/*
2733		 * We do not pass up hop-by-hop options or any other
2734		 * extension header as part of the packet. Applications
2735		 * that want to see them have to specify IPV6_RECV* socket
2736		 * options. And conn_recvancillary_size/add explicitly
2737		 * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2738		 *
2739		 * If we had multilevel ICMP sockets, then we'd want to
2740		 * modify conn_recvancillary_size/add to
2741		 * allow the user to see the label.
2742		 */
2743	}
2744
2745	/*
2746	 * Check a filter for ICMPv6 types if needed.
2747	 * Verify raw checksums if needed.
2748	 */
2749	mutex_enter(&connp->conn_lock);
2750	if (icmp->icmp_filter != NULL) {
2751		int type;
2752
2753		/* Assumes that IP has done the pullupmsg */
2754		type = mp->b_rptr[ip_hdr_length];
2755
2756		ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2757		if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2758			mutex_exit(&connp->conn_lock);
2759			freemsg(mp);
2760			return;
2761		}
2762	}
2763	if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2764		/* Checksum */
2765		uint16_t	*up;
2766		uint32_t	sum;
2767		int		remlen;
2768
2769		up = (uint16_t *)&ip6h->ip6_src;
2770
2771		remlen = msgdsize(mp) - ip_hdr_length;
2772		sum = htons(connp->conn_proto + remlen)
2773		    + up[0] + up[1] + up[2] + up[3]
2774		    + up[4] + up[5] + up[6] + up[7]
2775		    + up[8] + up[9] + up[10] + up[11]
2776		    + up[12] + up[13] + up[14] + up[15];
2777		sum = (sum & 0xffff) + (sum >> 16);
2778		sum = IP_CSUM(mp, ip_hdr_length, sum);
2779		if (sum != 0) {
2780			/* IPv6 RAW checksum failed */
2781			ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2782			mutex_exit(&connp->conn_lock);
2783			freemsg(mp);
2784			BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2785			return;
2786		}
2787	}
2788	mutex_exit(&connp->conn_lock);
2789
2790	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2791
2792	if (recv_ancillary.crb_all != 0) {
2793		udi_size += conn_recvancillary_size(connp,
2794		    recv_ancillary, ira, mp, &ipps);
2795	}
2796
2797	mp1 = allocb(udi_size, BPRI_MED);
2798	if (mp1 == NULL) {
2799		freemsg(mp);
2800		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2801		return;
2802	}
2803	mp1->b_cont = mp;
2804	mp1->b_datap->db_type = M_PROTO;
2805	tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2806	mp1->b_wptr = (uchar_t *)tudi + udi_size;
2807	tudi->PRIM_type = T_UNITDATA_IND;
2808	tudi->SRC_length = sizeof (sin6_t);
2809	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2810	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2811	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2812	tudi->OPT_length = udi_size;
2813	sin6 = (sin6_t *)&tudi[1];
2814	*sin6 = sin6_null;
2815	sin6->sin6_port = 0;
2816	sin6->sin6_family = AF_INET6;
2817
2818	sin6->sin6_addr = ip6h->ip6_src;
2819	/* No sin6_flowinfo per API */
2820	sin6->sin6_flowinfo = 0;
2821	/* For link-scope pass up scope id */
2822	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2823		sin6->sin6_scope_id = ira->ira_ruifindex;
2824	else
2825		sin6->sin6_scope_id = 0;
2826	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2827	    IPCL_ZONEID(connp), is->is_netstack);
2828
2829	if (udi_size != 0) {
2830		conn_recvancillary_add(connp, recv_ancillary, ira,
2831		    &ipps, (uchar_t *)&sin6[1], udi_size);
2832	}
2833
2834	/* Skip all the IPv6 headers per API */
2835	mp->b_rptr += ip_hdr_length;
2836	pkt_len -= ip_hdr_length;
2837
2838deliver:
2839	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2840	icmp_ulp_recv(connp, mp1, pkt_len);
2841}
2842
2843/*
2844 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2845 * information that can be changing beneath us.
2846 */
2847mblk_t *
2848icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2849{
2850	mblk_t			*mpdata;
2851	struct opthdr		*optp;
2852	conn_t			*connp = Q_TO_CONN(q);
2853	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
2854	mblk_t			*mp2ctl;
2855
2856	/*
2857	 * make a copy of the original message
2858	 */
2859	mp2ctl = copymsg(mpctl);
2860
2861	if (mpctl == NULL ||
2862	    (mpdata = mpctl->b_cont) == NULL) {
2863		freemsg(mpctl);
2864		freemsg(mp2ctl);
2865		return (0);
2866	}
2867
2868	/* fixed length structure for IPv4 and IPv6 counters */
2869	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2870	optp->level = EXPER_RAWIP;
2871	optp->name = 0;
2872	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2873	    sizeof (is->is_rawip_mib));
2874	optp->len = msgdsize(mpdata);
2875	qreply(q, mpctl);
2876
2877	return (mp2ctl);
2878}
2879
2880/*
2881 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2882 * TODO:  If this ever actually tries to set anything, it needs to be
2883 * to do the appropriate locking.
2884 */
2885/* ARGSUSED */
2886int
2887icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2888    uchar_t *ptr, int len)
2889{
2890	switch (level) {
2891	case EXPER_RAWIP:
2892		return (0);
2893	default:
2894		return (1);
2895	}
2896}
2897
2898/*
2899 * This routine creates a T_UDERROR_IND message and passes it upstream.
2900 * The address and options are copied from the T_UNITDATA_REQ message
2901 * passed in mp.  This message is freed.
2902 */
2903static void
2904icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
2905{
2906	struct T_unitdata_req *tudr;
2907	mblk_t	*mp1;
2908	uchar_t *destaddr;
2909	t_scalar_t destlen;
2910	uchar_t	*optaddr;
2911	t_scalar_t optlen;
2912
2913	if ((mp->b_wptr < mp->b_rptr) ||
2914	    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
2915		goto done;
2916	}
2917	tudr = (struct T_unitdata_req *)mp->b_rptr;
2918	destaddr = mp->b_rptr + tudr->DEST_offset;
2919	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
2920	    destaddr + tudr->DEST_length < mp->b_rptr ||
2921	    destaddr + tudr->DEST_length > mp->b_wptr) {
2922		goto done;
2923	}
2924	optaddr = mp->b_rptr + tudr->OPT_offset;
2925	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
2926	    optaddr + tudr->OPT_length < mp->b_rptr ||
2927	    optaddr + tudr->OPT_length > mp->b_wptr) {
2928		goto done;
2929	}
2930	destlen = tudr->DEST_length;
2931	optlen = tudr->OPT_length;
2932
2933	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
2934	    (char *)optaddr, optlen, err);
2935	if (mp1 != NULL)
2936		qreply(q, mp1);
2937
2938done:
2939	freemsg(mp);
2940}
2941
2942static int
2943rawip_do_unbind(conn_t *connp)
2944{
2945	icmp_t	*icmp = connp->conn_icmp;
2946
2947	mutex_enter(&connp->conn_lock);
2948	/* If a bind has not been done, we can't unbind. */
2949	if (icmp->icmp_state == TS_UNBND) {
2950		mutex_exit(&connp->conn_lock);
2951		return (-TOUTSTATE);
2952	}
2953	connp->conn_saddr_v6 = ipv6_all_zeros;
2954	connp->conn_bound_addr_v6 = ipv6_all_zeros;
2955	connp->conn_laddr_v6 = ipv6_all_zeros;
2956	connp->conn_mcbc_bind = B_FALSE;
2957	connp->conn_lport = 0;
2958	connp->conn_fport = 0;
2959	/* In case we were also connected */
2960	connp->conn_faddr_v6 = ipv6_all_zeros;
2961	connp->conn_v6lastdst = ipv6_all_zeros;
2962
2963	icmp->icmp_state = TS_UNBND;
2964
2965	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
2966	    &connp->conn_faddr_v6, connp->conn_flowinfo);
2967	mutex_exit(&connp->conn_lock);
2968
2969	ip_unbind(connp);
2970	return (0);
2971}
2972
2973/*
2974 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
2975 * After some error checking, the message is passed downstream to ip.
2976 */
2977static void
2978icmp_tpi_unbind(queue_t *q, mblk_t *mp)
2979{
2980	conn_t	*connp = Q_TO_CONN(q);
2981	int	error;
2982
2983	ASSERT(mp->b_cont == NULL);
2984	error = rawip_do_unbind(connp);
2985	if (error) {
2986		if (error < 0) {
2987			icmp_err_ack(q, mp, -error, 0);
2988		} else {
2989			icmp_err_ack(q, mp, 0, error);
2990		}
2991		return;
2992	}
2993
2994	/*
2995	 * Convert mp into a T_OK_ACK
2996	 */
2997
2998	mp = mi_tpi_ok_ack_alloc(mp);
2999
3000	/*
3001	 * should not happen in practice... T_OK_ACK is smaller than the
3002	 * original message.
3003	 */
3004	ASSERT(mp != NULL);
3005	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
3006	qreply(q, mp);
3007}
3008
3009/*
3010 * Process IPv4 packets that already include an IP header.
3011 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
3012 * IPPROTO_IGMP).
3013 * In this case we ignore the address and any options in the T_UNITDATA_REQ.
3014 *
3015 * The packet is assumed to have a base (20 byte) IP header followed
3016 * by the upper-layer protocol. We include any IP_OPTIONS including a
3017 * CIPSO label but otherwise preserve the base IP header.
3018 */
3019static int
3020icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3021{
3022	icmp_t		*icmp = connp->conn_icmp;
3023	icmp_stack_t	*is = icmp->icmp_is;
3024	ipha_t		iphas;
3025	ipha_t		*ipha;
3026	int		ip_hdr_length;
3027	int		tp_hdr_len;
3028	ip_xmit_attr_t	*ixa;
3029	ip_pkt_t	*ipp;
3030	in6_addr_t	v6src;
3031	in6_addr_t	v6dst;
3032	in6_addr_t	v6nexthop;
3033	int		error;
3034	boolean_t	do_ipsec;
3035
3036	/*
3037	 * We need an exclusive copy of conn_ixa since the included IP
3038	 * header could have any destination.
3039	 * That copy has no pointers hence we
3040	 * need to set them up once we've parsed the ancillary data.
3041	 */
3042	ixa = conn_get_ixa_exclusive(connp);
3043	if (ixa == NULL) {
3044		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3045		freemsg(mp);
3046		return (ENOMEM);
3047	}
3048	ASSERT(cr != NULL);
3049	/*
3050	 * Caller has a reference on cr; from db_credp or because we
3051	 * are running in process context.
3052	 */
3053	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3054	ixa->ixa_cred = cr;
3055	ixa->ixa_cpid = pid;
3056	if (is_system_labeled()) {
3057		/* We need to restart with a label based on the cred */
3058		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3059	}
3060
3061	/* In case previous destination was multicast or multirt */
3062	ip_attr_newdst(ixa);
3063
3064	/* Get a copy of conn_xmit_ipp since the TX label might change it */
3065	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3066	if (ipp == NULL) {
3067		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3068		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3069		ixa->ixa_cpid = connp->conn_cpid;
3070		ixa_refrele(ixa);
3071		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3072		freemsg(mp);
3073		return (ENOMEM);
3074	}
3075	mutex_enter(&connp->conn_lock);
3076	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3077	mutex_exit(&connp->conn_lock);
3078	if (error != 0) {
3079		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3080		freemsg(mp);
3081		goto done;
3082	}
3083
3084	/* Sanity check length of packet */
3085	ipha = (ipha_t *)mp->b_rptr;
3086
3087	ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3088	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3089		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3090			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3091			freemsg(mp);
3092			goto done;
3093		}
3094		ipha = (ipha_t *)mp->b_rptr;
3095	}
3096	ipha->ipha_version_and_hdr_length =
3097	    (IP_VERSION<<4) | (ip_hdr_length>>2);
3098
3099	/*
3100	 * We set IXAF_DONTFRAG if the application set DF which makes
3101	 * IP not fragment.
3102	 */
3103	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3104	if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3105		ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3106	else
3107		ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3108
3109	/* Even for multicast and broadcast we honor the apps ttl */
3110	ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3111
3112	/*
3113	 * No source verification for non-local addresses
3114	 */
3115	if (ipha->ipha_src != INADDR_ANY &&
3116	    ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
3117	    is->is_netstack->netstack_ip, B_FALSE)
3118	    != IPVL_UNICAST_UP) {
3119		ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3120	}
3121
3122	if (ipha->ipha_dst == INADDR_ANY)
3123		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3124
3125	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3126	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3127
3128	/* Defer IPsec if it might need to look at ICMP type/code */
3129	do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3130	ixa->ixa_flags |= IXAF_IS_IPV4;
3131
3132	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3133	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3134	    connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3135	    (do_ipsec ? IPDF_IPSEC : 0));
3136	switch (error) {
3137	case 0:
3138		break;
3139	case EADDRNOTAVAIL:
3140		/*
3141		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3142		 * Don't have the application see that errno
3143		 */
3144		error = ENETUNREACH;
3145		goto failed;
3146	case ENETDOWN:
3147		/*
3148		 * Have !ipif_addr_ready address; drop packet silently
3149		 * until we can get applications to not send until we
3150		 * are ready.
3151		 */
3152		error = 0;
3153		goto failed;
3154	case EHOSTUNREACH:
3155	case ENETUNREACH:
3156		if (ixa->ixa_ire != NULL) {
3157			/*
3158			 * Let conn_ip_output/ire_send_noroute return
3159			 * the error and send any local ICMP error.
3160			 */
3161			error = 0;
3162			break;
3163		}
3164		/* FALLTHRU */
3165	default:
3166	failed:
3167		freemsg(mp);
3168		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3169		goto done;
3170	}
3171	if (ipha->ipha_src == INADDR_ANY)
3172		IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3173
3174	/*
3175	 * We might be going to a different destination than last time,
3176	 * thus check that TX allows the communication and compute any
3177	 * needed label.
3178	 *
3179	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3180	 * don't have to worry about concurrent threads.
3181	 */
3182	if (is_system_labeled()) {
3183		/*
3184		 * Check whether Trusted Solaris policy allows communication
3185		 * with this host, and pretend that the destination is
3186		 * unreachable if not.
3187		 * Compute any needed label and place it in ipp_label_v4/v6.
3188		 *
3189		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3190		 * ipp_label_v4/v6 to form the packet.
3191		 *
3192		 * Tsol note: We have ipp structure local to this thread so
3193		 * no locking is needed.
3194		 */
3195		error = conn_update_label(connp, ixa, &v6dst, ipp);
3196		if (error != 0) {
3197			freemsg(mp);
3198			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3199			goto done;
3200		}
3201	}
3202
3203	/*
3204	 * Save away a copy of the IPv4 header the application passed down
3205	 * and then prepend an IPv4 header complete with any IP options
3206	 * including label.
3207	 * We need a struct copy since icmp_prepend_hdr will reuse the available
3208	 * space in the mblk.
3209	 */
3210	iphas = *ipha;
3211	mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3212
3213	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3214	if (mp == NULL) {
3215		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3216		ASSERT(error != 0);
3217		goto done;
3218	}
3219	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3220		error = EMSGSIZE;
3221		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3222		freemsg(mp);
3223		goto done;
3224	}
3225	/* Restore key parts of the header that the application passed down */
3226	ipha = (ipha_t *)mp->b_rptr;
3227	ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3228	ipha->ipha_ident = iphas.ipha_ident;
3229	ipha->ipha_fragment_offset_and_flags =
3230	    iphas.ipha_fragment_offset_and_flags;
3231	ipha->ipha_ttl = iphas.ipha_ttl;
3232	ipha->ipha_protocol = iphas.ipha_protocol;
3233	ipha->ipha_src = iphas.ipha_src;
3234	ipha->ipha_dst = iphas.ipha_dst;
3235
3236	ixa->ixa_protocol = ipha->ipha_protocol;
3237
3238	/*
3239	 * Make sure that the IP header plus any transport header that is
3240	 * checksumed by ip_output is in the first mblk. (ip_output assumes
3241	 * that at least the checksum field is in the first mblk.)
3242	 */
3243	switch (ipha->ipha_protocol) {
3244	case IPPROTO_UDP:
3245		tp_hdr_len = 8;
3246		break;
3247	case IPPROTO_TCP:
3248		tp_hdr_len = 20;
3249		break;
3250	default:
3251		tp_hdr_len = 0;
3252		break;
3253	}
3254	ip_hdr_length = IPH_HDR_LENGTH(ipha);
3255	if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3256		if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3257			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3258			if (mp->b_cont == NULL)
3259				error = EINVAL;
3260			else
3261				error = ENOMEM;
3262			freemsg(mp);
3263			goto done;
3264		}
3265	}
3266
3267	if (!do_ipsec) {
3268		/* Policy might differ for different ICMP type/code */
3269		if (ixa->ixa_ipsec_policy != NULL) {
3270			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3271			ixa->ixa_ipsec_policy = NULL;
3272			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3273		}
3274		mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3275		if (mp == NULL) {
3276			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3277			error = EHOSTUNREACH;	/* IPsec policy failure */
3278			goto done;
3279		}
3280	}
3281
3282	/* We're done.  Pass the packet to ip. */
3283	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3284
3285	error = conn_ip_output(mp, ixa);
3286	/* No rawipOutErrors if an error since IP increases its error counter */
3287	switch (error) {
3288	case 0:
3289		break;
3290	case EWOULDBLOCK:
3291		(void) ixa_check_drain_insert(connp, ixa);
3292		error = 0;
3293		break;
3294	case EADDRNOTAVAIL:
3295		/*
3296		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3297		 * Don't have the application see that errno
3298		 */
3299		error = ENETUNREACH;
3300		break;
3301	}
3302done:
3303	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3304	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3305	ixa->ixa_cpid = connp->conn_cpid;
3306	ixa_refrele(ixa);
3307	ip_pkt_free(ipp);
3308	kmem_free(ipp, sizeof (*ipp));
3309	return (error);
3310}
3311
3312static mblk_t *
3313icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3314{
3315	ipha_t	*ipha = NULL;
3316	ip6_t	*ip6h = NULL;
3317
3318	if (ixa->ixa_flags & IXAF_IS_IPV4)
3319		ipha = (ipha_t *)mp->b_rptr;
3320	else
3321		ip6h = (ip6_t *)mp->b_rptr;
3322
3323	if (ixa->ixa_ipsec_policy != NULL) {
3324		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3325		ixa->ixa_ipsec_policy = NULL;
3326		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3327	}
3328	return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3329}
3330
3331/*
3332 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3333 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3334 * the TPI options, otherwise we take them from msg_control.
3335 * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3336 * Always consumes mp; never consumes tudr_mp.
3337 */
3338static int
3339icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3340    mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3341{
3342	icmp_t		*icmp = connp->conn_icmp;
3343	icmp_stack_t	*is = icmp->icmp_is;
3344	int		error;
3345	ip_xmit_attr_t	*ixa;
3346	ip_pkt_t	*ipp;
3347	in6_addr_t	v6src;
3348	in6_addr_t	v6dst;
3349	in6_addr_t	v6nexthop;
3350	in_port_t	dstport;
3351	uint32_t	flowinfo;
3352	int		is_absreq_failure = 0;
3353	conn_opt_arg_t	coas, *coa;
3354
3355	ASSERT(tudr_mp != NULL || msg != NULL);
3356
3357	/*
3358	 * Get ixa before checking state to handle a disconnect race.
3359	 *
3360	 * We need an exclusive copy of conn_ixa since the ancillary data
3361	 * options might modify it. That copy has no pointers hence we
3362	 * need to set them up once we've parsed the ancillary data.
3363	 */
3364	ixa = conn_get_ixa_exclusive(connp);
3365	if (ixa == NULL) {
3366		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3367		freemsg(mp);
3368		return (ENOMEM);
3369	}
3370	ASSERT(cr != NULL);
3371	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3372	ixa->ixa_cred = cr;
3373	ixa->ixa_cpid = pid;
3374	if (is_system_labeled()) {
3375		/* We need to restart with a label based on the cred */
3376		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3377	}
3378
3379	/* In case previous destination was multicast or multirt */
3380	ip_attr_newdst(ixa);
3381
3382	/* Get a copy of conn_xmit_ipp since the options might change it */
3383	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3384	if (ipp == NULL) {
3385		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3386		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3387		ixa->ixa_cpid = connp->conn_cpid;
3388		ixa_refrele(ixa);
3389		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3390		freemsg(mp);
3391		return (ENOMEM);
3392	}
3393	mutex_enter(&connp->conn_lock);
3394	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3395	mutex_exit(&connp->conn_lock);
3396	if (error != 0) {
3397		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3398		freemsg(mp);
3399		goto done;
3400	}
3401
3402	/*
3403	 * Parse the options and update ixa and ipp as a result.
3404	 */
3405
3406	coa = &coas;
3407	coa->coa_connp = connp;
3408	coa->coa_ixa = ixa;
3409	coa->coa_ipp = ipp;
3410	coa->coa_ancillary = B_TRUE;
3411	coa->coa_changed = 0;
3412
3413	if (msg != NULL) {
3414		error = process_auxiliary_options(connp, msg->msg_control,
3415		    msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3416	} else {
3417		struct T_unitdata_req *tudr;
3418
3419		tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3420		ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3421		error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3422		    &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3423		    coa, &is_absreq_failure);
3424	}
3425	if (error != 0) {
3426		/*
3427		 * Note: No special action needed in this
3428		 * module for "is_absreq_failure"
3429		 */
3430		freemsg(mp);
3431		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3432		goto done;
3433	}
3434	ASSERT(is_absreq_failure == 0);
3435
3436	mutex_enter(&connp->conn_lock);
3437	/*
3438	 * If laddr is unspecified then we look at sin6_src_id.
3439	 * We will give precedence to a source address set with IPV6_PKTINFO
3440	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3441	 * want ip_attr_connect to select a source (since it can fail) when
3442	 * IPV6_PKTINFO is specified.
3443	 * If this doesn't result in a source address then we get a source
3444	 * from ip_attr_connect() below.
3445	 */
3446	v6src = connp->conn_saddr_v6;
3447	if (sin != NULL) {
3448		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3449		dstport = sin->sin_port;
3450		flowinfo = 0;
3451		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3452		ixa->ixa_flags |= IXAF_IS_IPV4;
3453	} else if (sin6 != NULL) {
3454		boolean_t v4mapped;
3455		uint_t srcid;
3456
3457		v6dst = sin6->sin6_addr;
3458		dstport = sin6->sin6_port;
3459		flowinfo = sin6->sin6_flowinfo;
3460		srcid = sin6->__sin6_src_id;
3461		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3462			ixa->ixa_scopeid = sin6->sin6_scope_id;
3463			ixa->ixa_flags |= IXAF_SCOPEID_SET;
3464		} else {
3465			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3466		}
3467		v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
3468		if (v4mapped)
3469			ixa->ixa_flags |= IXAF_IS_IPV4;
3470		else
3471			ixa->ixa_flags &= ~IXAF_IS_IPV4;
3472		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3473			if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3474			    v4mapped, connp->conn_netstack)) {
3475				/* Mismatched v4mapped/v6 specified by srcid. */
3476				mutex_exit(&connp->conn_lock);
3477				error = EADDRNOTAVAIL;
3478				goto failed;	/* Does freemsg() and mib. */
3479			}
3480		}
3481	} else {
3482		/* Connected case */
3483		dstport = connp->conn_fport;
3484		v6dst = connp->conn_faddr_v6;
3485		flowinfo = connp->conn_flowinfo;
3486	}
3487	mutex_exit(&connp->conn_lock);
3488	/* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
3489	if (ipp->ipp_fields & IPPF_ADDR) {
3490		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3491			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3492				v6src = ipp->ipp_addr;
3493		} else {
3494			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3495				v6src = ipp->ipp_addr;
3496		}
3497	}
3498	/*
3499	 * Allow source not assigned to the system
3500	 * only if it is not a local addresses
3501	 */
3502	if (!V6_OR_V4_INADDR_ANY(v6src)) {
3503		ip_laddr_t laddr_type;
3504
3505		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3506			ipaddr_t v4src;
3507
3508			IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
3509			laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
3510			    is->is_netstack->netstack_ip, B_FALSE);
3511		} else {
3512			laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
3513			    is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
3514		}
3515		if (laddr_type != IPVL_UNICAST_UP)
3516			ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3517	}
3518
3519	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3520	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3521	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3522
3523	switch (error) {
3524	case 0:
3525		break;
3526	case EADDRNOTAVAIL:
3527		/*
3528		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3529		 * Don't have the application see that errno
3530		 */
3531		error = ENETUNREACH;
3532		goto failed;
3533	case ENETDOWN:
3534		/*
3535		 * Have !ipif_addr_ready address; drop packet silently
3536		 * until we can get applications to not send until we
3537		 * are ready.
3538		 */
3539		error = 0;
3540		goto failed;
3541	case EHOSTUNREACH:
3542	case ENETUNREACH:
3543		if (ixa->ixa_ire != NULL) {
3544			/*
3545			 * Let conn_ip_output/ire_send_noroute return
3546			 * the error and send any local ICMP error.
3547			 */
3548			error = 0;
3549			break;
3550		}
3551		/* FALLTHRU */
3552	default:
3553	failed:
3554		freemsg(mp);
3555		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3556		goto done;
3557	}
3558
3559	/*
3560	 * We might be going to a different destination than last time,
3561	 * thus check that TX allows the communication and compute any
3562	 * needed label.
3563	 *
3564	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3565	 * don't have to worry about concurrent threads.
3566	 */
3567	if (is_system_labeled()) {
3568		/*
3569		 * Check whether Trusted Solaris policy allows communication
3570		 * with this host, and pretend that the destination is
3571		 * unreachable if not.
3572		 * Compute any needed label and place it in ipp_label_v4/v6.
3573		 *
3574		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3575		 * ipp_label_v4/v6 to form the packet.
3576		 *
3577		 * Tsol note: We have ipp structure local to this thread so
3578		 * no locking is needed.
3579		 */
3580		error = conn_update_label(connp, ixa, &v6dst, ipp);
3581		if (error != 0) {
3582			freemsg(mp);
3583			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3584			goto done;
3585		}
3586	}
3587	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3588	    &error);
3589	if (mp == NULL) {
3590		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3591		ASSERT(error != 0);
3592		goto done;
3593	}
3594	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3595		error = EMSGSIZE;
3596		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3597		freemsg(mp);
3598		goto done;
3599	}
3600
3601	/* Policy might differ for different ICMP type/code */
3602	mp = icmp_output_attach_policy(mp, connp, ixa);
3603	if (mp == NULL) {
3604		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3605		error = EHOSTUNREACH;	/* IPsec policy failure */
3606		goto done;
3607	}
3608
3609	/* We're done.  Pass the packet to ip. */
3610	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3611
3612	error = conn_ip_output(mp, ixa);
3613	if (!connp->conn_unspec_src)
3614		ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3615	/* No rawipOutErrors if an error since IP increases its error counter */
3616	switch (error) {
3617	case 0:
3618		break;
3619	case EWOULDBLOCK:
3620		(void) ixa_check_drain_insert(connp, ixa);
3621		error = 0;
3622		break;
3623	case EADDRNOTAVAIL:
3624		/*
3625		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3626		 * Don't have the application see that errno
3627		 */
3628		error = ENETUNREACH;
3629		/* FALLTHRU */
3630	default:
3631		mutex_enter(&connp->conn_lock);
3632		/*
3633		 * Clear the source and v6lastdst so we call ip_attr_connect
3634		 * for the next packet and try to pick a better source.
3635		 */
3636		if (connp->conn_mcbc_bind)
3637			connp->conn_saddr_v6 = ipv6_all_zeros;
3638		else
3639			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3640		connp->conn_v6lastdst = ipv6_all_zeros;
3641		mutex_exit(&connp->conn_lock);
3642		break;
3643	}
3644done:
3645	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3646	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3647	ixa->ixa_cpid = connp->conn_cpid;
3648	ixa_refrele(ixa);
3649	ip_pkt_free(ipp);
3650	kmem_free(ipp, sizeof (*ipp));
3651	return (error);
3652}
3653
3654/*
3655 * Handle sending an M_DATA for a connected socket.
3656 * Handles both IPv4 and IPv6.
3657 */
3658int
3659icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3660{
3661	icmp_t		*icmp = connp->conn_icmp;
3662	icmp_stack_t	*is = icmp->icmp_is;
3663	int		error;
3664	ip_xmit_attr_t	*ixa;
3665	boolean_t	do_ipsec;
3666
3667	/*
3668	 * If no other thread is using conn_ixa this just gets a reference to
3669	 * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3670	 */
3671	ixa = conn_get_ixa(connp, B_FALSE);
3672	if (ixa == NULL) {
3673		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3674		freemsg(mp);
3675		return (ENOMEM);
3676	}
3677
3678	ASSERT(cr != NULL);
3679	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3680	ixa->ixa_cred = cr;
3681	ixa->ixa_cpid = pid;
3682
3683	/* Defer IPsec if it might need to look at ICMP type/code */
3684	switch (ixa->ixa_protocol) {
3685	case IPPROTO_ICMP:
3686	case IPPROTO_ICMPV6:
3687		do_ipsec = B_FALSE;
3688		break;
3689	default:
3690		do_ipsec = B_TRUE;
3691	}
3692
3693	mutex_enter(&connp->conn_lock);
3694	mp = icmp_prepend_header_template(connp, ixa, mp,
3695	    &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3696
3697	if (mp == NULL) {
3698		ASSERT(error != 0);
3699		mutex_exit(&connp->conn_lock);
3700		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3701		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3702		ixa->ixa_cpid = connp->conn_cpid;
3703		ixa_refrele(ixa);
3704		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3705		freemsg(mp);
3706		return (error);
3707	}
3708
3709	if (!do_ipsec) {
3710		/* Policy might differ for different ICMP type/code */
3711		mp = icmp_output_attach_policy(mp, connp, ixa);
3712		if (mp == NULL) {
3713			mutex_exit(&connp->conn_lock);
3714			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3715			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3716			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3717			ixa->ixa_cpid = connp->conn_cpid;
3718			ixa_refrele(ixa);
3719			return (EHOSTUNREACH);	/* IPsec policy failure */
3720		}
3721	}
3722
3723	/*
3724	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3725	 * safe copy, then we need to fill in any pointers in it.
3726	 */
3727	if (ixa->ixa_ire == NULL) {
3728		in6_addr_t	faddr, saddr;
3729		in6_addr_t	nexthop;
3730		in_port_t	fport;
3731
3732		saddr = connp->conn_saddr_v6;
3733		faddr = connp->conn_faddr_v6;
3734		fport = connp->conn_fport;
3735		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3736		mutex_exit(&connp->conn_lock);
3737
3738		error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3739		    fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3740		    (do_ipsec ? IPDF_IPSEC : 0));
3741		switch (error) {
3742		case 0:
3743			break;
3744		case EADDRNOTAVAIL:
3745			/*
3746			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3747			 * Don't have the application see that errno
3748			 */
3749			error = ENETUNREACH;
3750			goto failed;
3751		case ENETDOWN:
3752			/*
3753			 * Have !ipif_addr_ready address; drop packet silently
3754			 * until we can get applications to not send until we
3755			 * are ready.
3756			 */
3757			error = 0;
3758			goto failed;
3759		case EHOSTUNREACH:
3760		case ENETUNREACH:
3761			if (ixa->ixa_ire != NULL) {
3762				/*
3763				 * Let conn_ip_output/ire_send_noroute return
3764				 * the error and send any local ICMP error.
3765				 */
3766				error = 0;
3767				break;
3768			}
3769			/* FALLTHRU */
3770		default:
3771		failed:
3772			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3773			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3774			ixa->ixa_cpid = connp->conn_cpid;
3775			ixa_refrele(ixa);
3776			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3777			freemsg(mp);
3778			return (error);
3779		}
3780	} else {
3781		/* Done with conn_t */
3782		mutex_exit(&connp->conn_lock);
3783	}
3784
3785	/* We're done.  Pass the packet to ip. */
3786	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3787
3788	error = conn_ip_output(mp, ixa);
3789	/* No rawipOutErrors if an error since IP increases its error counter */
3790	switch (error) {
3791	case 0:
3792		break;
3793	case EWOULDBLOCK:
3794		(void) ixa_check_drain_insert(connp, ixa);
3795		error = 0;
3796		break;
3797	case EADDRNOTAVAIL:
3798		/*
3799		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3800		 * Don't have the application see that errno
3801		 */
3802		error = ENETUNREACH;
3803		break;
3804	}
3805	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3806	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3807	ixa->ixa_cpid = connp->conn_cpid;
3808	ixa_refrele(ixa);
3809	return (error);
3810}
3811
3812/*
3813 * Handle sending an M_DATA to the last destination.
3814 * Handles both IPv4 and IPv6.
3815 *
3816 * NOTE: The caller must hold conn_lock and we drop it here.
3817 */
3818int
3819icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3820    ip_xmit_attr_t *ixa)
3821{
3822	icmp_t		*icmp = connp->conn_icmp;
3823	icmp_stack_t	*is = icmp->icmp_is;
3824	int		error;
3825	boolean_t	do_ipsec;
3826
3827	ASSERT(MUTEX_HELD(&connp->conn_lock));
3828	ASSERT(ixa != NULL);
3829
3830	ASSERT(cr != NULL);
3831	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3832	ixa->ixa_cred = cr;
3833	ixa->ixa_cpid = pid;
3834
3835	/* Defer IPsec if it might need to look at ICMP type/code */
3836	switch (ixa->ixa_protocol) {
3837	case IPPROTO_ICMP:
3838	case IPPROTO_ICMPV6:
3839		do_ipsec = B_FALSE;
3840		break;
3841	default:
3842		do_ipsec = B_TRUE;
3843	}
3844
3845
3846	mp = icmp_prepend_header_template(connp, ixa, mp,
3847	    &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3848
3849	if (mp == NULL) {
3850		ASSERT(error != 0);
3851		mutex_exit(&connp->conn_lock);
3852		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3853		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3854		ixa->ixa_cpid = connp->conn_cpid;
3855		ixa_refrele(ixa);
3856		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3857		freemsg(mp);
3858		return (error);
3859	}
3860
3861	if (!do_ipsec) {
3862		/* Policy might differ for different ICMP type/code */
3863		mp = icmp_output_attach_policy(mp, connp, ixa);
3864		if (mp == NULL) {
3865			mutex_exit(&connp->conn_lock);
3866			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3867			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3868			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3869			ixa->ixa_cpid = connp->conn_cpid;
3870			ixa_refrele(ixa);
3871			return (EHOSTUNREACH);	/* IPsec policy failure */
3872		}
3873	}
3874
3875	/*
3876	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3877	 * safe copy, then we need to fill in any pointers in it.
3878	 */
3879	if (ixa->ixa_ire == NULL) {
3880		in6_addr_t	lastdst, lastsrc;
3881		in6_addr_t	nexthop;
3882		in_port_t	lastport;
3883
3884		lastsrc = connp->conn_v6lastsrc;
3885		lastdst = connp->conn_v6lastdst;
3886		lastport = connp->conn_lastdstport;
3887		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3888		mutex_exit(&connp->conn_lock);
3889
3890		error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
3891		    &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
3892		    IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
3893		switch (error) {
3894		case 0:
3895			break;
3896		case EADDRNOTAVAIL:
3897			/*
3898			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3899			 * Don't have the application see that errno
3900			 */
3901			error = ENETUNREACH;
3902			goto failed;
3903		case ENETDOWN:
3904			/*
3905			 * Have !ipif_addr_ready address; drop packet silently
3906			 * until we can get applications to not send until we
3907			 * are ready.
3908			 */
3909			error = 0;
3910			goto failed;
3911		case EHOSTUNREACH:
3912		case ENETUNREACH:
3913			if (ixa->ixa_ire != NULL) {
3914				/*
3915				 * Let conn_ip_output/ire_send_noroute return
3916				 * the error and send any local ICMP error.
3917				 */
3918				error = 0;
3919				break;
3920			}
3921			/* FALLTHRU */
3922		default:
3923		failed:
3924			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3925			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3926			ixa->ixa_cpid = connp->conn_cpid;
3927			ixa_refrele(ixa);
3928			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3929			freemsg(mp);
3930			return (error);
3931		}
3932	} else {
3933		/* Done with conn_t */
3934		mutex_exit(&connp->conn_lock);
3935	}
3936
3937	/* We're done.  Pass the packet to ip. */
3938	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3939	error = conn_ip_output(mp, ixa);
3940	/* No rawipOutErrors if an error since IP increases its error counter */
3941	switch (error) {
3942	case 0:
3943		break;
3944	case EWOULDBLOCK:
3945		(void) ixa_check_drain_insert(connp, ixa);
3946		error = 0;
3947		break;
3948	case EADDRNOTAVAIL:
3949		/*
3950		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3951		 * Don't have the application see that errno
3952		 */
3953		error = ENETUNREACH;
3954		/* FALLTHRU */
3955	default:
3956		mutex_enter(&connp->conn_lock);
3957		/*
3958		 * Clear the source and v6lastdst so we call ip_attr_connect
3959		 * for the next packet and try to pick a better source.
3960		 */
3961		if (connp->conn_mcbc_bind)
3962			connp->conn_saddr_v6 = ipv6_all_zeros;
3963		else
3964			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3965		connp->conn_v6lastdst = ipv6_all_zeros;
3966		mutex_exit(&connp->conn_lock);
3967		break;
3968	}
3969	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3970	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3971	ixa->ixa_cpid = connp->conn_cpid;
3972	ixa_refrele(ixa);
3973	return (error);
3974}
3975
3976
3977/*
3978 * Prepend the header template and then fill in the source and
3979 * flowinfo. The caller needs to handle the destination address since
3980 * it's setting is different if rthdr or source route.
3981 *
3982 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3983 * When it returns NULL it sets errorp.
3984 */
3985static mblk_t *
3986icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3987    const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
3988{
3989	icmp_t		*icmp = connp->conn_icmp;
3990	icmp_stack_t	*is = icmp->icmp_is;
3991	uint_t		pktlen;
3992	uint_t		copylen;
3993	uint8_t		*iph;
3994	uint_t		ip_hdr_length;
3995	uint32_t	cksum;
3996	ip_pkt_t	*ipp;
3997
3998	ASSERT(MUTEX_HELD(&connp->conn_lock));
3999
4000	/*
4001	 * Copy the header template.
4002	 */
4003	copylen = connp->conn_ht_iphc_len;
4004	pktlen = copylen + msgdsize(mp);
4005	if (pktlen > IP_MAXPACKET) {
4006		freemsg(mp);
4007		*errorp = EMSGSIZE;
4008		return (NULL);
4009	}
4010	ixa->ixa_pktlen = pktlen;
4011
4012	/* check/fix buffer config, setup pointers into it */
4013	iph = mp->b_rptr - copylen;
4014	if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
4015		mblk_t *mp1;
4016
4017		mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
4018		if (mp1 == NULL) {
4019			freemsg(mp);
4020			*errorp = ENOMEM;
4021			return (NULL);
4022		}
4023		mp1->b_wptr = DB_LIM(mp1);
4024		mp1->b_cont = mp;
4025		mp = mp1;
4026		iph = (mp->b_wptr - copylen);
4027	}
4028	mp->b_rptr = iph;
4029	bcopy(connp->conn_ht_iphc, iph, copylen);
4030	ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
4031
4032	ixa->ixa_ip_hdr_length = ip_hdr_length;
4033
4034	/*
4035	 * Prepare for ICMPv6 checksum done in IP.
4036	 *
4037	 * icmp_build_hdr_template has already massaged any routing header
4038	 * and placed the result in conn_sum.
4039	 *
4040	 * We make it easy for IP to include our pseudo header
4041	 * by putting our length (and any routing header adjustment)
4042	 * in the ICMPv6 checksum field.
4043	 */
4044	cksum = pktlen - ip_hdr_length;
4045
4046	cksum += connp->conn_sum;
4047	cksum = (cksum >> 16) + (cksum & 0xFFFF);
4048	ASSERT(cksum < 0x10000);
4049
4050	ipp = &connp->conn_xmit_ipp;
4051	if (ixa->ixa_flags & IXAF_IS_IPV4) {
4052		ipha_t	*ipha = (ipha_t *)iph;
4053
4054		ipha->ipha_length = htons((uint16_t)pktlen);
4055
4056		/* if IP_PKTINFO specified an addres it wins over bind() */
4057		if ((ipp->ipp_fields & IPPF_ADDR) &&
4058		    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4059			ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
4060			ipha->ipha_src = ipp->ipp_addr_v4;
4061		} else {
4062			IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
4063		}
4064	} else {
4065		ip6_t *ip6h = (ip6_t *)iph;
4066		uint_t	cksum_offset = 0;
4067
4068		ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
4069
4070		/* if IP_PKTINFO specified an addres it wins over bind() */
4071		if ((ipp->ipp_fields & IPPF_ADDR) &&
4072		    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4073			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
4074			ip6h->ip6_src = ipp->ipp_addr;
4075		} else {
4076			ip6h->ip6_src = *v6src;
4077		}
4078		ip6h->ip6_vcf =
4079		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4080		    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4081		if (ipp->ipp_fields & IPPF_TCLASS) {
4082			/* Overrides the class part of flowinfo */
4083			ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4084			    ipp->ipp_tclass);
4085		}
4086
4087		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
4088			if (connp->conn_proto == IPPROTO_ICMPV6) {
4089				cksum_offset = ixa->ixa_ip_hdr_length +
4090				    offsetof(icmp6_t, icmp6_cksum);
4091			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4092				cksum_offset = ixa->ixa_ip_hdr_length +
4093				    ixa->ixa_raw_cksum_offset;
4094			}
4095		}
4096		if (cksum_offset != 0) {
4097			uint16_t *ptr;
4098
4099			/* Make sure the checksum fits in the first mblk */
4100			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4101				mblk_t *mp1;
4102
4103				mp1 = msgpullup(mp,
4104				    cksum_offset + sizeof (short));
4105				freemsg(mp);
4106				if (mp1 == NULL) {
4107					*errorp = ENOMEM;
4108					return (NULL);
4109				}
4110				mp = mp1;
4111				iph = mp->b_rptr;
4112				ip6h = (ip6_t *)iph;
4113			}
4114			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4115			*ptr = htons(cksum);
4116		}
4117	}
4118
4119	return (mp);
4120}
4121
4122/*
4123 * This routine handles all messages passed downstream.  It either
4124 * consumes the message or passes it downstream; it never queues a
4125 * a message.
4126 */
4127int
4128icmp_wput(queue_t *q, mblk_t *mp)
4129{
4130	sin6_t		*sin6;
4131	sin_t		*sin = NULL;
4132	uint_t		srcid;
4133	conn_t		*connp = Q_TO_CONN(q);
4134	icmp_t		*icmp = connp->conn_icmp;
4135	int		error = 0;
4136	struct sockaddr	*addr = NULL;
4137	socklen_t	addrlen;
4138	icmp_stack_t	*is = icmp->icmp_is;
4139	struct T_unitdata_req *tudr;
4140	mblk_t		*data_mp;
4141	cred_t		*cr;
4142	pid_t		pid;
4143
4144	/*
4145	 * We directly handle several cases here: T_UNITDATA_REQ message
4146	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4147	 * socket.
4148	 */
4149	switch (DB_TYPE(mp)) {
4150	case M_DATA:
4151		/* sockfs never sends down M_DATA */
4152		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4153		freemsg(mp);
4154		return (0);
4155
4156	case M_PROTO:
4157	case M_PCPROTO:
4158		tudr = (struct T_unitdata_req *)mp->b_rptr;
4159		if (MBLKL(mp) < sizeof (*tudr) ||
4160		    ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4161			icmp_wput_other(q, mp);
4162			return (0);
4163		}
4164		break;
4165
4166	default:
4167		icmp_wput_other(q, mp);
4168		return (0);
4169	}
4170
4171	/* Handle valid T_UNITDATA_REQ here */
4172	data_mp = mp->b_cont;
4173	if (data_mp == NULL) {
4174		error = EPROTO;
4175		goto ud_error2;
4176	}
4177	mp->b_cont = NULL;
4178
4179	if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4180		error = EADDRNOTAVAIL;
4181		goto ud_error2;
4182	}
4183
4184	/*
4185	 * All Solaris components should pass a db_credp
4186	 * for this message, hence we ASSERT.
4187	 * On production kernels we return an error to be robust against
4188	 * random streams modules sitting on top of us.
4189	 */
4190	cr = msg_getcred(mp, &pid);
4191	ASSERT(cr != NULL);
4192	if (cr == NULL) {
4193		error = EINVAL;
4194		goto ud_error2;
4195	}
4196
4197	/*
4198	 * If a port has not been bound to the stream, fail.
4199	 * This is not a problem when sockfs is directly
4200	 * above us, because it will ensure that the socket
4201	 * is first bound before allowing data to be sent.
4202	 */
4203	if (icmp->icmp_state == TS_UNBND) {
4204		error = EPROTO;
4205		goto ud_error2;
4206	}
4207	addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4208	addrlen = tudr->DEST_length;
4209
4210	switch (connp->conn_family) {
4211	case AF_INET6:
4212		sin6 = (sin6_t *)addr;
4213		if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4214		    (sin6->sin6_family != AF_INET6)) {
4215			error = EADDRNOTAVAIL;
4216			goto ud_error2;
4217		}
4218
4219		/* No support for mapped addresses on raw sockets */
4220		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4221			error = EADDRNOTAVAIL;
4222			goto ud_error2;
4223		}
4224		srcid = sin6->__sin6_src_id;
4225
4226		/*
4227		 * If the local address is a mapped address return
4228		 * an error.
4229		 * It would be possible to send an IPv6 packet but the
4230		 * response would never make it back to the application
4231		 * since it is bound to a mapped address.
4232		 */
4233		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4234			error = EADDRNOTAVAIL;
4235			goto ud_error2;
4236		}
4237
4238		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4239			sin6->sin6_addr = ipv6_loopback;
4240
4241		if (tudr->OPT_length != 0) {
4242			/*
4243			 * If we are connected then the destination needs to be
4244			 * the same as the connected one.
4245			 */
4246			if (icmp->icmp_state == TS_DATA_XFER &&
4247			    !conn_same_as_last_v6(connp, sin6)) {
4248				error = EISCONN;
4249				goto ud_error2;
4250			}
4251			error = icmp_output_ancillary(connp, NULL, sin6,
4252			    data_mp, mp, NULL, cr, pid);
4253		} else {
4254			ip_xmit_attr_t *ixa;
4255
4256			/*
4257			 * We have to allocate an ip_xmit_attr_t before we grab
4258			 * conn_lock and we need to hold conn_lock once we've
4259			 * checked conn_same_as_last_v6 to handle concurrent
4260			 * send* calls on a socket.
4261			 */
4262			ixa = conn_get_ixa(connp, B_FALSE);
4263			if (ixa == NULL) {
4264				error = ENOMEM;
4265				goto ud_error2;
4266			}
4267			mutex_enter(&connp->conn_lock);
4268
4269			if (conn_same_as_last_v6(connp, sin6) &&
4270			    connp->conn_lastsrcid == srcid &&
4271			    ipsec_outbound_policy_current(ixa)) {
4272				/* icmp_output_lastdst drops conn_lock */
4273				error = icmp_output_lastdst(connp, data_mp, cr,
4274				    pid, ixa);
4275			} else {
4276				/* icmp_output_newdst drops conn_lock */
4277				error = icmp_output_newdst(connp, data_mp, NULL,
4278				    sin6, cr, pid, ixa);
4279			}
4280			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4281		}
4282		if (error == 0) {
4283			freeb(mp);
4284			return (0);
4285		}
4286		break;
4287
4288	case AF_INET:
4289		sin = (sin_t *)addr;
4290		if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4291		    (sin->sin_family != AF_INET)) {
4292			error = EADDRNOTAVAIL;
4293			goto ud_error2;
4294		}
4295		if (sin->sin_addr.s_addr == INADDR_ANY)
4296			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4297
4298		/* Protocol 255 contains full IP headers */
4299		/* Read without holding lock */
4300		if (icmp->icmp_hdrincl) {
4301			if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4302				if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4303					error = EINVAL;
4304					goto ud_error2;
4305				}
4306			}
4307			error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4308			if (error == 0) {
4309				freeb(mp);
4310				return (0);
4311			}
4312			/* data_mp consumed above */
4313			data_mp = NULL;
4314			goto ud_error2;
4315		}
4316
4317		if (tudr->OPT_length != 0) {
4318			/*
4319			 * If we are connected then the destination needs to be
4320			 * the same as the connected one.
4321			 */
4322			if (icmp->icmp_state == TS_DATA_XFER &&
4323			    !conn_same_as_last_v4(connp, sin)) {
4324				error = EISCONN;
4325				goto ud_error2;
4326			}
4327			error = icmp_output_ancillary(connp, sin, NULL,
4328			    data_mp, mp, NULL, cr, pid);
4329		} else {
4330			ip_xmit_attr_t *ixa;
4331
4332			/*
4333			 * We have to allocate an ip_xmit_attr_t before we grab
4334			 * conn_lock and we need to hold conn_lock once we've
4335			 * checked conn_same_as_last_v4 to handle concurrent
4336			 * send* calls on a socket.
4337			 */
4338			ixa = conn_get_ixa(connp, B_FALSE);
4339			if (ixa == NULL) {
4340				error = ENOMEM;
4341				goto ud_error2;
4342			}
4343			mutex_enter(&connp->conn_lock);
4344
4345			if (conn_same_as_last_v4(connp, sin) &&
4346			    ipsec_outbound_policy_current(ixa)) {
4347				/* icmp_output_lastdst drops conn_lock */
4348				error = icmp_output_lastdst(connp, data_mp, cr,
4349				    pid, ixa);
4350			} else {
4351				/* icmp_output_newdst drops conn_lock */
4352				error = icmp_output_newdst(connp, data_mp, sin,
4353				    NULL, cr, pid, ixa);
4354			}
4355			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4356		}
4357		if (error == 0) {
4358			freeb(mp);
4359			return (0);
4360		}
4361		break;
4362	}
4363	ASSERT(mp != NULL);
4364	/* mp is freed by the following routine */
4365	icmp_ud_err(q, mp, (t_scalar_t)error);
4366	return (0);
4367
4368ud_error2:
4369	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4370	freemsg(data_mp);
4371	ASSERT(mp != NULL);
4372	/* mp is freed by the following routine */
4373	icmp_ud_err(q, mp, (t_scalar_t)error);
4374	return (0);
4375}
4376
4377/*
4378 * Handle the case of the IP address or flow label being different
4379 * for both IPv4 and IPv6.
4380 *
4381 * NOTE: The caller must hold conn_lock and we drop it here.
4382 */
4383static int
4384icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4385    cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4386{
4387	icmp_t		*icmp = connp->conn_icmp;
4388	icmp_stack_t	*is = icmp->icmp_is;
4389	int		error;
4390	ip_xmit_attr_t	*oldixa;
4391	boolean_t	do_ipsec;
4392	uint_t		srcid;
4393	uint32_t	flowinfo;
4394	in6_addr_t	v6src;
4395	in6_addr_t	v6dst;
4396	in6_addr_t	v6nexthop;
4397	in_port_t	dstport;
4398
4399	ASSERT(MUTEX_HELD(&connp->conn_lock));
4400	ASSERT(ixa != NULL);
4401
4402	/*
4403	 * We hold conn_lock across all the use and modifications of
4404	 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4405	 * stay consistent.
4406	 */
4407
4408	ASSERT(cr != NULL);
4409	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4410	ixa->ixa_cred = cr;
4411	ixa->ixa_cpid = pid;
4412	if (is_system_labeled()) {
4413		/* We need to restart with a label based on the cred */
4414		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4415	}
4416	/*
4417	 * If we are connected then the destination needs to be the
4418	 * same as the connected one, which is not the case here since we
4419	 * checked for that above.
4420	 */
4421	if (icmp->icmp_state == TS_DATA_XFER) {
4422		mutex_exit(&connp->conn_lock);
4423		error = EISCONN;
4424		goto ud_error;
4425	}
4426
4427	/* In case previous destination was multicast or multirt */
4428	ip_attr_newdst(ixa);
4429
4430	/*
4431	 * If laddr is unspecified then we look at sin6_src_id.
4432	 * We will give precedence to a source address set with IPV6_PKTINFO
4433	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4434	 * want ip_attr_connect to select a source (since it can fail) when
4435	 * IPV6_PKTINFO is specified.
4436	 * If this doesn't result in a source address then we get a source
4437	 * from ip_attr_connect() below.
4438	 */
4439	v6src = connp->conn_saddr_v6;
4440	if (sin != NULL) {
4441		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4442		dstport = sin->sin_port;
4443		flowinfo = 0;
4444		/* Don't bother with ip_srcid_find_id(), but indicate anyway. */
4445		srcid = 0;
4446		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4447		ixa->ixa_flags |= IXAF_IS_IPV4;
4448	} else {
4449		boolean_t v4mapped;
4450
4451		v6dst = sin6->sin6_addr;
4452		dstport = sin6->sin6_port;
4453		flowinfo = sin6->sin6_flowinfo;
4454		srcid = sin6->__sin6_src_id;
4455		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4456			ixa->ixa_scopeid = sin6->sin6_scope_id;
4457			ixa->ixa_flags |= IXAF_SCOPEID_SET;
4458		} else {
4459			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4460		}
4461		v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
4462		if (v4mapped)
4463			ixa->ixa_flags |= IXAF_IS_IPV4;
4464		else
4465			ixa->ixa_flags &= ~IXAF_IS_IPV4;
4466		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4467			if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4468			    v4mapped, connp->conn_netstack)) {
4469				/* Mismatched v4mapped/v6 specified by srcid. */
4470				mutex_exit(&connp->conn_lock);
4471				error = EADDRNOTAVAIL;
4472				goto ud_error;
4473			}
4474		}
4475	}
4476	/* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
4477	if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) {
4478		ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4479
4480		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4481			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4482				v6src = ipp->ipp_addr;
4483		} else {
4484			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4485				v6src = ipp->ipp_addr;
4486		}
4487	}
4488
4489	/* Defer IPsec if it might need to look at ICMP type/code */
4490	switch (ixa->ixa_protocol) {
4491	case IPPROTO_ICMP:
4492	case IPPROTO_ICMPV6:
4493		do_ipsec = B_FALSE;
4494		break;
4495	default:
4496		do_ipsec = B_TRUE;
4497	}
4498
4499	ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4500	mutex_exit(&connp->conn_lock);
4501
4502	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4503	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4504	    (do_ipsec ? IPDF_IPSEC : 0));
4505	switch (error) {
4506	case 0:
4507		break;
4508	case EADDRNOTAVAIL:
4509		/*
4510		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4511		 * Don't have the application see that errno
4512		 */
4513		error = ENETUNREACH;
4514		goto failed;
4515	case ENETDOWN:
4516		/*
4517		 * Have !ipif_addr_ready address; drop packet silently
4518		 * until we can get applications to not send until we
4519		 * are ready.
4520		 */
4521		error = 0;
4522		goto failed;
4523	case EHOSTUNREACH:
4524	case ENETUNREACH:
4525		if (ixa->ixa_ire != NULL) {
4526			/*
4527			 * Let conn_ip_output/ire_send_noroute return
4528			 * the error and send any local ICMP error.
4529			 */
4530			error = 0;
4531			break;
4532		}
4533		/* FALLTHRU */
4534	default:
4535	failed:
4536		goto ud_error;
4537	}
4538
4539	mutex_enter(&connp->conn_lock);
4540	/*
4541	 * While we dropped the lock some other thread might have connected
4542	 * this socket. If so we bail out with EISCONN to ensure that the
4543	 * connecting thread is the one that updates conn_ixa, conn_ht_*
4544	 * and conn_*last*.
4545	 */
4546	if (icmp->icmp_state == TS_DATA_XFER) {
4547		mutex_exit(&connp->conn_lock);
4548		error = EISCONN;
4549		goto ud_error;
4550	}
4551
4552	/*
4553	 * We need to rebuild the headers if
4554	 *  - we are labeling packets (could be different for different
4555	 *    destinations)
4556	 *  - we have a source route (or routing header) since we need to
4557	 *    massage that to get the pseudo-header checksum
4558	 *  - a socket option with COA_HEADER_CHANGED has been set which
4559	 *    set conn_v6lastdst to zero.
4560	 *
4561	 * Otherwise the prepend function will just update the src, dst,
4562	 * and flow label.
4563	 */
4564	if (is_system_labeled()) {
4565		/* TX MLP requires SCM_UCRED and don't have that here */
4566		if (connp->conn_mlp_type != mlptSingle) {
4567			mutex_exit(&connp->conn_lock);
4568			error = ECONNREFUSED;
4569			goto ud_error;
4570		}
4571		/*
4572		 * Check whether Trusted Solaris policy allows communication
4573		 * with this host, and pretend that the destination is
4574		 * unreachable if not.
4575		 * Compute any needed label and place it in ipp_label_v4/v6.
4576		 *
4577		 * Later conn_build_hdr_template/conn_prepend_hdr takes
4578		 * ipp_label_v4/v6 to form the packet.
4579		 *
4580		 * Tsol note: Since we hold conn_lock we know no other
4581		 * thread manipulates conn_xmit_ipp.
4582		 */
4583		error = conn_update_label(connp, ixa, &v6dst,
4584		    &connp->conn_xmit_ipp);
4585		if (error != 0) {
4586			mutex_exit(&connp->conn_lock);
4587			goto ud_error;
4588		}
4589		/* Rebuild the header template */
4590		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4591		    flowinfo);
4592		if (error != 0) {
4593			mutex_exit(&connp->conn_lock);
4594			goto ud_error;
4595		}
4596	} else if (connp->conn_xmit_ipp.ipp_fields &
4597	    (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4598	    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4599		/* Rebuild the header template */
4600		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4601		    flowinfo);
4602		if (error != 0) {
4603			mutex_exit(&connp->conn_lock);
4604			goto ud_error;
4605		}
4606	} else {
4607		/* Simply update the destination address if no source route */
4608		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4609			ipha_t	*ipha = (ipha_t *)connp->conn_ht_iphc;
4610
4611			IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4612			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4613				ipha->ipha_fragment_offset_and_flags |=
4614				    IPH_DF_HTONS;
4615			} else {
4616				ipha->ipha_fragment_offset_and_flags &=
4617				    ~IPH_DF_HTONS;
4618			}
4619		} else {
4620			ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4621			ip6h->ip6_dst = v6dst;
4622		}
4623	}
4624
4625	/*
4626	 * Remember the dst etc which corresponds to the built header
4627	 * template and conn_ixa.
4628	 */
4629	oldixa = conn_replace_ixa(connp, ixa);
4630	connp->conn_v6lastdst = v6dst;
4631	connp->conn_lastflowinfo = flowinfo;
4632	connp->conn_lastscopeid = ixa->ixa_scopeid;
4633	connp->conn_lastsrcid = srcid;
4634	/* Also remember a source to use together with lastdst */
4635	connp->conn_v6lastsrc = v6src;
4636
4637	data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4638	    flowinfo, &error);
4639
4640	/* Done with conn_t */
4641	mutex_exit(&connp->conn_lock);
4642	ixa_refrele(oldixa);
4643
4644	if (data_mp == NULL) {
4645		ASSERT(error != 0);
4646		goto ud_error;
4647	}
4648
4649	if (!do_ipsec) {
4650		/* Policy might differ for different ICMP type/code */
4651		data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4652		if (data_mp == NULL) {
4653			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4654			error = EHOSTUNREACH;	/* IPsec policy failure */
4655			goto done;
4656		}
4657	}
4658
4659	/* We're done.  Pass the packet to ip. */
4660	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4661
4662	error = conn_ip_output(data_mp, ixa);
4663	/* No rawipOutErrors if an error since IP increases its error counter */
4664	switch (error) {
4665	case 0:
4666		break;
4667	case EWOULDBLOCK:
4668		(void) ixa_check_drain_insert(connp, ixa);
4669		error = 0;
4670		break;
4671	case EADDRNOTAVAIL:
4672		/*
4673		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4674		 * Don't have the application see that errno
4675		 */
4676		error = ENETUNREACH;
4677		/* FALLTHRU */
4678	default:
4679		mutex_enter(&connp->conn_lock);
4680		/*
4681		 * Clear the source and v6lastdst so we call ip_attr_connect
4682		 * for the next packet and try to pick a better source.
4683		 */
4684		if (connp->conn_mcbc_bind)
4685			connp->conn_saddr_v6 = ipv6_all_zeros;
4686		else
4687			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4688		connp->conn_v6lastdst = ipv6_all_zeros;
4689		mutex_exit(&connp->conn_lock);
4690		break;
4691	}
4692done:
4693	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4694	ixa->ixa_cred = connp->conn_cred;	/* Restore */
4695	ixa->ixa_cpid = connp->conn_cpid;
4696	ixa_refrele(ixa);
4697	return (error);
4698
4699ud_error:
4700	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4701	ixa->ixa_cred = connp->conn_cred;	/* Restore */
4702	ixa->ixa_cpid = connp->conn_cpid;
4703	ixa_refrele(ixa);
4704
4705	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4706	freemsg(data_mp);
4707	return (error);
4708}
4709
4710/* ARGSUSED */
4711static int
4712icmp_wput_fallback(queue_t *q, mblk_t *mp)
4713{
4714#ifdef DEBUG
4715	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4716#endif
4717	freemsg(mp);
4718	return (0);
4719}
4720
4721static void
4722icmp_wput_other(queue_t *q, mblk_t *mp)
4723{
4724	uchar_t	*rptr = mp->b_rptr;
4725	struct iocblk *iocp;
4726	conn_t	*connp = Q_TO_CONN(q);
4727	icmp_t	*icmp = connp->conn_icmp;
4728	cred_t *cr;
4729
4730	switch (mp->b_datap->db_type) {
4731	case M_PROTO:
4732	case M_PCPROTO:
4733		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4734			/*
4735			 * If the message does not contain a PRIM_type,
4736			 * throw it away.
4737			 */
4738			freemsg(mp);
4739			return;
4740		}
4741		switch (((t_primp_t)rptr)->type) {
4742		case T_ADDR_REQ:
4743			icmp_addr_req(q, mp);
4744			return;
4745		case O_T_BIND_REQ:
4746		case T_BIND_REQ:
4747			icmp_tpi_bind(q, mp);
4748			return;
4749		case T_CONN_REQ:
4750			icmp_tpi_connect(q, mp);
4751			return;
4752		case T_CAPABILITY_REQ:
4753			icmp_capability_req(q, mp);
4754			return;
4755		case T_INFO_REQ:
4756			icmp_info_req(q, mp);
4757			return;
4758		case T_UNITDATA_REQ:
4759			/*
4760			 * If a T_UNITDATA_REQ gets here, the address must
4761			 * be bad.  Valid T_UNITDATA_REQs are handled
4762			 * in icmp_wput.
4763			 */
4764			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4765			return;
4766		case T_UNBIND_REQ:
4767			icmp_tpi_unbind(q, mp);
4768			return;
4769		case T_SVR4_OPTMGMT_REQ:
4770			/*
4771			 * All Solaris components should pass a db_credp
4772			 * for this TPI message, hence we ASSERT.
4773			 * But in case there is some other M_PROTO that looks
4774			 * like a TPI message sent by some other kernel
4775			 * component, we check and return an error.
4776			 */
4777			cr = msg_getcred(mp, NULL);
4778			ASSERT(cr != NULL);
4779			if (cr == NULL) {
4780				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4781				return;
4782			}
4783
4784			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4785			    cr)) {
4786				svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4787			}
4788			return;
4789
4790		case T_OPTMGMT_REQ:
4791			/*
4792			 * All Solaris components should pass a db_credp
4793			 * for this TPI message, hence we ASSERT.
4794			 * But in case there is some other M_PROTO that looks
4795			 * like a TPI message sent by some other kernel
4796			 * component, we check and return an error.
4797			 */
4798			cr = msg_getcred(mp, NULL);
4799			ASSERT(cr != NULL);
4800			if (cr == NULL) {
4801				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4802				return;
4803			}
4804			tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4805			return;
4806
4807		case T_DISCON_REQ:
4808			icmp_tpi_disconnect(q, mp);
4809			return;
4810
4811		/* The following TPI message is not supported by icmp. */
4812		case O_T_CONN_RES:
4813		case T_CONN_RES:
4814			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4815			return;
4816
4817		/* The following 3 TPI requests are illegal for icmp. */
4818		case T_DATA_REQ:
4819		case T_EXDATA_REQ:
4820		case T_ORDREL_REQ:
4821			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4822			return;
4823		default:
4824			break;
4825		}
4826		break;
4827	case M_FLUSH:
4828		if (*rptr & FLUSHW)
4829			flushq(q, FLUSHDATA);
4830		break;
4831	case M_IOCTL:
4832		iocp = (struct iocblk *)mp->b_rptr;
4833		switch (iocp->ioc_cmd) {
4834		case TI_GETPEERNAME:
4835			if (icmp->icmp_state != TS_DATA_XFER) {
4836				/*
4837				 * If a default destination address has not
4838				 * been associated with the stream, then we
4839				 * don't know the peer's name.
4840				 */
4841				iocp->ioc_error = ENOTCONN;
4842				iocp->ioc_count = 0;
4843				mp->b_datap->db_type = M_IOCACK;
4844				qreply(q, mp);
4845				return;
4846			}
4847			/* FALLTHRU */
4848		case TI_GETMYNAME:
4849			/*
4850			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
4851			 * need to copyin the user's strbuf structure.
4852			 * Processing will continue in the M_IOCDATA case
4853			 * below.
4854			 */
4855			mi_copyin(q, mp, NULL,
4856			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4857			return;
4858		default:
4859			break;
4860		}
4861		break;
4862	case M_IOCDATA:
4863		icmp_wput_iocdata(q, mp);
4864		return;
4865	default:
4866		/* Unrecognized messages are passed through without change. */
4867		break;
4868	}
4869	ip_wput_nondata(q, mp);
4870}
4871
4872/*
4873 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4874 * messages.
4875 */
4876static void
4877icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4878{
4879	mblk_t		*mp1;
4880	STRUCT_HANDLE(strbuf, sb);
4881	uint_t		addrlen;
4882	conn_t		*connp = Q_TO_CONN(q);
4883	icmp_t		*icmp = connp->conn_icmp;
4884
4885	/* Make sure it is one of ours. */
4886	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4887	case TI_GETMYNAME:
4888	case TI_GETPEERNAME:
4889		break;
4890	default:
4891		ip_wput_nondata(q, mp);
4892		return;
4893	}
4894
4895	switch (mi_copy_state(q, mp, &mp1)) {
4896	case -1:
4897		return;
4898	case MI_COPY_CASE(MI_COPY_IN, 1):
4899		break;
4900	case MI_COPY_CASE(MI_COPY_OUT, 1):
4901		/*
4902		 * The address has been copied out, so now
4903		 * copyout the strbuf.
4904		 */
4905		mi_copyout(q, mp);
4906		return;
4907	case MI_COPY_CASE(MI_COPY_OUT, 2):
4908		/*
4909		 * The address and strbuf have been copied out.
4910		 * We're done, so just acknowledge the original
4911		 * M_IOCTL.
4912		 */
4913		mi_copy_done(q, mp, 0);
4914		return;
4915	default:
4916		/*
4917		 * Something strange has happened, so acknowledge
4918		 * the original M_IOCTL with an EPROTO error.
4919		 */
4920		mi_copy_done(q, mp, EPROTO);
4921		return;
4922	}
4923
4924	/*
4925	 * Now we have the strbuf structure for TI_GETMYNAME
4926	 * and TI_GETPEERNAME.  Next we copyout the requested
4927	 * address and then we'll copyout the strbuf.
4928	 */
4929	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
4930	    (void *)mp1->b_rptr);
4931
4932	if (connp->conn_family == AF_INET)
4933		addrlen = sizeof (sin_t);
4934	else
4935		addrlen = sizeof (sin6_t);
4936
4937	if (STRUCT_FGET(sb, maxlen) < addrlen) {
4938		mi_copy_done(q, mp, EINVAL);
4939		return;
4940	}
4941	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4942	case TI_GETMYNAME:
4943		break;
4944	case TI_GETPEERNAME:
4945		if (icmp->icmp_state != TS_DATA_XFER) {
4946			mi_copy_done(q, mp, ENOTCONN);
4947			return;
4948		}
4949		break;
4950	default:
4951		mi_copy_done(q, mp, EPROTO);
4952		return;
4953	}
4954	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
4955	if (!mp1)
4956		return;
4957
4958	STRUCT_FSET(sb, len, addrlen);
4959	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4960	case TI_GETMYNAME:
4961		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
4962		    &addrlen);
4963		break;
4964	case TI_GETPEERNAME:
4965		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
4966		    &addrlen);
4967		break;
4968	}
4969	mp1->b_wptr += addrlen;
4970	/* Copy out the address */
4971	mi_copyout(q, mp);
4972}
4973
4974void
4975icmp_ddi_g_init(void)
4976{
4977	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
4978	    icmp_opt_obj.odb_opt_arr_cnt);
4979
4980	/*
4981	 * We want to be informed each time a stack is created or
4982	 * destroyed in the kernel, so we can maintain the
4983	 * set of icmp_stack_t's.
4984	 */
4985	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
4986}
4987
4988void
4989icmp_ddi_g_destroy(void)
4990{
4991	netstack_unregister(NS_ICMP);
4992}
4993
4994#define	INET_NAME	"ip"
4995
4996/*
4997 * Initialize the ICMP stack instance.
4998 */
4999static void *
5000rawip_stack_init(netstackid_t stackid, netstack_t *ns)
5001{
5002	icmp_stack_t	*is;
5003	int		error = 0;
5004	size_t		arrsz;
5005	major_t		major;
5006
5007	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
5008	is->is_netstack = ns;
5009
5010	arrsz = sizeof (icmp_propinfo_tbl);
5011	is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
5012	bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz);
5013
5014	is->is_ksp = rawip_kstat_init(stackid);
5015
5016	major = mod_name_to_major(INET_NAME);
5017	error = ldi_ident_from_major(major, &is->is_ldi_ident);
5018	ASSERT(error == 0);
5019	return (is);
5020}
5021
5022/*
5023 * Free the ICMP stack instance.
5024 */
5025static void
5026rawip_stack_fini(netstackid_t stackid, void *arg)
5027{
5028	icmp_stack_t *is = (icmp_stack_t *)arg;
5029
5030	kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl));
5031	is->is_propinfo_tbl = NULL;
5032
5033	rawip_kstat_fini(stackid, is->is_ksp);
5034	is->is_ksp = NULL;
5035	ldi_ident_release(is->is_ldi_ident);
5036	kmem_free(is, sizeof (*is));
5037}
5038
5039static void *
5040rawip_kstat_init(netstackid_t stackid)
5041{
5042	kstat_t	*ksp;
5043
5044	rawip_named_kstat_t template = {
5045		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
5046		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
5047		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
5048		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
5049		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
5050	};
5051
5052	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5053	    KSTAT_TYPE_NAMED, NUM_OF_FIELDS(rawip_named_kstat_t), 0, stackid);
5054	if (ksp == NULL || ksp->ks_data == NULL)
5055		return (NULL);
5056
5057	bcopy(&template, ksp->ks_data, sizeof (template));
5058	ksp->ks_update = rawip_kstat_update;
5059	ksp->ks_private = (void *)(uintptr_t)stackid;
5060
5061	kstat_install(ksp);
5062	return (ksp);
5063}
5064
5065static void
5066rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5067{
5068	if (ksp != NULL) {
5069		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5070		kstat_delete_netstack(ksp, stackid);
5071	}
5072}
5073
5074static int
5075rawip_kstat_update(kstat_t *ksp, int rw)
5076{
5077	rawip_named_kstat_t *rawipkp;
5078	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5079	netstack_t	*ns;
5080	icmp_stack_t	*is;
5081
5082	if (ksp->ks_data == NULL)
5083		return (EIO);
5084
5085	if (rw == KSTAT_WRITE)
5086		return (EACCES);
5087
5088	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5089
5090	ns = netstack_find_by_stackid(stackid);
5091	if (ns == NULL)
5092		return (-1);
5093	is = ns->netstack_icmp;
5094	if (is == NULL) {
5095		netstack_rele(ns);
5096		return (-1);
5097	}
5098	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5099	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5100	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5101	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5102	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5103	netstack_rele(ns);
5104	return (0);
5105}
5106
5107/* ARGSUSED */
5108int
5109rawip_accept(sock_lower_handle_t lproto_handle,
5110    sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5111    cred_t *cr)
5112{
5113	return (EOPNOTSUPP);
5114}
5115
5116/* ARGSUSED */
5117int
5118rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5119    socklen_t len, cred_t *cr)
5120{
5121	conn_t  *connp = (conn_t *)proto_handle;
5122	int	error;
5123
5124	/* All Solaris components should pass a cred for this operation. */
5125	ASSERT(cr != NULL);
5126
5127	/* Binding to a NULL address really means unbind */
5128	if (sa == NULL)
5129		error = rawip_do_unbind(connp);
5130	else
5131		error = rawip_do_bind(connp, sa, len);
5132
5133	if (error < 0) {
5134		if (error == -TOUTSTATE)
5135			error = EINVAL;
5136		else
5137			error = proto_tlitosyserr(-error);
5138	}
5139	return (error);
5140}
5141
5142static int
5143rawip_implicit_bind(conn_t *connp)
5144{
5145	sin6_t sin6addr;
5146	sin_t *sin;
5147	sin6_t *sin6;
5148	socklen_t len;
5149	int error;
5150
5151	if (connp->conn_family == AF_INET) {
5152		len = sizeof (struct sockaddr_in);
5153		sin = (sin_t *)&sin6addr;
5154		*sin = sin_null;
5155		sin->sin_family = AF_INET;
5156		sin->sin_addr.s_addr = INADDR_ANY;
5157	} else {
5158		ASSERT(connp->conn_family == AF_INET6);
5159		len = sizeof (sin6_t);
5160		sin6 = (sin6_t *)&sin6addr;
5161		*sin6 = sin6_null;
5162		sin6->sin6_family = AF_INET6;
5163		V6_SET_ZERO(sin6->sin6_addr);
5164	}
5165
5166	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5167
5168	return ((error < 0) ? proto_tlitosyserr(-error) : error);
5169}
5170
5171static int
5172rawip_unbind(conn_t *connp)
5173{
5174	int error;
5175
5176	error = rawip_do_unbind(connp);
5177	if (error < 0) {
5178		error = proto_tlitosyserr(-error);
5179	}
5180	return (error);
5181}
5182
5183/* ARGSUSED */
5184int
5185rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5186{
5187	return (EOPNOTSUPP);
5188}
5189
5190int
5191rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5192    socklen_t len, sock_connid_t *id, cred_t *cr)
5193{
5194	conn_t	*connp = (conn_t *)proto_handle;
5195	icmp_t *icmp = connp->conn_icmp;
5196	int	error;
5197	boolean_t did_bind = B_FALSE;
5198	pid_t	pid = curproc->p_pid;
5199
5200	/* All Solaris components should pass a cred for this operation. */
5201	ASSERT(cr != NULL);
5202
5203	if (sa == NULL) {
5204		/*
5205		 * Disconnect
5206		 * Make sure we are connected
5207		 */
5208		if (icmp->icmp_state != TS_DATA_XFER)
5209			return (EINVAL);
5210
5211		error = icmp_disconnect(connp);
5212		return (error);
5213	}
5214
5215	error = proto_verify_ip_addr(connp->conn_family, sa, len);
5216	if (error != 0)
5217		return (error);
5218
5219	/* do an implicit bind if necessary */
5220	if (icmp->icmp_state == TS_UNBND) {
5221		error = rawip_implicit_bind(connp);
5222		/*
5223		 * We could be racing with an actual bind, in which case
5224		 * we would see EPROTO. We cross our fingers and try
5225		 * to connect.
5226		 */
5227		if (!(error == 0 || error == EPROTO))
5228			return (error);
5229		did_bind = B_TRUE;
5230	}
5231
5232	/*
5233	 * set SO_DGRAM_ERRIND
5234	 */
5235	connp->conn_dgram_errind = B_TRUE;
5236
5237	error = rawip_do_connect(connp, sa, len, cr, pid);
5238	if (error != 0 && did_bind) {
5239		int unbind_err;
5240
5241		unbind_err = rawip_unbind(connp);
5242		ASSERT(unbind_err == 0);
5243	}
5244
5245	if (error == 0) {
5246		*id = 0;
5247		(*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5248		    0, NULL, -1);
5249	} else if (error < 0) {
5250		error = proto_tlitosyserr(-error);
5251	}
5252	return (error);
5253}
5254
5255/* ARGSUSED2 */
5256int
5257rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5258    boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
5259    sock_quiesce_arg_t *arg)
5260{
5261	conn_t  *connp = (conn_t *)proto_handle;
5262	icmp_t	*icmp;
5263	struct T_capability_ack tca;
5264	struct sockaddr_in6 laddr, faddr;
5265	socklen_t laddrlen, faddrlen;
5266	short opts;
5267	struct stroptions *stropt;
5268	mblk_t *mp, *stropt_mp;
5269	int error;
5270
5271	icmp = connp->conn_icmp;
5272
5273	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5274
5275	/*
5276	 * setup the fallback stream that was allocated
5277	 */
5278	connp->conn_dev = (dev_t)RD(q)->q_ptr;
5279	connp->conn_minor_arena = WR(q)->q_ptr;
5280
5281	RD(q)->q_ptr = WR(q)->q_ptr = connp;
5282
5283	WR(q)->q_qinfo = &icmpwinit;
5284
5285	connp->conn_rq = RD(q);
5286	connp->conn_wq = WR(q);
5287
5288	/* Notify stream head about options before sending up data */
5289	stropt_mp->b_datap->db_type = M_SETOPTS;
5290	stropt_mp->b_wptr += sizeof (*stropt);
5291	stropt = (struct stroptions *)stropt_mp->b_rptr;
5292	stropt->so_flags = SO_WROFF | SO_HIWAT;
5293	stropt->so_wroff = connp->conn_wroff;
5294	stropt->so_hiwat = connp->conn_rcvbuf;
5295	putnext(RD(q), stropt_mp);
5296
5297	/*
5298	 * free helper stream
5299	 */
5300	ip_free_helper_stream(connp);
5301
5302	/*
5303	 * Collect the information needed to sync with the sonode
5304	 */
5305	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5306
5307	laddrlen = faddrlen = sizeof (sin6_t);
5308	(void) rawip_getsockname((sock_lower_handle_t)connp,
5309	    (struct sockaddr *)&laddr, &laddrlen, CRED());
5310	error = rawip_getpeername((sock_lower_handle_t)connp,
5311	    (struct sockaddr *)&faddr, &faddrlen, CRED());
5312	if (error != 0)
5313		faddrlen = 0;
5314	opts = 0;
5315	if (connp->conn_dgram_errind)
5316		opts |= SO_DGRAM_ERRIND;
5317	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5318		opts |= SO_DONTROUTE;
5319
5320	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
5321	    (struct sockaddr *)&laddr, laddrlen,
5322	    (struct sockaddr *)&faddr, faddrlen, opts);
5323
5324	/*
5325	 * Attempts to send data up during fallback will result in it being
5326	 * queued in icmp_t. Now we push up any queued packets.
5327	 */
5328	mutex_enter(&icmp->icmp_recv_lock);
5329	if (mp != NULL) {
5330		mp->b_next = icmp->icmp_fallback_queue_head;
5331		icmp->icmp_fallback_queue_head = mp;
5332	}
5333	while (icmp->icmp_fallback_queue_head != NULL) {
5334		mp = icmp->icmp_fallback_queue_head;
5335		icmp->icmp_fallback_queue_head = mp->b_next;
5336		mp->b_next = NULL;
5337		mutex_exit(&icmp->icmp_recv_lock);
5338		putnext(RD(q), mp);
5339		mutex_enter(&icmp->icmp_recv_lock);
5340	}
5341	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5342
5343	/*
5344	 * No longer a streams less socket
5345	 */
5346	mutex_enter(&connp->conn_lock);
5347	connp->conn_flags &= ~IPCL_NONSTR;
5348	mutex_exit(&connp->conn_lock);
5349
5350	mutex_exit(&icmp->icmp_recv_lock);
5351
5352	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5353	    icmp->icmp_fallback_queue_tail == NULL);
5354
5355	ASSERT(connp->conn_ref >= 1);
5356
5357	return (0);
5358}
5359
5360/* ARGSUSED2 */
5361sock_lower_handle_t
5362rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5363    uint_t *smodep, int *errorp, int flags, cred_t *credp)
5364{
5365	conn_t *connp;
5366
5367	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5368		*errorp = EPROTONOSUPPORT;
5369		return (NULL);
5370	}
5371
5372	connp = rawip_do_open(family, credp, errorp, flags);
5373	if (connp != NULL) {
5374		connp->conn_flags |= IPCL_NONSTR;
5375
5376		mutex_enter(&connp->conn_lock);
5377		connp->conn_state_flags &= ~CONN_INCIPIENT;
5378		mutex_exit(&connp->conn_lock);
5379		*sock_downcalls = &sock_rawip_downcalls;
5380		*smodep = SM_ATOMIC;
5381	} else {
5382		ASSERT(*errorp != 0);
5383	}
5384
5385	return ((sock_lower_handle_t)connp);
5386}
5387
5388/* ARGSUSED3 */
5389void
5390rawip_activate(sock_lower_handle_t proto_handle,
5391    sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5392    cred_t *cr)
5393{
5394	conn_t *connp = (conn_t *)proto_handle;
5395	struct sock_proto_props sopp;
5396
5397	/* All Solaris components should pass a cred for this operation. */
5398	ASSERT(cr != NULL);
5399
5400	connp->conn_upcalls = sock_upcalls;
5401	connp->conn_upper_handle = sock_handle;
5402
5403	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5404	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5405	sopp.sopp_wroff = connp->conn_wroff;
5406	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5407	sopp.sopp_rxlowat = connp->conn_rcvlowat;
5408	sopp.sopp_maxblk = INFPSZ;
5409	sopp.sopp_maxpsz = IP_MAXPACKET;
5410	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5411	    icmp_mod_info.mi_minpsz;
5412
5413	(*connp->conn_upcalls->su_set_proto_props)
5414	    (connp->conn_upper_handle, &sopp);
5415
5416	icmp_bind_proto(connp->conn_icmp);
5417}
5418
5419/* ARGSUSED3 */
5420int
5421rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5422    socklen_t *salenp, cred_t *cr)
5423{
5424	conn_t  *connp = (conn_t *)proto_handle;
5425	icmp_t  *icmp = connp->conn_icmp;
5426	int	error;
5427
5428	/* All Solaris components should pass a cred for this operation. */
5429	ASSERT(cr != NULL);
5430
5431	mutex_enter(&connp->conn_lock);
5432	if (icmp->icmp_state != TS_DATA_XFER)
5433		error = ENOTCONN;
5434	else
5435		error = conn_getpeername(connp, sa, salenp);
5436	mutex_exit(&connp->conn_lock);
5437	return (error);
5438}
5439
5440/* ARGSUSED3 */
5441int
5442rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5443    socklen_t *salenp, cred_t *cr)
5444{
5445	conn_t  *connp = (conn_t *)proto_handle;
5446	int	error;
5447
5448	/* All Solaris components should pass a cred for this operation. */
5449	ASSERT(cr != NULL);
5450
5451	mutex_enter(&connp->conn_lock);
5452	error = conn_getsockname(connp, sa, salenp);
5453	mutex_exit(&connp->conn_lock);
5454	return (error);
5455}
5456
5457int
5458rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5459    const void *optvalp, socklen_t optlen, cred_t *cr)
5460{
5461	conn_t	*connp = (conn_t *)proto_handle;
5462	int error;
5463
5464	/* All Solaris components should pass a cred for this operation. */
5465	ASSERT(cr != NULL);
5466
5467	error = proto_opt_check(level, option_name, optlen, NULL,
5468	    icmp_opt_obj.odb_opt_des_arr,
5469	    icmp_opt_obj.odb_opt_arr_cnt,
5470	    B_TRUE, B_FALSE, cr);
5471
5472	if (error != 0) {
5473		/*
5474		 * option not recognized
5475		 */
5476		if (error < 0) {
5477			error = proto_tlitosyserr(-error);
5478		}
5479		return (error);
5480	}
5481
5482	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5483	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5484	    (uchar_t *)optvalp, NULL, cr);
5485
5486	ASSERT(error >= 0);
5487
5488	return (error);
5489}
5490
5491int
5492rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5493    void *optvalp, socklen_t *optlen, cred_t *cr)
5494{
5495	int		error;
5496	conn_t		*connp = (conn_t *)proto_handle;
5497	t_uscalar_t	max_optbuf_len;
5498	void		*optvalp_buf;
5499	int		len;
5500
5501	/* All Solaris components should pass a cred for this operation. */
5502	ASSERT(cr != NULL);
5503
5504	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5505	    icmp_opt_obj.odb_opt_des_arr,
5506	    icmp_opt_obj.odb_opt_arr_cnt,
5507	    B_FALSE, B_TRUE, cr);
5508
5509	if (error != 0) {
5510		if (error < 0) {
5511			error = proto_tlitosyserr(-error);
5512		}
5513		return (error);
5514	}
5515
5516	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5517	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5518	if (len == -1) {
5519		kmem_free(optvalp_buf, max_optbuf_len);
5520		return (EINVAL);
5521	}
5522
5523	/*
5524	 * update optlen and copy option value
5525	 */
5526	t_uscalar_t size = MIN(len, *optlen);
5527
5528	bcopy(optvalp_buf, optvalp, size);
5529	bcopy(&size, optlen, sizeof (size));
5530
5531	kmem_free(optvalp_buf, max_optbuf_len);
5532	return (0);
5533}
5534
5535/* ARGSUSED1 */
5536int
5537rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5538{
5539	conn_t	*connp = (conn_t *)proto_handle;
5540
5541	/* All Solaris components should pass a cred for this operation. */
5542	ASSERT(cr != NULL);
5543
5544	(void) rawip_do_close(connp);
5545	return (0);
5546}
5547
5548/* ARGSUSED2 */
5549int
5550rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5551{
5552	conn_t  *connp = (conn_t *)proto_handle;
5553
5554	/* All Solaris components should pass a cred for this operation. */
5555	ASSERT(cr != NULL);
5556
5557	/* shut down the send side */
5558	if (how != SHUT_RD)
5559		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5560		    SOCK_OPCTL_SHUT_SEND, 0);
5561	/* shut down the recv side */
5562	if (how != SHUT_WR)
5563		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5564		    SOCK_OPCTL_SHUT_RECV, 0);
5565	return (0);
5566}
5567
5568void
5569rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5570{
5571	conn_t  *connp = (conn_t *)proto_handle;
5572	icmp_t	*icmp = connp->conn_icmp;
5573
5574	mutex_enter(&icmp->icmp_recv_lock);
5575	connp->conn_flow_cntrld = B_FALSE;
5576	mutex_exit(&icmp->icmp_recv_lock);
5577}
5578
5579int
5580rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5581    int mode, int32_t *rvalp, cred_t *cr)
5582{
5583	conn_t		*connp = (conn_t *)proto_handle;
5584	int		error;
5585
5586	/* All Solaris components should pass a cred for this operation. */
5587	ASSERT(cr != NULL);
5588
5589	/*
5590	 * If we don't have a helper stream then create one.
5591	 * ip_create_helper_stream takes care of locking the conn_t,
5592	 * so this check for NULL is just a performance optimization.
5593	 */
5594	if (connp->conn_helper_info == NULL) {
5595		icmp_stack_t *is = connp->conn_icmp->icmp_is;
5596
5597		ASSERT(is->is_ldi_ident != NULL);
5598
5599		/*
5600		 * Create a helper stream for non-STREAMS socket.
5601		 */
5602		error = ip_create_helper_stream(connp, is->is_ldi_ident);
5603		if (error != 0) {
5604			ip0dbg(("rawip_ioctl: create of IP helper stream "
5605			    "failed %d\n", error));
5606			return (error);
5607		}
5608	}
5609
5610	switch (cmd) {
5611	case _SIOCSOCKFALLBACK:
5612	case TI_GETPEERNAME:
5613	case TI_GETMYNAME:
5614#ifdef DEBUG
5615		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5616		    " socket", cmd);
5617#endif
5618		error = EINVAL;
5619		break;
5620	default:
5621		/*
5622		 * Pass on to IP using helper stream
5623		 */
5624		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5625		    cmd, arg, mode, cr, rvalp);
5626		break;
5627	}
5628	return (error);
5629}
5630
5631int
5632rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5633    cred_t *cr)
5634{
5635	sin6_t		*sin6;
5636	sin_t		*sin = NULL;
5637	uint_t		srcid;
5638	conn_t		*connp = (conn_t *)proto_handle;
5639	icmp_t		*icmp = connp->conn_icmp;
5640	int		error = 0;
5641	icmp_stack_t	*is = icmp->icmp_is;
5642	pid_t		pid = curproc->p_pid;
5643	ip_xmit_attr_t	*ixa;
5644
5645	ASSERT(DB_TYPE(mp) == M_DATA);
5646
5647	/* All Solaris components should pass a cred for this operation. */
5648	ASSERT(cr != NULL);
5649
5650	/* do an implicit bind if necessary */
5651	if (icmp->icmp_state == TS_UNBND) {
5652		error = rawip_implicit_bind(connp);
5653		/*
5654		 * We could be racing with an actual bind, in which case
5655		 * we would see EPROTO. We cross our fingers and try
5656		 * to connect.
5657		 */
5658		if (!(error == 0 || error == EPROTO)) {
5659			freemsg(mp);
5660			return (error);
5661		}
5662	}
5663
5664	/* Protocol 255 contains full IP headers */
5665	/* Read without holding lock */
5666	if (icmp->icmp_hdrincl) {
5667		ASSERT(connp->conn_ipversion == IPV4_VERSION);
5668		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5669			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5670				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5671				freemsg(mp);
5672				return (EINVAL);
5673			}
5674		}
5675		error = icmp_output_hdrincl(connp, mp, cr, pid);
5676		if (is->is_sendto_ignerr)
5677			return (0);
5678		else
5679			return (error);
5680	}
5681
5682	/* Connected? */
5683	if (msg->msg_name == NULL) {
5684		if (icmp->icmp_state != TS_DATA_XFER) {
5685			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5686			return (EDESTADDRREQ);
5687		}
5688		if (msg->msg_controllen != 0) {
5689			error = icmp_output_ancillary(connp, NULL, NULL, mp,
5690			    NULL, msg, cr, pid);
5691		} else {
5692			error = icmp_output_connected(connp, mp, cr, pid);
5693		}
5694		if (is->is_sendto_ignerr)
5695			return (0);
5696		else
5697			return (error);
5698	}
5699	if (icmp->icmp_state == TS_DATA_XFER) {
5700		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5701		return (EISCONN);
5702	}
5703	error = proto_verify_ip_addr(connp->conn_family,
5704	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5705	if (error != 0) {
5706		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5707		return (error);
5708	}
5709	switch (connp->conn_family) {
5710	case AF_INET6:
5711		sin6 = (sin6_t *)msg->msg_name;
5712
5713		/* No support for mapped addresses on raw sockets */
5714		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5715			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5716			return (EADDRNOTAVAIL);
5717		}
5718		srcid = sin6->__sin6_src_id;
5719
5720		/*
5721		 * If the local address is a mapped address return
5722		 * an error.
5723		 * It would be possible to send an IPv6 packet but the
5724		 * response would never make it back to the application
5725		 * since it is bound to a mapped address.
5726		 */
5727		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5728			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5729			return (EADDRNOTAVAIL);
5730		}
5731
5732		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5733			sin6->sin6_addr = ipv6_loopback;
5734
5735		/*
5736		 * We have to allocate an ip_xmit_attr_t before we grab
5737		 * conn_lock and we need to hold conn_lock once we've check
5738		 * conn_same_as_last_v6 to handle concurrent send* calls on a
5739		 * socket.
5740		 */
5741		if (msg->msg_controllen == 0) {
5742			ixa = conn_get_ixa(connp, B_FALSE);
5743			if (ixa == NULL) {
5744				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5745				return (ENOMEM);
5746			}
5747		} else {
5748			ixa = NULL;
5749		}
5750		mutex_enter(&connp->conn_lock);
5751		if (icmp->icmp_delayed_error != 0) {
5752			sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5753
5754			error = icmp->icmp_delayed_error;
5755			icmp->icmp_delayed_error = 0;
5756
5757			/* Compare IP address and family */
5758
5759			if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5760			    &sin2->sin6_addr) &&
5761			    sin6->sin6_family == sin2->sin6_family) {
5762				mutex_exit(&connp->conn_lock);
5763				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5764				if (ixa != NULL)
5765					ixa_refrele(ixa);
5766				return (error);
5767			}
5768		}
5769		if (msg->msg_controllen != 0) {
5770			mutex_exit(&connp->conn_lock);
5771			ASSERT(ixa == NULL);
5772			error = icmp_output_ancillary(connp, NULL, sin6, mp,
5773			    NULL, msg, cr, pid);
5774		} else if (conn_same_as_last_v6(connp, sin6) &&
5775		    connp->conn_lastsrcid == srcid &&
5776		    ipsec_outbound_policy_current(ixa)) {
5777			/* icmp_output_lastdst drops conn_lock */
5778			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5779		} else {
5780			/* icmp_output_newdst drops conn_lock */
5781			error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5782			    pid, ixa);
5783		}
5784		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5785		if (is->is_sendto_ignerr)
5786			return (0);
5787		else
5788			return (error);
5789	case AF_INET:
5790		sin = (sin_t *)msg->msg_name;
5791
5792		if (sin->sin_addr.s_addr == INADDR_ANY)
5793			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5794
5795		/*
5796		 * We have to allocate an ip_xmit_attr_t before we grab
5797		 * conn_lock and we need to hold conn_lock once we've check
5798		 * conn_same_as_last_v6 to handle concurrent send* on a socket.
5799		 */
5800		if (msg->msg_controllen == 0) {
5801			ixa = conn_get_ixa(connp, B_FALSE);
5802			if (ixa == NULL) {
5803				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5804				return (ENOMEM);
5805			}
5806		} else {
5807			ixa = NULL;
5808		}
5809		mutex_enter(&connp->conn_lock);
5810		if (icmp->icmp_delayed_error != 0) {
5811			sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5812
5813			error = icmp->icmp_delayed_error;
5814			icmp->icmp_delayed_error = 0;
5815
5816			/* Compare IP address */
5817
5818			if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5819				mutex_exit(&connp->conn_lock);
5820				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5821				if (ixa != NULL)
5822					ixa_refrele(ixa);
5823				return (error);
5824			}
5825		}
5826
5827		if (msg->msg_controllen != 0) {
5828			mutex_exit(&connp->conn_lock);
5829			ASSERT(ixa == NULL);
5830			error = icmp_output_ancillary(connp, sin, NULL, mp,
5831			    NULL, msg, cr, pid);
5832		} else if (conn_same_as_last_v4(connp, sin) &&
5833		    ipsec_outbound_policy_current(ixa)) {
5834			/* icmp_output_lastdst drops conn_lock */
5835			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5836		} else {
5837			/* icmp_output_newdst drops conn_lock */
5838			error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5839			    pid, ixa);
5840		}
5841		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5842		if (is->is_sendto_ignerr)
5843			return (0);
5844		else
5845			return (error);
5846	default:
5847		return (EINVAL);
5848	}
5849}
5850
5851sock_downcalls_t sock_rawip_downcalls = {
5852	rawip_activate,
5853	rawip_accept,
5854	rawip_bind,
5855	rawip_listen,
5856	rawip_connect,
5857	rawip_getpeername,
5858	rawip_getsockname,
5859	rawip_getsockopt,
5860	rawip_setsockopt,
5861	rawip_send,
5862	NULL,
5863	NULL,
5864	NULL,
5865	rawip_shutdown,
5866	rawip_clr_flowctrl,
5867	rawip_ioctl,
5868	rawip_close
5869};
5870