1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
24 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
25 */
26/* Copyright (c) 1990 Mentat Inc. */
27
28#include <sys/sysmacros.h>
29#include <sys/types.h>
30#include <sys/stream.h>
31#include <sys/stropts.h>
32#include <sys/strlog.h>
33#include <sys/strsun.h>
34#define	_SUN_TPI_VERSION 2
35#include <sys/tihdr.h>
36#include <sys/timod.h>
37#include <sys/ddi.h>
38#include <sys/sunddi.h>
39#include <sys/strsubr.h>
40#include <sys/suntpi.h>
41#include <sys/xti_inet.h>
42#include <sys/kmem.h>
43#include <sys/cred_impl.h>
44#include <sys/policy.h>
45#include <sys/priv.h>
46#include <sys/ucred.h>
47#include <sys/zone.h>
48
49#include <sys/socket.h>
50#include <sys/socketvar.h>
51#include <sys/sockio.h>
52#include <sys/vtrace.h>
53#include <sys/sdt.h>
54#include <sys/debug.h>
55#include <sys/isa_defs.h>
56#include <sys/random.h>
57#include <netinet/in.h>
58#include <netinet/ip6.h>
59#include <netinet/icmp6.h>
60#include <netinet/udp.h>
61
62#include <inet/common.h>
63#include <inet/ip.h>
64#include <inet/ip_impl.h>
65#include <inet/ipsec_impl.h>
66#include <inet/ip6.h>
67#include <inet/ip_ire.h>
68#include <inet/ip_if.h>
69#include <inet/ip_multi.h>
70#include <inet/ip_ndp.h>
71#include <inet/proto_set.h>
72#include <inet/mib2.h>
73#include <inet/optcom.h>
74#include <inet/snmpcom.h>
75#include <inet/kstatcom.h>
76#include <inet/ipclassifier.h>
77#include <sys/squeue_impl.h>
78#include <inet/ipnet.h>
79#include <sys/ethernet.h>
80
81#include <sys/tsol/label.h>
82#include <sys/tsol/tnet.h>
83#include <rpc/pmap_prot.h>
84
85#include <inet/udp_impl.h>
86
87/*
88 * Synchronization notes:
89 *
90 * UDP is MT and uses the usual kernel synchronization primitives. There are 2
91 * locks, the fanout lock (uf_lock) and conn_lock. conn_lock
92 * protects the contents of the udp_t. uf_lock protects the address and the
93 * fanout information.
94 * The lock order is conn_lock -> uf_lock.
95 *
96 * The fanout lock uf_lock:
97 * When a UDP endpoint is bound to a local port, it is inserted into
98 * a bind hash list.  The list consists of an array of udp_fanout_t buckets.
99 * The size of the array is controlled by the udp_bind_fanout_size variable.
100 * This variable can be changed in /etc/system if the default value is
101 * not large enough.  Each bind hash bucket is protected by a per bucket
102 * lock.  It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t
103 * structure and a few other fields in the udp_t. A UDP endpoint is removed
104 * from the bind hash list only when it is being unbound or being closed.
105 * The per bucket lock also protects a UDP endpoint's state changes.
106 *
107 * Plumbing notes:
108 * UDP is always a device driver. For compatibility with mibopen() code
109 * it is possible to I_PUSH "udp", but that results in pushing a passthrough
110 * dummy module.
111 *
112 * The above implies that we don't support any intermediate module to
113 * reside in between /dev/ip and udp -- in fact, we never supported such
114 * scenario in the past as the inter-layer communication semantics have
115 * always been private.
116 */
117
118/* For /etc/system control */
119uint_t udp_bind_fanout_size = UDP_BIND_FANOUT_SIZE;
120
121static void	udp_addr_req(queue_t *q, mblk_t *mp);
122static void	udp_tpi_bind(queue_t *q, mblk_t *mp);
123static void	udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp);
124static void	udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock);
125static int	udp_build_hdr_template(conn_t *, const in6_addr_t *,
126    const in6_addr_t *, in_port_t, uint32_t);
127static void	udp_capability_req(queue_t *q, mblk_t *mp);
128static int	udp_tpi_close(queue_t *q, int flags, cred_t *);
129static void	udp_close_free(conn_t *);
130static void	udp_tpi_connect(queue_t *q, mblk_t *mp);
131static void	udp_tpi_disconnect(queue_t *q, mblk_t *mp);
132static void	udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
133    int sys_error);
134static void	udp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
135    t_scalar_t tlierr, int sys_error);
136static int	udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
137		    cred_t *cr);
138static int	udp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
139		    char *value, caddr_t cp, cred_t *cr);
140static int	udp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
141		    char *value, caddr_t cp, cred_t *cr);
142static void	udp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
143static void	udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
144    ip_recv_attr_t *ira);
145static void	udp_info_req(queue_t *q, mblk_t *mp);
146static void	udp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
147static int	udp_lrput(queue_t *, mblk_t *);
148static int	udp_lwput(queue_t *, mblk_t *);
149static int	udp_open(queue_t *q, dev_t *devp, int flag, int sflag,
150		    cred_t *credp, boolean_t isv6);
151static int	udp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
152		    cred_t *credp);
153static int	udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
154		    cred_t *credp);
155static boolean_t udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
156int		udp_opt_set(conn_t *connp, uint_t optset_context,
157		    int level, int name, uint_t inlen,
158		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
159		    void *thisdg_attrs, cred_t *cr);
160int		udp_opt_get(conn_t *connp, int level, int name,
161		    uchar_t *ptr);
162static int	udp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr,
163		    pid_t pid);
164static int	udp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr,
165    pid_t pid, ip_xmit_attr_t *ixa);
166static int	udp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
167		    sin6_t *sin6, ushort_t ipversion, cred_t *cr, pid_t,
168		    ip_xmit_attr_t *ixa);
169static mblk_t	*udp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
170    const in6_addr_t *, const in6_addr_t *, in_port_t, uint32_t, mblk_t *,
171    int *);
172static mblk_t	*udp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
173    mblk_t *, const in6_addr_t *, in_port_t, uint32_t, int *);
174static void	udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
175static void	udp_ud_err_connected(conn_t *, t_scalar_t);
176static void	udp_tpi_unbind(queue_t *q, mblk_t *mp);
177static in_port_t udp_update_next_port(udp_t *udp, in_port_t port,
178    boolean_t random);
179static void	udp_wput_other(queue_t *q, mblk_t *mp);
180static void	udp_wput_iocdata(queue_t *q, mblk_t *mp);
181static int	udp_wput_fallback(queue_t *q, mblk_t *mp);
182static size_t	udp_set_rcv_hiwat(udp_t *udp, size_t size);
183
184static void	*udp_stack_init(netstackid_t stackid, netstack_t *ns);
185static void	udp_stack_fini(netstackid_t stackid, void *arg);
186
187/* Common routines for TPI and socket module */
188static void	udp_ulp_recv(conn_t *, mblk_t *, uint_t, ip_recv_attr_t *);
189
190/* Common routine for TPI and socket module */
191static conn_t	*udp_do_open(cred_t *, boolean_t, int, int *);
192static void	udp_do_close(conn_t *);
193static int	udp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
194    boolean_t);
195static int	udp_do_unbind(conn_t *);
196
197int		udp_getsockname(sock_lower_handle_t,
198    struct sockaddr *, socklen_t *, cred_t *);
199int		udp_getpeername(sock_lower_handle_t,
200    struct sockaddr *, socklen_t *, cred_t *);
201static int	udp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
202    cred_t *, pid_t);
203
204#pragma inline(udp_output_connected, udp_output_newdst, udp_output_lastdst)
205
206/*
207 * Checks if the given destination addr/port is allowed out.
208 * If allowed, registers the (dest_addr/port, node_ID) mapping at Cluster.
209 * Called for each connect() and for sendto()/sendmsg() to a different
210 * destination.
211 * For connect(), called in udp_connect().
212 * For sendto()/sendmsg(), called in udp_output_newdst().
213 *
214 * This macro assumes that the cl_inet_connect2 hook is not NULL.
215 * Please check this before calling this macro.
216 *
217 * void
218 * CL_INET_UDP_CONNECT(conn_t cp, udp_t *udp, boolean_t is_outgoing,
219 *     in6_addr_t *faddrp, in_port_t (or uint16_t) fport, int err);
220 */
221#define	CL_INET_UDP_CONNECT(cp, is_outgoing, faddrp, fport, err) {	\
222	(err) = 0;							\
223	/*								\
224	 * Running in cluster mode - check and register active		\
225	 * "connection" information					\
226	 */								\
227	if ((cp)->conn_ipversion == IPV4_VERSION)			\
228		(err) = (*cl_inet_connect2)(				\
229		    (cp)->conn_netstack->netstack_stackid,		\
230		    IPPROTO_UDP, is_outgoing, AF_INET,			\
231		    (uint8_t *)&((cp)->conn_laddr_v4),			\
232		    (cp)->conn_lport,					\
233		    (uint8_t *)&(V4_PART_OF_V6(*faddrp)),		\
234		    (in_port_t)(fport), NULL);				\
235	else								\
236		(err) = (*cl_inet_connect2)(				\
237		    (cp)->conn_netstack->netstack_stackid,		\
238		    IPPROTO_UDP, is_outgoing, AF_INET6,			\
239		    (uint8_t *)&((cp)->conn_laddr_v6),			\
240		    (cp)->conn_lport,					\
241		    (uint8_t *)(faddrp), (in_port_t)(fport), NULL);	\
242}
243
244static struct module_info udp_mod_info =  {
245	UDP_MOD_ID, UDP_MOD_NAME, 1, INFPSZ, UDP_RECV_HIWATER, UDP_RECV_LOWATER
246};
247
248/*
249 * Entry points for UDP as a device.
250 * We have separate open functions for the /dev/udp and /dev/udp6 devices.
251 */
252static struct qinit udp_rinitv4 = {
253	NULL, NULL, udp_openv4, udp_tpi_close, NULL, &udp_mod_info, NULL
254};
255
256static struct qinit udp_rinitv6 = {
257	NULL, NULL, udp_openv6, udp_tpi_close, NULL, &udp_mod_info, NULL
258};
259
260static struct qinit udp_winit = {
261	udp_wput, ip_wsrv, NULL, NULL, NULL, &udp_mod_info
262};
263
264/* UDP entry point during fallback */
265struct qinit udp_fallback_sock_winit = {
266	udp_wput_fallback, NULL, NULL, NULL, NULL, &udp_mod_info
267};
268
269/*
270 * UDP needs to handle I_LINK and I_PLINK since ifconfig
271 * likes to use it as a place to hang the various streams.
272 */
273static struct qinit udp_lrinit = {
274	udp_lrput, NULL, udp_openv4, udp_tpi_close, NULL, &udp_mod_info
275};
276
277static struct qinit udp_lwinit = {
278	udp_lwput, NULL, udp_openv4, udp_tpi_close, NULL, &udp_mod_info
279};
280
281/* For AF_INET aka /dev/udp */
282struct streamtab udpinfov4 = {
283	&udp_rinitv4, &udp_winit, &udp_lrinit, &udp_lwinit
284};
285
286/* For AF_INET6 aka /dev/udp6 */
287struct streamtab udpinfov6 = {
288	&udp_rinitv6, &udp_winit, &udp_lrinit, &udp_lwinit
289};
290
291#define	UDP_MAXPACKET_IPV4 (IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH)
292
293/* Default structure copied into T_INFO_ACK messages */
294static struct T_info_ack udp_g_t_info_ack_ipv4 = {
295	T_INFO_ACK,
296	UDP_MAXPACKET_IPV4,	/* TSDU_size. Excl. headers */
297	T_INVALID,	/* ETSU_size.  udp does not support expedited data. */
298	T_INVALID,	/* CDATA_size. udp does not support connect data. */
299	T_INVALID,	/* DDATA_size. udp does not support disconnect data. */
300	sizeof (sin_t),	/* ADDR_size. */
301	0,		/* OPT_size - not initialized here */
302	UDP_MAXPACKET_IPV4,	/* TIDU_size.  Excl. headers */
303	T_CLTS,		/* SERV_type.  udp supports connection-less. */
304	TS_UNBND,	/* CURRENT_state.  This is set from udp_state. */
305	(XPG4_1|SENDZERO) /* PROVIDER_flag */
306};
307
308#define	UDP_MAXPACKET_IPV6 (IP_MAXPACKET - UDPH_SIZE - IPV6_HDR_LEN)
309
310static	struct T_info_ack udp_g_t_info_ack_ipv6 = {
311	T_INFO_ACK,
312	UDP_MAXPACKET_IPV6,	/* TSDU_size.  Excl. headers */
313	T_INVALID,	/* ETSU_size.  udp does not support expedited data. */
314	T_INVALID,	/* CDATA_size. udp does not support connect data. */
315	T_INVALID,	/* DDATA_size. udp does not support disconnect data. */
316	sizeof (sin6_t), /* ADDR_size. */
317	0,		/* OPT_size - not initialized here */
318	UDP_MAXPACKET_IPV6,	/* TIDU_size. Excl. headers */
319	T_CLTS,		/* SERV_type.  udp supports connection-less. */
320	TS_UNBND,	/* CURRENT_state.  This is set from udp_state. */
321	(XPG4_1|SENDZERO) /* PROVIDER_flag */
322};
323
324/*
325 * UDP tunables related declarations. Definitions are in udp_tunables.c
326 */
327extern mod_prop_info_t udp_propinfo_tbl[];
328extern int udp_propinfo_count;
329
330/* Setable in /etc/system */
331/* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
332uint32_t udp_random_anon_port = 1;
333
334/*
335 * Hook functions to enable cluster networking.
336 * On non-clustered systems these vectors must always be NULL
337 */
338
339void (*cl_inet_bind)(netstackid_t stack_id, uchar_t protocol,
340    sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
341    void *args) = NULL;
342void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
343    sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
344    void *args) = NULL;
345
346typedef union T_primitives *t_primp_t;
347
348/*
349 * Return the next anonymous port in the privileged port range for
350 * bind checking.
351 *
352 * Trusted Extension (TX) notes: TX allows administrator to mark or
353 * reserve ports as Multilevel ports (MLP). MLP has special function
354 * on TX systems. Once a port is made MLP, it's not available as
355 * ordinary port. This creates "holes" in the port name space. It
356 * may be necessary to skip the "holes" find a suitable anon port.
357 */
358static in_port_t
359udp_get_next_priv_port(udp_t *udp)
360{
361	static in_port_t next_priv_port = IPPORT_RESERVED - 1;
362	in_port_t nextport;
363	boolean_t restart = B_FALSE;
364	udp_stack_t *us = udp->udp_us;
365
366retry:
367	if (next_priv_port < us->us_min_anonpriv_port ||
368	    next_priv_port >= IPPORT_RESERVED) {
369		next_priv_port = IPPORT_RESERVED - 1;
370		if (restart)
371			return (0);
372		restart = B_TRUE;
373	}
374
375	if (is_system_labeled() &&
376	    (nextport = tsol_next_port(crgetzone(udp->udp_connp->conn_cred),
377	    next_priv_port, IPPROTO_UDP, B_FALSE)) != 0) {
378		next_priv_port = nextport;
379		goto retry;
380	}
381
382	return (next_priv_port--);
383}
384
385/*
386 * Hash list removal routine for udp_t structures.
387 */
388static void
389udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock)
390{
391	udp_t		*udpnext;
392	kmutex_t	*lockp;
393	udp_stack_t	*us = udp->udp_us;
394	conn_t		*connp = udp->udp_connp;
395
396	if (udp->udp_ptpbhn == NULL)
397		return;
398
399	/*
400	 * Extract the lock pointer in case there are concurrent
401	 * hash_remove's for this instance.
402	 */
403	ASSERT(connp->conn_lport != 0);
404	if (!caller_holds_lock) {
405		lockp = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
406		    us->us_bind_fanout_size)].uf_lock;
407		ASSERT(lockp != NULL);
408		mutex_enter(lockp);
409	}
410	if (udp->udp_ptpbhn != NULL) {
411		udpnext = udp->udp_bind_hash;
412		if (udpnext != NULL) {
413			udpnext->udp_ptpbhn = udp->udp_ptpbhn;
414			udp->udp_bind_hash = NULL;
415		}
416		*udp->udp_ptpbhn = udpnext;
417		udp->udp_ptpbhn = NULL;
418	}
419	if (!caller_holds_lock) {
420		mutex_exit(lockp);
421	}
422}
423
424static void
425udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp)
426{
427	conn_t	*connp = udp->udp_connp;
428	udp_t	**udpp;
429	udp_t	*udpnext;
430	conn_t	*connext;
431
432	ASSERT(MUTEX_HELD(&uf->uf_lock));
433	ASSERT(udp->udp_ptpbhn == NULL);
434	udpp = &uf->uf_udp;
435	udpnext = udpp[0];
436	if (udpnext != NULL) {
437		/*
438		 * If the new udp bound to the INADDR_ANY address
439		 * and the first one in the list is not bound to
440		 * INADDR_ANY we skip all entries until we find the
441		 * first one bound to INADDR_ANY.
442		 * This makes sure that applications binding to a
443		 * specific address get preference over those binding to
444		 * INADDR_ANY.
445		 */
446		connext = udpnext->udp_connp;
447		if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
448		    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
449			while ((udpnext = udpp[0]) != NULL &&
450			    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
451				udpp = &(udpnext->udp_bind_hash);
452			}
453			if (udpnext != NULL)
454				udpnext->udp_ptpbhn = &udp->udp_bind_hash;
455		} else {
456			udpnext->udp_ptpbhn = &udp->udp_bind_hash;
457		}
458	}
459	udp->udp_bind_hash = udpnext;
460	udp->udp_ptpbhn = udpp;
461	udpp[0] = udp;
462}
463
464/*
465 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
466 * passed to udp_wput.
467 * It associates a port number and local address with the stream.
468 * It calls IP to verify the local IP address, and calls IP to insert
469 * the conn_t in the fanout table.
470 * If everything is ok it then sends the T_BIND_ACK back up.
471 *
472 * Note that UDP over IPv4 and IPv6 sockets can use the same port number
473 * without setting SO_REUSEADDR. This is needed so that they
474 * can be viewed as two independent transport protocols.
475 * However, anonymouns ports are allocated from the same range to avoid
476 * duplicating the us->us_next_port_to_try.
477 */
478static void
479udp_tpi_bind(queue_t *q, mblk_t *mp)
480{
481	sin_t		*sin;
482	sin6_t		*sin6;
483	mblk_t		*mp1;
484	struct T_bind_req *tbr;
485	conn_t		*connp;
486	udp_t		*udp;
487	int		error;
488	struct sockaddr	*sa;
489	cred_t		*cr;
490
491	/*
492	 * All Solaris components should pass a db_credp
493	 * for this TPI message, hence we ASSERT.
494	 * But in case there is some other M_PROTO that looks
495	 * like a TPI message sent by some other kernel
496	 * component, we check and return an error.
497	 */
498	cr = msg_getcred(mp, NULL);
499	ASSERT(cr != NULL);
500	if (cr == NULL) {
501		udp_err_ack(q, mp, TSYSERR, EINVAL);
502		return;
503	}
504
505	connp = Q_TO_CONN(q);
506	udp = connp->conn_udp;
507	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
508		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
509		    "udp_bind: bad req, len %u",
510		    (uint_t)(mp->b_wptr - mp->b_rptr));
511		udp_err_ack(q, mp, TPROTO, 0);
512		return;
513	}
514	if (udp->udp_state != TS_UNBND) {
515		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
516		    "udp_bind: bad state, %u", udp->udp_state);
517		udp_err_ack(q, mp, TOUTSTATE, 0);
518		return;
519	}
520	/*
521	 * Reallocate the message to make sure we have enough room for an
522	 * address.
523	 */
524	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
525	if (mp1 == NULL) {
526		udp_err_ack(q, mp, TSYSERR, ENOMEM);
527		return;
528	}
529
530	mp = mp1;
531
532	/* Reset the message type in preparation for shipping it back. */
533	DB_TYPE(mp) = M_PCPROTO;
534
535	tbr = (struct T_bind_req *)mp->b_rptr;
536	switch (tbr->ADDR_length) {
537	case 0:			/* Request for a generic port */
538		tbr->ADDR_offset = sizeof (struct T_bind_req);
539		if (connp->conn_family == AF_INET) {
540			tbr->ADDR_length = sizeof (sin_t);
541			sin = (sin_t *)&tbr[1];
542			*sin = sin_null;
543			sin->sin_family = AF_INET;
544			mp->b_wptr = (uchar_t *)&sin[1];
545			sa = (struct sockaddr *)sin;
546		} else {
547			ASSERT(connp->conn_family == AF_INET6);
548			tbr->ADDR_length = sizeof (sin6_t);
549			sin6 = (sin6_t *)&tbr[1];
550			*sin6 = sin6_null;
551			sin6->sin6_family = AF_INET6;
552			mp->b_wptr = (uchar_t *)&sin6[1];
553			sa = (struct sockaddr *)sin6;
554		}
555		break;
556
557	case sizeof (sin_t):	/* Complete IPv4 address */
558		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
559		    sizeof (sin_t));
560		if (sa == NULL || !OK_32PTR((char *)sa)) {
561			udp_err_ack(q, mp, TSYSERR, EINVAL);
562			return;
563		}
564		if (connp->conn_family != AF_INET ||
565		    sa->sa_family != AF_INET) {
566			udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
567			return;
568		}
569		break;
570
571	case sizeof (sin6_t):	/* complete IPv6 address */
572		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
573		    sizeof (sin6_t));
574		if (sa == NULL || !OK_32PTR((char *)sa)) {
575			udp_err_ack(q, mp, TSYSERR, EINVAL);
576			return;
577		}
578		if (connp->conn_family != AF_INET6 ||
579		    sa->sa_family != AF_INET6) {
580			udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
581			return;
582		}
583		break;
584
585	default:		/* Invalid request */
586		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
587		    "udp_bind: bad ADDR_length length %u", tbr->ADDR_length);
588		udp_err_ack(q, mp, TBADADDR, 0);
589		return;
590	}
591
592	error = udp_do_bind(connp, sa, tbr->ADDR_length, cr,
593	    tbr->PRIM_type != O_T_BIND_REQ);
594
595	if (error != 0) {
596		if (error > 0) {
597			udp_err_ack(q, mp, TSYSERR, error);
598		} else {
599			udp_err_ack(q, mp, -error, 0);
600		}
601	} else {
602		tbr->PRIM_type = T_BIND_ACK;
603		qreply(q, mp);
604	}
605}
606
607/*
608 * This routine handles each T_CONN_REQ message passed to udp.  It
609 * associates a default destination address with the stream.
610 *
611 * After various error checks are completed, udp_connect() lays
612 * the target address and port into the composite header template.
613 * Then we ask IP for information, including a source address if we didn't
614 * already have one. Finally we send up the T_OK_ACK reply message.
615 */
616static void
617udp_tpi_connect(queue_t *q, mblk_t *mp)
618{
619	conn_t	*connp = Q_TO_CONN(q);
620	int	error;
621	socklen_t	len;
622	struct sockaddr		*sa;
623	struct T_conn_req	*tcr;
624	cred_t		*cr;
625	pid_t		pid;
626	/*
627	 * All Solaris components should pass a db_credp
628	 * for this TPI message, hence we ASSERT.
629	 * But in case there is some other M_PROTO that looks
630	 * like a TPI message sent by some other kernel
631	 * component, we check and return an error.
632	 */
633	cr = msg_getcred(mp, &pid);
634	ASSERT(cr != NULL);
635	if (cr == NULL) {
636		udp_err_ack(q, mp, TSYSERR, EINVAL);
637		return;
638	}
639
640	tcr = (struct T_conn_req *)mp->b_rptr;
641
642	/* A bit of sanity checking */
643	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
644		udp_err_ack(q, mp, TPROTO, 0);
645		return;
646	}
647
648	if (tcr->OPT_length != 0) {
649		udp_err_ack(q, mp, TBADOPT, 0);
650		return;
651	}
652
653	/*
654	 * Determine packet type based on type of address passed in
655	 * the request should contain an IPv4 or IPv6 address.
656	 * Make sure that address family matches the type of
657	 * family of the address passed down.
658	 */
659	len = tcr->DEST_length;
660	switch (tcr->DEST_length) {
661	default:
662		udp_err_ack(q, mp, TBADADDR, 0);
663		return;
664
665	case sizeof (sin_t):
666		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
667		    sizeof (sin_t));
668		break;
669
670	case sizeof (sin6_t):
671		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
672		    sizeof (sin6_t));
673		break;
674	}
675
676	error = proto_verify_ip_addr(connp->conn_family, sa, len);
677	if (error != 0) {
678		udp_err_ack(q, mp, TSYSERR, error);
679		return;
680	}
681
682	error = udp_do_connect(connp, sa, len, cr, pid);
683	if (error != 0) {
684		if (error < 0)
685			udp_err_ack(q, mp, -error, 0);
686		else
687			udp_err_ack(q, mp, TSYSERR, error);
688	} else {
689		mblk_t	*mp1;
690		/*
691		 * We have to send a connection confirmation to
692		 * keep TLI happy.
693		 */
694		if (connp->conn_family == AF_INET) {
695			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
696			    sizeof (sin_t), NULL, 0);
697		} else {
698			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
699			    sizeof (sin6_t), NULL, 0);
700		}
701		if (mp1 == NULL) {
702			udp_err_ack(q, mp, TSYSERR, ENOMEM);
703			return;
704		}
705
706		/*
707		 * Send ok_ack for T_CONN_REQ
708		 */
709		mp = mi_tpi_ok_ack_alloc(mp);
710		if (mp == NULL) {
711			/* Unable to reuse the T_CONN_REQ for the ack. */
712			udp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
713			return;
714		}
715
716		putnext(connp->conn_rq, mp);
717		putnext(connp->conn_rq, mp1);
718	}
719}
720
721/* ARGSUSED */
722static int
723udp_tpi_close(queue_t *q, int flags, cred_t *credp __unused)
724{
725	conn_t	*connp;
726
727	if (flags & SO_FALLBACK) {
728		/*
729		 * stream is being closed while in fallback
730		 * simply free the resources that were allocated
731		 */
732		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
733		qprocsoff(q);
734		goto done;
735	}
736
737	connp = Q_TO_CONN(q);
738	udp_do_close(connp);
739done:
740	q->q_ptr = WR(q)->q_ptr = NULL;
741	return (0);
742}
743
744static void
745udp_close_free(conn_t *connp)
746{
747	udp_t *udp = connp->conn_udp;
748
749	/* If there are any options associated with the stream, free them. */
750	if (udp->udp_recv_ipp.ipp_fields != 0)
751		ip_pkt_free(&udp->udp_recv_ipp);
752
753	/*
754	 * Clear any fields which the kmem_cache constructor clears.
755	 * Only udp_connp needs to be preserved.
756	 * TBD: We should make this more efficient to avoid clearing
757	 * everything.
758	 */
759	ASSERT(udp->udp_connp == connp);
760	bzero(udp, sizeof (udp_t));
761	udp->udp_connp = connp;
762}
763
764static int
765udp_do_disconnect(conn_t *connp)
766{
767	udp_t	*udp;
768	udp_fanout_t *udpf;
769	udp_stack_t *us;
770	int	error;
771
772	udp = connp->conn_udp;
773	us = udp->udp_us;
774	mutex_enter(&connp->conn_lock);
775	if (udp->udp_state != TS_DATA_XFER) {
776		mutex_exit(&connp->conn_lock);
777		return (-TOUTSTATE);
778	}
779	udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
780	    us->us_bind_fanout_size)];
781	mutex_enter(&udpf->uf_lock);
782	if (connp->conn_mcbc_bind)
783		connp->conn_saddr_v6 = ipv6_all_zeros;
784	else
785		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
786	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
787	connp->conn_faddr_v6 = ipv6_all_zeros;
788	connp->conn_fport = 0;
789	udp->udp_state = TS_IDLE;
790	mutex_exit(&udpf->uf_lock);
791
792	/* Remove any remnants of mapped address binding */
793	if (connp->conn_family == AF_INET6)
794		connp->conn_ipversion = IPV6_VERSION;
795
796	connp->conn_v6lastdst = ipv6_all_zeros;
797	error = udp_build_hdr_template(connp, &connp->conn_saddr_v6,
798	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
799	mutex_exit(&connp->conn_lock);
800	if (error != 0)
801		return (error);
802
803	/*
804	 * Tell IP to remove the full binding and revert
805	 * to the local address binding.
806	 */
807	return (ip_laddr_fanout_insert(connp));
808}
809
810static void
811udp_tpi_disconnect(queue_t *q, mblk_t *mp)
812{
813	conn_t	*connp = Q_TO_CONN(q);
814	int	error;
815
816	/*
817	 * Allocate the largest primitive we need to send back
818	 * T_error_ack is > than T_ok_ack
819	 */
820	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
821	if (mp == NULL) {
822		/* Unable to reuse the T_DISCON_REQ for the ack. */
823		udp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
824		return;
825	}
826
827	error = udp_do_disconnect(connp);
828
829	if (error != 0) {
830		if (error < 0) {
831			udp_err_ack(q, mp, -error, 0);
832		} else {
833			udp_err_ack(q, mp, TSYSERR, error);
834		}
835	} else {
836		mp = mi_tpi_ok_ack_alloc(mp);
837		ASSERT(mp != NULL);
838		qreply(q, mp);
839	}
840}
841
842int
843udp_disconnect(conn_t *connp)
844{
845	int error;
846
847	connp->conn_dgram_errind = B_FALSE;
848	error = udp_do_disconnect(connp);
849	if (error < 0)
850		error = proto_tlitosyserr(-error);
851
852	return (error);
853}
854
855/* This routine creates a T_ERROR_ACK message and passes it upstream. */
856static void
857udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
858{
859	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
860		qreply(q, mp);
861}
862
863/* Shorthand to generate and send TPI error acks to our client */
864static void
865udp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
866    t_scalar_t t_error, int sys_error)
867{
868	struct T_error_ack	*teackp;
869
870	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
871	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
872		teackp = (struct T_error_ack *)mp->b_rptr;
873		teackp->ERROR_prim = primitive;
874		teackp->TLI_error = t_error;
875		teackp->UNIX_error = sys_error;
876		qreply(q, mp);
877	}
878}
879
880/* At minimum we need 4 bytes of UDP header */
881#define	ICMP_MIN_UDP_HDR	4
882
883/*
884 * udp_icmp_input is called as conn_recvicmp to process ICMP messages.
885 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
886 * Assumes that IP has pulled up everything up to and including the ICMP header.
887 */
888/* ARGSUSED2 */
889static void
890udp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
891{
892	conn_t		*connp = (conn_t *)arg1;
893	icmph_t		*icmph;
894	ipha_t		*ipha;
895	int		iph_hdr_length;
896	udpha_t		*udpha;
897	sin_t		sin;
898	sin6_t		sin6;
899	mblk_t		*mp1;
900	int		error = 0;
901	udp_t		*udp = connp->conn_udp;
902
903	ipha = (ipha_t *)mp->b_rptr;
904
905	ASSERT(OK_32PTR(mp->b_rptr));
906
907	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
908		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
909		udp_icmp_error_ipv6(connp, mp, ira);
910		return;
911	}
912	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
913
914	/* Skip past the outer IP and ICMP headers */
915	ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
916	iph_hdr_length = ira->ira_ip_hdr_length;
917	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
918	ipha = (ipha_t *)&icmph[1];	/* Inner IP header */
919
920	/* Skip past the inner IP and find the ULP header */
921	iph_hdr_length = IPH_HDR_LENGTH(ipha);
922	udpha = (udpha_t *)((char *)ipha + iph_hdr_length);
923
924	switch (icmph->icmph_type) {
925	case ICMP_DEST_UNREACHABLE:
926		switch (icmph->icmph_code) {
927		case ICMP_FRAGMENTATION_NEEDED: {
928			ipha_t		*ipha;
929			ip_xmit_attr_t	*ixa;
930			/*
931			 * IP has already adjusted the path MTU.
932			 * But we need to adjust DF for IPv4.
933			 */
934			if (connp->conn_ipversion != IPV4_VERSION)
935				break;
936
937			ixa = conn_get_ixa(connp, B_FALSE);
938			if (ixa == NULL || ixa->ixa_ire == NULL) {
939				/*
940				 * Some other thread holds conn_ixa. We will
941				 * redo this on the next ICMP too big.
942				 */
943				if (ixa != NULL)
944					ixa_refrele(ixa);
945				break;
946			}
947			(void) ip_get_pmtu(ixa);
948
949			mutex_enter(&connp->conn_lock);
950			ipha = (ipha_t *)connp->conn_ht_iphc;
951			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
952				ipha->ipha_fragment_offset_and_flags |=
953				    IPH_DF_HTONS;
954			} else {
955				ipha->ipha_fragment_offset_and_flags &=
956				    ~IPH_DF_HTONS;
957			}
958			mutex_exit(&connp->conn_lock);
959			ixa_refrele(ixa);
960			break;
961		}
962		case ICMP_PORT_UNREACHABLE:
963		case ICMP_PROTOCOL_UNREACHABLE:
964			error = ECONNREFUSED;
965			break;
966		default:
967			/* Transient errors */
968			break;
969		}
970		break;
971	default:
972		/* Transient errors */
973		break;
974	}
975	if (error == 0) {
976		freemsg(mp);
977		return;
978	}
979
980	/*
981	 * Deliver T_UDERROR_IND when the application has asked for it.
982	 * The socket layer enables this automatically when connected.
983	 */
984	if (!connp->conn_dgram_errind) {
985		freemsg(mp);
986		return;
987	}
988
989	switch (connp->conn_family) {
990	case AF_INET:
991		sin = sin_null;
992		sin.sin_family = AF_INET;
993		sin.sin_addr.s_addr = ipha->ipha_dst;
994		sin.sin_port = udpha->uha_dst_port;
995		if (IPCL_IS_NONSTR(connp)) {
996			mutex_enter(&connp->conn_lock);
997			if (udp->udp_state == TS_DATA_XFER) {
998				if (sin.sin_port == connp->conn_fport &&
999				    sin.sin_addr.s_addr ==
1000				    connp->conn_faddr_v4) {
1001					mutex_exit(&connp->conn_lock);
1002					(*connp->conn_upcalls->su_set_error)
1003					    (connp->conn_upper_handle, error);
1004					goto done;
1005				}
1006			} else {
1007				udp->udp_delayed_error = error;
1008				*((sin_t *)&udp->udp_delayed_addr) = sin;
1009			}
1010			mutex_exit(&connp->conn_lock);
1011		} else {
1012			mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t),
1013			    NULL, 0, error);
1014			if (mp1 != NULL)
1015				putnext(connp->conn_rq, mp1);
1016		}
1017		break;
1018	case AF_INET6:
1019		sin6 = sin6_null;
1020		sin6.sin6_family = AF_INET6;
1021		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &sin6.sin6_addr);
1022		sin6.sin6_port = udpha->uha_dst_port;
1023		if (IPCL_IS_NONSTR(connp)) {
1024			mutex_enter(&connp->conn_lock);
1025			if (udp->udp_state == TS_DATA_XFER) {
1026				if (sin6.sin6_port == connp->conn_fport &&
1027				    IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1028				    &connp->conn_faddr_v6)) {
1029					mutex_exit(&connp->conn_lock);
1030					(*connp->conn_upcalls->su_set_error)
1031					    (connp->conn_upper_handle, error);
1032					goto done;
1033				}
1034			} else {
1035				udp->udp_delayed_error = error;
1036				*((sin6_t *)&udp->udp_delayed_addr) = sin6;
1037			}
1038			mutex_exit(&connp->conn_lock);
1039		} else {
1040			mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1041			    NULL, 0, error);
1042			if (mp1 != NULL)
1043				putnext(connp->conn_rq, mp1);
1044		}
1045		break;
1046	}
1047done:
1048	freemsg(mp);
1049}
1050
1051/*
1052 * udp_icmp_error_ipv6 is called by udp_icmp_error to process ICMP for IPv6.
1053 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1054 * Assumes that IP has pulled up all the extension headers as well as the
1055 * ICMPv6 header.
1056 */
1057static void
1058udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1059{
1060	icmp6_t		*icmp6;
1061	ip6_t		*ip6h, *outer_ip6h;
1062	uint16_t	iph_hdr_length;
1063	uint8_t		*nexthdrp;
1064	udpha_t		*udpha;
1065	sin6_t		sin6;
1066	mblk_t		*mp1;
1067	int		error = 0;
1068	udp_t		*udp = connp->conn_udp;
1069	udp_stack_t	*us = udp->udp_us;
1070
1071	outer_ip6h = (ip6_t *)mp->b_rptr;
1072#ifdef DEBUG
1073	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1074		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1075	else
1076		iph_hdr_length = IPV6_HDR_LEN;
1077	ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1078#endif
1079	/* Skip past the outer IP and ICMP headers */
1080	iph_hdr_length = ira->ira_ip_hdr_length;
1081	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1082
1083	/* Skip past the inner IP and find the ULP header */
1084	ip6h = (ip6_t *)&icmp6[1];	/* Inner IP header */
1085	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1086		freemsg(mp);
1087		return;
1088	}
1089	udpha = (udpha_t *)((char *)ip6h + iph_hdr_length);
1090
1091	switch (icmp6->icmp6_type) {
1092	case ICMP6_DST_UNREACH:
1093		switch (icmp6->icmp6_code) {
1094		case ICMP6_DST_UNREACH_NOPORT:
1095			error = ECONNREFUSED;
1096			break;
1097		case ICMP6_DST_UNREACH_ADMIN:
1098		case ICMP6_DST_UNREACH_NOROUTE:
1099		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1100		case ICMP6_DST_UNREACH_ADDR:
1101			/* Transient errors */
1102			break;
1103		default:
1104			break;
1105		}
1106		break;
1107	case ICMP6_PACKET_TOO_BIG: {
1108		struct T_unitdata_ind	*tudi;
1109		struct T_opthdr		*toh;
1110		size_t			udi_size;
1111		mblk_t			*newmp;
1112		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1113		    sizeof (struct ip6_mtuinfo);
1114		sin6_t			*sin6;
1115		struct ip6_mtuinfo	*mtuinfo;
1116
1117		/*
1118		 * If the application has requested to receive path mtu
1119		 * information, send up an empty message containing an
1120		 * IPV6_PATHMTU ancillary data item.
1121		 */
1122		if (!connp->conn_ipv6_recvpathmtu)
1123			break;
1124
1125		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1126		    opt_length;
1127		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1128			UDPS_BUMP_MIB(us, udpInErrors);
1129			break;
1130		}
1131
1132		/*
1133		 * newmp->b_cont is left to NULL on purpose.  This is an
1134		 * empty message containing only ancillary data.
1135		 */
1136		newmp->b_datap->db_type = M_PROTO;
1137		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1138		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1139		tudi->PRIM_type = T_UNITDATA_IND;
1140		tudi->SRC_length = sizeof (sin6_t);
1141		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1142		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1143		tudi->OPT_length = opt_length;
1144
1145		sin6 = (sin6_t *)&tudi[1];
1146		bzero(sin6, sizeof (sin6_t));
1147		sin6->sin6_family = AF_INET6;
1148		sin6->sin6_addr = connp->conn_faddr_v6;
1149
1150		toh = (struct T_opthdr *)&sin6[1];
1151		toh->level = IPPROTO_IPV6;
1152		toh->name = IPV6_PATHMTU;
1153		toh->len = opt_length;
1154		toh->status = 0;
1155
1156		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1157		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1158		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1159		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1160		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1161		/*
1162		 * We've consumed everything we need from the original
1163		 * message.  Free it, then send our empty message.
1164		 */
1165		freemsg(mp);
1166		udp_ulp_recv(connp, newmp, msgdsize(newmp), ira);
1167		return;
1168	}
1169	case ICMP6_TIME_EXCEEDED:
1170		/* Transient errors */
1171		break;
1172	case ICMP6_PARAM_PROB:
1173		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1174		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1175		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1176		    (uchar_t *)nexthdrp) {
1177			error = ECONNREFUSED;
1178			break;
1179		}
1180		break;
1181	}
1182	if (error == 0) {
1183		freemsg(mp);
1184		return;
1185	}
1186
1187	/*
1188	 * Deliver T_UDERROR_IND when the application has asked for it.
1189	 * The socket layer enables this automatically when connected.
1190	 */
1191	if (!connp->conn_dgram_errind) {
1192		freemsg(mp);
1193		return;
1194	}
1195
1196	sin6 = sin6_null;
1197	sin6.sin6_family = AF_INET6;
1198	sin6.sin6_addr = ip6h->ip6_dst;
1199	sin6.sin6_port = udpha->uha_dst_port;
1200	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1201
1202	if (IPCL_IS_NONSTR(connp)) {
1203		mutex_enter(&connp->conn_lock);
1204		if (udp->udp_state == TS_DATA_XFER) {
1205			if (sin6.sin6_port == connp->conn_fport &&
1206			    IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1207			    &connp->conn_faddr_v6)) {
1208				mutex_exit(&connp->conn_lock);
1209				(*connp->conn_upcalls->su_set_error)
1210				    (connp->conn_upper_handle, error);
1211				goto done;
1212			}
1213		} else {
1214			udp->udp_delayed_error = error;
1215			*((sin6_t *)&udp->udp_delayed_addr) = sin6;
1216		}
1217		mutex_exit(&connp->conn_lock);
1218	} else {
1219		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1220		    NULL, 0, error);
1221		if (mp1 != NULL)
1222			putnext(connp->conn_rq, mp1);
1223	}
1224done:
1225	freemsg(mp);
1226}
1227
1228/*
1229 * This routine responds to T_ADDR_REQ messages.  It is called by udp_wput.
1230 * The local address is filled in if endpoint is bound. The remote address
1231 * is filled in if remote address has been precified ("connected endpoint")
1232 * (The concept of connected CLTS sockets is alien to published TPI
1233 *  but we support it anyway).
1234 */
1235static void
1236udp_addr_req(queue_t *q, mblk_t *mp)
1237{
1238	struct sockaddr *sa;
1239	mblk_t	*ackmp;
1240	struct T_addr_ack *taa;
1241	udp_t	*udp = Q_TO_UDP(q);
1242	conn_t	*connp = udp->udp_connp;
1243	uint_t	addrlen;
1244
1245	/* Make it large enough for worst case */
1246	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1247	    2 * sizeof (sin6_t), 1);
1248	if (ackmp == NULL) {
1249		udp_err_ack(q, mp, TSYSERR, ENOMEM);
1250		return;
1251	}
1252	taa = (struct T_addr_ack *)ackmp->b_rptr;
1253
1254	bzero(taa, sizeof (struct T_addr_ack));
1255	ackmp->b_wptr = (uchar_t *)&taa[1];
1256
1257	taa->PRIM_type = T_ADDR_ACK;
1258	ackmp->b_datap->db_type = M_PCPROTO;
1259
1260	if (connp->conn_family == AF_INET)
1261		addrlen = sizeof (sin_t);
1262	else
1263		addrlen = sizeof (sin6_t);
1264
1265	mutex_enter(&connp->conn_lock);
1266	/*
1267	 * Note: Following code assumes 32 bit alignment of basic
1268	 * data structures like sin_t and struct T_addr_ack.
1269	 */
1270	if (udp->udp_state != TS_UNBND) {
1271		/*
1272		 * Fill in local address first
1273		 */
1274		taa->LOCADDR_offset = sizeof (*taa);
1275		taa->LOCADDR_length = addrlen;
1276		sa = (struct sockaddr *)&taa[1];
1277		(void) conn_getsockname(connp, sa, &addrlen);
1278		ackmp->b_wptr += addrlen;
1279	}
1280	if (udp->udp_state == TS_DATA_XFER) {
1281		/*
1282		 * connected, fill remote address too
1283		 */
1284		taa->REMADDR_length = addrlen;
1285		/* assumed 32-bit alignment */
1286		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1287		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1288		(void) conn_getpeername(connp, sa, &addrlen);
1289		ackmp->b_wptr += addrlen;
1290	}
1291	mutex_exit(&connp->conn_lock);
1292	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1293	qreply(q, ackmp);
1294}
1295
1296static void
1297udp_copy_info(struct T_info_ack *tap, udp_t *udp)
1298{
1299	conn_t		*connp = udp->udp_connp;
1300
1301	if (connp->conn_family == AF_INET) {
1302		*tap = udp_g_t_info_ack_ipv4;
1303	} else {
1304		*tap = udp_g_t_info_ack_ipv6;
1305	}
1306	tap->CURRENT_state = udp->udp_state;
1307	tap->OPT_size = udp_max_optsize;
1308}
1309
1310static void
1311udp_do_capability_ack(udp_t *udp, struct T_capability_ack *tcap,
1312    t_uscalar_t cap_bits1)
1313{
1314	tcap->CAP_bits1 = 0;
1315
1316	if (cap_bits1 & TC1_INFO) {
1317		udp_copy_info(&tcap->INFO_ack, udp);
1318		tcap->CAP_bits1 |= TC1_INFO;
1319	}
1320}
1321
1322/*
1323 * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1324 * udp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1325 * udp_g_t_info_ack.  The current state of the stream is copied from
1326 * udp_state.
1327 */
1328static void
1329udp_capability_req(queue_t *q, mblk_t *mp)
1330{
1331	t_uscalar_t		cap_bits1;
1332	struct T_capability_ack	*tcap;
1333	udp_t	*udp = Q_TO_UDP(q);
1334
1335	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1336
1337	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1338	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1339	if (!mp)
1340		return;
1341
1342	tcap = (struct T_capability_ack *)mp->b_rptr;
1343	udp_do_capability_ack(udp, tcap, cap_bits1);
1344
1345	qreply(q, mp);
1346}
1347
1348/*
1349 * This routine responds to T_INFO_REQ messages.  It is called by udp_wput.
1350 * Most of the T_INFO_ACK information is copied from udp_g_t_info_ack.
1351 * The current state of the stream is copied from udp_state.
1352 */
1353static void
1354udp_info_req(queue_t *q, mblk_t *mp)
1355{
1356	udp_t *udp = Q_TO_UDP(q);
1357
1358	/* Create a T_INFO_ACK message. */
1359	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1360	    T_INFO_ACK);
1361	if (!mp)
1362		return;
1363	udp_copy_info((struct T_info_ack *)mp->b_rptr, udp);
1364	qreply(q, mp);
1365}
1366
1367/* For /dev/udp aka AF_INET open */
1368static int
1369udp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1370{
1371	return (udp_open(q, devp, flag, sflag, credp, B_FALSE));
1372}
1373
1374/* For /dev/udp6 aka AF_INET6 open */
1375static int
1376udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1377{
1378	return (udp_open(q, devp, flag, sflag, credp, B_TRUE));
1379}
1380
1381/*
1382 * This is the open routine for udp.  It allocates a udp_t structure for
1383 * the stream and, on the first open of the module, creates an ND table.
1384 */
1385static int
1386udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1387    boolean_t isv6)
1388{
1389	udp_t		*udp;
1390	conn_t		*connp;
1391	dev_t		conn_dev;
1392	vmem_t		*minor_arena;
1393	int		err;
1394
1395	/* If the stream is already open, return immediately. */
1396	if (q->q_ptr != NULL)
1397		return (0);
1398
1399	if (sflag == MODOPEN)
1400		return (EINVAL);
1401
1402	if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
1403	    ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
1404		minor_arena = ip_minor_arena_la;
1405	} else {
1406		/*
1407		 * Either minor numbers in the large arena were exhausted
1408		 * or a non socket application is doing the open.
1409		 * Try to allocate from the small arena.
1410		 */
1411		if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0)
1412			return (EBUSY);
1413
1414		minor_arena = ip_minor_arena_sa;
1415	}
1416
1417	if (flag & SO_FALLBACK) {
1418		/*
1419		 * Non streams socket needs a stream to fallback to
1420		 */
1421		RD(q)->q_ptr = (void *)conn_dev;
1422		WR(q)->q_qinfo = &udp_fallback_sock_winit;
1423		WR(q)->q_ptr = (void *)minor_arena;
1424		qprocson(q);
1425		return (0);
1426	}
1427
1428	connp = udp_do_open(credp, isv6, KM_SLEEP, &err);
1429	if (connp == NULL) {
1430		inet_minor_free(minor_arena, conn_dev);
1431		return (err);
1432	}
1433	udp = connp->conn_udp;
1434
1435	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1436	connp->conn_dev = conn_dev;
1437	connp->conn_minor_arena = minor_arena;
1438
1439	/*
1440	 * Initialize the udp_t structure for this stream.
1441	 */
1442	q->q_ptr = connp;
1443	WR(q)->q_ptr = connp;
1444	connp->conn_rq = q;
1445	connp->conn_wq = WR(q);
1446
1447	/*
1448	 * Since this conn_t/udp_t is not yet visible to anybody else we don't
1449	 * need to lock anything.
1450	 */
1451	ASSERT(connp->conn_proto == IPPROTO_UDP);
1452	ASSERT(connp->conn_udp == udp);
1453	ASSERT(udp->udp_connp == connp);
1454
1455	if (flag & SO_SOCKSTR) {
1456		udp->udp_issocket = B_TRUE;
1457	}
1458
1459	WR(q)->q_hiwat = connp->conn_sndbuf;
1460	WR(q)->q_lowat = connp->conn_sndlowat;
1461
1462	qprocson(q);
1463
1464	/* Set the Stream head write offset and high watermark. */
1465	(void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1466	(void) proto_set_rx_hiwat(q, connp,
1467	    udp_set_rcv_hiwat(udp, connp->conn_rcvbuf));
1468
1469	mutex_enter(&connp->conn_lock);
1470	connp->conn_state_flags &= ~CONN_INCIPIENT;
1471	mutex_exit(&connp->conn_lock);
1472	return (0);
1473}
1474
1475/*
1476 * Which UDP options OK to set through T_UNITDATA_REQ...
1477 */
1478/* ARGSUSED */
1479static boolean_t
1480udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1481{
1482	return (B_TRUE);
1483}
1484
1485/*
1486 * This routine gets default values of certain options whose default
1487 * values are maintained by protcol specific code
1488 */
1489int
1490udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1491{
1492	udp_t		*udp = Q_TO_UDP(q);
1493	udp_stack_t *us = udp->udp_us;
1494	int *i1 = (int *)ptr;
1495
1496	switch (level) {
1497	case IPPROTO_IP:
1498		switch (name) {
1499		case IP_MULTICAST_TTL:
1500			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1501			return (sizeof (uchar_t));
1502		case IP_MULTICAST_LOOP:
1503			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1504			return (sizeof (uchar_t));
1505		}
1506		break;
1507	case IPPROTO_IPV6:
1508		switch (name) {
1509		case IPV6_MULTICAST_HOPS:
1510			*i1 = IP_DEFAULT_MULTICAST_TTL;
1511			return (sizeof (int));
1512		case IPV6_MULTICAST_LOOP:
1513			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1514			return (sizeof (int));
1515		case IPV6_UNICAST_HOPS:
1516			*i1 = us->us_ipv6_hoplimit;
1517			return (sizeof (int));
1518		}
1519		break;
1520	}
1521	return (-1);
1522}
1523
1524/*
1525 * This routine retrieves the current status of socket options.
1526 * It returns the size of the option retrieved, or -1.
1527 */
1528int
1529udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
1530    uchar_t *ptr)
1531{
1532	int		*i1 = (int *)ptr;
1533	udp_t		*udp = connp->conn_udp;
1534	int		len;
1535	conn_opt_arg_t	coas;
1536	int		retval;
1537
1538	coas.coa_connp = connp;
1539	coas.coa_ixa = connp->conn_ixa;
1540	coas.coa_ipp = &connp->conn_xmit_ipp;
1541	coas.coa_ancillary = B_FALSE;
1542	coas.coa_changed = 0;
1543
1544	/*
1545	 * We assume that the optcom framework has checked for the set
1546	 * of levels and names that are supported, hence we don't worry
1547	 * about rejecting based on that.
1548	 * First check for UDP specific handling, then pass to common routine.
1549	 */
1550	switch (level) {
1551	case IPPROTO_IP:
1552		/*
1553		 * Only allow IPv4 option processing on IPv4 sockets.
1554		 */
1555		if (connp->conn_family != AF_INET)
1556			return (-1);
1557
1558		switch (name) {
1559		case IP_OPTIONS:
1560		case T_IP_OPTIONS:
1561			mutex_enter(&connp->conn_lock);
1562			if (!(udp->udp_recv_ipp.ipp_fields &
1563			    IPPF_IPV4_OPTIONS)) {
1564				mutex_exit(&connp->conn_lock);
1565				return (0);
1566			}
1567
1568			len = udp->udp_recv_ipp.ipp_ipv4_options_len;
1569			ASSERT(len != 0);
1570			bcopy(udp->udp_recv_ipp.ipp_ipv4_options, ptr, len);
1571			mutex_exit(&connp->conn_lock);
1572			return (len);
1573		}
1574		break;
1575	case IPPROTO_UDP:
1576		switch (name) {
1577		case UDP_NAT_T_ENDPOINT:
1578			mutex_enter(&connp->conn_lock);
1579			*i1 = udp->udp_nat_t_endpoint;
1580			mutex_exit(&connp->conn_lock);
1581			return (sizeof (int));
1582		case UDP_RCVHDR:
1583			mutex_enter(&connp->conn_lock);
1584			*i1 = udp->udp_rcvhdr ? 1 : 0;
1585			mutex_exit(&connp->conn_lock);
1586			return (sizeof (int));
1587		}
1588	}
1589	mutex_enter(&connp->conn_lock);
1590	retval = conn_opt_get(&coas, level, name, ptr);
1591	mutex_exit(&connp->conn_lock);
1592	return (retval);
1593}
1594
1595/*
1596 * This routine retrieves the current status of socket options.
1597 * It returns the size of the option retrieved, or -1.
1598 */
1599int
1600udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1601{
1602	conn_t		*connp = Q_TO_CONN(q);
1603	int		err;
1604
1605	err = udp_opt_get(connp, level, name, ptr);
1606	return (err);
1607}
1608
1609/*
1610 * This routine sets socket options.
1611 */
1612int
1613udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
1614    uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
1615{
1616	conn_t		*connp = coa->coa_connp;
1617	ip_xmit_attr_t	*ixa = coa->coa_ixa;
1618	udp_t		*udp = connp->conn_udp;
1619	udp_stack_t	*us = udp->udp_us;
1620	int		*i1 = (int *)invalp;
1621	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
1622	int		error;
1623
1624	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
1625	/*
1626	 * First do UDP specific sanity checks and handle UDP specific
1627	 * options. Note that some IPPROTO_UDP options are handled
1628	 * by conn_opt_set.
1629	 */
1630	switch (level) {
1631	case SOL_SOCKET:
1632		switch (name) {
1633		case SO_SNDBUF:
1634			if (*i1 > us->us_max_buf) {
1635				return (ENOBUFS);
1636			}
1637			break;
1638		case SO_RCVBUF:
1639			if (*i1 > us->us_max_buf) {
1640				return (ENOBUFS);
1641			}
1642			break;
1643
1644		case SCM_UCRED: {
1645			struct ucred_s *ucr;
1646			cred_t *newcr;
1647			ts_label_t *tsl;
1648
1649			/*
1650			 * Only sockets that have proper privileges and are
1651			 * bound to MLPs will have any other value here, so
1652			 * this implicitly tests for privilege to set label.
1653			 */
1654			if (connp->conn_mlp_type == mlptSingle)
1655				break;
1656
1657			ucr = (struct ucred_s *)invalp;
1658			if (inlen < sizeof (*ucr) + sizeof (bslabel_t) ||
1659			    ucr->uc_labeloff < sizeof (*ucr) ||
1660			    ucr->uc_labeloff + sizeof (bslabel_t) > inlen)
1661				return (EINVAL);
1662			if (!checkonly) {
1663				/*
1664				 * Set ixa_tsl to the new label.
1665				 * We assume that crgetzoneid doesn't change
1666				 * as part of the SCM_UCRED.
1667				 */
1668				ASSERT(cr != NULL);
1669				if ((tsl = crgetlabel(cr)) == NULL)
1670					return (EINVAL);
1671				newcr = copycred_from_bslabel(cr, UCLABEL(ucr),
1672				    tsl->tsl_doi, KM_NOSLEEP);
1673				if (newcr == NULL)
1674					return (ENOSR);
1675				ASSERT(newcr->cr_label != NULL);
1676				/*
1677				 * Move the hold on the cr_label to ixa_tsl by
1678				 * setting cr_label to NULL. Then release newcr.
1679				 */
1680				ip_xmit_attr_replace_tsl(ixa, newcr->cr_label);
1681				ixa->ixa_flags |= IXAF_UCRED_TSL;
1682				newcr->cr_label = NULL;
1683				crfree(newcr);
1684				coa->coa_changed |= COA_HEADER_CHANGED;
1685				coa->coa_changed |= COA_WROFF_CHANGED;
1686			}
1687			/* Fully handled this option. */
1688			return (0);
1689		}
1690		}
1691		break;
1692	case IPPROTO_UDP:
1693		switch (name) {
1694		case UDP_NAT_T_ENDPOINT:
1695			if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
1696				return (error);
1697			}
1698
1699			/*
1700			 * Use conn_family instead so we can avoid ambiguitites
1701			 * with AF_INET6 sockets that may switch from IPv4
1702			 * to IPv6.
1703			 */
1704			if (connp->conn_family != AF_INET) {
1705				return (EAFNOSUPPORT);
1706			}
1707
1708			if (!checkonly) {
1709				mutex_enter(&connp->conn_lock);
1710				udp->udp_nat_t_endpoint = onoff;
1711				mutex_exit(&connp->conn_lock);
1712				coa->coa_changed |= COA_HEADER_CHANGED;
1713				coa->coa_changed |= COA_WROFF_CHANGED;
1714			}
1715			/* Fully handled this option. */
1716			return (0);
1717		case UDP_RCVHDR:
1718			mutex_enter(&connp->conn_lock);
1719			udp->udp_rcvhdr = onoff;
1720			mutex_exit(&connp->conn_lock);
1721			return (0);
1722		}
1723		break;
1724	}
1725	error = conn_opt_set(coa, level, name, inlen, invalp,
1726	    checkonly, cr);
1727	return (error);
1728}
1729
1730/*
1731 * This routine sets socket options.
1732 */
1733int
1734udp_opt_set(conn_t *connp, uint_t optset_context, int level,
1735    int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
1736    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
1737{
1738	udp_t		*udp = connp->conn_udp;
1739	int		err;
1740	conn_opt_arg_t	coas, *coa;
1741	boolean_t	checkonly;
1742	udp_stack_t	*us = udp->udp_us;
1743
1744	switch (optset_context) {
1745	case SETFN_OPTCOM_CHECKONLY:
1746		checkonly = B_TRUE;
1747		/*
1748		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
1749		 * inlen != 0 implies value supplied and
1750		 *	we have to "pretend" to set it.
1751		 * inlen == 0 implies that there is no
1752		 *	value part in T_CHECK request and just validation
1753		 * done elsewhere should be enough, we just return here.
1754		 */
1755		if (inlen == 0) {
1756			*outlenp = 0;
1757			return (0);
1758		}
1759		break;
1760	case SETFN_OPTCOM_NEGOTIATE:
1761		checkonly = B_FALSE;
1762		break;
1763	case SETFN_UD_NEGOTIATE:
1764	case SETFN_CONN_NEGOTIATE:
1765		checkonly = B_FALSE;
1766		/*
1767		 * Negotiating local and "association-related" options
1768		 * through T_UNITDATA_REQ.
1769		 *
1770		 * Following routine can filter out ones we do not
1771		 * want to be "set" this way.
1772		 */
1773		if (!udp_opt_allow_udr_set(level, name)) {
1774			*outlenp = 0;
1775			return (EINVAL);
1776		}
1777		break;
1778	default:
1779		/*
1780		 * We should never get here
1781		 */
1782		*outlenp = 0;
1783		return (EINVAL);
1784	}
1785
1786	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
1787	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
1788
1789	if (thisdg_attrs != NULL) {
1790		/* Options from T_UNITDATA_REQ */
1791		coa = (conn_opt_arg_t *)thisdg_attrs;
1792		ASSERT(coa->coa_connp == connp);
1793		ASSERT(coa->coa_ixa != NULL);
1794		ASSERT(coa->coa_ipp != NULL);
1795		ASSERT(coa->coa_ancillary);
1796	} else {
1797		coa = &coas;
1798		coas.coa_connp = connp;
1799		/* Get a reference on conn_ixa to prevent concurrent mods */
1800		coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
1801		if (coas.coa_ixa == NULL) {
1802			*outlenp = 0;
1803			return (ENOMEM);
1804		}
1805		coas.coa_ipp = &connp->conn_xmit_ipp;
1806		coas.coa_ancillary = B_FALSE;
1807		coas.coa_changed = 0;
1808	}
1809
1810	err = udp_do_opt_set(coa, level, name, inlen, invalp,
1811	    cr, checkonly);
1812	if (err != 0) {
1813errout:
1814		if (!coa->coa_ancillary)
1815			ixa_refrele(coa->coa_ixa);
1816		*outlenp = 0;
1817		return (err);
1818	}
1819	/* Handle DHCPINIT here outside of lock */
1820	if (level == IPPROTO_IP && name == IP_DHCPINIT_IF) {
1821		uint_t	ifindex;
1822		ill_t	*ill;
1823
1824		ifindex = *(uint_t *)invalp;
1825		if (ifindex == 0) {
1826			ill = NULL;
1827		} else {
1828			ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
1829			    coa->coa_ixa->ixa_ipst);
1830			if (ill == NULL) {
1831				err = ENXIO;
1832				goto errout;
1833			}
1834
1835			mutex_enter(&ill->ill_lock);
1836			if (ill->ill_state_flags & ILL_CONDEMNED) {
1837				mutex_exit(&ill->ill_lock);
1838				ill_refrele(ill);
1839				err = ENXIO;
1840				goto errout;
1841			}
1842			if (IS_VNI(ill)) {
1843				mutex_exit(&ill->ill_lock);
1844				ill_refrele(ill);
1845				err = EINVAL;
1846				goto errout;
1847			}
1848		}
1849		mutex_enter(&connp->conn_lock);
1850
1851		if (connp->conn_dhcpinit_ill != NULL) {
1852			/*
1853			 * We've locked the conn so conn_cleanup_ill()
1854			 * cannot clear conn_dhcpinit_ill -- so it's
1855			 * safe to access the ill.
1856			 */
1857			ill_t *oill = connp->conn_dhcpinit_ill;
1858
1859			ASSERT(oill->ill_dhcpinit != 0);
1860			atomic_dec_32(&oill->ill_dhcpinit);
1861			ill_set_inputfn(connp->conn_dhcpinit_ill);
1862			connp->conn_dhcpinit_ill = NULL;
1863		}
1864
1865		if (ill != NULL) {
1866			connp->conn_dhcpinit_ill = ill;
1867			atomic_inc_32(&ill->ill_dhcpinit);
1868			ill_set_inputfn(ill);
1869			mutex_exit(&connp->conn_lock);
1870			mutex_exit(&ill->ill_lock);
1871			ill_refrele(ill);
1872		} else {
1873			mutex_exit(&connp->conn_lock);
1874		}
1875	}
1876
1877	/*
1878	 * Common case of OK return with outval same as inval.
1879	 */
1880	if (invalp != outvalp) {
1881		/* don't trust bcopy for identical src/dst */
1882		(void) bcopy(invalp, outvalp, inlen);
1883	}
1884	*outlenp = inlen;
1885
1886	/*
1887	 * If this was not ancillary data, then we rebuild the headers,
1888	 * update the IRE/NCE, and IPsec as needed.
1889	 * Since the label depends on the destination we go through
1890	 * ip_set_destination first.
1891	 */
1892	if (coa->coa_ancillary) {
1893		return (0);
1894	}
1895
1896	if (coa->coa_changed & COA_ROUTE_CHANGED) {
1897		in6_addr_t saddr, faddr, nexthop;
1898		in_port_t fport;
1899
1900		/*
1901		 * We clear lastdst to make sure we pick up the change
1902		 * next time sending.
1903		 * If we are connected we re-cache the information.
1904		 * We ignore errors to preserve BSD behavior.
1905		 * Note that we don't redo IPsec policy lookup here
1906		 * since the final destination (or source) didn't change.
1907		 */
1908		mutex_enter(&connp->conn_lock);
1909		connp->conn_v6lastdst = ipv6_all_zeros;
1910
1911		ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
1912		    &connp->conn_faddr_v6, &nexthop);
1913		saddr = connp->conn_saddr_v6;
1914		faddr = connp->conn_faddr_v6;
1915		fport = connp->conn_fport;
1916		mutex_exit(&connp->conn_lock);
1917
1918		if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
1919		    !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
1920			(void) ip_attr_connect(connp, coa->coa_ixa,
1921			    &saddr, &faddr, &nexthop, fport, NULL, NULL,
1922			    IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
1923		}
1924	}
1925
1926	ixa_refrele(coa->coa_ixa);
1927
1928	if (coa->coa_changed & COA_HEADER_CHANGED) {
1929		/*
1930		 * Rebuild the header template if we are connected.
1931		 * Otherwise clear conn_v6lastdst so we rebuild the header
1932		 * in the data path.
1933		 */
1934		mutex_enter(&connp->conn_lock);
1935		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1936		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1937			err = udp_build_hdr_template(connp,
1938			    &connp->conn_saddr_v6, &connp->conn_faddr_v6,
1939			    connp->conn_fport, connp->conn_flowinfo);
1940			if (err != 0) {
1941				mutex_exit(&connp->conn_lock);
1942				return (err);
1943			}
1944		} else {
1945			connp->conn_v6lastdst = ipv6_all_zeros;
1946		}
1947		mutex_exit(&connp->conn_lock);
1948	}
1949	if (coa->coa_changed & COA_RCVBUF_CHANGED) {
1950		(void) proto_set_rx_hiwat(connp->conn_rq, connp,
1951		    connp->conn_rcvbuf);
1952	}
1953	if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1954		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1955	}
1956	if (coa->coa_changed & COA_WROFF_CHANGED) {
1957		/* Increase wroff if needed */
1958		uint_t wroff;
1959
1960		mutex_enter(&connp->conn_lock);
1961		wroff = connp->conn_ht_iphc_allocated + us->us_wroff_extra;
1962		if (udp->udp_nat_t_endpoint)
1963			wroff += sizeof (uint32_t);
1964		if (wroff > connp->conn_wroff) {
1965			connp->conn_wroff = wroff;
1966			mutex_exit(&connp->conn_lock);
1967			(void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
1968		} else {
1969			mutex_exit(&connp->conn_lock);
1970		}
1971	}
1972	return (err);
1973}
1974
1975/* This routine sets socket options. */
1976int
1977udp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
1978    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
1979    void *thisdg_attrs, cred_t *cr)
1980{
1981	conn_t	*connp = Q_TO_CONN(q);
1982	int error;
1983
1984	error = udp_opt_set(connp, optset_context, level, name, inlen, invalp,
1985	    outlenp, outvalp, thisdg_attrs, cr);
1986	return (error);
1987}
1988
1989/*
1990 * Setup IP and UDP headers.
1991 * Returns NULL on allocation failure, in which case data_mp is freed.
1992 */
1993mblk_t *
1994udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
1995    const in6_addr_t *v6src, const in6_addr_t *v6dst, in_port_t dstport,
1996    uint32_t flowinfo, mblk_t *data_mp, int *errorp)
1997{
1998	mblk_t		*mp;
1999	udpha_t		*udpha;
2000	udp_stack_t	*us = connp->conn_netstack->netstack_udp;
2001	uint_t		data_len;
2002	uint32_t	cksum;
2003	udp_t		*udp = connp->conn_udp;
2004	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
2005	uint_t		ulp_hdr_len;
2006
2007	data_len = msgdsize(data_mp);
2008	ulp_hdr_len = UDPH_SIZE;
2009	if (insert_spi)
2010		ulp_hdr_len += sizeof (uint32_t);
2011
2012	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
2013	    ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
2014	if (mp == NULL) {
2015		ASSERT(*errorp != 0);
2016		return (NULL);
2017	}
2018
2019	data_len += ulp_hdr_len;
2020	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2021
2022	udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
2023	udpha->uha_src_port = connp->conn_lport;
2024	udpha->uha_dst_port = dstport;
2025	udpha->uha_checksum = 0;
2026	udpha->uha_length = htons(data_len);
2027
2028	/*
2029	 * If there was a routing option/header then conn_prepend_hdr
2030	 * has massaged it and placed the pseudo-header checksum difference
2031	 * in the cksum argument.
2032	 *
2033	 * Setup header length and prepare for ULP checksum done in IP.
2034	 *
2035	 * We make it easy for IP to include our pseudo header
2036	 * by putting our length in uha_checksum.
2037	 * The IP source, destination, and length have already been set by
2038	 * conn_prepend_hdr.
2039	 */
2040	cksum += data_len;
2041	cksum = (cksum >> 16) + (cksum & 0xFFFF);
2042	ASSERT(cksum < 0x10000);
2043
2044	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2045		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
2046
2047		ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2048
2049		/* IP does the checksum if uha_checksum is non-zero */
2050		if (us->us_do_checksum) {
2051			if (cksum == 0)
2052				udpha->uha_checksum = 0xffff;
2053			else
2054				udpha->uha_checksum = htons(cksum);
2055		} else {
2056			udpha->uha_checksum = 0;
2057		}
2058	} else {
2059		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2060
2061		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2062		if (cksum == 0)
2063			udpha->uha_checksum = 0xffff;
2064		else
2065			udpha->uha_checksum = htons(cksum);
2066	}
2067
2068	/* Insert all-0s SPI now. */
2069	if (insert_spi)
2070		*((uint32_t *)(udpha + 1)) = 0;
2071
2072	return (mp);
2073}
2074
2075static int
2076udp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2077    const in6_addr_t *v6dst, in_port_t dstport, uint32_t flowinfo)
2078{
2079	udpha_t		*udpha;
2080	int		error;
2081
2082	ASSERT(MUTEX_HELD(&connp->conn_lock));
2083	/*
2084	 * We clear lastdst to make sure we don't use the lastdst path
2085	 * next time sending since we might not have set v6dst yet.
2086	 */
2087	connp->conn_v6lastdst = ipv6_all_zeros;
2088
2089	error = conn_build_hdr_template(connp, UDPH_SIZE, 0, v6src, v6dst,
2090	    flowinfo);
2091	if (error != 0)
2092		return (error);
2093
2094	/*
2095	 * Any routing header/option has been massaged. The checksum difference
2096	 * is stored in conn_sum.
2097	 */
2098	udpha = (udpha_t *)connp->conn_ht_ulp;
2099	udpha->uha_src_port = connp->conn_lport;
2100	udpha->uha_dst_port = dstport;
2101	udpha->uha_checksum = 0;
2102	udpha->uha_length = htons(UDPH_SIZE);	/* Filled in later */
2103	return (0);
2104}
2105
2106static mblk_t *
2107udp_queue_fallback(udp_t *udp, mblk_t *mp)
2108{
2109	ASSERT(MUTEX_HELD(&udp->udp_recv_lock));
2110	if (IPCL_IS_NONSTR(udp->udp_connp)) {
2111		/*
2112		 * fallback has started but messages have not been moved yet
2113		 */
2114		if (udp->udp_fallback_queue_head == NULL) {
2115			ASSERT(udp->udp_fallback_queue_tail == NULL);
2116			udp->udp_fallback_queue_head = mp;
2117			udp->udp_fallback_queue_tail = mp;
2118		} else {
2119			ASSERT(udp->udp_fallback_queue_tail != NULL);
2120			udp->udp_fallback_queue_tail->b_next = mp;
2121			udp->udp_fallback_queue_tail = mp;
2122		}
2123		return (NULL);
2124	} else {
2125		/*
2126		 * Fallback completed, let the caller putnext() the mblk.
2127		 */
2128		return (mp);
2129	}
2130}
2131
2132/*
2133 * Deliver data to ULP. In case we have a socket, and it's falling back to
2134 * TPI, then we'll queue the mp for later processing.
2135 */
2136static void
2137udp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len, ip_recv_attr_t *ira)
2138{
2139	if (IPCL_IS_NONSTR(connp)) {
2140		udp_t *udp = connp->conn_udp;
2141		int error;
2142
2143		ASSERT(len == msgdsize(mp));
2144		if ((*connp->conn_upcalls->su_recv)
2145		    (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2146			mutex_enter(&udp->udp_recv_lock);
2147			if (error == ENOSPC) {
2148				/*
2149				 * let's confirm while holding the lock
2150				 */
2151				if ((*connp->conn_upcalls->su_recv)
2152				    (connp->conn_upper_handle, NULL, 0, 0,
2153				    &error, NULL) < 0) {
2154					ASSERT(error == ENOSPC);
2155					if (error == ENOSPC) {
2156						connp->conn_flow_cntrld =
2157						    B_TRUE;
2158					}
2159				}
2160				mutex_exit(&udp->udp_recv_lock);
2161			} else {
2162				ASSERT(error == EOPNOTSUPP);
2163				mp = udp_queue_fallback(udp, mp);
2164				mutex_exit(&udp->udp_recv_lock);
2165				if (mp != NULL)
2166					putnext(connp->conn_rq, mp);
2167			}
2168		}
2169		ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock));
2170	} else {
2171		if (is_system_labeled()) {
2172			ASSERT(ira->ira_cred != NULL);
2173			/*
2174			 * Provide for protocols above UDP such as RPC
2175			 * NOPID leaves db_cpid unchanged.
2176			 */
2177			mblk_setcred(mp, ira->ira_cred, NOPID);
2178		}
2179
2180		putnext(connp->conn_rq, mp);
2181	}
2182}
2183
2184/*
2185 * This is the inbound data path.
2186 * IP has already pulled up the IP plus UDP headers and verified alignment
2187 * etc.
2188 */
2189/* ARGSUSED2 */
2190static void
2191udp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2192{
2193	conn_t			*connp = (conn_t *)arg1;
2194	struct T_unitdata_ind	*tudi;
2195	uchar_t			*rptr;		/* Pointer to IP header */
2196	int			hdr_length;	/* Length of IP+UDP headers */
2197	int			udi_size;	/* Size of T_unitdata_ind */
2198	int			pkt_len;
2199	udp_t			*udp;
2200	udpha_t			*udpha;
2201	ip_pkt_t		ipps;
2202	ip6_t			*ip6h;
2203	mblk_t			*mp1;
2204	uint32_t		udp_ipv4_options_len;
2205	crb_t			recv_ancillary;
2206	udp_stack_t		*us;
2207
2208	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2209
2210	udp = connp->conn_udp;
2211	us = udp->udp_us;
2212	rptr = mp->b_rptr;
2213
2214	ASSERT(DB_TYPE(mp) == M_DATA);
2215	ASSERT(OK_32PTR(rptr));
2216	ASSERT(ira->ira_pktlen == msgdsize(mp));
2217	pkt_len = ira->ira_pktlen;
2218
2219	/*
2220	 * Get a snapshot of these and allow other threads to change
2221	 * them after that. We need the same recv_ancillary when determining
2222	 * the size as when adding the ancillary data items.
2223	 */
2224	mutex_enter(&connp->conn_lock);
2225	udp_ipv4_options_len = udp->udp_recv_ipp.ipp_ipv4_options_len;
2226	recv_ancillary = connp->conn_recv_ancillary;
2227	mutex_exit(&connp->conn_lock);
2228
2229	hdr_length = ira->ira_ip_hdr_length;
2230
2231	/*
2232	 * IP inspected the UDP header thus all of it must be in the mblk.
2233	 * UDP length check is performed for IPv6 packets and IPv4 packets
2234	 * to check if the size of the packet as specified
2235	 * by the UDP header is the same as the length derived from the IP
2236	 * header.
2237	 */
2238	udpha = (udpha_t *)(rptr + hdr_length);
2239	if (pkt_len != ntohs(udpha->uha_length) + hdr_length)
2240		goto tossit;
2241
2242	hdr_length += UDPH_SIZE;
2243	ASSERT(MBLKL(mp) >= hdr_length);	/* IP did a pullup */
2244
2245	/* Initialize regardless of IP version */
2246	ipps.ipp_fields = 0;
2247
2248	if (((ira->ira_flags & IRAF_IPV4_OPTIONS) ||
2249	    udp_ipv4_options_len > 0) &&
2250	    connp->conn_family == AF_INET) {
2251		int	err;
2252
2253		/*
2254		 * Record/update udp_recv_ipp with the lock
2255		 * held. Not needed for AF_INET6 sockets
2256		 * since they don't support a getsockopt of IP_OPTIONS.
2257		 */
2258		mutex_enter(&connp->conn_lock);
2259		err = ip_find_hdr_v4((ipha_t *)rptr, &udp->udp_recv_ipp,
2260		    B_TRUE);
2261		if (err != 0) {
2262			/* Allocation failed. Drop packet */
2263			mutex_exit(&connp->conn_lock);
2264			freemsg(mp);
2265			UDPS_BUMP_MIB(us, udpInErrors);
2266			return;
2267		}
2268		mutex_exit(&connp->conn_lock);
2269	}
2270
2271	if (recv_ancillary.crb_all != 0) {
2272		/*
2273		 * Record packet information in the ip_pkt_t
2274		 */
2275		if (ira->ira_flags & IRAF_IS_IPV4) {
2276			ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2277			ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2278			ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP);
2279			ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2280
2281			(void) ip_find_hdr_v4((ipha_t *)rptr, &ipps, B_FALSE);
2282		} else {
2283			uint8_t nexthdrp;
2284
2285			ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2286			/*
2287			 * IPv6 packets can only be received by applications
2288			 * that are prepared to receive IPv6 addresses.
2289			 * The IP fanout must ensure this.
2290			 */
2291			ASSERT(connp->conn_family == AF_INET6);
2292
2293			ip6h = (ip6_t *)rptr;
2294
2295			/* We don't care about the length, but need the ipp */
2296			hdr_length = ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps,
2297			    &nexthdrp);
2298			ASSERT(hdr_length == ira->ira_ip_hdr_length);
2299			/* Restore */
2300			hdr_length = ira->ira_ip_hdr_length + UDPH_SIZE;
2301			ASSERT(nexthdrp == IPPROTO_UDP);
2302		}
2303	}
2304
2305	/*
2306	 * This is the inbound data path.  Packets are passed upstream as
2307	 * T_UNITDATA_IND messages.
2308	 */
2309	if (connp->conn_family == AF_INET) {
2310		sin_t *sin;
2311
2312		ASSERT(IPH_HDR_VERSION((ipha_t *)rptr) == IPV4_VERSION);
2313
2314		/*
2315		 * Normally only send up the source address.
2316		 * If any ancillary data items are wanted we add those.
2317		 */
2318		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2319		if (recv_ancillary.crb_all != 0) {
2320			udi_size += conn_recvancillary_size(connp,
2321			    recv_ancillary, ira, mp, &ipps);
2322		}
2323
2324		/* Allocate a message block for the T_UNITDATA_IND structure. */
2325		mp1 = allocb(udi_size, BPRI_MED);
2326		if (mp1 == NULL) {
2327			freemsg(mp);
2328			UDPS_BUMP_MIB(us, udpInErrors);
2329			return;
2330		}
2331		mp1->b_cont = mp;
2332		mp1->b_datap->db_type = M_PROTO;
2333		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2334		mp1->b_wptr = (uchar_t *)tudi + udi_size;
2335		tudi->PRIM_type = T_UNITDATA_IND;
2336		tudi->SRC_length = sizeof (sin_t);
2337		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2338		tudi->OPT_offset = sizeof (struct T_unitdata_ind) +
2339		    sizeof (sin_t);
2340		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2341		tudi->OPT_length = udi_size;
2342		sin = (sin_t *)&tudi[1];
2343		sin->sin_addr.s_addr = ((ipha_t *)rptr)->ipha_src;
2344		sin->sin_port =	udpha->uha_src_port;
2345		sin->sin_family = connp->conn_family;
2346		*(uint32_t *)&sin->sin_zero[0] = 0;
2347		*(uint32_t *)&sin->sin_zero[4] = 0;
2348
2349		/*
2350		 * Add options if IP_RECVDSTADDR, IP_RECVIF, IP_RECVSLLA or
2351		 * IP_RECVTTL has been set.
2352		 */
2353		if (udi_size != 0) {
2354			conn_recvancillary_add(connp, recv_ancillary, ira,
2355			    &ipps, (uchar_t *)&sin[1], udi_size);
2356		}
2357	} else {
2358		sin6_t *sin6;
2359
2360		/*
2361		 * Handle both IPv4 and IPv6 packets for IPv6 sockets.
2362		 *
2363		 * Normally we only send up the address. If receiving of any
2364		 * optional receive side information is enabled, we also send
2365		 * that up as options.
2366		 */
2367		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2368
2369		if (recv_ancillary.crb_all != 0) {
2370			udi_size += conn_recvancillary_size(connp,
2371			    recv_ancillary, ira, mp, &ipps);
2372		}
2373
2374		mp1 = allocb(udi_size, BPRI_MED);
2375		if (mp1 == NULL) {
2376			freemsg(mp);
2377			UDPS_BUMP_MIB(us, udpInErrors);
2378			return;
2379		}
2380		mp1->b_cont = mp;
2381		mp1->b_datap->db_type = M_PROTO;
2382		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2383		mp1->b_wptr = (uchar_t *)tudi + udi_size;
2384		tudi->PRIM_type = T_UNITDATA_IND;
2385		tudi->SRC_length = sizeof (sin6_t);
2386		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2387		tudi->OPT_offset = sizeof (struct T_unitdata_ind) +
2388		    sizeof (sin6_t);
2389		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2390		tudi->OPT_length = udi_size;
2391		sin6 = (sin6_t *)&tudi[1];
2392		if (ira->ira_flags & IRAF_IS_IPV4) {
2393			in6_addr_t v6dst;
2394
2395			IN6_IPADDR_TO_V4MAPPED(((ipha_t *)rptr)->ipha_src,
2396			    &sin6->sin6_addr);
2397			IN6_IPADDR_TO_V4MAPPED(((ipha_t *)rptr)->ipha_dst,
2398			    &v6dst);
2399			sin6->sin6_flowinfo = 0;
2400			sin6->sin6_scope_id = 0;
2401			sin6->__sin6_src_id = ip_srcid_find_addr(&v6dst,
2402			    IPCL_ZONEID(connp), us->us_netstack);
2403		} else {
2404			ip6h = (ip6_t *)rptr;
2405
2406			sin6->sin6_addr = ip6h->ip6_src;
2407			/* No sin6_flowinfo per API */
2408			sin6->sin6_flowinfo = 0;
2409			/* For link-scope pass up scope id */
2410			if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2411				sin6->sin6_scope_id = ira->ira_ruifindex;
2412			else
2413				sin6->sin6_scope_id = 0;
2414			sin6->__sin6_src_id = ip_srcid_find_addr(
2415			    &ip6h->ip6_dst, IPCL_ZONEID(connp),
2416			    us->us_netstack);
2417		}
2418		sin6->sin6_port = udpha->uha_src_port;
2419		sin6->sin6_family = connp->conn_family;
2420
2421		if (udi_size != 0) {
2422			conn_recvancillary_add(connp, recv_ancillary, ira,
2423			    &ipps, (uchar_t *)&sin6[1], udi_size);
2424		}
2425	}
2426
2427	/*
2428	 * DTrace this UDP input as udp:::receive (this is for IPv4, IPv6 and
2429	 * loopback traffic).
2430	 */
2431	DTRACE_UDP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa,
2432	    void_ip_t *, rptr, udp_t *, udp, udpha_t *, udpha);
2433
2434	/* Walk past the headers unless IP_RECVHDR was set. */
2435	if (!udp->udp_rcvhdr) {
2436		mp->b_rptr = rptr + hdr_length;
2437		pkt_len -= hdr_length;
2438	}
2439
2440	UDPS_BUMP_MIB(us, udpHCInDatagrams);
2441	udp_ulp_recv(connp, mp1, pkt_len, ira);
2442	return;
2443
2444tossit:
2445	freemsg(mp);
2446	UDPS_BUMP_MIB(us, udpInErrors);
2447}
2448
2449/*
2450 * This routine creates a T_UDERROR_IND message and passes it upstream.
2451 * The address and options are copied from the T_UNITDATA_REQ message
2452 * passed in mp.  This message is freed.
2453 */
2454static void
2455udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
2456{
2457	struct T_unitdata_req *tudr;
2458	mblk_t	*mp1;
2459	uchar_t *destaddr;
2460	t_scalar_t destlen;
2461	uchar_t	*optaddr;
2462	t_scalar_t optlen;
2463
2464	if ((mp->b_wptr < mp->b_rptr) ||
2465	    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
2466		goto done;
2467	}
2468	tudr = (struct T_unitdata_req *)mp->b_rptr;
2469	destaddr = mp->b_rptr + tudr->DEST_offset;
2470	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
2471	    destaddr + tudr->DEST_length < mp->b_rptr ||
2472	    destaddr + tudr->DEST_length > mp->b_wptr) {
2473		goto done;
2474	}
2475	optaddr = mp->b_rptr + tudr->OPT_offset;
2476	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
2477	    optaddr + tudr->OPT_length < mp->b_rptr ||
2478	    optaddr + tudr->OPT_length > mp->b_wptr) {
2479		goto done;
2480	}
2481	destlen = tudr->DEST_length;
2482	optlen = tudr->OPT_length;
2483
2484	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
2485	    (char *)optaddr, optlen, err);
2486	if (mp1 != NULL)
2487		qreply(q, mp1);
2488
2489done:
2490	freemsg(mp);
2491}
2492
2493/*
2494 * This routine removes a port number association from a stream.  It
2495 * is called by udp_wput to handle T_UNBIND_REQ messages.
2496 */
2497static void
2498udp_tpi_unbind(queue_t *q, mblk_t *mp)
2499{
2500	conn_t	*connp = Q_TO_CONN(q);
2501	int	error;
2502
2503	error = udp_do_unbind(connp);
2504	if (error) {
2505		if (error < 0)
2506			udp_err_ack(q, mp, -error, 0);
2507		else
2508			udp_err_ack(q, mp, TSYSERR, error);
2509		return;
2510	}
2511
2512	mp = mi_tpi_ok_ack_alloc(mp);
2513	ASSERT(mp != NULL);
2514	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
2515	qreply(q, mp);
2516}
2517
2518/*
2519 * Don't let port fall into the privileged range.
2520 * Since the extra privileged ports can be arbitrary we also
2521 * ensure that we exclude those from consideration.
2522 * us->us_epriv_ports is not sorted thus we loop over it until
2523 * there are no changes.
2524 */
2525static in_port_t
2526udp_update_next_port(udp_t *udp, in_port_t port, boolean_t random)
2527{
2528	int i, bump;
2529	in_port_t nextport;
2530	boolean_t restart = B_FALSE;
2531	udp_stack_t *us = udp->udp_us;
2532
2533	if (random && udp_random_anon_port != 0) {
2534		(void) random_get_pseudo_bytes((uint8_t *)&port,
2535		    sizeof (in_port_t));
2536		/*
2537		 * Unless changed by a sys admin, the smallest anon port
2538		 * is 32768 and the largest anon port is 65535.  It is
2539		 * very likely (50%) for the random port to be smaller
2540		 * than the smallest anon port.  When that happens,
2541		 * add port % (anon port range) to the smallest anon
2542		 * port to get the random port.  It should fall into the
2543		 * valid anon port range.
2544		 */
2545		if ((port < us->us_smallest_anon_port) ||
2546		    (port > us->us_largest_anon_port)) {
2547			if (us->us_smallest_anon_port ==
2548			    us->us_largest_anon_port) {
2549				bump = 0;
2550			} else {
2551				bump = port % (us->us_largest_anon_port -
2552				    us->us_smallest_anon_port);
2553			}
2554
2555			port = us->us_smallest_anon_port + bump;
2556		}
2557	}
2558
2559retry:
2560	if (port < us->us_smallest_anon_port)
2561		port = us->us_smallest_anon_port;
2562
2563	if (port > us->us_largest_anon_port) {
2564		port = us->us_smallest_anon_port;
2565		if (restart)
2566			return (0);
2567		restart = B_TRUE;
2568	}
2569
2570	if (port < us->us_smallest_nonpriv_port)
2571		port = us->us_smallest_nonpriv_port;
2572
2573	for (i = 0; i < us->us_num_epriv_ports; i++) {
2574		if (port == us->us_epriv_ports[i]) {
2575			port++;
2576			/*
2577			 * Make sure that the port is in the
2578			 * valid range.
2579			 */
2580			goto retry;
2581		}
2582	}
2583
2584	if (is_system_labeled() &&
2585	    (nextport = tsol_next_port(crgetzone(udp->udp_connp->conn_cred),
2586	    port, IPPROTO_UDP, B_TRUE)) != 0) {
2587		port = nextport;
2588		goto retry;
2589	}
2590
2591	return (port);
2592}
2593
2594/*
2595 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
2596 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
2597 * the TPI options, otherwise we take them from msg_control.
2598 * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
2599 * Always consumes mp; never consumes tudr_mp.
2600 */
2601static int
2602udp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
2603    mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
2604{
2605	udp_t		*udp = connp->conn_udp;
2606	udp_stack_t	*us = udp->udp_us;
2607	int		error;
2608	ip_xmit_attr_t	*ixa;
2609	ip_pkt_t	*ipp;
2610	in6_addr_t	v6src;
2611	in6_addr_t	v6dst;
2612	in6_addr_t	v6nexthop;
2613	in_port_t	dstport;
2614	uint32_t	flowinfo;
2615	uint_t		srcid;
2616	int		is_absreq_failure = 0;
2617	conn_opt_arg_t	coas, *coa;
2618
2619	ASSERT(tudr_mp != NULL || msg != NULL);
2620
2621	/*
2622	 * Get ixa before checking state to handle a disconnect race.
2623	 *
2624	 * We need an exclusive copy of conn_ixa since the ancillary data
2625	 * options might modify it. That copy has no pointers hence we
2626	 * need to set them up once we've parsed the ancillary data.
2627	 */
2628	ixa = conn_get_ixa_exclusive(connp);
2629	if (ixa == NULL) {
2630		UDPS_BUMP_MIB(us, udpOutErrors);
2631		freemsg(mp);
2632		return (ENOMEM);
2633	}
2634	ASSERT(cr != NULL);
2635	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
2636	ixa->ixa_cred = cr;
2637	ixa->ixa_cpid = pid;
2638	if (is_system_labeled()) {
2639		/* We need to restart with a label based on the cred */
2640		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
2641	}
2642
2643	/* In case previous destination was multicast or multirt */
2644	ip_attr_newdst(ixa);
2645
2646	/* Get a copy of conn_xmit_ipp since the options might change it */
2647	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
2648	if (ipp == NULL) {
2649		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
2650		ixa->ixa_cred = connp->conn_cred;	/* Restore */
2651		ixa->ixa_cpid = connp->conn_cpid;
2652		ixa_refrele(ixa);
2653		UDPS_BUMP_MIB(us, udpOutErrors);
2654		freemsg(mp);
2655		return (ENOMEM);
2656	}
2657	mutex_enter(&connp->conn_lock);
2658	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
2659	mutex_exit(&connp->conn_lock);
2660	if (error != 0) {
2661		UDPS_BUMP_MIB(us, udpOutErrors);
2662		freemsg(mp);
2663		goto done;
2664	}
2665
2666	/*
2667	 * Parse the options and update ixa and ipp as a result.
2668	 * Note that ixa_tsl can be updated if SCM_UCRED.
2669	 * ixa_refrele/ixa_inactivate will release any reference on ixa_tsl.
2670	 */
2671
2672	coa = &coas;
2673	coa->coa_connp = connp;
2674	coa->coa_ixa = ixa;
2675	coa->coa_ipp = ipp;
2676	coa->coa_ancillary = B_TRUE;
2677	coa->coa_changed = 0;
2678
2679	if (msg != NULL) {
2680		error = process_auxiliary_options(connp, msg->msg_control,
2681		    msg->msg_controllen, coa, &udp_opt_obj, udp_opt_set, cr);
2682	} else {
2683		struct T_unitdata_req *tudr;
2684
2685		tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
2686		ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
2687		error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
2688		    &tudr->OPT_length, tudr->OPT_offset, cr, &udp_opt_obj,
2689		    coa, &is_absreq_failure);
2690	}
2691	if (error != 0) {
2692		/*
2693		 * Note: No special action needed in this
2694		 * module for "is_absreq_failure"
2695		 */
2696		freemsg(mp);
2697		UDPS_BUMP_MIB(us, udpOutErrors);
2698		goto done;
2699	}
2700	ASSERT(is_absreq_failure == 0);
2701
2702	mutex_enter(&connp->conn_lock);
2703	/*
2704	 * If laddr is unspecified then we look at sin6_src_id.
2705	 * We will give precedence to a source address set with IPV6_PKTINFO
2706	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
2707	 * want ip_attr_connect to select a source (since it can fail) when
2708	 * IPV6_PKTINFO is specified.
2709	 * If this doesn't result in a source address then we get a source
2710	 * from ip_attr_connect() below.
2711	 */
2712	v6src = connp->conn_saddr_v6;
2713	if (sin != NULL) {
2714		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
2715		dstport = sin->sin_port;
2716		flowinfo = 0;
2717		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
2718		ixa->ixa_flags |= IXAF_IS_IPV4;
2719	} else if (sin6 != NULL) {
2720		boolean_t v4mapped;
2721
2722		v6dst = sin6->sin6_addr;
2723		dstport = sin6->sin6_port;
2724		flowinfo = sin6->sin6_flowinfo;
2725		srcid = sin6->__sin6_src_id;
2726		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
2727			ixa->ixa_scopeid = sin6->sin6_scope_id;
2728			ixa->ixa_flags |= IXAF_SCOPEID_SET;
2729		} else {
2730			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
2731		}
2732		v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
2733		if (v4mapped)
2734			ixa->ixa_flags |= IXAF_IS_IPV4;
2735		else
2736			ixa->ixa_flags &= ~IXAF_IS_IPV4;
2737		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
2738			if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
2739			    v4mapped, connp->conn_netstack)) {
2740				/* Mismatch - v4mapped/v6 specified by srcid. */
2741				mutex_exit(&connp->conn_lock);
2742				error = EADDRNOTAVAIL;
2743				goto failed;	/* Does freemsg() and mib. */
2744			}
2745		}
2746	} else {
2747		/* Connected case */
2748		v6dst = connp->conn_faddr_v6;
2749		dstport = connp->conn_fport;
2750		flowinfo = connp->conn_flowinfo;
2751	}
2752	mutex_exit(&connp->conn_lock);
2753
2754	/* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
2755	if (ipp->ipp_fields & IPPF_ADDR) {
2756		if (ixa->ixa_flags & IXAF_IS_IPV4) {
2757			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
2758				v6src = ipp->ipp_addr;
2759		} else {
2760			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
2761				v6src = ipp->ipp_addr;
2762		}
2763	}
2764
2765	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
2766	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
2767	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | IPDF_IPSEC);
2768
2769	switch (error) {
2770	case 0:
2771		break;
2772	case EADDRNOTAVAIL:
2773		/*
2774		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
2775		 * Don't have the application see that errno
2776		 */
2777		error = ENETUNREACH;
2778		goto failed;
2779	case ENETDOWN:
2780		/*
2781		 * Have !ipif_addr_ready address; drop packet silently
2782		 * until we can get applications to not send until we
2783		 * are ready.
2784		 */
2785		error = 0;
2786		goto failed;
2787	case EHOSTUNREACH:
2788	case ENETUNREACH:
2789		if (ixa->ixa_ire != NULL) {
2790			/*
2791			 * Let conn_ip_output/ire_send_noroute return
2792			 * the error and send any local ICMP error.
2793			 */
2794			error = 0;
2795			break;
2796		}
2797		/* FALLTHRU */
2798	default:
2799	failed:
2800		freemsg(mp);
2801		UDPS_BUMP_MIB(us, udpOutErrors);
2802		goto done;
2803	}
2804
2805	/*
2806	 * We might be going to a different destination than last time,
2807	 * thus check that TX allows the communication and compute any
2808	 * needed label.
2809	 *
2810	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
2811	 * don't have to worry about concurrent threads.
2812	 */
2813	if (is_system_labeled()) {
2814		/* Using UDP MLP requires SCM_UCRED from user */
2815		if (connp->conn_mlp_type != mlptSingle &&
2816		    !((ixa->ixa_flags & IXAF_UCRED_TSL))) {
2817			UDPS_BUMP_MIB(us, udpOutErrors);
2818			error = ECONNREFUSED;
2819			freemsg(mp);
2820			goto done;
2821		}
2822		/*
2823		 * Check whether Trusted Solaris policy allows communication
2824		 * with this host, and pretend that the destination is
2825		 * unreachable if not.
2826		 * Compute any needed label and place it in ipp_label_v4/v6.
2827		 *
2828		 * Later conn_build_hdr_template/conn_prepend_hdr takes
2829		 * ipp_label_v4/v6 to form the packet.
2830		 *
2831		 * Tsol note: We have ipp structure local to this thread so
2832		 * no locking is needed.
2833		 */
2834		error = conn_update_label(connp, ixa, &v6dst, ipp);
2835		if (error != 0) {
2836			freemsg(mp);
2837			UDPS_BUMP_MIB(us, udpOutErrors);
2838			goto done;
2839		}
2840	}
2841	mp = udp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, dstport,
2842	    flowinfo, mp, &error);
2843	if (mp == NULL) {
2844		ASSERT(error != 0);
2845		UDPS_BUMP_MIB(us, udpOutErrors);
2846		goto done;
2847	}
2848	if (ixa->ixa_pktlen > IP_MAXPACKET) {
2849		error = EMSGSIZE;
2850		UDPS_BUMP_MIB(us, udpOutErrors);
2851		freemsg(mp);
2852		goto done;
2853	}
2854	/* We're done.  Pass the packet to ip. */
2855	UDPS_BUMP_MIB(us, udpHCOutDatagrams);
2856
2857	DTRACE_UDP5(send, mblk_t *, NULL, ip_xmit_attr_t *, ixa,
2858	    void_ip_t *, mp->b_rptr, udp_t *, udp, udpha_t *,
2859	    &mp->b_rptr[ixa->ixa_ip_hdr_length]);
2860
2861	error = conn_ip_output(mp, ixa);
2862	/* No udpOutErrors if an error since IP increases its error counter */
2863	switch (error) {
2864	case 0:
2865		break;
2866	case EWOULDBLOCK:
2867		(void) ixa_check_drain_insert(connp, ixa);
2868		error = 0;
2869		break;
2870	case EADDRNOTAVAIL:
2871		/*
2872		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
2873		 * Don't have the application see that errno
2874		 */
2875		error = ENETUNREACH;
2876		/* FALLTHRU */
2877	default:
2878		mutex_enter(&connp->conn_lock);
2879		/*
2880		 * Clear the source and v6lastdst so we call ip_attr_connect
2881		 * for the next packet and try to pick a better source.
2882		 */
2883		if (connp->conn_mcbc_bind)
2884			connp->conn_saddr_v6 = ipv6_all_zeros;
2885		else
2886			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
2887		connp->conn_v6lastdst = ipv6_all_zeros;
2888		mutex_exit(&connp->conn_lock);
2889		break;
2890	}
2891done:
2892	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
2893	ixa->ixa_cred = connp->conn_cred;	/* Restore */
2894	ixa->ixa_cpid = connp->conn_cpid;
2895	ixa_refrele(ixa);
2896	ip_pkt_free(ipp);
2897	kmem_free(ipp, sizeof (*ipp));
2898	return (error);
2899}
2900
2901/*
2902 * Handle sending an M_DATA for a connected socket.
2903 * Handles both IPv4 and IPv6.
2904 */
2905static int
2906udp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
2907{
2908	udp_t		*udp = connp->conn_udp;
2909	udp_stack_t	*us = udp->udp_us;
2910	int		error;
2911	ip_xmit_attr_t	*ixa;
2912
2913	/*
2914	 * If no other thread is using conn_ixa this just gets a reference to
2915	 * conn_ixa. Otherwise we get a safe copy of conn_ixa.
2916	 */
2917	ixa = conn_get_ixa(connp, B_FALSE);
2918	if (ixa == NULL) {
2919		UDPS_BUMP_MIB(us, udpOutErrors);
2920		freemsg(mp);
2921		return (ENOMEM);
2922	}
2923
2924	ASSERT(cr != NULL);
2925	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
2926	ixa->ixa_cred = cr;
2927	ixa->ixa_cpid = pid;
2928
2929	mutex_enter(&connp->conn_lock);
2930	mp = udp_prepend_header_template(connp, ixa, mp, &connp->conn_saddr_v6,
2931	    connp->conn_fport, connp->conn_flowinfo, &error);
2932
2933	if (mp == NULL) {
2934		ASSERT(error != 0);
2935		mutex_exit(&connp->conn_lock);
2936		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
2937		ixa->ixa_cred = connp->conn_cred;	/* Restore */
2938		ixa->ixa_cpid = connp->conn_cpid;
2939		ixa_refrele(ixa);
2940		UDPS_BUMP_MIB(us, udpOutErrors);
2941		freemsg(mp);
2942		return (error);
2943	}
2944
2945	/*
2946	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
2947	 * safe copy, then we need to fill in any pointers in it.
2948	 */
2949	if (ixa->ixa_ire == NULL) {
2950		in6_addr_t	faddr, saddr;
2951		in6_addr_t	nexthop;
2952		in_port_t	fport;
2953
2954		saddr = connp->conn_saddr_v6;
2955		faddr = connp->conn_faddr_v6;
2956		fport = connp->conn_fport;
2957		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
2958		mutex_exit(&connp->conn_lock);
2959
2960		error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
2961		    fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
2962		    IPDF_IPSEC);
2963		switch (error) {
2964		case 0:
2965			break;
2966		case EADDRNOTAVAIL:
2967			/*
2968			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
2969			 * Don't have the application see that errno
2970			 */
2971			error = ENETUNREACH;
2972			goto failed;
2973		case ENETDOWN:
2974			/*
2975			 * Have !ipif_addr_ready address; drop packet silently
2976			 * until we can get applications to not send until we
2977			 * are ready.
2978			 */
2979			error = 0;
2980			goto failed;
2981		case EHOSTUNREACH:
2982		case ENETUNREACH:
2983			if (ixa->ixa_ire != NULL) {
2984				/*
2985				 * Let conn_ip_output/ire_send_noroute return
2986				 * the error and send any local ICMP error.
2987				 */
2988				error = 0;
2989				break;
2990			}
2991			/* FALLTHRU */
2992		default:
2993		failed:
2994			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
2995			ixa->ixa_cred = connp->conn_cred;	/* Restore */
2996			ixa->ixa_cpid = connp->conn_cpid;
2997			ixa_refrele(ixa);
2998			freemsg(mp);
2999			UDPS_BUMP_MIB(us, udpOutErrors);
3000			return (error);
3001		}
3002	} else {
3003		/* Done with conn_t */
3004		mutex_exit(&connp->conn_lock);
3005	}
3006	ASSERT(ixa->ixa_ire != NULL);
3007
3008	/* We're done.  Pass the packet to ip. */
3009	UDPS_BUMP_MIB(us, udpHCOutDatagrams);
3010
3011	DTRACE_UDP5(send, mblk_t *, NULL, ip_xmit_attr_t *, ixa,
3012	    void_ip_t *, mp->b_rptr, udp_t *, udp, udpha_t *,
3013	    &mp->b_rptr[ixa->ixa_ip_hdr_length]);
3014
3015	error = conn_ip_output(mp, ixa);
3016	/* No udpOutErrors if an error since IP increases its error counter */
3017	switch (error) {
3018	case 0:
3019		break;
3020	case EWOULDBLOCK:
3021		(void) ixa_check_drain_insert(connp, ixa);
3022		error = 0;
3023		break;
3024	case EADDRNOTAVAIL:
3025		/*
3026		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3027		 * Don't have the application see that errno
3028		 */
3029		error = ENETUNREACH;
3030		break;
3031	}
3032	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3033	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3034	ixa->ixa_cpid = connp->conn_cpid;
3035	ixa_refrele(ixa);
3036	return (error);
3037}
3038
3039/*
3040 * Handle sending an M_DATA to the last destination.
3041 * Handles both IPv4 and IPv6.
3042 *
3043 * NOTE: The caller must hold conn_lock and we drop it here.
3044 */
3045static int
3046udp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3047    ip_xmit_attr_t *ixa)
3048{
3049	udp_t		*udp = connp->conn_udp;
3050	udp_stack_t	*us = udp->udp_us;
3051	int		error;
3052
3053	ASSERT(MUTEX_HELD(&connp->conn_lock));
3054	ASSERT(ixa != NULL);
3055
3056	ASSERT(cr != NULL);
3057	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3058	ixa->ixa_cred = cr;
3059	ixa->ixa_cpid = pid;
3060
3061	mp = udp_prepend_header_template(connp, ixa, mp, &connp->conn_v6lastsrc,
3062	    connp->conn_lastdstport, connp->conn_lastflowinfo, &error);
3063
3064	if (mp == NULL) {
3065		ASSERT(error != 0);
3066		mutex_exit(&connp->conn_lock);
3067		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3068		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3069		ixa->ixa_cpid = connp->conn_cpid;
3070		ixa_refrele(ixa);
3071		UDPS_BUMP_MIB(us, udpOutErrors);
3072		freemsg(mp);
3073		return (error);
3074	}
3075
3076	/*
3077	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3078	 * safe copy, then we need to fill in any pointers in it.
3079	 */
3080	if (ixa->ixa_ire == NULL) {
3081		in6_addr_t	lastdst, lastsrc;
3082		in6_addr_t	nexthop;
3083		in_port_t	lastport;
3084
3085		lastsrc = connp->conn_v6lastsrc;
3086		lastdst = connp->conn_v6lastdst;
3087		lastport = connp->conn_lastdstport;
3088		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3089		mutex_exit(&connp->conn_lock);
3090
3091		error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
3092		    &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
3093		    IPDF_VERIFY_DST | IPDF_IPSEC);
3094		switch (error) {
3095		case 0:
3096			break;
3097		case EADDRNOTAVAIL:
3098			/*
3099			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3100			 * Don't have the application see that errno
3101			 */
3102			error = ENETUNREACH;
3103			goto failed;
3104		case ENETDOWN:
3105			/*
3106			 * Have !ipif_addr_ready address; drop packet silently
3107			 * until we can get applications to not send until we
3108			 * are ready.
3109			 */
3110			error = 0;
3111			goto failed;
3112		case EHOSTUNREACH:
3113		case ENETUNREACH:
3114			if (ixa->ixa_ire != NULL) {
3115				/*
3116				 * Let conn_ip_output/ire_send_noroute return
3117				 * the error and send any local ICMP error.
3118				 */
3119				error = 0;
3120				break;
3121			}
3122			/* FALLTHRU */
3123		default:
3124		failed:
3125			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3126			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3127			ixa->ixa_cpid = connp->conn_cpid;
3128			ixa_refrele(ixa);
3129			freemsg(mp);
3130			UDPS_BUMP_MIB(us, udpOutErrors);
3131			return (error);
3132		}
3133	} else {
3134		/* Done with conn_t */
3135		mutex_exit(&connp->conn_lock);
3136	}
3137
3138	/* We're done.  Pass the packet to ip. */
3139	UDPS_BUMP_MIB(us, udpHCOutDatagrams);
3140
3141	DTRACE_UDP5(send, mblk_t *, NULL, ip_xmit_attr_t *, ixa,
3142	    void_ip_t *, mp->b_rptr, udp_t *, udp, udpha_t *,
3143	    &mp->b_rptr[ixa->ixa_ip_hdr_length]);
3144
3145	error = conn_ip_output(mp, ixa);
3146	/* No udpOutErrors if an error since IP increases its error counter */
3147	switch (error) {
3148	case 0:
3149		break;
3150	case EWOULDBLOCK:
3151		(void) ixa_check_drain_insert(connp, ixa);
3152		error = 0;
3153		break;
3154	case EADDRNOTAVAIL:
3155		/*
3156		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3157		 * Don't have the application see that errno
3158		 */
3159		error = ENETUNREACH;
3160		/* FALLTHRU */
3161	default:
3162		mutex_enter(&connp->conn_lock);
3163		/*
3164		 * Clear the source and v6lastdst so we call ip_attr_connect
3165		 * for the next packet and try to pick a better source.
3166		 */
3167		if (connp->conn_mcbc_bind)
3168			connp->conn_saddr_v6 = ipv6_all_zeros;
3169		else
3170			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3171		connp->conn_v6lastdst = ipv6_all_zeros;
3172		mutex_exit(&connp->conn_lock);
3173		break;
3174	}
3175	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3176	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3177	ixa->ixa_cpid = connp->conn_cpid;
3178	ixa_refrele(ixa);
3179	return (error);
3180}
3181
3182
3183/*
3184 * Prepend the header template and then fill in the source and
3185 * flowinfo. The caller needs to handle the destination address since
3186 * it's setting is different if rthdr or source route.
3187 *
3188 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3189 * When it returns NULL it sets errorp.
3190 */
3191static mblk_t *
3192udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3193    const in6_addr_t *v6src, in_port_t dstport, uint32_t flowinfo, int *errorp)
3194{
3195	udp_t		*udp = connp->conn_udp;
3196	udp_stack_t	*us = udp->udp_us;
3197	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
3198	uint_t		pktlen;
3199	uint_t		alloclen;
3200	uint_t		copylen;
3201	uint8_t		*iph;
3202	uint_t		ip_hdr_length;
3203	udpha_t		*udpha;
3204	uint32_t	cksum;
3205	ip_pkt_t	*ipp;
3206
3207	ASSERT(MUTEX_HELD(&connp->conn_lock));
3208
3209	/*
3210	 * Copy the header template and leave space for an SPI
3211	 */
3212	copylen = connp->conn_ht_iphc_len;
3213	alloclen = copylen + (insert_spi ? sizeof (uint32_t) : 0);
3214	pktlen = alloclen + msgdsize(mp);
3215	if (pktlen > IP_MAXPACKET) {
3216		freemsg(mp);
3217		*errorp = EMSGSIZE;
3218		return (NULL);
3219	}
3220	ixa->ixa_pktlen = pktlen;
3221
3222	/* check/fix buffer config, setup pointers into it */
3223	iph = mp->b_rptr - alloclen;
3224	if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3225		mblk_t *mp1;
3226
3227		mp1 = allocb(alloclen + us->us_wroff_extra, BPRI_MED);
3228		if (mp1 == NULL) {
3229			freemsg(mp);
3230			*errorp = ENOMEM;
3231			return (NULL);
3232		}
3233		mp1->b_wptr = DB_LIM(mp1);
3234		mp1->b_cont = mp;
3235		mp = mp1;
3236		iph = (mp->b_wptr - alloclen);
3237	}
3238	mp->b_rptr = iph;
3239	bcopy(connp->conn_ht_iphc, iph, copylen);
3240	ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
3241
3242	ixa->ixa_ip_hdr_length = ip_hdr_length;
3243	udpha = (udpha_t *)(iph + ip_hdr_length);
3244
3245	/*
3246	 * Setup header length and prepare for ULP checksum done in IP.
3247	 * udp_build_hdr_template has already massaged any routing header
3248	 * and placed the result in conn_sum.
3249	 *
3250	 * We make it easy for IP to include our pseudo header
3251	 * by putting our length in uha_checksum.
3252	 */
3253	cksum = pktlen - ip_hdr_length;
3254	udpha->uha_length = htons(cksum);
3255
3256	cksum += connp->conn_sum;
3257	cksum = (cksum >> 16) + (cksum & 0xFFFF);
3258	ASSERT(cksum < 0x10000);
3259
3260	ipp = &connp->conn_xmit_ipp;
3261	if (ixa->ixa_flags & IXAF_IS_IPV4) {
3262		ipha_t	*ipha = (ipha_t *)iph;
3263
3264		ipha->ipha_length = htons((uint16_t)pktlen);
3265
3266		/* IP does the checksum if uha_checksum is non-zero */
3267		if (us->us_do_checksum)
3268			udpha->uha_checksum = htons(cksum);
3269
3270		/* if IP_PKTINFO specified an addres it wins over bind() */
3271		if ((ipp->ipp_fields & IPPF_ADDR) &&
3272		    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
3273			ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
3274			ipha->ipha_src = ipp->ipp_addr_v4;
3275		} else {
3276			IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
3277		}
3278	} else {
3279		ip6_t *ip6h = (ip6_t *)iph;
3280
3281		ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
3282		udpha->uha_checksum = htons(cksum);
3283
3284		/* if IP_PKTINFO specified an addres it wins over bind() */
3285		if ((ipp->ipp_fields & IPPF_ADDR) &&
3286		    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
3287			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
3288			ip6h->ip6_src = ipp->ipp_addr;
3289		} else {
3290			ip6h->ip6_src = *v6src;
3291		}
3292		ip6h->ip6_vcf =
3293		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
3294		    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
3295		if (ipp->ipp_fields & IPPF_TCLASS) {
3296			/* Overrides the class part of flowinfo */
3297			ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
3298			    ipp->ipp_tclass);
3299		}
3300	}
3301
3302	/* Insert all-0s SPI now. */
3303	if (insert_spi)
3304		*((uint32_t *)(udpha + 1)) = 0;
3305
3306	udpha->uha_dst_port = dstport;
3307	return (mp);
3308}
3309
3310/*
3311 * Send a T_UDERR_IND in response to an M_DATA
3312 */
3313static void
3314udp_ud_err_connected(conn_t *connp, t_scalar_t error)
3315{
3316	struct sockaddr_storage ss;
3317	sin_t		*sin;
3318	sin6_t		*sin6;
3319	struct sockaddr	*addr;
3320	socklen_t	addrlen;
3321	mblk_t		*mp1;
3322
3323	mutex_enter(&connp->conn_lock);
3324	/* Initialize addr and addrlen as if they're passed in */
3325	if (connp->conn_family == AF_INET) {
3326		sin = (sin_t *)&ss;
3327		*sin = sin_null;
3328		sin->sin_family = AF_INET;
3329		sin->sin_port = connp->conn_fport;
3330		sin->sin_addr.s_addr = connp->conn_faddr_v4;
3331		addr = (struct sockaddr *)sin;
3332		addrlen = sizeof (*sin);
3333	} else {
3334		sin6 = (sin6_t *)&ss;
3335		*sin6 = sin6_null;
3336		sin6->sin6_family = AF_INET6;
3337		sin6->sin6_port = connp->conn_fport;
3338		sin6->sin6_flowinfo = connp->conn_flowinfo;
3339		sin6->sin6_addr = connp->conn_faddr_v6;
3340		if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6) &&
3341		    (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
3342			sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid;
3343		} else {
3344			sin6->sin6_scope_id = 0;
3345		}
3346		sin6->__sin6_src_id = 0;
3347		addr = (struct sockaddr *)sin6;
3348		addrlen = sizeof (*sin6);
3349	}
3350	mutex_exit(&connp->conn_lock);
3351
3352	mp1 = mi_tpi_uderror_ind((char *)addr, addrlen, NULL, 0, error);
3353	if (mp1 != NULL)
3354		putnext(connp->conn_rq, mp1);
3355}
3356
3357/*
3358 * This routine handles all messages passed downstream.  It either
3359 * consumes the message or passes it downstream; it never queues a
3360 * a message.
3361 *
3362 * Also entry point for sockfs when udp is in "direct sockfs" mode.  This mode
3363 * is valid when we are directly beneath the stream head, and thus sockfs
3364 * is able to bypass STREAMS and directly call us, passing along the sockaddr
3365 * structure without the cumbersome T_UNITDATA_REQ interface for the case of
3366 * connected endpoints.
3367 */
3368int
3369udp_wput(queue_t *q, mblk_t *mp)
3370{
3371	sin6_t		*sin6;
3372	sin_t		*sin = NULL;
3373	uint_t		srcid;
3374	conn_t		*connp = Q_TO_CONN(q);
3375	udp_t		*udp = connp->conn_udp;
3376	int		error = 0;
3377	struct sockaddr	*addr = NULL;
3378	socklen_t	addrlen;
3379	udp_stack_t	*us = udp->udp_us;
3380	struct T_unitdata_req *tudr;
3381	mblk_t		*data_mp;
3382	ushort_t	ipversion;
3383	cred_t		*cr;
3384	pid_t		pid;
3385
3386	/*
3387	 * We directly handle several cases here: T_UNITDATA_REQ message
3388	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
3389	 * socket.
3390	 */
3391	switch (DB_TYPE(mp)) {
3392	case M_DATA:
3393		if (!udp->udp_issocket || udp->udp_state != TS_DATA_XFER) {
3394			/* Not connected; address is required */
3395			UDPS_BUMP_MIB(us, udpOutErrors);
3396			UDP_DBGSTAT(us, udp_data_notconn);
3397			UDP_STAT(us, udp_out_err_notconn);
3398			freemsg(mp);
3399			return (0);
3400		}
3401		/*
3402		 * All Solaris components should pass a db_credp
3403		 * for this message, hence we ASSERT.
3404		 * On production kernels we return an error to be robust against
3405		 * random streams modules sitting on top of us.
3406		 */
3407		cr = msg_getcred(mp, &pid);
3408		ASSERT(cr != NULL);
3409		if (cr == NULL) {
3410			UDPS_BUMP_MIB(us, udpOutErrors);
3411			freemsg(mp);
3412			return (0);
3413		}
3414		ASSERT(udp->udp_issocket);
3415		UDP_DBGSTAT(us, udp_data_conn);
3416		error = udp_output_connected(connp, mp, cr, pid);
3417		if (error != 0) {
3418			UDP_STAT(us, udp_out_err_output);
3419			if (connp->conn_rq != NULL)
3420				udp_ud_err_connected(connp, (t_scalar_t)error);
3421#ifdef DEBUG
3422			printf("udp_output_connected returned %d\n", error);
3423#endif
3424		}
3425		return (0);
3426
3427	case M_PROTO:
3428	case M_PCPROTO:
3429		tudr = (struct T_unitdata_req *)mp->b_rptr;
3430		if (MBLKL(mp) < sizeof (*tudr) ||
3431		    ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
3432			udp_wput_other(q, mp);
3433			return (0);
3434		}
3435		break;
3436
3437	default:
3438		udp_wput_other(q, mp);
3439		return (0);
3440	}
3441
3442	/* Handle valid T_UNITDATA_REQ here */
3443	data_mp = mp->b_cont;
3444	if (data_mp == NULL) {
3445		error = EPROTO;
3446		goto ud_error2;
3447	}
3448	mp->b_cont = NULL;
3449
3450	if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
3451		error = EADDRNOTAVAIL;
3452		goto ud_error2;
3453	}
3454
3455	/*
3456	 * All Solaris components should pass a db_credp
3457	 * for this TPI message, hence we should ASSERT.
3458	 * However, RPC (svc_clts_ksend) does this odd thing where it
3459	 * passes the options from a T_UNITDATA_IND unchanged in a
3460	 * T_UNITDATA_REQ. While that is the right thing to do for
3461	 * some options, SCM_UCRED being the key one, this also makes it
3462	 * pass down IP_RECVDSTADDR. Hence we can't ASSERT here.
3463	 */
3464	cr = msg_getcred(mp, &pid);
3465	if (cr == NULL) {
3466		cr = connp->conn_cred;
3467		pid = connp->conn_cpid;
3468	}
3469
3470	/*
3471	 * If a port has not been bound to the stream, fail.
3472	 * This is not a problem when sockfs is directly
3473	 * above us, because it will ensure that the socket
3474	 * is first bound before allowing data to be sent.
3475	 */
3476	if (udp->udp_state == TS_UNBND) {
3477		error = EPROTO;
3478		goto ud_error2;
3479	}
3480	addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
3481	addrlen = tudr->DEST_length;
3482
3483	switch (connp->conn_family) {
3484	case AF_INET6:
3485		sin6 = (sin6_t *)addr;
3486		if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
3487		    (sin6->sin6_family != AF_INET6)) {
3488			error = EADDRNOTAVAIL;
3489			goto ud_error2;
3490		}
3491
3492		srcid = sin6->__sin6_src_id;
3493		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
3494			/*
3495			 * Destination is a non-IPv4-compatible IPv6 address.
3496			 * Send out an IPv6 format packet.
3497			 */
3498
3499			/*
3500			 * If the local address is a mapped address return
3501			 * an error.
3502			 * It would be possible to send an IPv6 packet but the
3503			 * response would never make it back to the application
3504			 * since it is bound to a mapped address.
3505			 */
3506			if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
3507				error = EADDRNOTAVAIL;
3508				goto ud_error2;
3509			}
3510
3511			UDP_DBGSTAT(us, udp_out_ipv6);
3512
3513			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
3514				sin6->sin6_addr = ipv6_loopback;
3515			ipversion = IPV6_VERSION;
3516		} else {
3517			if (connp->conn_ipv6_v6only) {
3518				error = EADDRNOTAVAIL;
3519				goto ud_error2;
3520			}
3521
3522			/*
3523			 * If the local address is not zero or a mapped address
3524			 * return an error.  It would be possible to send an
3525			 * IPv4 packet but the response would never make it
3526			 * back to the application since it is bound to a
3527			 * non-mapped address.
3528			 */
3529			if (!IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6) &&
3530			    !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
3531				error = EADDRNOTAVAIL;
3532				goto ud_error2;
3533			}
3534			UDP_DBGSTAT(us, udp_out_mapped);
3535
3536			if (V4_PART_OF_V6(sin6->sin6_addr) == INADDR_ANY) {
3537				V4_PART_OF_V6(sin6->sin6_addr) =
3538				    htonl(INADDR_LOOPBACK);
3539			}
3540			ipversion = IPV4_VERSION;
3541		}
3542
3543		if (tudr->OPT_length != 0) {
3544			/*
3545			 * If we are connected then the destination needs to be
3546			 * the same as the connected one.
3547			 */
3548			if (udp->udp_state == TS_DATA_XFER &&
3549			    !conn_same_as_last_v6(connp, sin6)) {
3550				error = EISCONN;
3551				goto ud_error2;
3552			}
3553			UDP_STAT(us, udp_out_opt);
3554			error = udp_output_ancillary(connp, NULL, sin6,
3555			    data_mp, mp, NULL, cr, pid);
3556		} else {
3557			ip_xmit_attr_t *ixa;
3558
3559			/*
3560			 * We have to allocate an ip_xmit_attr_t before we grab
3561			 * conn_lock and we need to hold conn_lock once we've
3562			 * checked conn_same_as_last_v6 to handle concurrent
3563			 * send* calls on a socket.
3564			 */
3565			ixa = conn_get_ixa(connp, B_FALSE);
3566			if (ixa == NULL) {
3567				error = ENOMEM;
3568				goto ud_error2;
3569			}
3570			mutex_enter(&connp->conn_lock);
3571
3572			if (conn_same_as_last_v6(connp, sin6) &&
3573			    connp->conn_lastsrcid == srcid &&
3574			    ipsec_outbound_policy_current(ixa)) {
3575				UDP_DBGSTAT(us, udp_out_lastdst);
3576				/* udp_output_lastdst drops conn_lock */
3577				error = udp_output_lastdst(connp, data_mp, cr,
3578				    pid, ixa);
3579			} else {
3580				UDP_DBGSTAT(us, udp_out_diffdst);
3581				/* udp_output_newdst drops conn_lock */
3582				error = udp_output_newdst(connp, data_mp, NULL,
3583				    sin6, ipversion, cr, pid, ixa);
3584			}
3585			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
3586		}
3587		if (error == 0) {
3588			freeb(mp);
3589			return (0);
3590		}
3591		break;
3592
3593	case AF_INET:
3594		sin = (sin_t *)addr;
3595		if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
3596		    (sin->sin_family != AF_INET)) {
3597			error = EADDRNOTAVAIL;
3598			goto ud_error2;
3599		}
3600		UDP_DBGSTAT(us, udp_out_ipv4);
3601		if (sin->sin_addr.s_addr == INADDR_ANY)
3602			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
3603		ipversion = IPV4_VERSION;
3604
3605		srcid = 0;
3606		if (tudr->OPT_length != 0) {
3607			/*
3608			 * If we are connected then the destination needs to be
3609			 * the same as the connected one.
3610			 */
3611			if (udp->udp_state == TS_DATA_XFER &&
3612			    !conn_same_as_last_v4(connp, sin)) {
3613				error = EISCONN;
3614				goto ud_error2;
3615			}
3616			UDP_STAT(us, udp_out_opt);
3617			error = udp_output_ancillary(connp, sin, NULL,
3618			    data_mp, mp, NULL, cr, pid);
3619		} else {
3620			ip_xmit_attr_t *ixa;
3621
3622			/*
3623			 * We have to allocate an ip_xmit_attr_t before we grab
3624			 * conn_lock and we need to hold conn_lock once we've
3625			 * checked conn_same_as_last_v4 to handle concurrent
3626			 * send* calls on a socket.
3627			 */
3628			ixa = conn_get_ixa(connp, B_FALSE);
3629			if (ixa == NULL) {
3630				error = ENOMEM;
3631				goto ud_error2;
3632			}
3633			mutex_enter(&connp->conn_lock);
3634
3635			if (conn_same_as_last_v4(connp, sin) &&
3636			    ipsec_outbound_policy_current(ixa)) {
3637				UDP_DBGSTAT(us, udp_out_lastdst);
3638				/* udp_output_lastdst drops conn_lock */
3639				error = udp_output_lastdst(connp, data_mp, cr,
3640				    pid, ixa);
3641			} else {
3642				UDP_DBGSTAT(us, udp_out_diffdst);
3643				/* udp_output_newdst drops conn_lock */
3644				error = udp_output_newdst(connp, data_mp, sin,
3645				    NULL, ipversion, cr, pid, ixa);
3646			}
3647			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
3648		}
3649		if (error == 0) {
3650			freeb(mp);
3651			return (0);
3652		}
3653		break;
3654	}
3655	UDP_STAT(us, udp_out_err_output);
3656	ASSERT(mp != NULL);
3657	/* mp is freed by the following routine */
3658	udp_ud_err(q, mp, (t_scalar_t)error);
3659	return (0);
3660
3661ud_error2:
3662	UDPS_BUMP_MIB(us, udpOutErrors);
3663	freemsg(data_mp);
3664	UDP_STAT(us, udp_out_err_output);
3665	ASSERT(mp != NULL);
3666	/* mp is freed by the following routine */
3667	udp_ud_err(q, mp, (t_scalar_t)error);
3668	return (0);
3669}
3670
3671/*
3672 * Handle the case of the IP address, port, flow label being different
3673 * for both IPv4 and IPv6.
3674 *
3675 * NOTE: The caller must hold conn_lock and we drop it here.
3676 */
3677static int
3678udp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
3679    ushort_t ipversion, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
3680{
3681	uint_t		srcid;
3682	uint32_t	flowinfo;
3683	udp_t		*udp = connp->conn_udp;
3684	int		error = 0;
3685	ip_xmit_attr_t	*oldixa;
3686	udp_stack_t	*us = udp->udp_us;
3687	in6_addr_t	v6src;
3688	in6_addr_t	v6dst;
3689	in6_addr_t	v6nexthop;
3690	in_port_t	dstport;
3691
3692	ASSERT(MUTEX_HELD(&connp->conn_lock));
3693	ASSERT(ixa != NULL);
3694	/*
3695	 * We hold conn_lock across all the use and modifications of
3696	 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
3697	 * stay consistent.
3698	 */
3699
3700	ASSERT(cr != NULL);
3701	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3702	ixa->ixa_cred = cr;
3703	ixa->ixa_cpid = pid;
3704	if (is_system_labeled()) {
3705		/* We need to restart with a label based on the cred */
3706		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3707	}
3708
3709	/*
3710	 * If we are connected then the destination needs to be the
3711	 * same as the connected one, which is not the case here since we
3712	 * checked for that above.
3713	 */
3714	if (udp->udp_state == TS_DATA_XFER) {
3715		mutex_exit(&connp->conn_lock);
3716		error = EISCONN;
3717		goto ud_error;
3718	}
3719
3720	/* In case previous destination was multicast or multirt */
3721	ip_attr_newdst(ixa);
3722
3723	/*
3724	 * If laddr is unspecified then we look at sin6_src_id.
3725	 * We will give precedence to a source address set with IPV6_PKTINFO
3726	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3727	 * want ip_attr_connect to select a source (since it can fail) when
3728	 * IPV6_PKTINFO is specified.
3729	 * If this doesn't result in a source address then we get a source
3730	 * from ip_attr_connect() below.
3731	 */
3732	v6src = connp->conn_saddr_v6;
3733	if (sin != NULL) {
3734		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3735		dstport = sin->sin_port;
3736		flowinfo = 0;
3737		/* Don't bother with ip_srcid_find_id(), but indicate anyway. */
3738		srcid = 0;
3739		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3740		ixa->ixa_flags |= IXAF_IS_IPV4;
3741	} else {
3742		boolean_t v4mapped;
3743
3744		v6dst = sin6->sin6_addr;
3745		dstport = sin6->sin6_port;
3746		flowinfo = sin6->sin6_flowinfo;
3747		srcid = sin6->__sin6_src_id;
3748		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3749			ixa->ixa_scopeid = sin6->sin6_scope_id;
3750			ixa->ixa_flags |= IXAF_SCOPEID_SET;
3751		} else {
3752			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3753		}
3754		v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
3755		if (v4mapped)
3756			ixa->ixa_flags |= IXAF_IS_IPV4;
3757		else
3758			ixa->ixa_flags &= ~IXAF_IS_IPV4;
3759		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3760			if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3761			    v4mapped, connp->conn_netstack)) {
3762				/* Mismatched v4mapped/v6 specified by srcid. */
3763				mutex_exit(&connp->conn_lock);
3764				error = EADDRNOTAVAIL;
3765				goto ud_error;
3766			}
3767		}
3768	}
3769	/* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
3770	if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) {
3771		ip_pkt_t *ipp = &connp->conn_xmit_ipp;
3772
3773		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3774			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3775				v6src = ipp->ipp_addr;
3776		} else {
3777			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3778				v6src = ipp->ipp_addr;
3779		}
3780	}
3781
3782	ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
3783	mutex_exit(&connp->conn_lock);
3784
3785	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3786	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | IPDF_IPSEC);
3787	switch (error) {
3788	case 0:
3789		break;
3790	case EADDRNOTAVAIL:
3791		/*
3792		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3793		 * Don't have the application see that errno
3794		 */
3795		error = ENETUNREACH;
3796		goto failed;
3797	case ENETDOWN:
3798		/*
3799		 * Have !ipif_addr_ready address; drop packet silently
3800		 * until we can get applications to not send until we
3801		 * are ready.
3802		 */
3803		error = 0;
3804		goto failed;
3805	case EHOSTUNREACH:
3806	case ENETUNREACH:
3807		if (ixa->ixa_ire != NULL) {
3808			/*
3809			 * Let conn_ip_output/ire_send_noroute return
3810			 * the error and send any local ICMP error.
3811			 */
3812			error = 0;
3813			break;
3814		}
3815		/* FALLTHRU */
3816	failed:
3817	default:
3818		goto ud_error;
3819	}
3820
3821
3822	/*
3823	 * Cluster note: we let the cluster hook know that we are sending to a
3824	 * new address and/or port.
3825	 */
3826	if (cl_inet_connect2 != NULL) {
3827		CL_INET_UDP_CONNECT(connp, B_TRUE, &v6dst, dstport, error);
3828		if (error != 0) {
3829			error = EHOSTUNREACH;
3830			goto ud_error;
3831		}
3832	}
3833
3834	mutex_enter(&connp->conn_lock);
3835	/*
3836	 * While we dropped the lock some other thread might have connected
3837	 * this socket. If so we bail out with EISCONN to ensure that the
3838	 * connecting thread is the one that updates conn_ixa, conn_ht_*
3839	 * and conn_*last*.
3840	 */
3841	if (udp->udp_state == TS_DATA_XFER) {
3842		mutex_exit(&connp->conn_lock);
3843		error = EISCONN;
3844		goto ud_error;
3845	}
3846
3847	/*
3848	 * We need to rebuild the headers if
3849	 *  - we are labeling packets (could be different for different
3850	 *    destinations)
3851	 *  - we have a source route (or routing header) since we need to
3852	 *    massage that to get the pseudo-header checksum
3853	 *  - the IP version is different than the last time
3854	 *  - a socket option with COA_HEADER_CHANGED has been set which
3855	 *    set conn_v6lastdst to zero.
3856	 *
3857	 * Otherwise the prepend function will just update the src, dst,
3858	 * dstport, and flow label.
3859	 */
3860	if (is_system_labeled()) {
3861		/* TX MLP requires SCM_UCRED and don't have that here */
3862		if (connp->conn_mlp_type != mlptSingle) {
3863			mutex_exit(&connp->conn_lock);
3864			error = ECONNREFUSED;
3865			goto ud_error;
3866		}
3867		/*
3868		 * Check whether Trusted Solaris policy allows communication
3869		 * with this host, and pretend that the destination is
3870		 * unreachable if not.
3871		 * Compute any needed label and place it in ipp_label_v4/v6.
3872		 *
3873		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3874		 * ipp_label_v4/v6 to form the packet.
3875		 *
3876		 * Tsol note: Since we hold conn_lock we know no other
3877		 * thread manipulates conn_xmit_ipp.
3878		 */
3879		error = conn_update_label(connp, ixa, &v6dst,
3880		    &connp->conn_xmit_ipp);
3881		if (error != 0) {
3882			mutex_exit(&connp->conn_lock);
3883			goto ud_error;
3884		}
3885		/* Rebuild the header template */
3886		error = udp_build_hdr_template(connp, &v6src, &v6dst, dstport,
3887		    flowinfo);
3888		if (error != 0) {
3889			mutex_exit(&connp->conn_lock);
3890			goto ud_error;
3891		}
3892	} else if ((connp->conn_xmit_ipp.ipp_fields &
3893	    (IPPF_IPV4_OPTIONS|IPPF_RTHDR)) ||
3894	    ipversion != connp->conn_lastipversion ||
3895	    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
3896		/* Rebuild the header template */
3897		error = udp_build_hdr_template(connp, &v6src, &v6dst, dstport,
3898		    flowinfo);
3899		if (error != 0) {
3900			mutex_exit(&connp->conn_lock);
3901			goto ud_error;
3902		}
3903	} else {
3904		/* Simply update the destination address if no source route */
3905		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3906			ipha_t	*ipha = (ipha_t *)connp->conn_ht_iphc;
3907
3908			IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
3909			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
3910				ipha->ipha_fragment_offset_and_flags |=
3911				    IPH_DF_HTONS;
3912			} else {
3913				ipha->ipha_fragment_offset_and_flags &=
3914				    ~IPH_DF_HTONS;
3915			}
3916		} else {
3917			ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
3918			ip6h->ip6_dst = v6dst;
3919		}
3920	}
3921
3922	/*
3923	 * Remember the dst/dstport etc which corresponds to the built header
3924	 * template and conn_ixa.
3925	 */
3926	oldixa = conn_replace_ixa(connp, ixa);
3927	connp->conn_v6lastdst = v6dst;
3928	connp->conn_lastipversion = ipversion;
3929	connp->conn_lastdstport = dstport;
3930	connp->conn_lastflowinfo = flowinfo;
3931	connp->conn_lastscopeid = ixa->ixa_scopeid;
3932	connp->conn_lastsrcid = srcid;
3933	/* Also remember a source to use together with lastdst */
3934	connp->conn_v6lastsrc = v6src;
3935
3936	data_mp = udp_prepend_header_template(connp, ixa, data_mp, &v6src,
3937	    dstport, flowinfo, &error);
3938
3939	/* Done with conn_t */
3940	mutex_exit(&connp->conn_lock);
3941	ixa_refrele(oldixa);
3942
3943	if (data_mp == NULL) {
3944		ASSERT(error != 0);
3945		goto ud_error;
3946	}
3947
3948	/* We're done.  Pass the packet to ip. */
3949	UDPS_BUMP_MIB(us, udpHCOutDatagrams);
3950
3951	DTRACE_UDP5(send, mblk_t *, NULL, ip_xmit_attr_t *, ixa,
3952	    void_ip_t *, data_mp->b_rptr, udp_t *, udp, udpha_t *,
3953	    &data_mp->b_rptr[ixa->ixa_ip_hdr_length]);
3954
3955	error = conn_ip_output(data_mp, ixa);
3956	/* No udpOutErrors if an error since IP increases its error counter */
3957	switch (error) {
3958	case 0:
3959		break;
3960	case EWOULDBLOCK:
3961		(void) ixa_check_drain_insert(connp, ixa);
3962		error = 0;
3963		break;
3964	case EADDRNOTAVAIL:
3965		/*
3966		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3967		 * Don't have the application see that errno
3968		 */
3969		error = ENETUNREACH;
3970		/* FALLTHRU */
3971	default:
3972		mutex_enter(&connp->conn_lock);
3973		/*
3974		 * Clear the source and v6lastdst so we call ip_attr_connect
3975		 * for the next packet and try to pick a better source.
3976		 */
3977		if (connp->conn_mcbc_bind)
3978			connp->conn_saddr_v6 = ipv6_all_zeros;
3979		else
3980			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3981		connp->conn_v6lastdst = ipv6_all_zeros;
3982		mutex_exit(&connp->conn_lock);
3983		break;
3984	}
3985	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3986	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3987	ixa->ixa_cpid = connp->conn_cpid;
3988	ixa_refrele(ixa);
3989	return (error);
3990
3991ud_error:
3992	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3993	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3994	ixa->ixa_cpid = connp->conn_cpid;
3995	ixa_refrele(ixa);
3996
3997	freemsg(data_mp);
3998	UDPS_BUMP_MIB(us, udpOutErrors);
3999	UDP_STAT(us, udp_out_err_output);
4000	return (error);
4001}
4002
4003/* ARGSUSED */
4004static int
4005udp_wput_fallback(queue_t *wq, mblk_t *mp)
4006{
4007#ifdef DEBUG
4008	cmn_err(CE_CONT, "udp_wput_fallback: Message in fallback \n");
4009#endif
4010	freemsg(mp);
4011	return (0);
4012}
4013
4014
4015/*
4016 * Handle special out-of-band ioctl requests (see PSARC/2008/265).
4017 */
4018static void
4019udp_wput_cmdblk(queue_t *q, mblk_t *mp)
4020{
4021	void	*data;
4022	mblk_t	*datamp = mp->b_cont;
4023	conn_t	*connp = Q_TO_CONN(q);
4024	udp_t	*udp = connp->conn_udp;
4025	cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
4026
4027	if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
4028		cmdp->cb_error = EPROTO;
4029		qreply(q, mp);
4030		return;
4031	}
4032	data = datamp->b_rptr;
4033
4034	mutex_enter(&connp->conn_lock);
4035	switch (cmdp->cb_cmd) {
4036	case TI_GETPEERNAME:
4037		if (udp->udp_state != TS_DATA_XFER)
4038			cmdp->cb_error = ENOTCONN;
4039		else
4040			cmdp->cb_error = conn_getpeername(connp, data,
4041			    &cmdp->cb_len);
4042		break;
4043	case TI_GETMYNAME:
4044		cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
4045		break;
4046	default:
4047		cmdp->cb_error = EINVAL;
4048		break;
4049	}
4050	mutex_exit(&connp->conn_lock);
4051
4052	qreply(q, mp);
4053}
4054
4055static void
4056udp_use_pure_tpi(udp_t *udp)
4057{
4058	conn_t	*connp = udp->udp_connp;
4059
4060	mutex_enter(&connp->conn_lock);
4061	udp->udp_issocket = B_FALSE;
4062	mutex_exit(&connp->conn_lock);
4063	UDP_STAT(udp->udp_us, udp_sock_fallback);
4064}
4065
4066static void
4067udp_wput_other(queue_t *q, mblk_t *mp)
4068{
4069	uchar_t	*rptr = mp->b_rptr;
4070	struct iocblk *iocp;
4071	conn_t	*connp = Q_TO_CONN(q);
4072	udp_t	*udp = connp->conn_udp;
4073	cred_t	*cr;
4074
4075	switch (mp->b_datap->db_type) {
4076	case M_CMD:
4077		udp_wput_cmdblk(q, mp);
4078		return;
4079
4080	case M_PROTO:
4081	case M_PCPROTO:
4082		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4083			/*
4084			 * If the message does not contain a PRIM_type,
4085			 * throw it away.
4086			 */
4087			freemsg(mp);
4088			return;
4089		}
4090		switch (((t_primp_t)rptr)->type) {
4091		case T_ADDR_REQ:
4092			udp_addr_req(q, mp);
4093			return;
4094		case O_T_BIND_REQ:
4095		case T_BIND_REQ:
4096			udp_tpi_bind(q, mp);
4097			return;
4098		case T_CONN_REQ:
4099			udp_tpi_connect(q, mp);
4100			return;
4101		case T_CAPABILITY_REQ:
4102			udp_capability_req(q, mp);
4103			return;
4104		case T_INFO_REQ:
4105			udp_info_req(q, mp);
4106			return;
4107		case T_UNITDATA_REQ:
4108			/*
4109			 * If a T_UNITDATA_REQ gets here, the address must
4110			 * be bad.  Valid T_UNITDATA_REQs are handled
4111			 * in udp_wput.
4112			 */
4113			udp_ud_err(q, mp, EADDRNOTAVAIL);
4114			return;
4115		case T_UNBIND_REQ:
4116			udp_tpi_unbind(q, mp);
4117			return;
4118		case T_SVR4_OPTMGMT_REQ:
4119			/*
4120			 * All Solaris components should pass a db_credp
4121			 * for this TPI message, hence we ASSERT.
4122			 * But in case there is some other M_PROTO that looks
4123			 * like a TPI message sent by some other kernel
4124			 * component, we check and return an error.
4125			 */
4126			cr = msg_getcred(mp, NULL);
4127			ASSERT(cr != NULL);
4128			if (cr == NULL) {
4129				udp_err_ack(q, mp, TSYSERR, EINVAL);
4130				return;
4131			}
4132			if (!snmpcom_req(q, mp, udp_snmp_set, ip_snmp_get,
4133			    cr)) {
4134				svr4_optcom_req(q, mp, cr, &udp_opt_obj);
4135			}
4136			return;
4137
4138		case T_OPTMGMT_REQ:
4139			/*
4140			 * All Solaris components should pass a db_credp
4141			 * for this TPI message, hence we ASSERT.
4142			 * But in case there is some other M_PROTO that looks
4143			 * like a TPI message sent by some other kernel
4144			 * component, we check and return an error.
4145			 */
4146			cr = msg_getcred(mp, NULL);
4147			ASSERT(cr != NULL);
4148			if (cr == NULL) {
4149				udp_err_ack(q, mp, TSYSERR, EINVAL);
4150				return;
4151			}
4152			tpi_optcom_req(q, mp, cr, &udp_opt_obj);
4153			return;
4154
4155		case T_DISCON_REQ:
4156			udp_tpi_disconnect(q, mp);
4157			return;
4158
4159		/* The following TPI message is not supported by udp. */
4160		case O_T_CONN_RES:
4161		case T_CONN_RES:
4162			udp_err_ack(q, mp, TNOTSUPPORT, 0);
4163			return;
4164
4165		/* The following 3 TPI requests are illegal for udp. */
4166		case T_DATA_REQ:
4167		case T_EXDATA_REQ:
4168		case T_ORDREL_REQ:
4169			udp_err_ack(q, mp, TNOTSUPPORT, 0);
4170			return;
4171		default:
4172			break;
4173		}
4174		break;
4175	case M_FLUSH:
4176		if (*rptr & FLUSHW)
4177			flushq(q, FLUSHDATA);
4178		break;
4179	case M_IOCTL:
4180		iocp = (struct iocblk *)mp->b_rptr;
4181		switch (iocp->ioc_cmd) {
4182		case TI_GETPEERNAME:
4183			if (udp->udp_state != TS_DATA_XFER) {
4184				/*
4185				 * If a default destination address has not
4186				 * been associated with the stream, then we
4187				 * don't know the peer's name.
4188				 */
4189				iocp->ioc_error = ENOTCONN;
4190				iocp->ioc_count = 0;
4191				mp->b_datap->db_type = M_IOCACK;
4192				qreply(q, mp);
4193				return;
4194			}
4195			/* FALLTHRU */
4196		case TI_GETMYNAME:
4197			/*
4198			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
4199			 * need to copyin the user's strbuf structure.
4200			 * Processing will continue in the M_IOCDATA case
4201			 * below.
4202			 */
4203			mi_copyin(q, mp, NULL,
4204			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4205			return;
4206		case _SIOCSOCKFALLBACK:
4207			/*
4208			 * Either sockmod is about to be popped and the
4209			 * socket would now be treated as a plain stream,
4210			 * or a module is about to be pushed so we have
4211			 * to follow pure TPI semantics.
4212			 */
4213			if (!udp->udp_issocket) {
4214				DB_TYPE(mp) = M_IOCNAK;
4215				iocp->ioc_error = EINVAL;
4216			} else {
4217				udp_use_pure_tpi(udp);
4218
4219				DB_TYPE(mp) = M_IOCACK;
4220				iocp->ioc_error = 0;
4221			}
4222			iocp->ioc_count = 0;
4223			iocp->ioc_rval = 0;
4224			qreply(q, mp);
4225			return;
4226		default:
4227			break;
4228		}
4229		break;
4230	case M_IOCDATA:
4231		udp_wput_iocdata(q, mp);
4232		return;
4233	default:
4234		/* Unrecognized messages are passed through without change. */
4235		break;
4236	}
4237	ip_wput_nondata(q, mp);
4238}
4239
4240/*
4241 * udp_wput_iocdata is called by udp_wput_other to handle all M_IOCDATA
4242 * messages.
4243 */
4244static void
4245udp_wput_iocdata(queue_t *q, mblk_t *mp)
4246{
4247	mblk_t		*mp1;
4248	struct	iocblk *iocp = (struct iocblk *)mp->b_rptr;
4249	STRUCT_HANDLE(strbuf, sb);
4250	uint_t		addrlen;
4251	conn_t		*connp = Q_TO_CONN(q);
4252	udp_t		*udp = connp->conn_udp;
4253
4254	/* Make sure it is one of ours. */
4255	switch (iocp->ioc_cmd) {
4256	case TI_GETMYNAME:
4257	case TI_GETPEERNAME:
4258		break;
4259	default:
4260		ip_wput_nondata(q, mp);
4261		return;
4262	}
4263
4264	switch (mi_copy_state(q, mp, &mp1)) {
4265	case -1:
4266		return;
4267	case MI_COPY_CASE(MI_COPY_IN, 1):
4268		break;
4269	case MI_COPY_CASE(MI_COPY_OUT, 1):
4270		/*
4271		 * The address has been copied out, so now
4272		 * copyout the strbuf.
4273		 */
4274		mi_copyout(q, mp);
4275		return;
4276	case MI_COPY_CASE(MI_COPY_OUT, 2):
4277		/*
4278		 * The address and strbuf have been copied out.
4279		 * We're done, so just acknowledge the original
4280		 * M_IOCTL.
4281		 */
4282		mi_copy_done(q, mp, 0);
4283		return;
4284	default:
4285		/*
4286		 * Something strange has happened, so acknowledge
4287		 * the original M_IOCTL with an EPROTO error.
4288		 */
4289		mi_copy_done(q, mp, EPROTO);
4290		return;
4291	}
4292
4293	/*
4294	 * Now we have the strbuf structure for TI_GETMYNAME
4295	 * and TI_GETPEERNAME.  Next we copyout the requested
4296	 * address and then we'll copyout the strbuf.
4297	 */
4298	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
4299
4300	if (connp->conn_family == AF_INET)
4301		addrlen = sizeof (sin_t);
4302	else
4303		addrlen = sizeof (sin6_t);
4304
4305	if (STRUCT_FGET(sb, maxlen) < addrlen) {
4306		mi_copy_done(q, mp, EINVAL);
4307		return;
4308	}
4309
4310	switch (iocp->ioc_cmd) {
4311	case TI_GETMYNAME:
4312		break;
4313	case TI_GETPEERNAME:
4314		if (udp->udp_state != TS_DATA_XFER) {
4315			mi_copy_done(q, mp, ENOTCONN);
4316			return;
4317		}
4318		break;
4319	}
4320	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
4321	if (!mp1)
4322		return;
4323
4324	STRUCT_FSET(sb, len, addrlen);
4325	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4326	case TI_GETMYNAME:
4327		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
4328		    &addrlen);
4329		break;
4330	case TI_GETPEERNAME:
4331		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
4332		    &addrlen);
4333		break;
4334	}
4335	mp1->b_wptr += addrlen;
4336	/* Copy out the address */
4337	mi_copyout(q, mp);
4338}
4339
4340void
4341udp_ddi_g_init(void)
4342{
4343	udp_max_optsize = optcom_max_optsize(udp_opt_obj.odb_opt_des_arr,
4344	    udp_opt_obj.odb_opt_arr_cnt);
4345
4346	/*
4347	 * We want to be informed each time a stack is created or
4348	 * destroyed in the kernel, so we can maintain the
4349	 * set of udp_stack_t's.
4350	 */
4351	netstack_register(NS_UDP, udp_stack_init, NULL, udp_stack_fini);
4352}
4353
4354void
4355udp_ddi_g_destroy(void)
4356{
4357	netstack_unregister(NS_UDP);
4358}
4359
4360#define	INET_NAME	"ip"
4361
4362/*
4363 * Initialize the UDP stack instance.
4364 */
4365static void *
4366udp_stack_init(netstackid_t stackid, netstack_t *ns)
4367{
4368	udp_stack_t	*us;
4369	int		i;
4370	int		error = 0;
4371	major_t		major;
4372	size_t		arrsz;
4373
4374	us = (udp_stack_t *)kmem_zalloc(sizeof (*us), KM_SLEEP);
4375	us->us_netstack = ns;
4376
4377	mutex_init(&us->us_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL);
4378	us->us_num_epriv_ports = UDP_NUM_EPRIV_PORTS;
4379	us->us_epriv_ports[0] = ULP_DEF_EPRIV_PORT1;
4380	us->us_epriv_ports[1] = ULP_DEF_EPRIV_PORT2;
4381
4382	/*
4383	 * The smallest anonymous port in the priviledged port range which UDP
4384	 * looks for free port.  Use in the option UDP_ANONPRIVBIND.
4385	 */
4386	us->us_min_anonpriv_port = 512;
4387
4388	us->us_bind_fanout_size = udp_bind_fanout_size;
4389
4390	/* Roundup variable that might have been modified in /etc/system */
4391	if (!ISP2(us->us_bind_fanout_size)) {
4392		/* Not a power of two. Round up to nearest power of two */
4393		for (i = 0; i < 31; i++) {
4394			if (us->us_bind_fanout_size < (1 << i))
4395				break;
4396		}
4397		us->us_bind_fanout_size = 1 << i;
4398	}
4399	us->us_bind_fanout = kmem_zalloc(us->us_bind_fanout_size *
4400	    sizeof (udp_fanout_t), KM_SLEEP);
4401	for (i = 0; i < us->us_bind_fanout_size; i++) {
4402		mutex_init(&us->us_bind_fanout[i].uf_lock, NULL, MUTEX_DEFAULT,
4403		    NULL);
4404	}
4405
4406	arrsz = udp_propinfo_count * sizeof (mod_prop_info_t);
4407	us->us_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz,
4408	    KM_SLEEP);
4409	bcopy(udp_propinfo_tbl, us->us_propinfo_tbl, arrsz);
4410
4411	/* Allocate the per netstack stats */
4412	mutex_enter(&cpu_lock);
4413	us->us_sc_cnt = MAX(ncpus, boot_ncpus);
4414	mutex_exit(&cpu_lock);
4415	us->us_sc = kmem_zalloc(max_ncpus  * sizeof (udp_stats_cpu_t *),
4416	    KM_SLEEP);
4417	for (i = 0; i < us->us_sc_cnt; i++) {
4418		us->us_sc[i] = kmem_zalloc(sizeof (udp_stats_cpu_t),
4419		    KM_SLEEP);
4420	}
4421
4422	us->us_kstat = udp_kstat2_init(stackid);
4423	us->us_mibkp = udp_kstat_init(stackid);
4424
4425	major = mod_name_to_major(INET_NAME);
4426	error = ldi_ident_from_major(major, &us->us_ldi_ident);
4427	ASSERT(error == 0);
4428	return (us);
4429}
4430
4431/*
4432 * Free the UDP stack instance.
4433 */
4434static void
4435udp_stack_fini(netstackid_t stackid, void *arg)
4436{
4437	udp_stack_t *us = (udp_stack_t *)arg;
4438	int i;
4439
4440	for (i = 0; i < us->us_bind_fanout_size; i++) {
4441		mutex_destroy(&us->us_bind_fanout[i].uf_lock);
4442	}
4443
4444	kmem_free(us->us_bind_fanout, us->us_bind_fanout_size *
4445	    sizeof (udp_fanout_t));
4446
4447	us->us_bind_fanout = NULL;
4448
4449	for (i = 0; i < us->us_sc_cnt; i++)
4450		kmem_free(us->us_sc[i], sizeof (udp_stats_cpu_t));
4451	kmem_free(us->us_sc, max_ncpus * sizeof (udp_stats_cpu_t *));
4452
4453	kmem_free(us->us_propinfo_tbl,
4454	    udp_propinfo_count * sizeof (mod_prop_info_t));
4455	us->us_propinfo_tbl = NULL;
4456
4457	udp_kstat_fini(stackid, us->us_mibkp);
4458	us->us_mibkp = NULL;
4459
4460	udp_kstat2_fini(stackid, us->us_kstat);
4461	us->us_kstat = NULL;
4462
4463	mutex_destroy(&us->us_epriv_port_lock);
4464	ldi_ident_release(us->us_ldi_ident);
4465	kmem_free(us, sizeof (*us));
4466}
4467
4468static size_t
4469udp_set_rcv_hiwat(udp_t *udp, size_t size)
4470{
4471	udp_stack_t *us = udp->udp_us;
4472
4473	/* We add a bit of extra buffering */
4474	size += size >> 1;
4475	if (size > us->us_max_buf)
4476		size = us->us_max_buf;
4477
4478	udp->udp_rcv_hiwat = size;
4479	return (size);
4480}
4481
4482/*
4483 * For the lower queue so that UDP can be a dummy mux.
4484 * Nobody should be sending
4485 * packets up this stream
4486 */
4487static int
4488udp_lrput(queue_t *q, mblk_t *mp)
4489{
4490	switch (mp->b_datap->db_type) {
4491	case M_FLUSH:
4492		/* Turn around */
4493		if (*mp->b_rptr & FLUSHW) {
4494			*mp->b_rptr &= ~FLUSHR;
4495			qreply(q, mp);
4496			return (0);
4497		}
4498		break;
4499	}
4500	freemsg(mp);
4501	return (0);
4502}
4503
4504/*
4505 * For the lower queue so that UDP can be a dummy mux.
4506 * Nobody should be sending packets down this stream.
4507 */
4508/* ARGSUSED */
4509int
4510udp_lwput(queue_t *q, mblk_t *mp)
4511{
4512	freemsg(mp);
4513	return (0);
4514}
4515
4516/*
4517 * When a CPU is added, we need to allocate the per CPU stats struct.
4518 */
4519void
4520udp_stack_cpu_add(udp_stack_t *us, processorid_t cpu_seqid)
4521{
4522	int i;
4523
4524	if (cpu_seqid < us->us_sc_cnt)
4525		return;
4526	for (i = us->us_sc_cnt; i <= cpu_seqid; i++) {
4527		ASSERT(us->us_sc[i] == NULL);
4528		us->us_sc[i] = kmem_zalloc(sizeof (udp_stats_cpu_t),
4529		    KM_SLEEP);
4530	}
4531	membar_producer();
4532	us->us_sc_cnt = cpu_seqid + 1;
4533}
4534
4535/*
4536 * Below routines for UDP socket module.
4537 */
4538
4539static conn_t *
4540udp_do_open(cred_t *credp, boolean_t isv6, int flags, int *errorp)
4541{
4542	udp_t		*udp;
4543	conn_t		*connp;
4544	zoneid_t	zoneid;
4545	netstack_t	*ns;
4546	udp_stack_t	*us;
4547	int		len;
4548
4549	ASSERT(errorp != NULL);
4550
4551	if ((*errorp = secpolicy_basic_net_access(credp)) != 0)
4552		return (NULL);
4553
4554	ns = netstack_find_by_cred(credp);
4555	ASSERT(ns != NULL);
4556	us = ns->netstack_udp;
4557	ASSERT(us != NULL);
4558
4559	/*
4560	 * For exclusive stacks we set the zoneid to zero
4561	 * to make UDP operate as if in the global zone.
4562	 */
4563	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
4564		zoneid = GLOBAL_ZONEID;
4565	else
4566		zoneid = crgetzoneid(credp);
4567
4568	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
4569
4570	connp = ipcl_conn_create(IPCL_UDPCONN, flags, ns);
4571	if (connp == NULL) {
4572		netstack_rele(ns);
4573		*errorp = ENOMEM;
4574		return (NULL);
4575	}
4576	udp = connp->conn_udp;
4577
4578	/*
4579	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
4580	 * done by netstack_find_by_cred()
4581	 */
4582	netstack_rele(ns);
4583
4584	/*
4585	 * Since this conn_t/udp_t is not yet visible to anybody else we don't
4586	 * need to lock anything.
4587	 */
4588	ASSERT(connp->conn_proto == IPPROTO_UDP);
4589	ASSERT(connp->conn_udp == udp);
4590	ASSERT(udp->udp_connp == connp);
4591
4592	/* Set the initial state of the stream and the privilege status. */
4593	udp->udp_state = TS_UNBND;
4594	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
4595	if (isv6) {
4596		connp->conn_family = AF_INET6;
4597		connp->conn_ipversion = IPV6_VERSION;
4598		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
4599		connp->conn_default_ttl = us->us_ipv6_hoplimit;
4600		len = sizeof (ip6_t) + UDPH_SIZE;
4601	} else {
4602		connp->conn_family = AF_INET;
4603		connp->conn_ipversion = IPV4_VERSION;
4604		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
4605		connp->conn_default_ttl = us->us_ipv4_ttl;
4606		len = sizeof (ipha_t) + UDPH_SIZE;
4607	}
4608
4609	ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
4610	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
4611
4612	connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
4613	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
4614	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
4615	connp->conn_ixa->ixa_zoneid = zoneid;
4616
4617	connp->conn_zoneid = zoneid;
4618
4619	/*
4620	 * If the caller has the process-wide flag set, then default to MAC
4621	 * exempt mode.  This allows read-down to unlabeled hosts.
4622	 */
4623	if (getpflags(NET_MAC_AWARE, credp) != 0)
4624		connp->conn_mac_mode = CONN_MAC_AWARE;
4625
4626	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
4627
4628	udp->udp_us = us;
4629
4630	connp->conn_rcvbuf = us->us_recv_hiwat;
4631	connp->conn_sndbuf = us->us_xmit_hiwat;
4632	connp->conn_sndlowat = us->us_xmit_lowat;
4633	connp->conn_rcvlowat = udp_mod_info.mi_lowat;
4634
4635	connp->conn_wroff = len + us->us_wroff_extra;
4636	connp->conn_so_type = SOCK_DGRAM;
4637
4638	connp->conn_recv = udp_input;
4639	connp->conn_recvicmp = udp_icmp_input;
4640	crhold(credp);
4641	connp->conn_cred = credp;
4642	connp->conn_cpid = curproc->p_pid;
4643	connp->conn_open_time = ddi_get_lbolt64();
4644	/* Cache things in ixa without an extra refhold */
4645	ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
4646	connp->conn_ixa->ixa_cred = connp->conn_cred;
4647	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
4648	if (is_system_labeled())
4649		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
4650
4651	*((sin6_t *)&udp->udp_delayed_addr) = sin6_null;
4652
4653	if (us->us_pmtu_discovery)
4654		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
4655
4656	return (connp);
4657}
4658
4659sock_lower_handle_t
4660udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
4661    uint_t *smodep, int *errorp, int flags, cred_t *credp)
4662{
4663	udp_t		*udp = NULL;
4664	udp_stack_t	*us;
4665	conn_t		*connp;
4666	boolean_t	isv6;
4667
4668	if (type != SOCK_DGRAM || (family != AF_INET && family != AF_INET6) ||
4669	    (proto != 0 && proto != IPPROTO_UDP)) {
4670		*errorp = EPROTONOSUPPORT;
4671		return (NULL);
4672	}
4673
4674	if (family == AF_INET6)
4675		isv6 = B_TRUE;
4676	else
4677		isv6 = B_FALSE;
4678
4679	connp = udp_do_open(credp, isv6, flags, errorp);
4680	if (connp == NULL)
4681		return (NULL);
4682
4683	udp = connp->conn_udp;
4684	ASSERT(udp != NULL);
4685	us = udp->udp_us;
4686	ASSERT(us != NULL);
4687
4688	udp->udp_issocket = B_TRUE;
4689	connp->conn_flags |= IPCL_NONSTR;
4690
4691	/*
4692	 * Set flow control
4693	 * Since this conn_t/udp_t is not yet visible to anybody else we don't
4694	 * need to lock anything.
4695	 */
4696	(void) udp_set_rcv_hiwat(udp, connp->conn_rcvbuf);
4697	udp->udp_rcv_disply_hiwat = connp->conn_rcvbuf;
4698
4699	connp->conn_flow_cntrld = B_FALSE;
4700
4701	mutex_enter(&connp->conn_lock);
4702	connp->conn_state_flags &= ~CONN_INCIPIENT;
4703	mutex_exit(&connp->conn_lock);
4704
4705	*errorp = 0;
4706	*smodep = SM_ATOMIC;
4707	*sock_downcalls = &sock_udp_downcalls;
4708	return ((sock_lower_handle_t)connp);
4709}
4710
4711/* ARGSUSED3 */
4712void
4713udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
4714    sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
4715{
4716	conn_t		*connp = (conn_t *)proto_handle;
4717	struct sock_proto_props sopp;
4718
4719	/* All Solaris components should pass a cred for this operation. */
4720	ASSERT(cr != NULL);
4721
4722	connp->conn_upcalls = sock_upcalls;
4723	connp->conn_upper_handle = sock_handle;
4724
4725	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
4726	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
4727	sopp.sopp_wroff = connp->conn_wroff;
4728	sopp.sopp_maxblk = INFPSZ;
4729	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
4730	sopp.sopp_rxlowat = connp->conn_rcvlowat;
4731	sopp.sopp_maxaddrlen = sizeof (sin6_t);
4732	sopp.sopp_maxpsz =
4733	    (connp->conn_family == AF_INET) ? UDP_MAXPACKET_IPV4 :
4734	    UDP_MAXPACKET_IPV6;
4735	sopp.sopp_minpsz = (udp_mod_info.mi_minpsz == 1) ? 0 :
4736	    udp_mod_info.mi_minpsz;
4737
4738	(*connp->conn_upcalls->su_set_proto_props)(connp->conn_upper_handle,
4739	    &sopp);
4740}
4741
4742static void
4743udp_do_close(conn_t *connp)
4744{
4745	udp_t	*udp;
4746
4747	ASSERT(connp != NULL && IPCL_IS_UDP(connp));
4748	udp = connp->conn_udp;
4749
4750	if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) {
4751		/*
4752		 * Running in cluster mode - register unbind information
4753		 */
4754		if (connp->conn_ipversion == IPV4_VERSION) {
4755			(*cl_inet_unbind)(
4756			    connp->conn_netstack->netstack_stackid,
4757			    IPPROTO_UDP, AF_INET,
4758			    (uint8_t *)(&V4_PART_OF_V6(connp->conn_laddr_v6)),
4759			    (in_port_t)connp->conn_lport, NULL);
4760		} else {
4761			(*cl_inet_unbind)(
4762			    connp->conn_netstack->netstack_stackid,
4763			    IPPROTO_UDP, AF_INET6,
4764			    (uint8_t *)&(connp->conn_laddr_v6),
4765			    (in_port_t)connp->conn_lport, NULL);
4766		}
4767	}
4768
4769	udp_bind_hash_remove(udp, B_FALSE);
4770
4771	ip_quiesce_conn(connp);
4772
4773	if (!IPCL_IS_NONSTR(connp)) {
4774		ASSERT(connp->conn_wq != NULL);
4775		ASSERT(connp->conn_rq != NULL);
4776		qprocsoff(connp->conn_rq);
4777	}
4778
4779	udp_close_free(connp);
4780
4781	/*
4782	 * Now we are truly single threaded on this stream, and can
4783	 * delete the things hanging off the connp, and finally the connp.
4784	 * We removed this connp from the fanout list, it cannot be
4785	 * accessed thru the fanouts, and we already waited for the
4786	 * conn_ref to drop to 0. We are already in close, so
4787	 * there cannot be any other thread from the top. qprocsoff
4788	 * has completed, and service has completed or won't run in
4789	 * future.
4790	 */
4791	ASSERT(connp->conn_ref == 1);
4792
4793	if (!IPCL_IS_NONSTR(connp)) {
4794		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
4795	} else {
4796		ip_free_helper_stream(connp);
4797	}
4798
4799	connp->conn_ref--;
4800	ipcl_conn_destroy(connp);
4801}
4802
4803/* ARGSUSED1 */
4804int
4805udp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
4806{
4807	conn_t	*connp = (conn_t *)proto_handle;
4808
4809	/* All Solaris components should pass a cred for this operation. */
4810	ASSERT(cr != NULL);
4811
4812	udp_do_close(connp);
4813	return (0);
4814}
4815
4816static int
4817udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
4818    boolean_t bind_to_req_port_only)
4819{
4820	sin_t		*sin;
4821	sin6_t		*sin6;
4822	udp_t		*udp = connp->conn_udp;
4823	int		error = 0;
4824	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
4825	in_port_t	port;		/* Host byte order */
4826	in_port_t	requested_port;	/* Host byte order */
4827	int		count;
4828	ipaddr_t	v4src;		/* Set if AF_INET */
4829	in6_addr_t	v6src;
4830	int		loopmax;
4831	udp_fanout_t	*udpf;
4832	in_port_t	lport;		/* Network byte order */
4833	uint_t		scopeid = 0;
4834	zoneid_t	zoneid = IPCL_ZONEID(connp);
4835	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
4836	boolean_t	is_inaddr_any;
4837	mlp_type_t	addrtype, mlptype;
4838	udp_stack_t	*us = udp->udp_us;
4839
4840	sin = NULL;
4841	sin6 = NULL;
4842	switch (len) {
4843	case sizeof (sin_t):	/* Complete IPv4 address */
4844		sin = (sin_t *)sa;
4845
4846		if (sin == NULL || !OK_32PTR((char *)sin))
4847			return (EINVAL);
4848
4849		if (connp->conn_family != AF_INET ||
4850		    sin->sin_family != AF_INET) {
4851			return (EAFNOSUPPORT);
4852		}
4853		v4src = sin->sin_addr.s_addr;
4854		IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
4855		if (v4src != INADDR_ANY) {
4856			laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
4857			    B_TRUE);
4858		}
4859		port = ntohs(sin->sin_port);
4860		break;
4861
4862	case sizeof (sin6_t):	/* complete IPv6 address */
4863		sin6 = (sin6_t *)sa;
4864
4865		if (sin6 == NULL || !OK_32PTR((char *)sin6))
4866			return (EINVAL);
4867
4868		if (connp->conn_family != AF_INET6 ||
4869		    sin6->sin6_family != AF_INET6) {
4870			return (EAFNOSUPPORT);
4871		}
4872		v6src = sin6->sin6_addr;
4873		if (IN6_IS_ADDR_V4MAPPED(&v6src)) {
4874			if (connp->conn_ipv6_v6only)
4875				return (EADDRNOTAVAIL);
4876
4877			IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
4878			if (v4src != INADDR_ANY) {
4879				laddr_type = ip_laddr_verify_v4(v4src,
4880				    zoneid, ipst, B_FALSE);
4881			}
4882		} else {
4883			if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4884				if (IN6_IS_ADDR_LINKSCOPE(&v6src))
4885					scopeid = sin6->sin6_scope_id;
4886				laddr_type = ip_laddr_verify_v6(&v6src,
4887				    zoneid, ipst, B_TRUE, scopeid);
4888			}
4889		}
4890		port = ntohs(sin6->sin6_port);
4891		break;
4892
4893	default:		/* Invalid request */
4894		(void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
4895		    "udp_bind: bad ADDR_length length %u", len);
4896		return (-TBADADDR);
4897	}
4898
4899	/* Is the local address a valid unicast, multicast, or broadcast? */
4900	if (laddr_type == IPVL_BAD)
4901		return (EADDRNOTAVAIL);
4902
4903	requested_port = port;
4904
4905	if (requested_port == 0 || !bind_to_req_port_only)
4906		bind_to_req_port_only = B_FALSE;
4907	else		/* T_BIND_REQ and requested_port != 0 */
4908		bind_to_req_port_only = B_TRUE;
4909
4910	if (requested_port == 0) {
4911		/*
4912		 * If the application passed in zero for the port number, it
4913		 * doesn't care which port number we bind to. Get one in the
4914		 * valid range.
4915		 */
4916		if (connp->conn_anon_priv_bind) {
4917			port = udp_get_next_priv_port(udp);
4918		} else {
4919			port = udp_update_next_port(udp,
4920			    us->us_next_port_to_try, B_TRUE);
4921		}
4922	} else {
4923		/*
4924		 * If the port is in the well-known privileged range,
4925		 * make sure the caller was privileged.
4926		 */
4927		int i;
4928		boolean_t priv = B_FALSE;
4929
4930		if (port < us->us_smallest_nonpriv_port) {
4931			priv = B_TRUE;
4932		} else {
4933			for (i = 0; i < us->us_num_epriv_ports; i++) {
4934				if (port == us->us_epriv_ports[i]) {
4935					priv = B_TRUE;
4936					break;
4937				}
4938			}
4939		}
4940
4941		if (priv) {
4942			if (secpolicy_net_privaddr(cr, port, IPPROTO_UDP) != 0)
4943				return (-TACCES);
4944		}
4945	}
4946
4947	if (port == 0)
4948		return (-TNOADDR);
4949
4950	/*
4951	 * The state must be TS_UNBND. TPI mandates that users must send
4952	 * TPI primitives only 1 at a time and wait for the response before
4953	 * sending the next primitive.
4954	 */
4955	mutex_enter(&connp->conn_lock);
4956	if (udp->udp_state != TS_UNBND) {
4957		mutex_exit(&connp->conn_lock);
4958		(void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
4959		    "udp_bind: bad state, %u", udp->udp_state);
4960		return (-TOUTSTATE);
4961	}
4962	/*
4963	 * Copy the source address into our udp structure. This address
4964	 * may still be zero; if so, IP will fill in the correct address
4965	 * each time an outbound packet is passed to it. Since the udp is
4966	 * not yet in the bind hash list, we don't grab the uf_lock to
4967	 * change conn_ipversion
4968	 */
4969	if (connp->conn_family == AF_INET) {
4970		ASSERT(sin != NULL);
4971		ASSERT(connp->conn_ixa->ixa_flags & IXAF_IS_IPV4);
4972	} else {
4973		if (IN6_IS_ADDR_V4MAPPED(&v6src)) {
4974			/*
4975			 * no need to hold the uf_lock to set the conn_ipversion
4976			 * since we are not yet in the fanout list
4977			 */
4978			connp->conn_ipversion = IPV4_VERSION;
4979			connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
4980		} else {
4981			connp->conn_ipversion = IPV6_VERSION;
4982			connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
4983		}
4984	}
4985
4986	/*
4987	 * If conn_reuseaddr is not set, then we have to make sure that
4988	 * the IP address and port number the application requested
4989	 * (or we selected for the application) is not being used by
4990	 * another stream.  If another stream is already using the
4991	 * requested IP address and port, the behavior depends on
4992	 * "bind_to_req_port_only". If set the bind fails; otherwise we
4993	 * search for any unused port to bind to the stream.
4994	 *
4995	 * As per the BSD semantics, as modified by the Deering multicast
4996	 * changes, if conn_reuseaddr is set, then we allow multiple binds
4997	 * to the same port independent of the local IP address.
4998	 *
4999	 * This is slightly different than in SunOS 4.X which did not
5000	 * support IP multicast. Note that the change implemented by the
5001	 * Deering multicast code effects all binds - not only binding
5002	 * to IP multicast addresses.
5003	 *
5004	 * Note that when binding to port zero we ignore SO_REUSEADDR in
5005	 * order to guarantee a unique port.
5006	 */
5007
5008	count = 0;
5009	if (connp->conn_anon_priv_bind) {
5010		/*
5011		 * loopmax = (IPPORT_RESERVED-1) -
5012		 *    us->us_min_anonpriv_port + 1
5013		 */
5014		loopmax = IPPORT_RESERVED - us->us_min_anonpriv_port;
5015	} else {
5016		loopmax = us->us_largest_anon_port -
5017		    us->us_smallest_anon_port + 1;
5018	}
5019
5020	is_inaddr_any = V6_OR_V4_INADDR_ANY(v6src);
5021
5022	for (;;) {
5023		udp_t		*udp1;
5024		boolean_t	found_exclbind = B_FALSE;
5025		conn_t		*connp1;
5026
5027		/*
5028		 * Walk through the list of udp streams bound to
5029		 * requested port with the same IP address.
5030		 */
5031		lport = htons(port);
5032		udpf = &us->us_bind_fanout[UDP_BIND_HASH(lport,
5033		    us->us_bind_fanout_size)];
5034		mutex_enter(&udpf->uf_lock);
5035		for (udp1 = udpf->uf_udp; udp1 != NULL;
5036		    udp1 = udp1->udp_bind_hash) {
5037			connp1 = udp1->udp_connp;
5038
5039			if (lport != connp1->conn_lport)
5040				continue;
5041
5042			/*
5043			 * On a labeled system, we must treat bindings to ports
5044			 * on shared IP addresses by sockets with MAC exemption
5045			 * privilege as being in all zones, as there's
5046			 * otherwise no way to identify the right receiver.
5047			 */
5048			if (!IPCL_BIND_ZONE_MATCH(connp1, connp))
5049				continue;
5050
5051			/*
5052			 * If UDP_EXCLBIND is set for either the bound or
5053			 * binding endpoint, the semantics of bind
5054			 * is changed according to the following chart.
5055			 *
5056			 * spec = specified address (v4 or v6)
5057			 * unspec = unspecified address (v4 or v6)
5058			 * A = specified addresses are different for endpoints
5059			 *
5060			 * bound	bind to		allowed?
5061			 * -------------------------------------
5062			 * unspec	unspec		no
5063			 * unspec	spec		no
5064			 * spec		unspec		no
5065			 * spec		spec		yes if A
5066			 *
5067			 * For labeled systems, SO_MAC_EXEMPT behaves the same
5068			 * as UDP_EXCLBIND, except that zoneid is ignored.
5069			 */
5070			if (connp1->conn_exclbind || connp->conn_exclbind ||
5071			    IPCL_CONNS_MAC(udp1->udp_connp, connp)) {
5072				if (V6_OR_V4_INADDR_ANY(
5073				    connp1->conn_bound_addr_v6) ||
5074				    is_inaddr_any ||
5075				    IN6_ARE_ADDR_EQUAL(
5076				    &connp1->conn_bound_addr_v6,
5077				    &v6src)) {
5078					found_exclbind = B_TRUE;
5079					break;
5080				}
5081				continue;
5082			}
5083
5084			/*
5085			 * Check ipversion to allow IPv4 and IPv6 sockets to
5086			 * have disjoint port number spaces.
5087			 */
5088			if (connp->conn_ipversion != connp1->conn_ipversion) {
5089
5090				/*
5091				 * On the first time through the loop, if the
5092				 * the user intentionally specified a
5093				 * particular port number, then ignore any
5094				 * bindings of the other protocol that may
5095				 * conflict. This allows the user to bind IPv6
5096				 * alone and get both v4 and v6, or bind both
5097				 * both and get each seperately. On subsequent
5098				 * times through the loop, we're checking a
5099				 * port that we chose (not the user) and thus
5100				 * we do not allow casual duplicate bindings.
5101				 */
5102				if (count == 0 && requested_port != 0)
5103					continue;
5104			}
5105
5106			/*
5107			 * No difference depending on SO_REUSEADDR.
5108			 *
5109			 * If existing port is bound to a
5110			 * non-wildcard IP address and
5111			 * the requesting stream is bound to
5112			 * a distinct different IP addresses
5113			 * (non-wildcard, also), keep going.
5114			 */
5115			if (!is_inaddr_any &&
5116			    !V6_OR_V4_INADDR_ANY(connp1->conn_bound_addr_v6) &&
5117			    !IN6_ARE_ADDR_EQUAL(&connp1->conn_laddr_v6,
5118			    &v6src)) {
5119				continue;
5120			}
5121			break;
5122		}
5123
5124		if (!found_exclbind &&
5125		    (connp->conn_reuseaddr && requested_port != 0)) {
5126			break;
5127		}
5128
5129		if (udp1 == NULL) {
5130			/*
5131			 * No other stream has this IP address
5132			 * and port number. We can use it.
5133			 */
5134			break;
5135		}
5136		mutex_exit(&udpf->uf_lock);
5137		if (bind_to_req_port_only) {
5138			/*
5139			 * We get here only when requested port
5140			 * is bound (and only first  of the for()
5141			 * loop iteration).
5142			 *
5143			 * The semantics of this bind request
5144			 * require it to fail so we return from
5145			 * the routine (and exit the loop).
5146			 *
5147			 */
5148			mutex_exit(&connp->conn_lock);
5149			return (-TADDRBUSY);
5150		}
5151
5152		if (connp->conn_anon_priv_bind) {
5153			port = udp_get_next_priv_port(udp);
5154		} else {
5155			if ((count == 0) && (requested_port != 0)) {
5156				/*
5157				 * If the application wants us to find
5158				 * a port, get one to start with. Set
5159				 * requested_port to 0, so that we will
5160				 * update us->us_next_port_to_try below.
5161				 */
5162				port = udp_update_next_port(udp,
5163				    us->us_next_port_to_try, B_TRUE);
5164				requested_port = 0;
5165			} else {
5166				port = udp_update_next_port(udp, port + 1,
5167				    B_FALSE);
5168			}
5169		}
5170
5171		if (port == 0 || ++count >= loopmax) {
5172			/*
5173			 * We've tried every possible port number and
5174			 * there are none available, so send an error
5175			 * to the user.
5176			 */
5177			mutex_exit(&connp->conn_lock);
5178			return (-TNOADDR);
5179		}
5180	}
5181
5182	/*
5183	 * Copy the source address into our udp structure.  This address
5184	 * may still be zero; if so, ip_attr_connect will fill in the correct
5185	 * address when a packet is about to be sent.
5186	 * If we are binding to a broadcast or multicast address then
5187	 * we just set the conn_bound_addr since we don't want to use
5188	 * that as the source address when sending.
5189	 */
5190	connp->conn_bound_addr_v6 = v6src;
5191	connp->conn_laddr_v6 = v6src;
5192	if (scopeid != 0) {
5193		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
5194		connp->conn_ixa->ixa_scopeid = scopeid;
5195		connp->conn_incoming_ifindex = scopeid;
5196	} else {
5197		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
5198		connp->conn_incoming_ifindex = connp->conn_bound_if;
5199	}
5200
5201	switch (laddr_type) {
5202	case IPVL_UNICAST_UP:
5203	case IPVL_UNICAST_DOWN:
5204		connp->conn_saddr_v6 = v6src;
5205		connp->conn_mcbc_bind = B_FALSE;
5206		break;
5207	case IPVL_MCAST:
5208	case IPVL_BCAST:
5209		/* ip_set_destination will pick a source address later */
5210		connp->conn_saddr_v6 = ipv6_all_zeros;
5211		connp->conn_mcbc_bind = B_TRUE;
5212		break;
5213	}
5214
5215	/* Any errors after this point should use late_error */
5216	connp->conn_lport = lport;
5217
5218	/*
5219	 * Now reset the next anonymous port if the application requested
5220	 * an anonymous port, or we handed out the next anonymous port.
5221	 */
5222	if ((requested_port == 0) && (!connp->conn_anon_priv_bind)) {
5223		us->us_next_port_to_try = port + 1;
5224	}
5225
5226	/* Initialize the T_BIND_ACK. */
5227	if (connp->conn_family == AF_INET) {
5228		sin->sin_port = connp->conn_lport;
5229	} else {
5230		sin6->sin6_port = connp->conn_lport;
5231	}
5232	udp->udp_state = TS_IDLE;
5233	udp_bind_hash_insert(udpf, udp);
5234	mutex_exit(&udpf->uf_lock);
5235	mutex_exit(&connp->conn_lock);
5236
5237	if (cl_inet_bind) {
5238		/*
5239		 * Running in cluster mode - register bind information
5240		 */
5241		if (connp->conn_ipversion == IPV4_VERSION) {
5242			(*cl_inet_bind)(connp->conn_netstack->netstack_stackid,
5243			    IPPROTO_UDP, AF_INET, (uint8_t *)&v4src,
5244			    (in_port_t)connp->conn_lport, NULL);
5245		} else {
5246			(*cl_inet_bind)(connp->conn_netstack->netstack_stackid,
5247			    IPPROTO_UDP, AF_INET6, (uint8_t *)&v6src,
5248			    (in_port_t)connp->conn_lport, NULL);
5249		}
5250	}
5251
5252	mutex_enter(&connp->conn_lock);
5253	connp->conn_anon_port = (is_system_labeled() && requested_port == 0);
5254	if (is_system_labeled() && (!connp->conn_anon_port ||
5255	    connp->conn_anon_mlp)) {
5256		uint16_t mlpport;
5257		zone_t *zone;
5258
5259		zone = crgetzone(cr);
5260		connp->conn_mlp_type =
5261		    connp->conn_recv_ancillary.crb_recvucred ? mlptBoth :
5262		    mlptSingle;
5263		addrtype = tsol_mlp_addr_type(
5264		    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
5265		    IPV6_VERSION, &v6src, us->us_netstack->netstack_ip);
5266		if (addrtype == mlptSingle) {
5267			error = -TNOADDR;
5268			mutex_exit(&connp->conn_lock);
5269			goto late_error;
5270		}
5271		mlpport = connp->conn_anon_port ? PMAPPORT : port;
5272		mlptype = tsol_mlp_port_type(zone, IPPROTO_UDP, mlpport,
5273		    addrtype);
5274
5275		/*
5276		 * It is a coding error to attempt to bind an MLP port
5277		 * without first setting SOL_SOCKET/SCM_UCRED.
5278		 */
5279		if (mlptype != mlptSingle &&
5280		    connp->conn_mlp_type == mlptSingle) {
5281			error = EINVAL;
5282			mutex_exit(&connp->conn_lock);
5283			goto late_error;
5284		}
5285
5286		/*
5287		 * It is an access violation to attempt to bind an MLP port
5288		 * without NET_BINDMLP privilege.
5289		 */
5290		if (mlptype != mlptSingle &&
5291		    secpolicy_net_bindmlp(cr) != 0) {
5292			if (connp->conn_debug) {
5293				(void) strlog(UDP_MOD_ID, 0, 1,
5294				    SL_ERROR|SL_TRACE,
5295				    "udp_bind: no priv for multilevel port %d",
5296				    mlpport);
5297			}
5298			error = -TACCES;
5299			mutex_exit(&connp->conn_lock);
5300			goto late_error;
5301		}
5302
5303		/*
5304		 * If we're specifically binding a shared IP address and the
5305		 * port is MLP on shared addresses, then check to see if this
5306		 * zone actually owns the MLP.  Reject if not.
5307		 */
5308		if (mlptype == mlptShared && addrtype == mlptShared) {
5309			/*
5310			 * No need to handle exclusive-stack zones since
5311			 * ALL_ZONES only applies to the shared stack.
5312			 */
5313			zoneid_t mlpzone;
5314
5315			mlpzone = tsol_mlp_findzone(IPPROTO_UDP,
5316			    htons(mlpport));
5317			if (connp->conn_zoneid != mlpzone) {
5318				if (connp->conn_debug) {
5319					(void) strlog(UDP_MOD_ID, 0, 1,
5320					    SL_ERROR|SL_TRACE,
5321					    "udp_bind: attempt to bind port "
5322					    "%d on shared addr in zone %d "
5323					    "(should be %d)",
5324					    mlpport, connp->conn_zoneid,
5325					    mlpzone);
5326				}
5327				error = -TACCES;
5328				mutex_exit(&connp->conn_lock);
5329				goto late_error;
5330			}
5331		}
5332		if (connp->conn_anon_port) {
5333			error = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
5334			    port, B_TRUE);
5335			if (error != 0) {
5336				if (connp->conn_debug) {
5337					(void) strlog(UDP_MOD_ID, 0, 1,
5338					    SL_ERROR|SL_TRACE,
5339					    "udp_bind: cannot establish anon "
5340					    "MLP for port %d", port);
5341				}
5342				error = -TACCES;
5343				mutex_exit(&connp->conn_lock);
5344				goto late_error;
5345			}
5346		}
5347		connp->conn_mlp_type = mlptype;
5348	}
5349
5350	/*
5351	 * We create an initial header template here to make a subsequent
5352	 * sendto have a starting point. Since conn_last_dst is zero the
5353	 * first sendto will always follow the 'dst changed' code path.
5354	 * Note that we defer massaging options and the related checksum
5355	 * adjustment until we have a destination address.
5356	 */
5357	error = udp_build_hdr_template(connp, &connp->conn_saddr_v6,
5358	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
5359	if (error != 0) {
5360		mutex_exit(&connp->conn_lock);
5361		goto late_error;
5362	}
5363	/* Just in case */
5364	connp->conn_faddr_v6 = ipv6_all_zeros;
5365	connp->conn_fport = 0;
5366	connp->conn_v6lastdst = ipv6_all_zeros;
5367	mutex_exit(&connp->conn_lock);
5368
5369	error = ip_laddr_fanout_insert(connp);
5370	if (error != 0)
5371		goto late_error;
5372
5373	/* Bind succeeded */
5374	return (0);
5375
5376late_error:
5377	/* We had already picked the port number, and then the bind failed */
5378	mutex_enter(&connp->conn_lock);
5379	udpf = &us->us_bind_fanout[
5380	    UDP_BIND_HASH(connp->conn_lport,
5381	    us->us_bind_fanout_size)];
5382	mutex_enter(&udpf->uf_lock);
5383	connp->conn_saddr_v6 = ipv6_all_zeros;
5384	connp->conn_bound_addr_v6 = ipv6_all_zeros;
5385	connp->conn_laddr_v6 = ipv6_all_zeros;
5386	if (scopeid != 0) {
5387		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
5388		connp->conn_incoming_ifindex = connp->conn_bound_if;
5389	}
5390	udp->udp_state = TS_UNBND;
5391	udp_bind_hash_remove(udp, B_TRUE);
5392	connp->conn_lport = 0;
5393	mutex_exit(&udpf->uf_lock);
5394	connp->conn_anon_port = B_FALSE;
5395	connp->conn_mlp_type = mlptSingle;
5396
5397	connp->conn_v6lastdst = ipv6_all_zeros;
5398
5399	/* Restore the header that was built above - different source address */
5400	(void) udp_build_hdr_template(connp, &connp->conn_saddr_v6,
5401	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
5402	mutex_exit(&connp->conn_lock);
5403	return (error);
5404}
5405
5406int
5407udp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5408    socklen_t len, cred_t *cr)
5409{
5410	int		error;
5411	conn_t		*connp;
5412
5413	/* All Solaris components should pass a cred for this operation. */
5414	ASSERT(cr != NULL);
5415
5416	connp = (conn_t *)proto_handle;
5417
5418	if (sa == NULL)
5419		error = udp_do_unbind(connp);
5420	else
5421		error = udp_do_bind(connp, sa, len, cr, B_TRUE);
5422
5423	if (error < 0) {
5424		if (error == -TOUTSTATE)
5425			error = EINVAL;
5426		else
5427			error = proto_tlitosyserr(-error);
5428	}
5429
5430	return (error);
5431}
5432
5433static int
5434udp_implicit_bind(conn_t *connp, cred_t *cr)
5435{
5436	sin6_t sin6addr;
5437	sin_t *sin;
5438	sin6_t *sin6;
5439	socklen_t len;
5440	int error;
5441
5442	/* All Solaris components should pass a cred for this operation. */
5443	ASSERT(cr != NULL);
5444
5445	if (connp->conn_family == AF_INET) {
5446		len = sizeof (struct sockaddr_in);
5447		sin = (sin_t *)&sin6addr;
5448		*sin = sin_null;
5449		sin->sin_family = AF_INET;
5450		sin->sin_addr.s_addr = INADDR_ANY;
5451	} else {
5452		ASSERT(connp->conn_family == AF_INET6);
5453		len = sizeof (sin6_t);
5454		sin6 = (sin6_t *)&sin6addr;
5455		*sin6 = sin6_null;
5456		sin6->sin6_family = AF_INET6;
5457		V6_SET_ZERO(sin6->sin6_addr);
5458	}
5459
5460	error = udp_do_bind(connp, (struct sockaddr *)&sin6addr, len,
5461	    cr, B_FALSE);
5462	return ((error < 0) ? proto_tlitosyserr(-error) : error);
5463}
5464
5465/*
5466 * This routine removes a port number association from a stream. It
5467 * is called by udp_unbind and udp_tpi_unbind.
5468 */
5469static int
5470udp_do_unbind(conn_t *connp)
5471{
5472	udp_t		*udp = connp->conn_udp;
5473	udp_fanout_t	*udpf;
5474	udp_stack_t	*us = udp->udp_us;
5475
5476	if (cl_inet_unbind != NULL) {
5477		/*
5478		 * Running in cluster mode - register unbind information
5479		 */
5480		if (connp->conn_ipversion == IPV4_VERSION) {
5481			(*cl_inet_unbind)(
5482			    connp->conn_netstack->netstack_stackid,
5483			    IPPROTO_UDP, AF_INET,
5484			    (uint8_t *)(&V4_PART_OF_V6(connp->conn_laddr_v6)),
5485			    (in_port_t)connp->conn_lport, NULL);
5486		} else {
5487			(*cl_inet_unbind)(
5488			    connp->conn_netstack->netstack_stackid,
5489			    IPPROTO_UDP, AF_INET6,
5490			    (uint8_t *)&(connp->conn_laddr_v6),
5491			    (in_port_t)connp->conn_lport, NULL);
5492		}
5493	}
5494
5495	mutex_enter(&connp->conn_lock);
5496	/* If a bind has not been done, we can't unbind. */
5497	if (udp->udp_state == TS_UNBND) {
5498		mutex_exit(&connp->conn_lock);
5499		return (-TOUTSTATE);
5500	}
5501	udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
5502	    us->us_bind_fanout_size)];
5503	mutex_enter(&udpf->uf_lock);
5504	udp_bind_hash_remove(udp, B_TRUE);
5505	connp->conn_saddr_v6 = ipv6_all_zeros;
5506	connp->conn_bound_addr_v6 = ipv6_all_zeros;
5507	connp->conn_laddr_v6 = ipv6_all_zeros;
5508	connp->conn_mcbc_bind = B_FALSE;
5509	connp->conn_lport = 0;
5510	/* In case we were also connected */
5511	connp->conn_faddr_v6 = ipv6_all_zeros;
5512	connp->conn_fport = 0;
5513	mutex_exit(&udpf->uf_lock);
5514
5515	connp->conn_v6lastdst = ipv6_all_zeros;
5516	udp->udp_state = TS_UNBND;
5517
5518	(void) udp_build_hdr_template(connp, &connp->conn_saddr_v6,
5519	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
5520	mutex_exit(&connp->conn_lock);
5521
5522	ip_unbind(connp);
5523
5524	return (0);
5525}
5526
5527/*
5528 * It associates a default destination address with the stream.
5529 */
5530static int
5531udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
5532    cred_t *cr, pid_t pid)
5533{
5534	sin6_t		*sin6;
5535	sin_t		*sin;
5536	in6_addr_t	v6dst;
5537	ipaddr_t	v4dst;
5538	uint16_t	dstport;
5539	uint32_t	flowinfo;
5540	udp_fanout_t	*udpf;
5541	udp_t		*udp, *udp1;
5542	ushort_t	ipversion;
5543	udp_stack_t	*us;
5544	int		error;
5545	conn_t		*connp1;
5546	ip_xmit_attr_t	*ixa;
5547	ip_xmit_attr_t	*oldixa;
5548	uint_t		scopeid = 0;
5549	uint_t		srcid = 0;
5550	in6_addr_t	v6src = connp->conn_saddr_v6;
5551	boolean_t	v4mapped;
5552
5553	udp = connp->conn_udp;
5554	us = udp->udp_us;
5555	sin = NULL;
5556	sin6 = NULL;
5557	v4dst = INADDR_ANY;
5558	flowinfo = 0;
5559
5560	/*
5561	 * Address has been verified by the caller
5562	 */
5563	switch (len) {
5564	default:
5565		/*
5566		 * Should never happen
5567		 */
5568		return (EINVAL);
5569
5570	case sizeof (sin_t):
5571		sin = (sin_t *)sa;
5572		v4dst = sin->sin_addr.s_addr;
5573		dstport = sin->sin_port;
5574		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
5575		ASSERT(connp->conn_ipversion == IPV4_VERSION);
5576		ipversion = IPV4_VERSION;
5577		break;
5578
5579	case sizeof (sin6_t):
5580		sin6 = (sin6_t *)sa;
5581		v6dst = sin6->sin6_addr;
5582		dstport = sin6->sin6_port;
5583		srcid = sin6->__sin6_src_id;
5584		v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
5585		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
5586			if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
5587			    v4mapped, connp->conn_netstack)) {
5588				/* Mismatch v4mapped/v6 specified by srcid. */
5589				return (EADDRNOTAVAIL);
5590			}
5591		}
5592		if (v4mapped) {
5593			if (connp->conn_ipv6_v6only)
5594				return (EADDRNOTAVAIL);
5595
5596			/*
5597			 * Destination adress is mapped IPv6 address.
5598			 * Source bound address should be unspecified or
5599			 * IPv6 mapped address as well.
5600			 */
5601			if (!IN6_IS_ADDR_UNSPECIFIED(
5602			    &connp->conn_bound_addr_v6) &&
5603			    !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) {
5604				return (EADDRNOTAVAIL);
5605			}
5606			IN6_V4MAPPED_TO_IPADDR(&v6dst, v4dst);
5607			ipversion = IPV4_VERSION;
5608			flowinfo = 0;
5609		} else {
5610			ipversion = IPV6_VERSION;
5611			flowinfo = sin6->sin6_flowinfo;
5612			if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
5613				scopeid = sin6->sin6_scope_id;
5614		}
5615		break;
5616	}
5617
5618	if (dstport == 0)
5619		return (-TBADADDR);
5620
5621	/*
5622	 * If there is a different thread using conn_ixa then we get a new
5623	 * copy and cut the old one loose from conn_ixa. Otherwise we use
5624	 * conn_ixa and prevent any other thread from using/changing it.
5625	 * Once connect() is done other threads can use conn_ixa since the
5626	 * refcnt will be back at one.
5627	 * We defer updating conn_ixa until later to handle any concurrent
5628	 * conn_ixa_cleanup thread.
5629	 */
5630	ixa = conn_get_ixa(connp, B_FALSE);
5631	if (ixa == NULL)
5632		return (ENOMEM);
5633
5634	mutex_enter(&connp->conn_lock);
5635	/*
5636	 * This udp_t must have bound to a port already before doing a connect.
5637	 * Reject if a connect is in progress (we drop conn_lock during
5638	 * udp_do_connect).
5639	 */
5640	if (udp->udp_state == TS_UNBND || udp->udp_state == TS_WCON_CREQ) {
5641		mutex_exit(&connp->conn_lock);
5642		(void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
5643		    "udp_connect: bad state, %u", udp->udp_state);
5644		ixa_refrele(ixa);
5645		return (-TOUTSTATE);
5646	}
5647	ASSERT(connp->conn_lport != 0 && udp->udp_ptpbhn != NULL);
5648
5649	udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
5650	    us->us_bind_fanout_size)];
5651
5652	mutex_enter(&udpf->uf_lock);
5653	if (udp->udp_state == TS_DATA_XFER) {
5654		/* Already connected - clear out state */
5655		if (connp->conn_mcbc_bind)
5656			connp->conn_saddr_v6 = ipv6_all_zeros;
5657		else
5658			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
5659		connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
5660		connp->conn_faddr_v6 = ipv6_all_zeros;
5661		connp->conn_fport = 0;
5662		udp->udp_state = TS_IDLE;
5663	}
5664
5665	connp->conn_fport = dstport;
5666	connp->conn_ipversion = ipversion;
5667	if (ipversion == IPV4_VERSION) {
5668		/*
5669		 * Interpret a zero destination to mean loopback.
5670		 * Update the T_CONN_REQ (sin/sin6) since it is used to
5671		 * generate the T_CONN_CON.
5672		 */
5673		if (v4dst == INADDR_ANY) {
5674			v4dst = htonl(INADDR_LOOPBACK);
5675			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
5676			if (connp->conn_family == AF_INET) {
5677				sin->sin_addr.s_addr = v4dst;
5678			} else {
5679				sin6->sin6_addr = v6dst;
5680			}
5681		}
5682		connp->conn_faddr_v6 = v6dst;
5683		connp->conn_flowinfo = 0;
5684	} else {
5685		ASSERT(connp->conn_ipversion == IPV6_VERSION);
5686		/*
5687		 * Interpret a zero destination to mean loopback.
5688		 * Update the T_CONN_REQ (sin/sin6) since it is used to
5689		 * generate the T_CONN_CON.
5690		 */
5691		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
5692			v6dst = ipv6_loopback;
5693			sin6->sin6_addr = v6dst;
5694		}
5695		connp->conn_faddr_v6 = v6dst;
5696		connp->conn_flowinfo = flowinfo;
5697	}
5698	mutex_exit(&udpf->uf_lock);
5699
5700	/*
5701	 * We update our cred/cpid based on the caller of connect
5702	 */
5703	if (connp->conn_cred != cr) {
5704		crhold(cr);
5705		crfree(connp->conn_cred);
5706		connp->conn_cred = cr;
5707	}
5708	connp->conn_cpid = pid;
5709	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
5710	ixa->ixa_cred = cr;
5711	ixa->ixa_cpid = pid;
5712	if (is_system_labeled()) {
5713		/* We need to restart with a label based on the cred */
5714		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
5715	}
5716
5717	if (scopeid != 0) {
5718		ixa->ixa_flags |= IXAF_SCOPEID_SET;
5719		ixa->ixa_scopeid = scopeid;
5720		connp->conn_incoming_ifindex = scopeid;
5721	} else {
5722		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
5723		connp->conn_incoming_ifindex = connp->conn_bound_if;
5724	}
5725	/*
5726	 * conn_connect will drop conn_lock and reacquire it.
5727	 * To prevent a send* from messing with this udp_t while the lock
5728	 * is dropped we set udp_state and clear conn_v6lastdst.
5729	 * That will make all send* fail with EISCONN.
5730	 */
5731	connp->conn_v6lastdst = ipv6_all_zeros;
5732	udp->udp_state = TS_WCON_CREQ;
5733
5734	error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
5735	mutex_exit(&connp->conn_lock);
5736	if (error != 0)
5737		goto connect_failed;
5738
5739	/*
5740	 * The addresses have been verified. Time to insert in
5741	 * the correct fanout list.
5742	 */
5743	error = ipcl_conn_insert(connp);
5744	if (error != 0)
5745		goto connect_failed;
5746
5747	mutex_enter(&connp->conn_lock);
5748	error = udp_build_hdr_template(connp, &connp->conn_saddr_v6,
5749	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
5750	if (error != 0) {
5751		mutex_exit(&connp->conn_lock);
5752		goto connect_failed;
5753	}
5754
5755	udp->udp_state = TS_DATA_XFER;
5756	/* Record this as the "last" send even though we haven't sent any */
5757	connp->conn_v6lastdst = connp->conn_faddr_v6;
5758	connp->conn_lastipversion = connp->conn_ipversion;
5759	connp->conn_lastdstport = connp->conn_fport;
5760	connp->conn_lastflowinfo = connp->conn_flowinfo;
5761	connp->conn_lastscopeid = scopeid;
5762	connp->conn_lastsrcid = srcid;
5763	/* Also remember a source to use together with lastdst */
5764	connp->conn_v6lastsrc = v6src;
5765
5766	oldixa = conn_replace_ixa(connp, ixa);
5767	mutex_exit(&connp->conn_lock);
5768	ixa_refrele(oldixa);
5769
5770	/*
5771	 * We've picked a source address above. Now we can
5772	 * verify that the src/port/dst/port is unique for all
5773	 * connections in TS_DATA_XFER, skipping ourselves.
5774	 */
5775	mutex_enter(&udpf->uf_lock);
5776	for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) {
5777		if (udp1->udp_state != TS_DATA_XFER)
5778			continue;
5779
5780		if (udp1 == udp)
5781			continue;
5782
5783		connp1 = udp1->udp_connp;
5784		if (connp->conn_lport != connp1->conn_lport ||
5785		    connp->conn_ipversion != connp1->conn_ipversion ||
5786		    dstport != connp1->conn_fport ||
5787		    !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
5788		    &connp1->conn_laddr_v6) ||
5789		    !IN6_ARE_ADDR_EQUAL(&v6dst, &connp1->conn_faddr_v6) ||
5790		    !(IPCL_ZONE_MATCH(connp, connp1->conn_zoneid) ||
5791		    IPCL_ZONE_MATCH(connp1, connp->conn_zoneid)))
5792			continue;
5793		mutex_exit(&udpf->uf_lock);
5794		error = -TBADADDR;
5795		goto connect_failed;
5796	}
5797	if (cl_inet_connect2 != NULL) {
5798		CL_INET_UDP_CONNECT(connp, B_TRUE, &v6dst, dstport, error);
5799		if (error != 0) {
5800			mutex_exit(&udpf->uf_lock);
5801			error = -TBADADDR;
5802			goto connect_failed;
5803		}
5804	}
5805	mutex_exit(&udpf->uf_lock);
5806
5807	ixa_refrele(ixa);
5808	return (0);
5809
5810connect_failed:
5811	if (ixa != NULL)
5812		ixa_refrele(ixa);
5813	mutex_enter(&connp->conn_lock);
5814	mutex_enter(&udpf->uf_lock);
5815	udp->udp_state = TS_IDLE;
5816	connp->conn_faddr_v6 = ipv6_all_zeros;
5817	connp->conn_fport = 0;
5818	/* In case the source address was set above */
5819	if (connp->conn_mcbc_bind)
5820		connp->conn_saddr_v6 = ipv6_all_zeros;
5821	else
5822		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
5823	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
5824	mutex_exit(&udpf->uf_lock);
5825
5826	connp->conn_v6lastdst = ipv6_all_zeros;
5827	connp->conn_flowinfo = 0;
5828
5829	(void) udp_build_hdr_template(connp, &connp->conn_saddr_v6,
5830	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
5831	mutex_exit(&connp->conn_lock);
5832	return (error);
5833}
5834
5835static int
5836udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5837    socklen_t len, sock_connid_t *id, cred_t *cr)
5838{
5839	conn_t	*connp = (conn_t *)proto_handle;
5840	udp_t	*udp = connp->conn_udp;
5841	int	error;
5842	boolean_t did_bind = B_FALSE;
5843	pid_t	pid = curproc->p_pid;
5844
5845	/* All Solaris components should pass a cred for this operation. */
5846	ASSERT(cr != NULL);
5847
5848	if (sa == NULL) {
5849		/*
5850		 * Disconnect
5851		 * Make sure we are connected
5852		 */
5853		if (udp->udp_state != TS_DATA_XFER)
5854			return (EINVAL);
5855
5856		error = udp_disconnect(connp);
5857		return (error);
5858	}
5859
5860	error = proto_verify_ip_addr(connp->conn_family, sa, len);
5861	if (error != 0)
5862		goto done;
5863
5864	/* do an implicit bind if necessary */
5865	if (udp->udp_state == TS_UNBND) {
5866		error = udp_implicit_bind(connp, cr);
5867		/*
5868		 * We could be racing with an actual bind, in which case
5869		 * we would see EPROTO. We cross our fingers and try
5870		 * to connect.
5871		 */
5872		if (!(error == 0 || error == EPROTO))
5873			goto done;
5874		did_bind = B_TRUE;
5875	}
5876	/*
5877	 * set SO_DGRAM_ERRIND
5878	 */
5879	connp->conn_dgram_errind = B_TRUE;
5880
5881	error = udp_do_connect(connp, sa, len, cr, pid);
5882
5883	if (error != 0 && did_bind) {
5884		int unbind_err;
5885
5886		unbind_err = udp_do_unbind(connp);
5887		ASSERT(unbind_err == 0);
5888	}
5889
5890	if (error == 0) {
5891		*id = 0;
5892		(*connp->conn_upcalls->su_connected)
5893		    (connp->conn_upper_handle, 0, NULL, -1);
5894	} else if (error < 0) {
5895		error = proto_tlitosyserr(-error);
5896	}
5897
5898done:
5899	if (error != 0 && udp->udp_state == TS_DATA_XFER) {
5900		/*
5901		 * No need to hold locks to set state
5902		 * after connect failure socket state is undefined
5903		 * We set the state only to imitate old sockfs behavior
5904		 */
5905		udp->udp_state = TS_IDLE;
5906	}
5907	return (error);
5908}
5909
5910int
5911udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5912    cred_t *cr)
5913{
5914	sin6_t		*sin6;
5915	sin_t		*sin = NULL;
5916	uint_t		srcid;
5917	conn_t		*connp = (conn_t *)proto_handle;
5918	udp_t		*udp = connp->conn_udp;
5919	int		error = 0;
5920	udp_stack_t	*us = udp->udp_us;
5921	ushort_t	ipversion;
5922	pid_t		pid = curproc->p_pid;
5923	ip_xmit_attr_t	*ixa;
5924
5925	ASSERT(DB_TYPE(mp) == M_DATA);
5926
5927	/* All Solaris components should pass a cred for this operation. */
5928	ASSERT(cr != NULL);
5929
5930	/* do an implicit bind if necessary */
5931	if (udp->udp_state == TS_UNBND) {
5932		error = udp_implicit_bind(connp, cr);
5933		/*
5934		 * We could be racing with an actual bind, in which case
5935		 * we would see EPROTO. We cross our fingers and try
5936		 * to connect.
5937		 */
5938		if (!(error == 0 || error == EPROTO)) {
5939			freemsg(mp);
5940			return (error);
5941		}
5942	}
5943
5944	/* Connected? */
5945	if (msg->msg_name == NULL) {
5946		if (udp->udp_state != TS_DATA_XFER) {
5947			UDPS_BUMP_MIB(us, udpOutErrors);
5948			return (EDESTADDRREQ);
5949		}
5950		if (msg->msg_controllen != 0) {
5951			error = udp_output_ancillary(connp, NULL, NULL, mp,
5952			    NULL, msg, cr, pid);
5953		} else {
5954			error = udp_output_connected(connp, mp, cr, pid);
5955		}
5956		if (us->us_sendto_ignerr)
5957			return (0);
5958		else
5959			return (error);
5960	}
5961	if (udp->udp_state == TS_DATA_XFER) {
5962		UDPS_BUMP_MIB(us, udpOutErrors);
5963		return (EISCONN);
5964	}
5965	error = proto_verify_ip_addr(connp->conn_family,
5966	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5967	if (error != 0) {
5968		UDPS_BUMP_MIB(us, udpOutErrors);
5969		return (error);
5970	}
5971	switch (connp->conn_family) {
5972	case AF_INET6:
5973		sin6 = (sin6_t *)msg->msg_name;
5974
5975		srcid = sin6->__sin6_src_id;
5976
5977		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5978			/*
5979			 * Destination is a non-IPv4-compatible IPv6 address.
5980			 * Send out an IPv6 format packet.
5981			 */
5982
5983			/*
5984			 * If the local address is a mapped address return
5985			 * an error.
5986			 * It would be possible to send an IPv6 packet but the
5987			 * response would never make it back to the application
5988			 * since it is bound to a mapped address.
5989			 */
5990			if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5991				UDPS_BUMP_MIB(us, udpOutErrors);
5992				return (EADDRNOTAVAIL);
5993			}
5994			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5995				sin6->sin6_addr = ipv6_loopback;
5996			ipversion = IPV6_VERSION;
5997		} else {
5998			if (connp->conn_ipv6_v6only) {
5999				UDPS_BUMP_MIB(us, udpOutErrors);
6000				return (EADDRNOTAVAIL);
6001			}
6002
6003			/*
6004			 * If the local address is not zero or a mapped address
6005			 * return an error.  It would be possible to send an
6006			 * IPv4 packet but the response would never make it
6007			 * back to the application since it is bound to a
6008			 * non-mapped address.
6009			 */
6010			if (!IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6) &&
6011			    !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
6012				UDPS_BUMP_MIB(us, udpOutErrors);
6013				return (EADDRNOTAVAIL);
6014			}
6015
6016			if (V4_PART_OF_V6(sin6->sin6_addr) == INADDR_ANY) {
6017				V4_PART_OF_V6(sin6->sin6_addr) =
6018				    htonl(INADDR_LOOPBACK);
6019			}
6020			ipversion = IPV4_VERSION;
6021		}
6022
6023		/*
6024		 * We have to allocate an ip_xmit_attr_t before we grab
6025		 * conn_lock and we need to hold conn_lock once we've check
6026		 * conn_same_as_last_v6 to handle concurrent send* calls on a
6027		 * socket.
6028		 */
6029		if (msg->msg_controllen == 0) {
6030			ixa = conn_get_ixa(connp, B_FALSE);
6031			if (ixa == NULL) {
6032				UDPS_BUMP_MIB(us, udpOutErrors);
6033				return (ENOMEM);
6034			}
6035		} else {
6036			ixa = NULL;
6037		}
6038