1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2022 Garrett D'Amore
27 */
28
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/kmem_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/debug.h>
42 #include <sys/errno.h>
43 #include <sys/time.h>
44 #include <sys/file.h>
45 #include <sys/open.h>
46 #include <sys/user.h>
47 #include <sys/termios.h>
48 #include <sys/stream.h>
49 #include <sys/strsubr.h>
50 #include <sys/strsun.h>
51 #include <sys/suntpi.h>
52 #include <sys/ddi.h>
53 #include <sys/esunddi.h>
54 #include <sys/flock.h>
55 #include <sys/modctl.h>
56 #include <sys/vtrace.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathname.h>
59
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <netinet/in.h>
64 #include <sys/un.h>
65 #include <sys/strsun.h>
66
67 #include <sys/tiuser.h>
68 #define _SUN_TPI_VERSION 2
69 #include <sys/tihdr.h>
70 #include <sys/timod.h> /* TI_GETMYNAME, TI_GETPEERNAME */
71
72 #include <c2/audit.h>
73
74 #include <inet/common.h>
75 #include <inet/ip.h>
76 #include <inet/ip6.h>
77 #include <inet/tcp.h>
78 #include <inet/udp_impl.h>
79
80 #include <sys/zone.h>
81
82 #include <fs/sockfs/sockcommon.h>
83 #include <fs/sockfs/socktpi.h>
84 #include <fs/sockfs/socktpi_impl.h>
85
86 /*
87 * Possible failures when memory can't be allocated. The documented behavior:
88 *
89 * 5.5: 4.X: XNET:
90 * accept: ENOMEM/ENOSR/EINTR - (EINTR) ENOMEM/ENOBUFS/ENOSR/
91 * EINTR
92 * (4.X does not document EINTR but returns it)
93 * bind: ENOSR - ENOBUFS/ENOSR
94 * connect: EINTR EINTR ENOBUFS/ENOSR/EINTR
95 * getpeername: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
96 * getsockname: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
97 * (4.X getpeername and getsockname do not fail in practice)
98 * getsockopt: ENOMEM/ENOSR - ENOBUFS/ENOSR
99 * listen: - - ENOBUFS
100 * recv: ENOMEM/ENOSR/EINTR EINTR ENOBUFS/ENOMEM/ENOSR/
101 * EINTR
102 * send: ENOMEM/ENOSR/EINTR ENOBUFS/EINTR ENOBUFS/ENOMEM/ENOSR/
103 * EINTR
104 * setsockopt: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
105 * shutdown: ENOMEM/ENOSR - ENOBUFS/ENOSR
106 * socket: ENOMEM/ENOSR ENOBUFS ENOBUFS/ENOMEM/ENOSR
107 * socketpair: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
108 *
109 * Resolution. When allocation fails:
110 * recv: return EINTR
111 * send: return EINTR
112 * connect, accept: EINTR
113 * bind, listen, shutdown (unbind, unix_close, disconnect): sleep
114 * socket, socketpair: ENOBUFS
115 * getpeername, getsockname: sleep
116 * getsockopt, setsockopt: sleep
117 */
118
119 #ifdef SOCK_TEST
120 /*
121 * Variables that make sockfs do something other than the standard TPI
122 * for the AF_INET transports.
123 *
124 * solisten_tpi_tcp:
125 * TCP can handle a O_T_BIND_REQ with an increased backlog even though
126 * the transport is already bound. This is needed to avoid loosing the
127 * port number should listen() do a T_UNBIND_REQ followed by a
128 * O_T_BIND_REQ.
129 *
130 * soconnect_tpi_udp:
131 * UDP and ICMP can handle a T_CONN_REQ.
132 * This is needed to make the sequence of connect(), getsockname()
133 * return the local IP address used to send packets to the connected to
134 * destination.
135 *
136 * soconnect_tpi_tcp:
137 * TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
138 * Set this to non-zero to send TPI conformant messages to TCP in this
139 * respect. This is a performance optimization.
140 *
141 * soaccept_tpi_tcp:
142 * TCP can handle a T_CONN_REQ without the acceptor being bound.
143 * This is a performance optimization that has been picked up in XTI.
144 *
145 * soaccept_tpi_multioptions:
146 * When inheriting SOL_SOCKET options from the listener to the accepting
147 * socket send them as a single message for AF_INET{,6}.
148 */
149 int solisten_tpi_tcp = 0;
150 int soconnect_tpi_udp = 0;
151 int soconnect_tpi_tcp = 0;
152 int soaccept_tpi_tcp = 0;
153 int soaccept_tpi_multioptions = 1;
154 #else /* SOCK_TEST */
155 #define soconnect_tpi_tcp 0
156 #define soconnect_tpi_udp 0
157 #define solisten_tpi_tcp 0
158 #define soaccept_tpi_tcp 0
159 #define soaccept_tpi_multioptions 1
160 #endif /* SOCK_TEST */
161
162 #ifdef SOCK_TEST
163 extern int do_useracc;
164 extern clock_t sock_test_timelimit;
165 #endif /* SOCK_TEST */
166
167 extern uint32_t ucredsize;
168
169 /*
170 * Some X/Open added checks might have to be backed out to keep SunOS 4.X
171 * applications working. Turn on this flag to disable these checks.
172 */
173 int xnet_skip_checks = 0;
174 int xnet_check_print = 0;
175 int xnet_truncate_print = 0;
176
177 static void sotpi_destroy(struct sonode *);
178 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
179 int, int *, cred_t *cr);
180
181 static boolean_t sotpi_info_create(struct sonode *, int);
182 static void sotpi_info_init(struct sonode *);
183 static void sotpi_info_fini(struct sonode *);
184 static void sotpi_info_destroy(struct sonode *);
185
186 /*
187 * Do direct function call to the transport layer below; this would
188 * also allow the transport to utilize read-side synchronous stream
189 * interface if necessary. This is a /etc/system tunable that must
190 * not be modified on a running system. By default this is enabled
191 * for performance reasons and may be disabled for debugging purposes.
192 */
193 boolean_t socktpi_direct = B_TRUE;
194
195 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
196
197 extern void sigintr(k_sigset_t *, int);
198 extern void sigunintr(k_sigset_t *);
199
200 static int sotpi_unbind(struct sonode *, int);
201
202 /* TPI sockfs sonode operations */
203 int sotpi_init(struct sonode *, struct sonode *, struct cred *,
204 int);
205 static int sotpi_accept(struct sonode *, int, struct cred *,
206 struct sonode **);
207 static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
208 int, struct cred *);
209 static int sotpi_listen(struct sonode *, int, struct cred *);
210 static int sotpi_connect(struct sonode *, struct sockaddr *,
211 socklen_t, int, int, struct cred *);
212 extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
213 struct uio *, struct cred *);
214 static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
215 struct uio *, struct cred *);
216 static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
217 struct cred *, mblk_t **);
218 static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
219 struct uio *, void *, t_uscalar_t, int);
220 static int sodgram_direct(struct sonode *, struct sockaddr *,
221 socklen_t, struct uio *, int);
222 extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
223 socklen_t *, boolean_t, struct cred *);
224 static int sotpi_getsockname(struct sonode *, struct sockaddr *,
225 socklen_t *, struct cred *);
226 static int sotpi_shutdown(struct sonode *, int, struct cred *);
227 extern int sotpi_getsockopt(struct sonode *, int, int, void *,
228 socklen_t *, int, struct cred *);
229 extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
230 socklen_t, struct cred *);
231 static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
232 int32_t *);
233 static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
234 struct cred *, int32_t *);
235 static int sotpi_poll(struct sonode *, short, int, short *,
236 struct pollhead **);
237 static int sotpi_close(struct sonode *, int, struct cred *);
238
239 static int i_sotpi_info_constructor(sotpi_info_t *);
240 static void i_sotpi_info_destructor(sotpi_info_t *);
241
242 sonodeops_t sotpi_sonodeops = {
243 sotpi_init, /* sop_init */
244 sotpi_accept, /* sop_accept */
245 sotpi_bind, /* sop_bind */
246 sotpi_listen, /* sop_listen */
247 sotpi_connect, /* sop_connect */
248 sotpi_recvmsg, /* sop_recvmsg */
249 sotpi_sendmsg, /* sop_sendmsg */
250 sotpi_sendmblk, /* sop_sendmblk */
251 sotpi_getpeername, /* sop_getpeername */
252 sotpi_getsockname, /* sop_getsockname */
253 sotpi_shutdown, /* sop_shutdown */
254 sotpi_getsockopt, /* sop_getsockopt */
255 sotpi_setsockopt, /* sop_setsockopt */
256 sotpi_ioctl, /* sop_ioctl */
257 sotpi_poll, /* sop_poll */
258 sotpi_close, /* sop_close */
259 };
260
261 /*
262 * Return a TPI socket vnode.
263 *
264 * Note that sockets assume that the driver will clone (either itself
265 * or by using the clone driver) i.e. a socket() call will always
266 * result in a new vnode being created.
267 */
268
269 /*
270 * Common create code for socket and accept. If tso is set the values
271 * from that node is used instead of issuing a T_INFO_REQ.
272 */
273
274 /* ARGSUSED */
275 static struct sonode *
sotpi_create(struct sockparams * sp,int family,int type,int protocol,int version,int sflags,int * errorp,cred_t * cr)276 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
277 int version, int sflags, int *errorp, cred_t *cr)
278 {
279 struct sonode *so;
280 kmem_cache_t *cp;
281
282 ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
283
284 if (family == AF_NCA) {
285 /*
286 * The request is for an NCA socket so for NL7C use the
287 * INET domain instead and mark NL7C_AF_NCA below.
288 */
289 family = AF_INET;
290 /*
291 * NL7C is not supported in the non-global zone,
292 * we enforce this restriction here.
293 */
294 if (getzoneid() != GLOBAL_ZONEID) {
295 *errorp = ENOTSUP;
296 return (NULL);
297 }
298 }
299
300 /*
301 * to be compatible with old tpi socket implementation ignore
302 * sleep flag (sflags) passed in
303 */
304 cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
305 so = kmem_cache_alloc(cp, KM_SLEEP);
306 if (so == NULL) {
307 *errorp = ENOMEM;
308 return (NULL);
309 }
310
311 sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
312 sotpi_info_init(so);
313
314 if (version == SOV_DEFAULT)
315 version = so_default_version;
316
317 so->so_version = (short)version;
318 *errorp = 0;
319
320 return (so);
321 }
322
323 static void
sotpi_destroy(struct sonode * so)324 sotpi_destroy(struct sonode *so)
325 {
326 kmem_cache_t *cp;
327 struct sockparams *origsp;
328
329 /*
330 * If there is a new dealloc function (ie. smod_destroy_func),
331 * then it should check the correctness of the ops.
332 */
333
334 ASSERT(so->so_ops == &sotpi_sonodeops);
335
336 origsp = SOTOTPI(so)->sti_orig_sp;
337
338 sotpi_info_fini(so);
339
340 if (so->so_state & SS_FALLBACK_COMP) {
341 /*
342 * A fallback happend, which means that a sotpi_info_t struct
343 * was allocated (as opposed to being allocated from the TPI
344 * sonode cache. Therefore we explicitly free the struct
345 * here.
346 */
347 sotpi_info_destroy(so);
348 ASSERT(origsp != NULL);
349
350 origsp->sp_smod_info->smod_sock_destroy_func(so);
351 SOCKPARAMS_DEC_REF(origsp);
352 } else {
353 sonode_fini(so);
354 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
355 socktpi_cache;
356 kmem_cache_free(cp, so);
357 }
358 }
359
360 /* ARGSUSED1 */
361 int
sotpi_init(struct sonode * so,struct sonode * tso,struct cred * cr,int flags)362 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
363 {
364 major_t maj;
365 dev_t newdev;
366 struct vnode *vp;
367 int error = 0;
368 struct stdata *stp;
369
370 sotpi_info_t *sti = SOTOTPI(so);
371
372 dprint(1, ("sotpi_init()\n"));
373
374 /*
375 * over write the sleep flag passed in but that is ok
376 * as tpi socket does not honor sleep flag.
377 */
378 flags |= FREAD|FWRITE;
379
380 /*
381 * Record in so_flag that it is a clone.
382 */
383 if (getmajor(sti->sti_dev) == clone_major)
384 so->so_flag |= SOCLONE;
385
386 if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
387 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
388 (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
389 so->so_protocol == IPPROTO_IP)) {
390 /* Tell tcp or udp that it's talking to sockets */
391 flags |= SO_SOCKSTR;
392
393 /*
394 * Here we indicate to socktpi_open() our attempt to
395 * make direct calls between sockfs and transport.
396 * The final decision is left to socktpi_open().
397 */
398 sti->sti_direct = 1;
399
400 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
401 if (so->so_type == SOCK_STREAM && tso != NULL) {
402 if (SOTOTPI(tso)->sti_direct) {
403 /*
404 * Inherit sti_direct from listener and pass
405 * SO_ACCEPTOR open flag to tcp, indicating
406 * that this is an accept fast-path instance.
407 */
408 flags |= SO_ACCEPTOR;
409 } else {
410 /*
411 * sti_direct is not set on listener, meaning
412 * that the listener has been converted from
413 * a socket to a stream. Ensure that the
414 * acceptor inherits these settings.
415 */
416 sti->sti_direct = 0;
417 flags &= ~SO_SOCKSTR;
418 }
419 }
420 }
421
422 /*
423 * Tell local transport that it is talking to sockets.
424 */
425 if (so->so_family == AF_UNIX) {
426 flags |= SO_SOCKSTR;
427 }
428
429 vp = SOTOV(so);
430 newdev = vp->v_rdev;
431 maj = getmajor(newdev);
432 ASSERT(STREAMSTAB(maj));
433
434 error = stropen(vp, &newdev, flags, cr);
435
436 stp = vp->v_stream;
437 if (error == 0) {
438 if (so->so_flag & SOCLONE)
439 ASSERT(newdev != vp->v_rdev);
440 mutex_enter(&so->so_lock);
441 sti->sti_dev = newdev;
442 vp->v_rdev = newdev;
443 mutex_exit(&so->so_lock);
444
445 if (stp->sd_flag & STRISTTY) {
446 /*
447 * this is a post SVR4 tty driver - a socket can not
448 * be a controlling terminal. Fail the open.
449 */
450 (void) sotpi_close(so, flags, cr);
451 return (ENOTTY); /* XXX */
452 }
453
454 ASSERT(stp->sd_wrq != NULL);
455 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
456
457 /*
458 * If caller is interested in doing direct function call
459 * interface to/from transport module, probe the module
460 * directly beneath the streamhead to see if it qualifies.
461 *
462 * We turn off the direct interface when qualifications fail.
463 * In the acceptor case, we simply turn off the sti_direct
464 * flag on the socket. We do the fallback after the accept
465 * has completed, before the new socket is returned to the
466 * application.
467 */
468 if (sti->sti_direct) {
469 queue_t *tq = stp->sd_wrq->q_next;
470
471 /*
472 * sti_direct is currently supported and tested
473 * only for tcp/udp; this is the main reason to
474 * have the following assertions.
475 */
476 ASSERT(so->so_family == AF_INET ||
477 so->so_family == AF_INET6);
478 ASSERT(so->so_protocol == IPPROTO_UDP ||
479 so->so_protocol == IPPROTO_TCP ||
480 so->so_protocol == IPPROTO_IP);
481 ASSERT(so->so_type == SOCK_DGRAM ||
482 so->so_type == SOCK_STREAM);
483
484 /*
485 * Abort direct call interface if the module directly
486 * underneath the stream head is not defined with the
487 * _D_DIRECT flag. This could happen in the tcp or
488 * udp case, when some other module is autopushed
489 * above it, or for some reasons the expected module
490 * isn't purely D_MP (which is the main requirement).
491 */
492 if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
493 !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
494 int rval;
495
496 /* Continue on without direct calls */
497 sti->sti_direct = 0;
498
499 /*
500 * Cannot issue ioctl on fallback socket since
501 * there is no conn associated with the queue.
502 * The fallback downcall will notify the proto
503 * of the change.
504 */
505 if (!(flags & SO_ACCEPTOR) &&
506 !(flags & SO_FALLBACK)) {
507 if ((error = strioctl(vp,
508 _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
509 cr, &rval)) != 0) {
510 (void) sotpi_close(so, flags,
511 cr);
512 return (error);
513 }
514 }
515 }
516 }
517
518 if (flags & SO_FALLBACK) {
519 /*
520 * The stream created does not have a conn.
521 * do stream set up after conn has been assigned
522 */
523 return (error);
524 }
525 error = so_strinit(so, tso);
526 if (error != 0) {
527 (void) sotpi_close(so, flags, cr);
528 return (error);
529 }
530
531 /* Enable sendfile() on AF_UNIX streams */
532 if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
533 mutex_enter(&so->so_lock);
534 so->so_mode |= SM_SENDFILESUPP;
535 mutex_exit(&so->so_lock);
536 }
537
538 /* Wildcard */
539 if (so->so_protocol != so->so_sockparams->sp_protocol) {
540 int protocol = so->so_protocol;
541 /*
542 * Issue SO_PROTOTYPE setsockopt.
543 */
544 error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
545 &protocol, (t_uscalar_t)sizeof (protocol), cr);
546 if (error != 0) {
547 (void) sotpi_close(so, flags, cr);
548 /*
549 * Setsockopt often fails with ENOPROTOOPT but
550 * socket() should fail with
551 * EPROTONOSUPPORT/EPROTOTYPE.
552 */
553 return (EPROTONOSUPPORT);
554 }
555 }
556
557 } else {
558 /*
559 * While the same socket can not be reopened (unlike specfs)
560 * the stream head sets STREOPENFAIL when the autopush fails.
561 */
562 if ((stp != NULL) &&
563 (stp->sd_flag & STREOPENFAIL)) {
564 /*
565 * Open failed part way through.
566 */
567 mutex_enter(&stp->sd_lock);
568 stp->sd_flag &= ~STREOPENFAIL;
569 mutex_exit(&stp->sd_lock);
570 (void) sotpi_close(so, flags, cr);
571 return (error);
572 /*NOTREACHED*/
573 }
574 ASSERT(stp == NULL);
575 }
576 TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
577 "sockfs open:maj %d vp %p so %p error %d",
578 maj, vp, so, error);
579 return (error);
580 }
581
582 /*
583 * Bind the socket to an unspecified address in sockfs only.
584 * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
585 * required in all cases.
586 */
587 static void
so_automatic_bind(struct sonode * so)588 so_automatic_bind(struct sonode *so)
589 {
590 sotpi_info_t *sti = SOTOTPI(so);
591 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
592
593 ASSERT(MUTEX_HELD(&so->so_lock));
594 ASSERT(!(so->so_state & SS_ISBOUND));
595 ASSERT(sti->sti_unbind_mp);
596
597 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
598 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
599 sti->sti_laddr_sa->sa_family = so->so_family;
600 so->so_state |= SS_ISBOUND;
601 }
602
603
604 /*
605 * bind the socket.
606 *
607 * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
608 * are passed in we allow rebinding. Note that for backwards compatibility
609 * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
610 * Thus the rebinding code is currently not executed.
611 *
612 * The constraints for rebinding are:
613 * - it is a SOCK_DGRAM, or
614 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
615 * and no listen() has been done.
616 * This rebinding code was added based on some language in the XNET book
617 * about not returning EINVAL it the protocol allows rebinding. However,
618 * this language is not present in the Posix socket draft. Thus maybe the
619 * rebinding logic should be deleted from the source.
620 *
621 * A null "name" can be used to unbind the socket if:
622 * - it is a SOCK_DGRAM, or
623 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
624 * and no listen() has been done.
625 */
626 /* ARGSUSED */
627 static int
sotpi_bindlisten(struct sonode * so,struct sockaddr * name,socklen_t namelen,int backlog,int flags,struct cred * cr)628 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
629 socklen_t namelen, int backlog, int flags, struct cred *cr)
630 {
631 struct T_bind_req bind_req;
632 struct T_bind_ack *bind_ack;
633 int error = 0;
634 mblk_t *mp;
635 void *addr;
636 t_uscalar_t addrlen;
637 int unbind_on_err = 1;
638 boolean_t clear_acceptconn_on_err = B_FALSE;
639 boolean_t restore_backlog_on_err = B_FALSE;
640 int save_so_backlog;
641 t_scalar_t PRIM_type = O_T_BIND_REQ;
642 boolean_t tcp_udp_xport;
643 sotpi_info_t *sti = SOTOTPI(so);
644
645 dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
646 (void *)so, (void *)name, namelen, backlog, flags,
647 pr_state(so->so_state, so->so_mode)));
648
649 tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
650
651 if (!(flags & _SOBIND_LOCK_HELD)) {
652 mutex_enter(&so->so_lock);
653 so_lock_single(so); /* Set SOLOCKED */
654 } else {
655 ASSERT(MUTEX_HELD(&so->so_lock));
656 ASSERT(so->so_flag & SOLOCKED);
657 }
658
659 /*
660 * Make sure that there is a preallocated unbind_req message
661 * before binding. This message allocated when the socket is
662 * created but it might be have been consumed.
663 */
664 if (sti->sti_unbind_mp == NULL) {
665 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
666 /* NOTE: holding so_lock while sleeping */
667 sti->sti_unbind_mp =
668 soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
669 cr);
670 }
671
672 if (flags & _SOBIND_REBIND) {
673 /*
674 * Called from solisten after doing an sotpi_unbind() or
675 * potentially without the unbind (latter for AF_INET{,6}).
676 */
677 ASSERT(name == NULL && namelen == 0);
678
679 if (so->so_family == AF_UNIX) {
680 ASSERT(sti->sti_ux_bound_vp);
681 addr = &sti->sti_ux_laddr;
682 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
683 dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
684 "addr 0x%p, vp %p\n",
685 addrlen,
686 (void *)((struct so_ux_addr *)addr)->soua_vp,
687 (void *)sti->sti_ux_bound_vp));
688 } else {
689 addr = sti->sti_laddr_sa;
690 addrlen = (t_uscalar_t)sti->sti_laddr_len;
691 }
692 } else if (flags & _SOBIND_UNSPEC) {
693 ASSERT(name == NULL && namelen == 0);
694
695 /*
696 * The caller checked SS_ISBOUND but not necessarily
697 * under so_lock
698 */
699 if (so->so_state & SS_ISBOUND) {
700 /* No error */
701 goto done;
702 }
703
704 /* Set an initial local address */
705 switch (so->so_family) {
706 case AF_UNIX:
707 /*
708 * Use an address with same size as struct sockaddr
709 * just like BSD.
710 */
711 sti->sti_laddr_len =
712 (socklen_t)sizeof (struct sockaddr);
713 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
714 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
715 sti->sti_laddr_sa->sa_family = so->so_family;
716
717 /*
718 * Pass down an address with the implicit bind
719 * magic number and the rest all zeros.
720 * The transport will return a unique address.
721 */
722 sti->sti_ux_laddr.soua_vp = NULL;
723 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
724 addr = &sti->sti_ux_laddr;
725 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
726 break;
727
728 case AF_INET:
729 case AF_INET6:
730 /*
731 * An unspecified bind in TPI has a NULL address.
732 * Set the address in sockfs to have the sa_family.
733 */
734 sti->sti_laddr_len = (so->so_family == AF_INET) ?
735 (socklen_t)sizeof (sin_t) :
736 (socklen_t)sizeof (sin6_t);
737 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
738 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
739 sti->sti_laddr_sa->sa_family = so->so_family;
740 addr = NULL;
741 addrlen = 0;
742 break;
743
744 default:
745 /*
746 * An unspecified bind in TPI has a NULL address.
747 * Set the address in sockfs to be zero length.
748 *
749 * Can not assume there is a sa_family for all
750 * protocol families. For example, AF_X25 does not
751 * have a family field.
752 */
753 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
754 sti->sti_laddr_len = 0; /* XXX correct? */
755 addr = NULL;
756 addrlen = 0;
757 break;
758 }
759
760 } else {
761 if (so->so_state & SS_ISBOUND) {
762 /*
763 * If it is ok to rebind the socket, first unbind
764 * with the transport. A rebind to the NULL address
765 * is interpreted as an unbind.
766 * Note that a bind to NULL in BSD does unbind the
767 * socket but it fails with EINVAL.
768 * Note that regular sockets set SOV_SOCKBSD i.e.
769 * _SOBIND_SOCKBSD gets set here hence no type of
770 * socket does currently allow rebinding.
771 *
772 * If the name is NULL just do an unbind.
773 */
774 if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
775 name != NULL) {
776 error = EINVAL;
777 unbind_on_err = 0;
778 eprintsoline(so, error);
779 goto done;
780 }
781 if ((so->so_mode & SM_CONNREQUIRED) &&
782 (so->so_state & SS_CANTREBIND)) {
783 error = EINVAL;
784 unbind_on_err = 0;
785 eprintsoline(so, error);
786 goto done;
787 }
788 error = sotpi_unbind(so, 0);
789 if (error) {
790 eprintsoline(so, error);
791 goto done;
792 }
793 ASSERT(!(so->so_state & SS_ISBOUND));
794 if (name == NULL) {
795 so->so_state &=
796 ~(SS_ISCONNECTED|SS_ISCONNECTING);
797 goto done;
798 }
799 }
800
801 /* X/Open requires this check */
802 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
803 if (xnet_check_print) {
804 printf("sockfs: X/Open bind state check "
805 "caused EINVAL\n");
806 }
807 error = EINVAL;
808 goto done;
809 }
810
811 switch (so->so_family) {
812 case AF_UNIX:
813 /*
814 * All AF_UNIX addresses are nul terminated
815 * when copied (copyin_name) in so the minimum
816 * length is 3 bytes.
817 */
818 if (name == NULL ||
819 (ssize_t)namelen <= sizeof (short) + 1) {
820 error = EISDIR;
821 eprintsoline(so, error);
822 goto done;
823 }
824 /*
825 * Verify so_family matches the bound family.
826 * BSD does not check this for AF_UNIX resulting
827 * in funny mknods.
828 */
829 if (name->sa_family != so->so_family) {
830 error = EAFNOSUPPORT;
831 goto done;
832 }
833 break;
834 case AF_INET:
835 if (name == NULL) {
836 error = EINVAL;
837 eprintsoline(so, error);
838 goto done;
839 }
840 if ((size_t)namelen != sizeof (sin_t)) {
841 error = name->sa_family != so->so_family ?
842 EAFNOSUPPORT : EINVAL;
843 eprintsoline(so, error);
844 goto done;
845 }
846 if ((flags & _SOBIND_XPG4_2) &&
847 (name->sa_family != so->so_family)) {
848 /*
849 * This check has to be made for X/Open
850 * sockets however application failures have
851 * been observed when it is applied to
852 * all sockets.
853 */
854 error = EAFNOSUPPORT;
855 eprintsoline(so, error);
856 goto done;
857 }
858 /*
859 * Force a zero sa_family to match so_family.
860 *
861 * Some programs like inetd(8) don't set the
862 * family field. Other programs leave
863 * sin_family set to garbage - SunOS 4.X does
864 * not check the family field on a bind.
865 * We use the family field that
866 * was passed in to the socket() call.
867 */
868 name->sa_family = so->so_family;
869 break;
870
871 case AF_INET6: {
872 #ifdef DEBUG
873 sin6_t *sin6 = (sin6_t *)name;
874 #endif /* DEBUG */
875
876 if (name == NULL) {
877 error = EINVAL;
878 eprintsoline(so, error);
879 goto done;
880 }
881 if ((size_t)namelen != sizeof (sin6_t)) {
882 error = name->sa_family != so->so_family ?
883 EAFNOSUPPORT : EINVAL;
884 eprintsoline(so, error);
885 goto done;
886 }
887 if (name->sa_family != so->so_family) {
888 /*
889 * With IPv6 we require the family to match
890 * unlike in IPv4.
891 */
892 error = EAFNOSUPPORT;
893 eprintsoline(so, error);
894 goto done;
895 }
896 #ifdef DEBUG
897 /*
898 * Verify that apps don't forget to clear
899 * sin6_scope_id etc
900 */
901 if (sin6->sin6_scope_id != 0 &&
902 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
903 zcmn_err(getzoneid(), CE_WARN,
904 "bind with uninitialized sin6_scope_id "
905 "(%d) on socket. Pid = %d\n",
906 (int)sin6->sin6_scope_id,
907 (int)curproc->p_pid);
908 }
909 if (sin6->__sin6_src_id != 0) {
910 zcmn_err(getzoneid(), CE_WARN,
911 "bind with uninitialized __sin6_src_id "
912 "(%d) on socket. Pid = %d\n",
913 (int)sin6->__sin6_src_id,
914 (int)curproc->p_pid);
915 }
916 #endif /* DEBUG */
917 break;
918 }
919 default:
920 /*
921 * Don't do any length or sa_family check to allow
922 * non-sockaddr style addresses.
923 */
924 if (name == NULL) {
925 error = EINVAL;
926 eprintsoline(so, error);
927 goto done;
928 }
929 break;
930 }
931
932 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
933 error = ENAMETOOLONG;
934 eprintsoline(so, error);
935 goto done;
936 }
937 /*
938 * Save local address.
939 */
940 sti->sti_laddr_len = (socklen_t)namelen;
941 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
942 bcopy(name, sti->sti_laddr_sa, namelen);
943
944 addr = sti->sti_laddr_sa;
945 addrlen = (t_uscalar_t)sti->sti_laddr_len;
946 switch (so->so_family) {
947 case AF_INET6:
948 case AF_INET:
949 break;
950 case AF_UNIX: {
951 struct sockaddr_un *soun =
952 (struct sockaddr_un *)sti->sti_laddr_sa;
953 struct vnode *vp, *rvp;
954 struct vattr vattr;
955
956 ASSERT(sti->sti_ux_bound_vp == NULL);
957 /*
958 * Create vnode for the specified path name.
959 * Keep vnode held with a reference in sti_ux_bound_vp.
960 * Use the vnode pointer as the address used in the
961 * bind with the transport.
962 *
963 * Use the same mode as in BSD. In particular this does
964 * not observe the umask.
965 */
966 /* MAXPATHLEN + soun_family + nul termination */
967 if (sti->sti_laddr_len >
968 (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
969 error = ENAMETOOLONG;
970 eprintsoline(so, error);
971 goto done;
972 }
973 vattr.va_type = VSOCK;
974 vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
975 vattr.va_mask = AT_TYPE|AT_MODE;
976 /* NOTE: holding so_lock */
977 error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
978 EXCL, 0, &vp, CRMKNOD, 0, 0);
979 if (error) {
980 if (error == EEXIST)
981 error = EADDRINUSE;
982 eprintsoline(so, error);
983 goto done;
984 }
985 /*
986 * Establish pointer from the underlying filesystem
987 * vnode to the socket node.
988 * sti_ux_bound_vp and v_stream->sd_vnode form the
989 * cross-linkage between the underlying filesystem
990 * node and the socket node.
991 */
992
993 if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
994 VN_HOLD(rvp);
995 VN_RELE(vp);
996 vp = rvp;
997 }
998
999 ASSERT(SOTOV(so)->v_stream);
1000 mutex_enter(&vp->v_lock);
1001 vp->v_stream = SOTOV(so)->v_stream;
1002 sti->sti_ux_bound_vp = vp;
1003 mutex_exit(&vp->v_lock);
1004
1005 /*
1006 * Use the vnode pointer value as a unique address
1007 * (together with the magic number to avoid conflicts
1008 * with implicit binds) in the transport provider.
1009 */
1010 sti->sti_ux_laddr.soua_vp =
1011 (void *)sti->sti_ux_bound_vp;
1012 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1013 addr = &sti->sti_ux_laddr;
1014 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1015 dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1016 addrlen,
1017 (void *)((struct so_ux_addr *)addr)->soua_vp));
1018 break;
1019 }
1020 } /* end switch (so->so_family) */
1021 }
1022
1023 /*
1024 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1025 * the transport can start passing up T_CONN_IND messages
1026 * as soon as it receives the bind req and strsock_proto()
1027 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1028 */
1029 if (flags & _SOBIND_LISTEN) {
1030 if ((so->so_state & SS_ACCEPTCONN) == 0)
1031 clear_acceptconn_on_err = B_TRUE;
1032 save_so_backlog = so->so_backlog;
1033 restore_backlog_on_err = B_TRUE;
1034 so->so_state |= SS_ACCEPTCONN;
1035 so->so_backlog = backlog;
1036 }
1037
1038 /*
1039 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1040 * for other transports we will send in a O_T_BIND_REQ.
1041 */
1042 if (tcp_udp_xport &&
1043 (so->so_family == AF_INET || so->so_family == AF_INET6))
1044 PRIM_type = T_BIND_REQ;
1045
1046 bind_req.PRIM_type = PRIM_type;
1047 bind_req.ADDR_length = addrlen;
1048 bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1049 bind_req.CONIND_number = backlog;
1050 /* NOTE: holding so_lock while sleeping */
1051 mp = soallocproto2(&bind_req, sizeof (bind_req),
1052 addr, addrlen, 0, _ALLOC_SLEEP, cr);
1053 sti->sti_laddr_valid = 0;
1054
1055 /* Done using sti_laddr_sa - can drop the lock */
1056 mutex_exit(&so->so_lock);
1057
1058 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1059 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1060 if (error) {
1061 eprintsoline(so, error);
1062 mutex_enter(&so->so_lock);
1063 goto done;
1064 }
1065
1066 mutex_enter(&so->so_lock);
1067 error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1068 (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1069 if (error) {
1070 eprintsoline(so, error);
1071 goto done;
1072 }
1073 ASSERT(mp);
1074 /*
1075 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1076 * strsock_proto while the lock was dropped above, the bind
1077 * is allowed to complete.
1078 */
1079
1080 /* Mark as bound. This will be undone if we detect errors below. */
1081 if (flags & _SOBIND_NOXLATE) {
1082 ASSERT(so->so_family == AF_UNIX);
1083 sti->sti_faddr_noxlate = 1;
1084 }
1085 ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1086 so->so_state |= SS_ISBOUND;
1087 ASSERT(sti->sti_unbind_mp);
1088
1089 /* note that we've already set SS_ACCEPTCONN above */
1090
1091 /*
1092 * Recompute addrlen - an unspecied bind sent down an
1093 * address of length zero but we expect the appropriate length
1094 * in return.
1095 */
1096 addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1097 sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1098
1099 bind_ack = (struct T_bind_ack *)mp->b_rptr;
1100 /*
1101 * The alignment restriction is really too strict but
1102 * we want enough alignment to inspect the fields of
1103 * a sockaddr_in.
1104 */
1105 addr = sogetoff(mp, bind_ack->ADDR_offset,
1106 bind_ack->ADDR_length,
1107 __TPI_ALIGN_SIZE);
1108 if (addr == NULL) {
1109 freemsg(mp);
1110 error = EPROTO;
1111 eprintsoline(so, error);
1112 goto done;
1113 }
1114 if (!(flags & _SOBIND_UNSPEC)) {
1115 /*
1116 * Verify that the transport didn't return something we
1117 * did not want e.g. an address other than what we asked for.
1118 *
1119 * NOTE: These checks would go away if/when we switch to
1120 * using the new TPI (in which the transport would fail
1121 * the request instead of assigning a different address).
1122 *
1123 * NOTE2: For protocols that we don't know (i.e. any
1124 * other than AF_INET6, AF_INET and AF_UNIX), we
1125 * cannot know if the transport should be expected to
1126 * return the same address as that requested.
1127 *
1128 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1129 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1130 *
1131 * For example, in the case of netatalk it may be
1132 * inappropriate for the transport to return the
1133 * requested address (as it may have allocated a local
1134 * port number in behaviour similar to that of an
1135 * AF_INET bind request with a port number of zero).
1136 *
1137 * Given the definition of O_T_BIND_REQ, where the
1138 * transport may bind to an address other than the
1139 * requested address, it's not possible to determine
1140 * whether a returned address that differs from the
1141 * requested address is a reason to fail (because the
1142 * requested address was not available) or succeed
1143 * (because the transport allocated an appropriate
1144 * address and/or port).
1145 *
1146 * sockfs currently requires that the transport return
1147 * the requested address in the T_BIND_ACK, unless
1148 * there is code here to allow for any discrepancy.
1149 * Such code exists for AF_INET and AF_INET6.
1150 *
1151 * Netatalk chooses to return the requested address
1152 * rather than the (correct) allocated address. This
1153 * means that netatalk violates the TPI specification
1154 * (and would not function correctly if used from a
1155 * TLI application), but it does mean that it works
1156 * with sockfs.
1157 *
1158 * As noted above, using the newer XTI bind primitive
1159 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1160 * allow sockfs to be more sure about whether or not
1161 * the bind request had succeeded (as transports are
1162 * not permitted to bind to a different address than
1163 * that requested - they must return failure).
1164 * Unfortunately, support for T_BIND_REQ may not be
1165 * present in all transport implementations (netatalk,
1166 * for example, doesn't have it), making the
1167 * transition difficult.
1168 */
1169 if (bind_ack->ADDR_length != addrlen) {
1170 /* Assumes that the requested address was in use */
1171 freemsg(mp);
1172 error = EADDRINUSE;
1173 eprintsoline(so, error);
1174 goto done;
1175 }
1176
1177 switch (so->so_family) {
1178 case AF_INET6:
1179 case AF_INET: {
1180 sin_t *rname, *aname;
1181
1182 rname = (sin_t *)addr;
1183 aname = (sin_t *)sti->sti_laddr_sa;
1184
1185 /*
1186 * Take advantage of the alignment
1187 * of sin_port and sin6_port which fall
1188 * in the same place in their data structures.
1189 * Just use sin_port for either address family.
1190 *
1191 * This may become a problem if (heaven forbid)
1192 * there's a separate ipv6port_reserved... :-P
1193 *
1194 * Binding to port 0 has the semantics of letting
1195 * the transport bind to any port.
1196 *
1197 * If the transport is TCP or UDP since we had sent
1198 * a T_BIND_REQ we would not get a port other than
1199 * what we asked for.
1200 */
1201 if (tcp_udp_xport) {
1202 /*
1203 * Pick up the new port number if we bound to
1204 * port 0.
1205 */
1206 if (aname->sin_port == 0)
1207 aname->sin_port = rname->sin_port;
1208 sti->sti_laddr_valid = 1;
1209 break;
1210 }
1211 if (aname->sin_port != 0 &&
1212 aname->sin_port != rname->sin_port) {
1213 freemsg(mp);
1214 error = EADDRINUSE;
1215 eprintsoline(so, error);
1216 goto done;
1217 }
1218 /*
1219 * Pick up the new port number if we bound to port 0.
1220 */
1221 aname->sin_port = rname->sin_port;
1222
1223 /*
1224 * Unfortunately, addresses aren't _quite_ the same.
1225 */
1226 if (so->so_family == AF_INET) {
1227 if (aname->sin_addr.s_addr !=
1228 rname->sin_addr.s_addr) {
1229 freemsg(mp);
1230 error = EADDRNOTAVAIL;
1231 eprintsoline(so, error);
1232 goto done;
1233 }
1234 } else {
1235 sin6_t *rname6 = (sin6_t *)rname;
1236 sin6_t *aname6 = (sin6_t *)aname;
1237
1238 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1239 &rname6->sin6_addr)) {
1240 freemsg(mp);
1241 error = EADDRNOTAVAIL;
1242 eprintsoline(so, error);
1243 goto done;
1244 }
1245 }
1246 break;
1247 }
1248 case AF_UNIX:
1249 if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1250 freemsg(mp);
1251 error = EADDRINUSE;
1252 eprintsoline(so, error);
1253 eprintso(so,
1254 ("addrlen %d, addr 0x%x, vp %p\n",
1255 addrlen, *((int *)addr),
1256 (void *)sti->sti_ux_bound_vp));
1257 goto done;
1258 }
1259 sti->sti_laddr_valid = 1;
1260 break;
1261 default:
1262 /*
1263 * NOTE: This assumes that addresses can be
1264 * byte-compared for equivalence.
1265 */
1266 if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1267 freemsg(mp);
1268 error = EADDRINUSE;
1269 eprintsoline(so, error);
1270 goto done;
1271 }
1272 /*
1273 * Don't mark sti_laddr_valid, as we cannot be
1274 * sure that the returned address is the real
1275 * bound address when talking to an unknown
1276 * transport.
1277 */
1278 break;
1279 }
1280 } else {
1281 /*
1282 * Save for returned address for getsockname.
1283 * Needed for unspecific bind unless transport supports
1284 * the TI_GETMYNAME ioctl.
1285 * Do this for AF_INET{,6} even though they do, as
1286 * caching info here is much better performance than
1287 * a TPI/STREAMS trip to the transport for getsockname.
1288 * Any which can't for some reason _must_ _not_ set
1289 * sti_laddr_valid here for the caching version of
1290 * getsockname to not break;
1291 */
1292 switch (so->so_family) {
1293 case AF_UNIX:
1294 /*
1295 * Record the address bound with the transport
1296 * for use by socketpair.
1297 */
1298 bcopy(addr, &sti->sti_ux_laddr, addrlen);
1299 sti->sti_laddr_valid = 1;
1300 break;
1301 case AF_INET:
1302 case AF_INET6:
1303 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1304 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1305 sti->sti_laddr_valid = 1;
1306 break;
1307 default:
1308 /*
1309 * Don't mark sti_laddr_valid, as we cannot be
1310 * sure that the returned address is the real
1311 * bound address when talking to an unknown
1312 * transport.
1313 */
1314 break;
1315 }
1316 }
1317
1318 freemsg(mp);
1319
1320 done:
1321 if (error) {
1322 /* reset state & backlog to values held on entry */
1323 if (clear_acceptconn_on_err == B_TRUE)
1324 so->so_state &= ~SS_ACCEPTCONN;
1325 if (restore_backlog_on_err == B_TRUE)
1326 so->so_backlog = save_so_backlog;
1327
1328 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1329 int err;
1330
1331 err = sotpi_unbind(so, 0);
1332 /* LINTED - statement has no consequent: if */
1333 if (err) {
1334 eprintsoline(so, error);
1335 } else {
1336 ASSERT(!(so->so_state & SS_ISBOUND));
1337 }
1338 }
1339 }
1340 if (!(flags & _SOBIND_LOCK_HELD)) {
1341 so_unlock_single(so, SOLOCKED);
1342 mutex_exit(&so->so_lock);
1343 } else {
1344 ASSERT(MUTEX_HELD(&so->so_lock));
1345 ASSERT(so->so_flag & SOLOCKED);
1346 }
1347 return (error);
1348 }
1349
1350 /* bind the socket */
1351 static int
sotpi_bind(struct sonode * so,struct sockaddr * name,socklen_t namelen,int flags,struct cred * cr)1352 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1353 int flags, struct cred *cr)
1354 {
1355 if ((flags & _SOBIND_SOCKETPAIR) == 0)
1356 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1357
1358 flags &= ~_SOBIND_SOCKETPAIR;
1359 return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1360 }
1361
1362 /*
1363 * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1364 * address, or when listen needs to unbind and bind.
1365 * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1366 * so that a sobind can pick them up.
1367 */
1368 static int
sotpi_unbind(struct sonode * so,int flags)1369 sotpi_unbind(struct sonode *so, int flags)
1370 {
1371 struct T_unbind_req unbind_req;
1372 int error = 0;
1373 mblk_t *mp;
1374 sotpi_info_t *sti = SOTOTPI(so);
1375
1376 dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1377 (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1378
1379 ASSERT(MUTEX_HELD(&so->so_lock));
1380 ASSERT(so->so_flag & SOLOCKED);
1381
1382 if (!(so->so_state & SS_ISBOUND)) {
1383 error = EINVAL;
1384 eprintsoline(so, error);
1385 goto done;
1386 }
1387
1388 mutex_exit(&so->so_lock);
1389
1390 /*
1391 * Flush the read and write side (except stream head read queue)
1392 * and send down T_UNBIND_REQ.
1393 */
1394 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1395
1396 unbind_req.PRIM_type = T_UNBIND_REQ;
1397 mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1398 0, _ALLOC_SLEEP, CRED());
1399 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1400 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1401 mutex_enter(&so->so_lock);
1402 if (error) {
1403 eprintsoline(so, error);
1404 goto done;
1405 }
1406
1407 error = sowaitokack(so, T_UNBIND_REQ);
1408 if (error) {
1409 eprintsoline(so, error);
1410 goto done;
1411 }
1412
1413 /*
1414 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1415 * strsock_proto while the lock was dropped above, the unbind
1416 * is allowed to complete.
1417 */
1418 if (!(flags & _SOUNBIND_REBIND)) {
1419 /*
1420 * Clear out bound address.
1421 */
1422 vnode_t *vp;
1423
1424 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1425 sti->sti_ux_bound_vp = NULL;
1426 vn_rele_stream(vp);
1427 }
1428 /* Clear out address */
1429 sti->sti_laddr_len = 0;
1430 }
1431 so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1432 sti->sti_laddr_valid = 0;
1433
1434 done:
1435
1436 /* If the caller held the lock don't release it here */
1437 ASSERT(MUTEX_HELD(&so->so_lock));
1438 ASSERT(so->so_flag & SOLOCKED);
1439
1440 return (error);
1441 }
1442
1443 /*
1444 * listen on the socket.
1445 * For TPI conforming transports this has to first unbind with the transport
1446 * and then bind again using the new backlog.
1447 */
1448 /* ARGSUSED */
1449 int
sotpi_listen(struct sonode * so,int backlog,struct cred * cr)1450 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1451 {
1452 int error = 0;
1453 sotpi_info_t *sti = SOTOTPI(so);
1454
1455 dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1456 (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1457
1458 if (sti->sti_serv_type == T_CLTS)
1459 return (EOPNOTSUPP);
1460
1461 /*
1462 * If the socket is ready to accept connections already, then
1463 * return without doing anything. This avoids a problem where
1464 * a second listen() call fails if a connection is pending and
1465 * leaves the socket unbound. Only when we are not unbinding
1466 * with the transport can we safely increase the backlog.
1467 */
1468 if (so->so_state & SS_ACCEPTCONN &&
1469 !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1470 /*CONSTCOND*/
1471 !solisten_tpi_tcp))
1472 return (0);
1473
1474 if (so->so_state & SS_ISCONNECTED)
1475 return (EINVAL);
1476
1477 mutex_enter(&so->so_lock);
1478 so_lock_single(so); /* Set SOLOCKED */
1479
1480 /*
1481 * If the listen doesn't change the backlog we do nothing.
1482 * This avoids an EPROTO error from the transport.
1483 */
1484 if ((so->so_state & SS_ACCEPTCONN) &&
1485 so->so_backlog == backlog)
1486 goto done;
1487
1488 if (!(so->so_state & SS_ISBOUND)) {
1489 /*
1490 * Must have been explicitly bound in the UNIX domain.
1491 */
1492 if (so->so_family == AF_UNIX) {
1493 error = EINVAL;
1494 goto done;
1495 }
1496 error = sotpi_bindlisten(so, NULL, 0, backlog,
1497 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1498 } else if (backlog > 0) {
1499 /*
1500 * AF_INET{,6} hack to avoid losing the port.
1501 * Assumes that all AF_INET{,6} transports can handle a
1502 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1503 * has already bound thus it is possible to avoid the unbind.
1504 */
1505 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1506 /*CONSTCOND*/
1507 !solisten_tpi_tcp)) {
1508 error = sotpi_unbind(so, _SOUNBIND_REBIND);
1509 if (error)
1510 goto done;
1511 }
1512 error = sotpi_bindlisten(so, NULL, 0, backlog,
1513 _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1514 } else {
1515 so->so_state |= SS_ACCEPTCONN;
1516 so->so_backlog = backlog;
1517 }
1518 if (error)
1519 goto done;
1520 ASSERT(so->so_state & SS_ACCEPTCONN);
1521 done:
1522 so_unlock_single(so, SOLOCKED);
1523 mutex_exit(&so->so_lock);
1524 return (error);
1525 }
1526
1527 /*
1528 * Disconnect either a specified seqno or all (-1).
1529 * The former is used on listening sockets only.
1530 *
1531 * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1532 * the current use of sodisconnect(seqno == -1) is only for shutdown
1533 * so there is no point (and potentially incorrect) to unbind.
1534 */
1535 static int
sodisconnect(struct sonode * so,t_scalar_t seqno,int flags)1536 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1537 {
1538 struct T_discon_req discon_req;
1539 int error = 0;
1540 mblk_t *mp;
1541
1542 dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1543 (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1544
1545 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1546 mutex_enter(&so->so_lock);
1547 so_lock_single(so); /* Set SOLOCKED */
1548 } else {
1549 ASSERT(MUTEX_HELD(&so->so_lock));
1550 ASSERT(so->so_flag & SOLOCKED);
1551 }
1552
1553 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1554 error = EINVAL;
1555 eprintsoline(so, error);
1556 goto done;
1557 }
1558
1559 mutex_exit(&so->so_lock);
1560 /*
1561 * Flush the write side (unless this is a listener)
1562 * and then send down a T_DISCON_REQ.
1563 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1564 * and other messages.)
1565 */
1566 if (!(so->so_state & SS_ACCEPTCONN))
1567 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1568
1569 discon_req.PRIM_type = T_DISCON_REQ;
1570 discon_req.SEQ_number = seqno;
1571 mp = soallocproto1(&discon_req, sizeof (discon_req),
1572 0, _ALLOC_SLEEP, CRED());
1573 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1574 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1575 mutex_enter(&so->so_lock);
1576 if (error) {
1577 eprintsoline(so, error);
1578 goto done;
1579 }
1580
1581 error = sowaitokack(so, T_DISCON_REQ);
1582 if (error) {
1583 eprintsoline(so, error);
1584 goto done;
1585 }
1586 /*
1587 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1588 * strsock_proto while the lock was dropped above, the disconnect
1589 * is allowed to complete. However, it is not possible to
1590 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1591 */
1592 so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1593 SOTOTPI(so)->sti_laddr_valid = 0;
1594 SOTOTPI(so)->sti_faddr_valid = 0;
1595 done:
1596 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1597 so_unlock_single(so, SOLOCKED);
1598 mutex_exit(&so->so_lock);
1599 } else {
1600 /* If the caller held the lock don't release it here */
1601 ASSERT(MUTEX_HELD(&so->so_lock));
1602 ASSERT(so->so_flag & SOLOCKED);
1603 }
1604 return (error);
1605 }
1606
1607 /* ARGSUSED */
1608 int
sotpi_accept(struct sonode * so,int fflag,struct cred * cr,struct sonode ** nsop)1609 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1610 struct sonode **nsop)
1611 {
1612 struct T_conn_ind *conn_ind;
1613 struct T_conn_res *conn_res;
1614 int error = 0;
1615 mblk_t *mp, *ack_mp;
1616 struct sonode *nso;
1617 vnode_t *nvp;
1618 void *src;
1619 t_uscalar_t srclen;
1620 void *opt;
1621 t_uscalar_t optlen;
1622 t_scalar_t PRIM_type;
1623 t_scalar_t SEQ_number;
1624 size_t sinlen;
1625 sotpi_info_t *sti = SOTOTPI(so);
1626 sotpi_info_t *nsti;
1627
1628 dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1629 (void *)so, fflag, (void *)nsop,
1630 pr_state(so->so_state, so->so_mode)));
1631
1632 /*
1633 * Defer single-threading the accepting socket until
1634 * the T_CONN_IND has been received and parsed and the
1635 * new sonode has been opened.
1636 */
1637
1638 /* Check that we are not already connected */
1639 if ((so->so_state & SS_ACCEPTCONN) == 0)
1640 goto conn_bad;
1641 again:
1642 if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1643 goto e_bad;
1644
1645 ASSERT(mp != NULL);
1646 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1647
1648 /*
1649 * Save SEQ_number for error paths.
1650 */
1651 SEQ_number = conn_ind->SEQ_number;
1652
1653 srclen = conn_ind->SRC_length;
1654 src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1655 if (src == NULL) {
1656 error = EPROTO;
1657 freemsg(mp);
1658 eprintsoline(so, error);
1659 goto disconnect_unlocked;
1660 }
1661 optlen = conn_ind->OPT_length;
1662 switch (so->so_family) {
1663 case AF_INET:
1664 case AF_INET6:
1665 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1666 bcopy(mp->b_rptr + conn_ind->OPT_offset,
1667 &opt, conn_ind->OPT_length);
1668 } else {
1669 /*
1670 * The transport (in this case TCP) hasn't sent up
1671 * a pointer to an instance for the accept fast-path.
1672 * Disable fast-path completely because the call to
1673 * sotpi_create() below would otherwise create an
1674 * incomplete TCP instance, which would lead to
1675 * problems when sockfs sends a normal T_CONN_RES
1676 * message down the new stream.
1677 */
1678 if (sti->sti_direct) {
1679 int rval;
1680 /*
1681 * For consistency we inform tcp to disable
1682 * direct interface on the listener, though
1683 * we can certainly live without doing this
1684 * because no data will ever travel upstream
1685 * on the listening socket.
1686 */
1687 sti->sti_direct = 0;
1688 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1689 0, 0, K_TO_K, cr, &rval);
1690 }
1691 opt = NULL;
1692 optlen = 0;
1693 }
1694 break;
1695 case AF_UNIX:
1696 default:
1697 if (optlen != 0) {
1698 opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1699 __TPI_ALIGN_SIZE);
1700 if (opt == NULL) {
1701 error = EPROTO;
1702 freemsg(mp);
1703 eprintsoline(so, error);
1704 goto disconnect_unlocked;
1705 }
1706 }
1707 if (so->so_family == AF_UNIX) {
1708 if (!sti->sti_faddr_noxlate) {
1709 src = NULL;
1710 srclen = 0;
1711 }
1712 /* Extract src address from options */
1713 if (optlen != 0)
1714 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1715 }
1716 break;
1717 }
1718
1719 /*
1720 * Create the new socket.
1721 */
1722 nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1723 if (nso == NULL) {
1724 ASSERT(error != 0);
1725 /*
1726 * Accept can not fail with ENOBUFS. sotpi_create
1727 * sleeps waiting for memory until a signal is caught
1728 * so return EINTR.
1729 */
1730 freemsg(mp);
1731 if (error == ENOBUFS)
1732 error = EINTR;
1733 goto e_disc_unl;
1734 }
1735 nvp = SOTOV(nso);
1736 nsti = SOTOTPI(nso);
1737
1738 #ifdef DEBUG
1739 /*
1740 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1741 * it's inherited early to allow debugging of the accept code itself.
1742 */
1743 nso->so_options |= so->so_options & SO_DEBUG;
1744 #endif /* DEBUG */
1745
1746 /*
1747 * Save the SRC address from the T_CONN_IND
1748 * for getpeername to work on AF_UNIX and on transports that do not
1749 * support TI_GETPEERNAME.
1750 *
1751 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1752 * copyin_name().
1753 */
1754 if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1755 error = EINVAL;
1756 freemsg(mp);
1757 eprintsoline(so, error);
1758 goto disconnect_vp_unlocked;
1759 }
1760 nsti->sti_faddr_len = (socklen_t)srclen;
1761 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1762 bcopy(src, nsti->sti_faddr_sa, srclen);
1763 nsti->sti_faddr_valid = 1;
1764
1765 /*
1766 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1767 */
1768 if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1769 (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1770 cred_t *cr;
1771 pid_t cpid;
1772
1773 cr = msg_getcred(mp, &cpid);
1774 if (cr != NULL) {
1775 crhold(cr);
1776 nso->so_peercred = cr;
1777 nso->so_cpid = cpid;
1778 }
1779 freemsg(mp);
1780
1781 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1782 sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1783 if (mp == NULL) {
1784 /*
1785 * Accept can not fail with ENOBUFS.
1786 * A signal was caught so return EINTR.
1787 */
1788 error = EINTR;
1789 eprintsoline(so, error);
1790 goto disconnect_vp_unlocked;
1791 }
1792 conn_res = (struct T_conn_res *)mp->b_rptr;
1793 } else {
1794 /*
1795 * For efficency reasons we use msg_extractcred; no crhold
1796 * needed since db_credp is cleared (i.e., we move the cred
1797 * from the message to so_peercred.
1798 */
1799 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1800
1801 mp->b_rptr = DB_BASE(mp);
1802 conn_res = (struct T_conn_res *)mp->b_rptr;
1803 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1804
1805 mblk_setcred(mp, cr, curproc->p_pid);
1806 }
1807
1808 /*
1809 * New socket must be bound at least in sockfs and, except for AF_INET,
1810 * (or AF_INET6) it also has to be bound in the transport provider.
1811 * We set the local address in the sonode from the T_OK_ACK of the
1812 * T_CONN_RES. For this reason the address we bind to here isn't
1813 * important.
1814 */
1815 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1816 /*CONSTCOND*/
1817 nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1818 /*
1819 * Optimization for AF_INET{,6} transports
1820 * that can handle a T_CONN_RES without being bound.
1821 */
1822 mutex_enter(&nso->so_lock);
1823 so_automatic_bind(nso);
1824 mutex_exit(&nso->so_lock);
1825 } else {
1826 /* Perform NULL bind with the transport provider. */
1827 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1828 cr)) != 0) {
1829 ASSERT(error != ENOBUFS);
1830 freemsg(mp);
1831 eprintsoline(nso, error);
1832 goto disconnect_vp_unlocked;
1833 }
1834 }
1835
1836 /*
1837 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1838 * so that any data arriving on the new socket will cause the
1839 * appropriate signals to be delivered for the new socket.
1840 *
1841 * No other thread (except strsock_proto and strsock_misc)
1842 * can access the new socket thus we relax the locking.
1843 */
1844 nso->so_pgrp = so->so_pgrp;
1845 nso->so_state |= so->so_state & SS_ASYNC;
1846 nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1847
1848 if (nso->so_pgrp != 0) {
1849 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1850 eprintsoline(nso, error);
1851 error = 0;
1852 nso->so_pgrp = 0;
1853 }
1854 }
1855
1856 /*
1857 * Make note of the socket level options. TCP and IP level options
1858 * are already inherited. We could do all this after accept is
1859 * successful but doing it here simplifies code and no harm done
1860 * for error case.
1861 */
1862 nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1863 SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1864 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1865 nso->so_sndbuf = so->so_sndbuf;
1866 nso->so_rcvbuf = so->so_rcvbuf;
1867 if (nso->so_options & SO_LINGER)
1868 nso->so_linger = so->so_linger;
1869
1870 /*
1871 * Note that the following sti_direct code path should be
1872 * removed once we are confident that the direct sockets
1873 * do not result in any degradation.
1874 */
1875 if (sti->sti_direct) {
1876
1877 ASSERT(opt != NULL);
1878
1879 conn_res->OPT_length = optlen;
1880 conn_res->OPT_offset = MBLKL(mp);
1881 bcopy(&opt, mp->b_wptr, optlen);
1882 mp->b_wptr += optlen;
1883 conn_res->PRIM_type = T_CONN_RES;
1884 conn_res->ACCEPTOR_id = 0;
1885 PRIM_type = T_CONN_RES;
1886
1887 /* Send down the T_CONN_RES on acceptor STREAM */
1888 error = kstrputmsg(SOTOV(nso), mp, NULL,
1889 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1890 if (error) {
1891 mutex_enter(&so->so_lock);
1892 so_lock_single(so);
1893 eprintsoline(so, error);
1894 goto disconnect_vp;
1895 }
1896 mutex_enter(&nso->so_lock);
1897 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1898 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1899 if (error) {
1900 mutex_exit(&nso->so_lock);
1901 mutex_enter(&so->so_lock);
1902 so_lock_single(so);
1903 eprintsoline(so, error);
1904 goto disconnect_vp;
1905 }
1906 if (nso->so_family == AF_INET) {
1907 sin_t *sin;
1908
1909 sin = (sin_t *)(ack_mp->b_rptr +
1910 sizeof (struct T_ok_ack));
1911 bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1912 nsti->sti_laddr_len = sizeof (sin_t);
1913 } else {
1914 sin6_t *sin6;
1915
1916 sin6 = (sin6_t *)(ack_mp->b_rptr +
1917 sizeof (struct T_ok_ack));
1918 bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1919 nsti->sti_laddr_len = sizeof (sin6_t);
1920 }
1921 freemsg(ack_mp);
1922
1923 nso->so_state |= SS_ISCONNECTED;
1924 nso->so_proto_handle = (sock_lower_handle_t)opt;
1925 nsti->sti_laddr_valid = 1;
1926
1927 mutex_exit(&nso->so_lock);
1928
1929 /*
1930 * It's possible, through the use of autopush for example,
1931 * that the acceptor stream may not support sti_direct
1932 * semantics. If the new socket does not support sti_direct
1933 * we issue a _SIOCSOCKFALLBACK to inform the transport
1934 * as we would in the I_PUSH case.
1935 */
1936 if (nsti->sti_direct == 0) {
1937 int rval;
1938
1939 if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1940 0, 0, K_TO_K, cr, &rval)) != 0) {
1941 mutex_enter(&so->so_lock);
1942 so_lock_single(so);
1943 eprintsoline(so, error);
1944 goto disconnect_vp;
1945 }
1946 }
1947
1948 /*
1949 * Pass out new socket.
1950 */
1951 if (nsop != NULL)
1952 *nsop = nso;
1953
1954 return (0);
1955 }
1956
1957 /*
1958 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1959 * which don't support the FireEngine accept fast-path. It is also
1960 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1961 * again. Neither sockfs nor TCP attempt to find out if some other
1962 * random module has been inserted in between (in which case we
1963 * should follow TLI accept behaviour). We blindly assume the worst
1964 * case and revert back to old behaviour i.e. TCP will not send us
1965 * any option (eager) and the accept should happen on the listener
1966 * queue. Any queued T_conn_ind have already got their options removed
1967 * by so_sock2_stream() when "sockmod" was I_POP'd.
1968 */
1969 /*
1970 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1971 */
1972 if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1973 #ifdef _ILP32
1974 queue_t *q;
1975
1976 /*
1977 * Find read queue in driver
1978 * Can safely do this since we "own" nso/nvp.
1979 */
1980 q = strvp2wq(nvp)->q_next;
1981 while (SAMESTR(q))
1982 q = q->q_next;
1983 q = RD(q);
1984 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1985 #else
1986 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1987 #endif /* _ILP32 */
1988 conn_res->PRIM_type = O_T_CONN_RES;
1989 PRIM_type = O_T_CONN_RES;
1990 } else {
1991 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
1992 conn_res->PRIM_type = T_CONN_RES;
1993 PRIM_type = T_CONN_RES;
1994 }
1995 conn_res->SEQ_number = SEQ_number;
1996 conn_res->OPT_length = 0;
1997 conn_res->OPT_offset = 0;
1998
1999 mutex_enter(&so->so_lock);
2000 so_lock_single(so); /* Set SOLOCKED */
2001 mutex_exit(&so->so_lock);
2002
2003 error = kstrputmsg(SOTOV(so), mp, NULL,
2004 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2005 mutex_enter(&so->so_lock);
2006 if (error) {
2007 eprintsoline(so, error);
2008 goto disconnect_vp;
2009 }
2010 error = sowaitprim(so, PRIM_type, T_OK_ACK,
2011 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2012 if (error) {
2013 eprintsoline(so, error);
2014 goto disconnect_vp;
2015 }
2016 mutex_exit(&so->so_lock);
2017 /*
2018 * If there is a sin/sin6 appended onto the T_OK_ACK use
2019 * that to set the local address. If this is not present
2020 * then we zero out the address and don't set the
2021 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2022 * the pathname from the listening socket.
2023 * In the case where this is TCP or an AF_UNIX socket the
2024 * client side may have queued data or a T_ORDREL in the
2025 * transport. Having now sent the T_CONN_RES we may receive
2026 * those queued messages at any time. Hold the acceptor
2027 * so_lock until its state and laddr are finalized.
2028 */
2029 mutex_enter(&nso->so_lock);
2030 sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2031 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
2032 MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2033 ack_mp->b_rptr += sizeof (struct T_ok_ack);
2034 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2035 nsti->sti_laddr_len = sinlen;
2036 nsti->sti_laddr_valid = 1;
2037 } else if (nso->so_family == AF_UNIX) {
2038 ASSERT(so->so_family == AF_UNIX);
2039 nsti->sti_laddr_len = sti->sti_laddr_len;
2040 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2041 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2042 nsti->sti_laddr_len);
2043 nsti->sti_laddr_valid = 1;
2044 } else {
2045 nsti->sti_laddr_len = sti->sti_laddr_len;
2046 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2047 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2048 nsti->sti_laddr_sa->sa_family = nso->so_family;
2049 }
2050 nso->so_state |= SS_ISCONNECTED;
2051 mutex_exit(&nso->so_lock);
2052
2053 freemsg(ack_mp);
2054
2055 mutex_enter(&so->so_lock);
2056 so_unlock_single(so, SOLOCKED);
2057 mutex_exit(&so->so_lock);
2058
2059 /*
2060 * Pass out new socket.
2061 */
2062 if (nsop != NULL)
2063 *nsop = nso;
2064
2065 return (0);
2066
2067
2068 eproto_disc_unl:
2069 error = EPROTO;
2070 e_disc_unl:
2071 eprintsoline(so, error);
2072 goto disconnect_unlocked;
2073
2074 pr_disc_vp_unl:
2075 eprintsoline(so, error);
2076 disconnect_vp_unlocked:
2077 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2078 VN_RELE(nvp);
2079 disconnect_unlocked:
2080 (void) sodisconnect(so, SEQ_number, 0);
2081 return (error);
2082
2083 pr_disc_vp:
2084 eprintsoline(so, error);
2085 disconnect_vp:
2086 (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2087 so_unlock_single(so, SOLOCKED);
2088 mutex_exit(&so->so_lock);
2089 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2090 VN_RELE(nvp);
2091 return (error);
2092
2093 conn_bad: /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2094 error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2095 ? EOPNOTSUPP : EINVAL;
2096 e_bad:
2097 eprintsoline(so, error);
2098 return (error);
2099 }
2100
2101 /*
2102 * connect a socket.
2103 *
2104 * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2105 * unconnect (by specifying a null address).
2106 */
2107 int
sotpi_connect(struct sonode * so,struct sockaddr * name,socklen_t namelen,int fflag,int flags,struct cred * cr)2108 sotpi_connect(struct sonode *so,
2109 struct sockaddr *name,
2110 socklen_t namelen,
2111 int fflag,
2112 int flags,
2113 struct cred *cr)
2114 {
2115 struct T_conn_req conn_req;
2116 int error = 0;
2117 mblk_t *mp;
2118 void *src;
2119 socklen_t srclen;
2120 void *addr;
2121 socklen_t addrlen;
2122 boolean_t need_unlock;
2123 sotpi_info_t *sti = SOTOTPI(so);
2124
2125 dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2126 (void *)so, (void *)name, namelen, fflag, flags,
2127 pr_state(so->so_state, so->so_mode)));
2128
2129 /*
2130 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2131 * avoid sleeping for memory with SOLOCKED held.
2132 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2133 * + sizeof (struct T_opthdr).
2134 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2135 * exceed sti_faddr_maxlen).
2136 */
2137 mp = soallocproto(sizeof (struct T_conn_req) +
2138 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2139 cr);
2140 if (mp == NULL) {
2141 /*
2142 * Connect can not fail with ENOBUFS. A signal was
2143 * caught so return EINTR.
2144 */
2145 error = EINTR;
2146 eprintsoline(so, error);
2147 return (error);
2148 }
2149
2150 mutex_enter(&so->so_lock);
2151 /*
2152 * Make sure there is a preallocated T_unbind_req message
2153 * before any binding. This message is allocated when the
2154 * socket is created. Since another thread can consume
2155 * so_unbind_mp by the time we return from so_lock_single(),
2156 * we should check the availability of so_unbind_mp after
2157 * we return from so_lock_single().
2158 */
2159
2160 so_lock_single(so); /* Set SOLOCKED */
2161 need_unlock = B_TRUE;
2162
2163 if (sti->sti_unbind_mp == NULL) {
2164 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2165 /* NOTE: holding so_lock while sleeping */
2166 sti->sti_unbind_mp =
2167 soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2168 if (sti->sti_unbind_mp == NULL) {
2169 error = EINTR;
2170 goto done;
2171 }
2172 }
2173
2174 /*
2175 * Can't have done a listen before connecting.
2176 */
2177 if (so->so_state & SS_ACCEPTCONN) {
2178 error = EOPNOTSUPP;
2179 goto done;
2180 }
2181
2182 /*
2183 * Must be bound with the transport
2184 */
2185 if (!(so->so_state & SS_ISBOUND)) {
2186 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2187 /*CONSTCOND*/
2188 so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2189 /*
2190 * Optimization for AF_INET{,6} transports
2191 * that can handle a T_CONN_REQ without being bound.
2192 */
2193 so_automatic_bind(so);
2194 } else {
2195 error = sotpi_bind(so, NULL, 0,
2196 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2197 if (error)
2198 goto done;
2199 }
2200 ASSERT(so->so_state & SS_ISBOUND);
2201 flags |= _SOCONNECT_DID_BIND;
2202 }
2203
2204 /*
2205 * Handle a connect to a name parameter of type AF_UNSPEC like a
2206 * connect to a null address. This is the portable method to
2207 * unconnect a socket.
2208 */
2209 if ((namelen >= sizeof (sa_family_t)) &&
2210 (name->sa_family == AF_UNSPEC)) {
2211 name = NULL;
2212 namelen = 0;
2213 }
2214
2215 /*
2216 * Check that we are not already connected.
2217 * A connection-oriented socket cannot be reconnected.
2218 * A connected connection-less socket can be
2219 * - connected to a different address by a subsequent connect
2220 * - "unconnected" by a connect to the NULL address
2221 */
2222 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2223 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2224 if (so->so_mode & SM_CONNREQUIRED) {
2225 /* Connection-oriented socket */
2226 error = so->so_state & SS_ISCONNECTED ?
2227 EISCONN : EALREADY;
2228 goto done;
2229 }
2230 /* Connection-less socket */
2231 if (name == NULL) {
2232 /*
2233 * Remove the connected state and clear SO_DGRAM_ERRIND
2234 * since it was set when the socket was connected.
2235 * If this is UDP also send down a T_DISCON_REQ.
2236 */
2237 int val;
2238
2239 if ((so->so_family == AF_INET ||
2240 so->so_family == AF_INET6) &&
2241 (so->so_type == SOCK_DGRAM ||
2242 so->so_type == SOCK_RAW) &&
2243 /*CONSTCOND*/
2244 !soconnect_tpi_udp) {
2245 /* XXX What about implicitly unbinding here? */
2246 error = sodisconnect(so, -1,
2247 _SODISCONNECT_LOCK_HELD);
2248 } else {
2249 so->so_state &=
2250 ~(SS_ISCONNECTED | SS_ISCONNECTING);
2251 sti->sti_faddr_valid = 0;
2252 sti->sti_faddr_len = 0;
2253 }
2254
2255 /* Remove SOLOCKED since setsockopt will grab it */
2256 so_unlock_single(so, SOLOCKED);
2257 mutex_exit(&so->so_lock);
2258
2259 val = 0;
2260 (void) sotpi_setsockopt(so, SOL_SOCKET,
2261 SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2262 cr);
2263
2264 mutex_enter(&so->so_lock);
2265 so_lock_single(so); /* Set SOLOCKED */
2266 goto done;
2267 }
2268 }
2269 ASSERT(so->so_state & SS_ISBOUND);
2270
2271 if (name == NULL || namelen == 0) {
2272 error = EINVAL;
2273 goto done;
2274 }
2275 /*
2276 * Mark the socket if sti_faddr_sa represents the transport level
2277 * address.
2278 */
2279 if (flags & _SOCONNECT_NOXLATE) {
2280 struct sockaddr_ux *soaddr_ux;
2281
2282 ASSERT(so->so_family == AF_UNIX);
2283 if (namelen != sizeof (struct sockaddr_ux)) {
2284 error = EINVAL;
2285 goto done;
2286 }
2287 soaddr_ux = (struct sockaddr_ux *)name;
2288 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2289 namelen = sizeof (soaddr_ux->sou_addr);
2290 sti->sti_faddr_noxlate = 1;
2291 }
2292
2293 /*
2294 * Length and family checks.
2295 */
2296 error = so_addr_verify(so, name, namelen);
2297 if (error)
2298 goto bad;
2299
2300 /*
2301 * Save foreign address. Needed for AF_UNIX as well as
2302 * transport providers that do not support TI_GETPEERNAME.
2303 * Also used for cached foreign address for TCP and UDP.
2304 */
2305 if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2306 error = EINVAL;
2307 goto done;
2308 }
2309 sti->sti_faddr_len = (socklen_t)namelen;
2310 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2311 bcopy(name, sti->sti_faddr_sa, namelen);
2312 sti->sti_faddr_valid = 1;
2313
2314 if (so->so_family == AF_UNIX) {
2315 if (sti->sti_faddr_noxlate) {
2316 /*
2317 * sti_faddr is a transport-level address, so
2318 * don't pass it as an option. Do save it in
2319 * sti_ux_faddr, used for connected DG send.
2320 */
2321 src = NULL;
2322 srclen = 0;
2323 addr = sti->sti_faddr_sa;
2324 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2325 bcopy(addr, &sti->sti_ux_faddr,
2326 sizeof (sti->sti_ux_faddr));
2327 } else {
2328 /*
2329 * Pass the sockaddr_un source address as an option
2330 * and translate the remote address.
2331 * Holding so_lock thus sti_laddr_sa can not change.
2332 */
2333 src = sti->sti_laddr_sa;
2334 srclen = (t_uscalar_t)sti->sti_laddr_len;
2335 dprintso(so, 1,
2336 ("sotpi_connect UNIX: srclen %d, src %p\n",
2337 srclen, src));
2338 /*
2339 * Translate the destination address into our
2340 * internal form, and save it in sti_ux_faddr.
2341 * After this call, addr==&sti->sti_ux_taddr,
2342 * and we copy that to sti->sti_ux_faddr so
2343 * we save the connected peer address.
2344 */
2345 error = so_ux_addr_xlate(so,
2346 sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2347 (flags & _SOCONNECT_XPG4_2),
2348 &addr, &addrlen);
2349 if (error)
2350 goto bad;
2351 bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2352 sizeof (sti->sti_ux_faddr));
2353 }
2354 } else {
2355 addr = sti->sti_faddr_sa;
2356 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2357 src = NULL;
2358 srclen = 0;
2359 }
2360 /*
2361 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2362 * option which asks the transport provider to send T_UDERR_IND
2363 * messages. These T_UDERR_IND messages are used to return connected
2364 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2365 *
2366 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2367 * we send down a T_CONN_REQ. This is needed to let the
2368 * transport assign a local address that is consistent with
2369 * the remote address. Applications depend on a getsockname()
2370 * after a connect() to retrieve the "source" IP address for
2371 * the connected socket. Invalidate the cached local address
2372 * to force getsockname() to enquire of the transport.
2373 */
2374 if (!(so->so_mode & SM_CONNREQUIRED)) {
2375 /*
2376 * Datagram socket.
2377 */
2378 int32_t val;
2379
2380 so_unlock_single(so, SOLOCKED);
2381 mutex_exit(&so->so_lock);
2382
2383 val = 1;
2384 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2385 &val, (t_uscalar_t)sizeof (val), cr);
2386
2387 mutex_enter(&so->so_lock);
2388 so_lock_single(so); /* Set SOLOCKED */
2389 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2390 (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2391 soconnect_tpi_udp) {
2392 soisconnected(so);
2393 goto done;
2394 }
2395 /*
2396 * Send down T_CONN_REQ etc.
2397 * Clear fflag to avoid returning EWOULDBLOCK.
2398 */
2399 fflag = 0;
2400 ASSERT(so->so_family != AF_UNIX);
2401 sti->sti_laddr_valid = 0;
2402 } else if (sti->sti_laddr_len != 0) {
2403 /*
2404 * If the local address or port was "any" then it may be
2405 * changed by the transport as a result of the
2406 * connect. Invalidate the cached version if we have one.
2407 */
2408 switch (so->so_family) {
2409 case AF_INET:
2410 ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2411 if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2412 INADDR_ANY ||
2413 ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2414 sti->sti_laddr_valid = 0;
2415 break;
2416
2417 case AF_INET6:
2418 ASSERT(sti->sti_laddr_len ==
2419 (socklen_t)sizeof (sin6_t));
2420 if (IN6_IS_ADDR_UNSPECIFIED(
2421 &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2422 IN6_IS_ADDR_V4MAPPED_ANY(
2423 &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2424 ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2425 sti->sti_laddr_valid = 0;
2426 break;
2427
2428 default:
2429 break;
2430 }
2431 }
2432
2433 /*
2434 * Check for failure of an earlier call
2435 */
2436 if (so->so_error != 0)
2437 goto so_bad;
2438
2439 /*
2440 * Send down T_CONN_REQ. Message was allocated above.
2441 */
2442 conn_req.PRIM_type = T_CONN_REQ;
2443 conn_req.DEST_length = addrlen;
2444 conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2445 if (srclen == 0) {
2446 conn_req.OPT_length = 0;
2447 conn_req.OPT_offset = 0;
2448 soappendmsg(mp, &conn_req, sizeof (conn_req));
2449 soappendmsg(mp, addr, addrlen);
2450 } else {
2451 /*
2452 * There is a AF_UNIX sockaddr_un to include as a source
2453 * address option.
2454 */
2455 struct T_opthdr toh;
2456
2457 toh.level = SOL_SOCKET;
2458 toh.name = SO_SRCADDR;
2459 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2460 toh.status = 0;
2461 conn_req.OPT_length =
2462 (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2463 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2464 _TPI_ALIGN_TOPT(addrlen));
2465
2466 soappendmsg(mp, &conn_req, sizeof (conn_req));
2467 soappendmsg(mp, addr, addrlen);
2468 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2469 soappendmsg(mp, &toh, sizeof (toh));
2470 soappendmsg(mp, src, srclen);
2471 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2472 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2473 }
2474 /*
2475 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2476 * in order to have the right state when the T_CONN_CON shows up.
2477 */
2478 soisconnecting(so);
2479 mutex_exit(&so->so_lock);
2480
2481 if (AU_AUDITING())
2482 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2483
2484 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2485 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2486 mp = NULL;
2487 mutex_enter(&so->so_lock);
2488 if (error != 0)
2489 goto bad;
2490
2491 if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2492 goto bad;
2493
2494 /* Allow other threads to access the socket */
2495 so_unlock_single(so, SOLOCKED);
2496 need_unlock = B_FALSE;
2497
2498 /*
2499 * Wait until we get a T_CONN_CON or an error
2500 */
2501 if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2502 so_lock_single(so); /* Set SOLOCKED */
2503 need_unlock = B_TRUE;
2504 }
2505
2506 done:
2507 freemsg(mp);
2508 switch (error) {
2509 case EINPROGRESS:
2510 case EALREADY:
2511 case EISCONN:
2512 case EINTR:
2513 /* Non-fatal errors */
2514 sti->sti_laddr_valid = 0;
2515 /* FALLTHRU */
2516 case 0:
2517 break;
2518 default:
2519 ASSERT(need_unlock);
2520 /*
2521 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2522 * and invalidate local-address cache
2523 */
2524 so->so_state &= ~SS_ISCONNECTING;
2525 sti->sti_laddr_valid = 0;
2526 /* A discon_ind might have already unbound us */
2527 if ((flags & _SOCONNECT_DID_BIND) &&
2528 (so->so_state & SS_ISBOUND)) {
2529 int err;
2530
2531 err = sotpi_unbind(so, 0);
2532 /* LINTED - statement has no conseq */
2533 if (err) {
2534 eprintsoline(so, err);
2535 }
2536 }
2537 break;
2538 }
2539 if (need_unlock)
2540 so_unlock_single(so, SOLOCKED);
2541 mutex_exit(&so->so_lock);
2542 return (error);
2543
2544 so_bad: error = sogeterr(so, B_TRUE);
2545 bad: eprintsoline(so, error);
2546 goto done;
2547 }
2548
2549 /* ARGSUSED */
2550 int
sotpi_shutdown(struct sonode * so,int how,struct cred * cr)2551 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2552 {
2553 struct T_ordrel_req ordrel_req;
2554 mblk_t *mp;
2555 uint_t old_state, state_change;
2556 int error = 0;
2557 sotpi_info_t *sti = SOTOTPI(so);
2558
2559 dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2560 (void *)so, how, pr_state(so->so_state, so->so_mode)));
2561
2562 mutex_enter(&so->so_lock);
2563 so_lock_single(so); /* Set SOLOCKED */
2564
2565 /*
2566 * SunOS 4.X has no check for datagram sockets.
2567 * 5.X checks that it is connected (ENOTCONN)
2568 * X/Open requires that we check the connected state.
2569 */
2570 if (!(so->so_state & SS_ISCONNECTED)) {
2571 if (!xnet_skip_checks) {
2572 error = ENOTCONN;
2573 if (xnet_check_print) {
2574 printf("sockfs: X/Open shutdown check "
2575 "caused ENOTCONN\n");
2576 }
2577 }
2578 goto done;
2579 }
2580 /*
2581 * Record the current state and then perform any state changes.
2582 * Then use the difference between the old and new states to
2583 * determine which messages need to be sent.
2584 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2585 * duplicate calls to shutdown().
2586 */
2587 old_state = so->so_state;
2588
2589 switch (how) {
2590 case 0:
2591 socantrcvmore(so);
2592 break;
2593 case 1:
2594 socantsendmore(so);
2595 break;
2596 case 2:
2597 socantsendmore(so);
2598 socantrcvmore(so);
2599 break;
2600 default:
2601 error = EINVAL;
2602 goto done;
2603 }
2604
2605 /*
2606 * Assumes that the SS_CANT* flags are never cleared in the above code.
2607 */
2608 state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2609 (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2610 ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2611
2612 switch (state_change) {
2613 case 0:
2614 dprintso(so, 1,
2615 ("sotpi_shutdown: nothing to send in state 0x%x\n",
2616 so->so_state));
2617 goto done;
2618
2619 case SS_CANTRCVMORE:
2620 mutex_exit(&so->so_lock);
2621 strseteof(SOTOV(so), 1);
2622 /*
2623 * strseteof takes care of read side wakeups,
2624 * pollwakeups, and signals.
2625 */
2626 /*
2627 * Get the read lock before flushing data to avoid problems
2628 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2629 */
2630 mutex_enter(&so->so_lock);
2631 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2632 mutex_exit(&so->so_lock);
2633
2634 /* Flush read side queue */
2635 strflushrq(SOTOV(so), FLUSHALL);
2636
2637 mutex_enter(&so->so_lock);
2638 so_unlock_read(so); /* Clear SOREADLOCKED */
2639 break;
2640
2641 case SS_CANTSENDMORE:
2642 mutex_exit(&so->so_lock);
2643 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2644 mutex_enter(&so->so_lock);
2645 break;
2646
2647 case SS_CANTSENDMORE|SS_CANTRCVMORE:
2648 mutex_exit(&so->so_lock);
2649 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2650 strseteof(SOTOV(so), 1);
2651 /*
2652 * strseteof takes care of read side wakeups,
2653 * pollwakeups, and signals.
2654 */
2655 /*
2656 * Get the read lock before flushing data to avoid problems
2657 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2658 */
2659 mutex_enter(&so->so_lock);
2660 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2661 mutex_exit(&so->so_lock);
2662
2663 /* Flush read side queue */
2664 strflushrq(SOTOV(so), FLUSHALL);
2665
2666 mutex_enter(&so->so_lock);
2667 so_unlock_read(so); /* Clear SOREADLOCKED */
2668 break;
2669 }
2670
2671 ASSERT(MUTEX_HELD(&so->so_lock));
2672
2673 /*
2674 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2675 * was set due to this call and the new state has both of them set:
2676 * Send the AF_UNIX close indication
2677 * For T_COTS send a discon_ind
2678 *
2679 * If cantsend was set due to this call:
2680 * For T_COTSORD send an ordrel_ind
2681 *
2682 * Note that for T_CLTS there is no message sent here.
2683 */
2684 if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2685 (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2686 /*
2687 * For SunOS 4.X compatibility we tell the other end
2688 * that we are unable to receive at this point.
2689 */
2690 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2691 so_unix_close(so);
2692
2693 if (sti->sti_serv_type == T_COTS)
2694 error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2695 }
2696 if ((state_change & SS_CANTSENDMORE) &&
2697 (sti->sti_serv_type == T_COTS_ORD)) {
2698 /* Send an orderly release */
2699 ordrel_req.PRIM_type = T_ORDREL_REQ;
2700
2701 mutex_exit(&so->so_lock);
2702 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2703 0, _ALLOC_SLEEP, cr);
2704 /*
2705 * Send down the T_ORDREL_REQ even if there is flow control.
2706 * This prevents shutdown from blocking.
2707 * Note that there is no T_OK_ACK for ordrel_req.
2708 */
2709 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2710 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2711 mutex_enter(&so->so_lock);
2712 if (error) {
2713 eprintsoline(so, error);
2714 goto done;
2715 }
2716 }
2717
2718 done:
2719 so_unlock_single(so, SOLOCKED);
2720 mutex_exit(&so->so_lock);
2721 return (error);
2722 }
2723
2724 /*
2725 * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2726 * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2727 * that we have closed.
2728 * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2729 * T_UNITDATA_REQ containing the same option.
2730 *
2731 * For SOCK_DGRAM half-connections (somebody connected to this end
2732 * but this end is not connect) we don't know where to send any
2733 * SO_UNIX_CLOSE.
2734 *
2735 * We have to ignore stream head errors just in case there has been
2736 * a shutdown(output).
2737 * Ignore any flow control to try to get the message more quickly to the peer.
2738 * While locally ignoring flow control solves the problem when there
2739 * is only the loopback transport on the stream it would not provide
2740 * the correct AF_UNIX socket semantics when one or more modules have
2741 * been pushed.
2742 */
2743 void
so_unix_close(struct sonode * so)2744 so_unix_close(struct sonode *so)
2745 {
2746 struct T_opthdr toh;
2747 mblk_t *mp;
2748 sotpi_info_t *sti = SOTOTPI(so);
2749
2750 ASSERT(MUTEX_HELD(&so->so_lock));
2751
2752 ASSERT(so->so_family == AF_UNIX);
2753
2754 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2755 (SS_ISCONNECTED|SS_ISBOUND))
2756 return;
2757
2758 dprintso(so, 1, ("so_unix_close(%p) %s\n",
2759 (void *)so, pr_state(so->so_state, so->so_mode)));
2760
2761 toh.level = SOL_SOCKET;
2762 toh.name = SO_UNIX_CLOSE;
2763
2764 /* zero length + header */
2765 toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2766 toh.status = 0;
2767
2768 if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2769 struct T_optdata_req tdr;
2770
2771 tdr.PRIM_type = T_OPTDATA_REQ;
2772 tdr.DATA_flag = 0;
2773
2774 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2775 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2776
2777 /* NOTE: holding so_lock while sleeping */
2778 mp = soallocproto2(&tdr, sizeof (tdr),
2779 &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2780 } else {
2781 struct T_unitdata_req tudr;
2782 void *addr;
2783 socklen_t addrlen;
2784 void *src;
2785 socklen_t srclen;
2786 struct T_opthdr toh2;
2787 t_scalar_t size;
2788
2789 /*
2790 * We know this is an AF_UNIX connected DGRAM socket.
2791 * We therefore already have the destination address
2792 * in the internal form needed for this send. This is
2793 * similar to the sosend_dgram call later in this file
2794 * when there's no user-specified destination address.
2795 */
2796 if (sti->sti_faddr_noxlate) {
2797 /*
2798 * Already have a transport internal address. Do not
2799 * pass any (transport internal) source address.
2800 */
2801 addr = sti->sti_faddr_sa;
2802 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2803 src = NULL;
2804 srclen = 0;
2805 } else {
2806 /*
2807 * Pass the sockaddr_un source address as an option
2808 * and translate the remote address.
2809 * Holding so_lock thus sti_laddr_sa can not change.
2810 */
2811 src = sti->sti_laddr_sa;
2812 srclen = (socklen_t)sti->sti_laddr_len;
2813 dprintso(so, 1,
2814 ("so_ux_close: srclen %d, src %p\n",
2815 srclen, src));
2816 /*
2817 * Use the destination address saved in connect.
2818 */
2819 addr = &sti->sti_ux_faddr;
2820 addrlen = sizeof (sti->sti_ux_faddr);
2821 }
2822 tudr.PRIM_type = T_UNITDATA_REQ;
2823 tudr.DEST_length = addrlen;
2824 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2825 if (srclen == 0) {
2826 tudr.OPT_length = (t_scalar_t)sizeof (toh);
2827 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2828 _TPI_ALIGN_TOPT(addrlen));
2829
2830 size = tudr.OPT_offset + tudr.OPT_length;
2831 /* NOTE: holding so_lock while sleeping */
2832 mp = soallocproto2(&tudr, sizeof (tudr),
2833 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2834 mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2835 soappendmsg(mp, &toh, sizeof (toh));
2836 } else {
2837 /*
2838 * There is a AF_UNIX sockaddr_un to include as a
2839 * source address option.
2840 */
2841 tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2842 _TPI_ALIGN_TOPT(srclen));
2843 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2844 _TPI_ALIGN_TOPT(addrlen));
2845
2846 toh2.level = SOL_SOCKET;
2847 toh2.name = SO_SRCADDR;
2848 toh2.len = (t_uscalar_t)(srclen +
2849 sizeof (struct T_opthdr));
2850 toh2.status = 0;
2851
2852 size = tudr.OPT_offset + tudr.OPT_length;
2853
2854 /* NOTE: holding so_lock while sleeping */
2855 mp = soallocproto2(&tudr, sizeof (tudr),
2856 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2857 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2858 soappendmsg(mp, &toh, sizeof (toh));
2859 soappendmsg(mp, &toh2, sizeof (toh2));
2860 soappendmsg(mp, src, srclen);
2861 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2862 }
2863 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2864 }
2865 mutex_exit(&so->so_lock);
2866 (void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2867 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2868 mutex_enter(&so->so_lock);
2869 }
2870
2871 /*
2872 * Called by sotpi_recvmsg when reading a non-zero amount of data.
2873 * In addition, the caller typically verifies that there is some
2874 * potential state to clear by checking
2875 * if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2876 * before calling this routine.
2877 * Note that such a check can be made without holding so_lock since
2878 * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2879 * decrements sti_oobsigcnt.
2880 *
2881 * When data is read *after* the point that all pending
2882 * oob data has been consumed the oob indication is cleared.
2883 *
2884 * This logic keeps select/poll returning POLLRDBAND and
2885 * SIOCATMARK returning true until we have read past
2886 * the mark.
2887 */
2888 static void
sorecv_update_oobstate(struct sonode * so)2889 sorecv_update_oobstate(struct sonode *so)
2890 {
2891 sotpi_info_t *sti = SOTOTPI(so);
2892
2893 mutex_enter(&so->so_lock);
2894 ASSERT(so_verify_oobstate(so));
2895 dprintso(so, 1,
2896 ("sorecv_update_oobstate: counts %d/%d state %s\n",
2897 sti->sti_oobsigcnt,
2898 sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2899 if (sti->sti_oobsigcnt == 0) {
2900 /* No more pending oob indications */
2901 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2902 freemsg(so->so_oobmsg);
2903 so->so_oobmsg = NULL;
2904 }
2905 ASSERT(so_verify_oobstate(so));
2906 mutex_exit(&so->so_lock);
2907 }
2908
2909 /*
2910 * Receive the next message on the queue.
2911 * If msg_controllen is non-zero when called the caller is interested in
2912 * any received control info (options).
2913 * If msg_namelen is non-zero when called the caller is interested in
2914 * any received source address.
2915 * The routine returns with msg_control and msg_name pointing to
2916 * kmem_alloc'ed memory which the caller has to free.
2917 */
2918 /* ARGSUSED */
2919 int
sotpi_recvmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)2920 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
2921 struct cred *cr)
2922 {
2923 union T_primitives *tpr;
2924 mblk_t *mp;
2925 uchar_t pri;
2926 int pflag, opflag;
2927 void *control;
2928 t_uscalar_t controllen;
2929 t_uscalar_t namelen;
2930 int so_state = so->so_state; /* Snapshot */
2931 ssize_t saved_resid;
2932 rval_t rval;
2933 int flags;
2934 clock_t timout;
2935 int error = 0;
2936 sotpi_info_t *sti = SOTOTPI(so);
2937
2938 flags = msg->msg_flags;
2939 msg->msg_flags = 0;
2940
2941 dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2942 (void *)so, (void *)msg, flags,
2943 pr_state(so->so_state, so->so_mode), so->so_error));
2944
2945 if (so->so_version == SOV_STREAM) {
2946 so_update_attrs(so, SOACC);
2947 /* The imaginary "sockmod" has been popped - act as a stream */
2948 return (strread(SOTOV(so), uiop, cr));
2949 }
2950
2951 /*
2952 * If we are not connected because we have never been connected
2953 * we return ENOTCONN. If we have been connected (but are no longer
2954 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2955 * the EOF.
2956 *
2957 * An alternative would be to post an ENOTCONN error in stream head
2958 * (read+write) and clear it when we're connected. However, that error
2959 * would cause incorrect poll/select behavior!
2960 */
2961 if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2962 (so->so_mode & SM_CONNREQUIRED)) {
2963 return (ENOTCONN);
2964 }
2965
2966 /*
2967 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2968 * after checking that the read queue is empty) and returns zero.
2969 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2970 * is zero.
2971 */
2972
2973 if (flags & MSG_OOB) {
2974 /* Check that the transport supports OOB */
2975 if (!(so->so_mode & SM_EXDATA))
2976 return (EOPNOTSUPP);
2977 so_update_attrs(so, SOACC);
2978 return (sorecvoob(so, msg, uiop, flags,
2979 (so->so_options & SO_OOBINLINE)));
2980 }
2981
2982 so_update_attrs(so, SOACC);
2983
2984 /*
2985 * Set msg_controllen and msg_namelen to zero here to make it
2986 * simpler in the cases that no control or name is returned.
2987 */
2988 controllen = msg->msg_controllen;
2989 namelen = msg->msg_namelen;
2990 msg->msg_controllen = 0;
2991 msg->msg_namelen = 0;
2992
2993 dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2994 namelen, controllen));
2995
2996 mutex_enter(&so->so_lock);
2997 /*
2998 * Only one reader is allowed at any given time. This is needed
2999 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3000 *
3001 * This is slightly different that BSD behavior in that it fails with
3002 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3003 * is single-threaded using sblock(), which is dropped while waiting
3004 * for data to appear. The difference shows up e.g. if one
3005 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3006 * does use nonblocking io and different threads are reading each
3007 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3008 * in this case as long as the read queue doesn't get empty.
3009 * In this implementation the thread using nonblocking io can
3010 * get an EWOULDBLOCK error due to the blocking thread executing
3011 * e.g. in the uiomove in kstrgetmsg.
3012 * This difference is not believed to be significant.
3013 */
3014 /* Set SOREADLOCKED */
3015 error = so_lock_read_intr(so,
3016 uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3017 mutex_exit(&so->so_lock);
3018 if (error)
3019 return (error);
3020
3021 /*
3022 * Tell kstrgetmsg to not inspect the stream head errors until all
3023 * queued data has been consumed.
3024 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3025 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3026 *
3027 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3028 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3029 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3030 */
3031 pflag = MSG_ANY | MSG_DELAYERROR;
3032 if (flags & MSG_PEEK) {
3033 pflag |= MSG_IPEEK;
3034 flags &= ~MSG_WAITALL;
3035 }
3036 if (so->so_mode & SM_ATOMIC)
3037 pflag |= MSG_DISCARDTAIL;
3038
3039 if (flags & MSG_DONTWAIT)
3040 timout = 0;
3041 else if (so->so_rcvtimeo != 0)
3042 timout = TICK_TO_MSEC(so->so_rcvtimeo);
3043 else
3044 timout = -1;
3045 opflag = pflag;
3046 retry:
3047 saved_resid = uiop->uio_resid;
3048 pri = 0;
3049 mp = NULL;
3050 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3051 timout, &rval);
3052 if (error != 0) {
3053 /* kstrgetmsg returns ETIME when timeout expires */
3054 if (error == ETIME)
3055 error = EWOULDBLOCK;
3056 goto out;
3057 }
3058 /*
3059 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3060 * For non-datagrams MOREDATA is used to set MSG_EOR.
3061 */
3062 ASSERT(!(rval.r_val1 & MORECTL));
3063 if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3064 msg->msg_flags |= MSG_TRUNC;
3065
3066 if (mp == NULL) {
3067 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3068 /*
3069 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3070 * The draft Posix socket spec states that the mark should
3071 * not be cleared when peeking. We follow the latter.
3072 */
3073 if ((so->so_state &
3074 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3075 (uiop->uio_resid != saved_resid) &&
3076 !(flags & MSG_PEEK)) {
3077 sorecv_update_oobstate(so);
3078 }
3079
3080 mutex_enter(&so->so_lock);
3081 /* Set MSG_EOR based on MOREDATA */
3082 if (!(rval.r_val1 & MOREDATA)) {
3083 if (so->so_state & SS_SAVEDEOR) {
3084 msg->msg_flags |= MSG_EOR;
3085 so->so_state &= ~SS_SAVEDEOR;
3086 }
3087 }
3088 /*
3089 * If some data was received (i.e. not EOF) and the
3090 * read/recv* has not been satisfied wait for some more.
3091 */
3092 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3093 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3094 mutex_exit(&so->so_lock);
3095 pflag = opflag | MSG_NOMARK;
3096 goto retry;
3097 }
3098 goto out_locked;
3099 }
3100
3101 /* strsock_proto has already verified length and alignment */
3102 tpr = (union T_primitives *)mp->b_rptr;
3103 dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3104
3105 switch (tpr->type) {
3106 case T_DATA_IND: {
3107 if ((so->so_state &
3108 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3109 (uiop->uio_resid != saved_resid) &&
3110 !(flags & MSG_PEEK)) {
3111 sorecv_update_oobstate(so);
3112 }
3113
3114 /*
3115 * Set msg_flags to MSG_EOR based on
3116 * MORE_flag and MOREDATA.
3117 */
3118 mutex_enter(&so->so_lock);
3119 so->so_state &= ~SS_SAVEDEOR;
3120 if (!(tpr->data_ind.MORE_flag & 1)) {
3121 if (!(rval.r_val1 & MOREDATA))
3122 msg->msg_flags |= MSG_EOR;
3123 else
3124 so->so_state |= SS_SAVEDEOR;
3125 }
3126 freemsg(mp);
3127 /*
3128 * If some data was received (i.e. not EOF) and the
3129 * read/recv* has not been satisfied wait for some more.
3130 */
3131 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3132 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3133 mutex_exit(&so->so_lock);
3134 pflag = opflag | MSG_NOMARK;
3135 goto retry;
3136 }
3137 goto out_locked;
3138 }
3139 case T_UNITDATA_IND: {
3140 void *addr;
3141 t_uscalar_t addrlen;
3142 void *abuf;
3143 t_uscalar_t optlen;
3144 void *opt;
3145
3146 if ((so->so_state &
3147 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3148 (uiop->uio_resid != saved_resid) &&
3149 !(flags & MSG_PEEK)) {
3150 sorecv_update_oobstate(so);
3151 }
3152
3153 if (namelen != 0) {
3154 /* Caller wants source address */
3155 addrlen = tpr->unitdata_ind.SRC_length;
3156 addr = sogetoff(mp,
3157 tpr->unitdata_ind.SRC_offset,
3158 addrlen, 1);
3159 if (addr == NULL) {
3160 freemsg(mp);
3161 error = EPROTO;
3162 eprintsoline(so, error);
3163 goto out;
3164 }
3165 if (so->so_family == AF_UNIX) {
3166 /*
3167 * Can not use the transport level address.
3168 * If there is a SO_SRCADDR option carrying
3169 * the socket level address it will be
3170 * extracted below.
3171 */
3172 addr = NULL;
3173 addrlen = 0;
3174 }
3175 }
3176 optlen = tpr->unitdata_ind.OPT_length;
3177 if (optlen != 0) {
3178 t_uscalar_t ncontrollen;
3179
3180 /*
3181 * Extract any source address option.
3182 * Determine how large cmsg buffer is needed.
3183 */
3184 opt = sogetoff(mp,
3185 tpr->unitdata_ind.OPT_offset,
3186 optlen, __TPI_ALIGN_SIZE);
3187
3188 if (opt == NULL) {
3189 freemsg(mp);
3190 error = EPROTO;
3191 eprintsoline(so, error);
3192 goto out;
3193 }
3194 if (so->so_family == AF_UNIX)
3195 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3196 ncontrollen = so_cmsglen(mp, opt, optlen,
3197 !(flags & MSG_XPG4_2));
3198 if (controllen != 0)
3199 controllen = ncontrollen;
3200 else if (ncontrollen != 0)
3201 msg->msg_flags |= MSG_CTRUNC;
3202 } else {
3203 controllen = 0;
3204 }
3205
3206 if (namelen != 0) {
3207 /*
3208 * Return address to caller.
3209 * Caller handles truncation if length
3210 * exceeds msg_namelen.
3211 * NOTE: AF_UNIX NUL termination is ensured by
3212 * the sender's copyin_name().
3213 */
3214 abuf = kmem_alloc(addrlen, KM_SLEEP);
3215
3216 bcopy(addr, abuf, addrlen);
3217 msg->msg_name = abuf;
3218 msg->msg_namelen = addrlen;
3219 }
3220
3221 if (controllen != 0) {
3222 /*
3223 * Return control msg to caller.
3224 * Caller handles truncation if length
3225 * exceeds msg_controllen.
3226 */
3227 control = kmem_zalloc(controllen, KM_SLEEP);
3228
3229 error = so_opt2cmsg(mp, opt, optlen,
3230 !(flags & MSG_XPG4_2),
3231 control, controllen);
3232 if (error) {
3233 freemsg(mp);
3234 if (msg->msg_namelen != 0)
3235 kmem_free(msg->msg_name,
3236 msg->msg_namelen);
3237 kmem_free(control, controllen);
3238 eprintsoline(so, error);
3239 goto out;
3240 }
3241 msg->msg_control = control;
3242 msg->msg_controllen = controllen;
3243 }
3244
3245 freemsg(mp);
3246 goto out;
3247 }
3248 case T_OPTDATA_IND: {
3249 struct T_optdata_req *tdr;
3250 void *opt;
3251 t_uscalar_t optlen;
3252
3253 if ((so->so_state &
3254 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3255 (uiop->uio_resid != saved_resid) &&
3256 !(flags & MSG_PEEK)) {
3257 sorecv_update_oobstate(so);
3258 }
3259
3260 tdr = (struct T_optdata_req *)mp->b_rptr;
3261 optlen = tdr->OPT_length;
3262 if (optlen != 0) {
3263 t_uscalar_t ncontrollen;
3264 /*
3265 * Determine how large cmsg buffer is needed.
3266 */
3267 opt = sogetoff(mp,
3268 tpr->optdata_ind.OPT_offset,
3269 optlen, __TPI_ALIGN_SIZE);
3270
3271 if (opt == NULL) {
3272 freemsg(mp);
3273 error = EPROTO;
3274 eprintsoline(so, error);
3275 goto out;
3276 }
3277
3278 ncontrollen = so_cmsglen(mp, opt, optlen,
3279 !(flags & MSG_XPG4_2));
3280 if (controllen != 0)
3281 controllen = ncontrollen;
3282 else if (ncontrollen != 0)
3283 msg->msg_flags |= MSG_CTRUNC;
3284 } else {
3285 controllen = 0;
3286 }
3287
3288 if (controllen != 0) {
3289 /*
3290 * Return control msg to caller.
3291 * Caller handles truncation if length
3292 * exceeds msg_controllen.
3293 */
3294 control = kmem_zalloc(controllen, KM_SLEEP);
3295
3296 error = so_opt2cmsg(mp, opt, optlen,
3297 !(flags & MSG_XPG4_2),
3298 control, controllen);
3299 if (error) {
3300 freemsg(mp);
3301 kmem_free(control, controllen);
3302 eprintsoline(so, error);
3303 goto out;
3304 }
3305 msg->msg_control = control;
3306 msg->msg_controllen = controllen;
3307 }
3308
3309 /*
3310 * Set msg_flags to MSG_EOR based on
3311 * DATA_flag and MOREDATA.
3312 */
3313 mutex_enter(&so->so_lock);
3314 so->so_state &= ~SS_SAVEDEOR;
3315 if (!(tpr->data_ind.MORE_flag & 1)) {
3316 if (!(rval.r_val1 & MOREDATA))
3317 msg->msg_flags |= MSG_EOR;
3318 else
3319 so->so_state |= SS_SAVEDEOR;
3320 }
3321 freemsg(mp);
3322 /*
3323 * If some data was received (i.e. not EOF) and the
3324 * read/recv* has not been satisfied wait for some more.
3325 * Not possible to wait if control info was received.
3326 */
3327 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3328 controllen == 0 &&
3329 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3330 mutex_exit(&so->so_lock);
3331 pflag = opflag | MSG_NOMARK;
3332 goto retry;
3333 }
3334 goto out_locked;
3335 }
3336 case T_EXDATA_IND: {
3337 dprintso(so, 1,
3338 ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3339 "state %s\n",
3340 sti->sti_oobsigcnt, sti->sti_oobcnt,
3341 saved_resid - uiop->uio_resid,
3342 pr_state(so->so_state, so->so_mode)));
3343 /*
3344 * kstrgetmsg handles MSGMARK so there is nothing to
3345 * inspect in the T_EXDATA_IND.
3346 * strsock_proto makes the stream head queue the T_EXDATA_IND
3347 * as a separate message with no M_DATA component. Furthermore,
3348 * the stream head does not consolidate M_DATA messages onto
3349 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3350 * remains a message by itself. This is needed since MSGMARK
3351 * marks both the whole message as well as the last byte
3352 * of the message.
3353 */
3354 freemsg(mp);
3355 ASSERT(uiop->uio_resid == saved_resid); /* No data */
3356 if (flags & MSG_PEEK) {
3357 /*
3358 * Even though we are peeking we consume the
3359 * T_EXDATA_IND thereby moving the mark information
3360 * to SS_RCVATMARK. Then the oob code below will
3361 * retry the peeking kstrgetmsg.
3362 * Note that the stream head read queue is
3363 * never flushed without holding SOREADLOCKED
3364 * thus the T_EXDATA_IND can not disappear
3365 * underneath us.
3366 */
3367 dprintso(so, 1,
3368 ("sotpi_recvmsg: consume EXDATA_IND "
3369 "counts %d/%d state %s\n",
3370 sti->sti_oobsigcnt,
3371 sti->sti_oobcnt,
3372 pr_state(so->so_state, so->so_mode)));
3373
3374 pflag = MSG_ANY | MSG_DELAYERROR;
3375 if (so->so_mode & SM_ATOMIC)
3376 pflag |= MSG_DISCARDTAIL;
3377
3378 pri = 0;
3379 mp = NULL;
3380
3381 error = kstrgetmsg(SOTOV(so), &mp, uiop,
3382 &pri, &pflag, (clock_t)-1, &rval);
3383 ASSERT(uiop->uio_resid == saved_resid);
3384
3385 if (error) {
3386 #ifdef SOCK_DEBUG
3387 if (error != EWOULDBLOCK && error != EINTR) {
3388 eprintsoline(so, error);
3389 }
3390 #endif /* SOCK_DEBUG */
3391 goto out;
3392 }
3393 ASSERT(mp);
3394 tpr = (union T_primitives *)mp->b_rptr;
3395 ASSERT(tpr->type == T_EXDATA_IND);
3396 freemsg(mp);
3397 } /* end "if (flags & MSG_PEEK)" */
3398
3399 /*
3400 * Decrement the number of queued and pending oob.
3401 *
3402 * SS_RCVATMARK is cleared when we read past a mark.
3403 * SS_HAVEOOBDATA is cleared when we've read past the
3404 * last mark.
3405 * SS_OOBPEND is cleared if we've read past the last
3406 * mark and no (new) SIGURG has been posted.
3407 */
3408 mutex_enter(&so->so_lock);
3409 ASSERT(so_verify_oobstate(so));
3410 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3411 ASSERT(sti->sti_oobsigcnt > 0);
3412 sti->sti_oobsigcnt--;
3413 ASSERT(sti->sti_oobcnt > 0);
3414 sti->sti_oobcnt--;
3415 /*
3416 * Since the T_EXDATA_IND has been removed from the stream
3417 * head, but we have not read data past the mark,
3418 * sockfs needs to track that the socket is still at the mark.
3419 *
3420 * Since no data was received call kstrgetmsg again to wait
3421 * for data.
3422 */
3423 so->so_state |= SS_RCVATMARK;
3424 mutex_exit(&so->so_lock);
3425 dprintso(so, 1,
3426 ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3427 sti->sti_oobsigcnt, sti->sti_oobcnt,
3428 pr_state(so->so_state, so->so_mode)));
3429 pflag = opflag;
3430 goto retry;
3431 }
3432 default:
3433 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3434 (void *)so, tpr->type, (void *)mp);
3435 ASSERT(0);
3436 freemsg(mp);
3437 error = EPROTO;
3438 eprintsoline(so, error);
3439 goto out;
3440 }
3441 /* NOTREACHED */
3442 out:
3443 mutex_enter(&so->so_lock);
3444 out_locked:
3445 so_unlock_read(so); /* Clear SOREADLOCKED */
3446 mutex_exit(&so->so_lock);
3447 return (error);
3448 }
3449
3450 /*
3451 * Sending data with options on a datagram socket.
3452 * Assumes caller has verified that SS_ISBOUND etc. are set.
3453 *
3454 * For AF_UNIX the destination address may be already in
3455 * internal form, as indicated by sti->sti_faddr_noxlate
3456 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3457 * translate the destination address to internal form.
3458 *
3459 * The source address is passed as an option. If passing
3460 * file descriptors, those are passed as file pointers in
3461 * another option.
3462 */
3463 static int
sosend_dgramcmsg(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,void * control,t_uscalar_t controllen,int flags)3464 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3465 struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3466 {
3467 struct T_unitdata_req tudr;
3468 mblk_t *mp;
3469 int error;
3470 void *addr;
3471 socklen_t addrlen;
3472 void *src;
3473 socklen_t srclen;
3474 ssize_t len;
3475 int size;
3476 struct T_opthdr toh;
3477 struct fdbuf *fdbuf;
3478 t_uscalar_t optlen;
3479 void *fds;
3480 int fdlen;
3481 sotpi_info_t *sti = SOTOTPI(so);
3482
3483 ASSERT(name && namelen);
3484 ASSERT(control && controllen);
3485
3486 len = uiop->uio_resid;
3487 if (len > (ssize_t)sti->sti_tidu_size) {
3488 return (EMSGSIZE);
3489 }
3490
3491 if (sti->sti_faddr_noxlate == 0 &&
3492 (flags & MSG_SENDTO_NOXLATE) == 0) {
3493 /*
3494 * Length and family checks.
3495 * Don't verify internal form.
3496 */
3497 error = so_addr_verify(so, name, namelen);
3498 if (error) {
3499 eprintsoline(so, error);
3500 return (error);
3501 }
3502 }
3503
3504 if (so->so_family == AF_UNIX) {
3505 if (sti->sti_faddr_noxlate) {
3506 /*
3507 * Already have a transport internal address. Do not
3508 * pass any (transport internal) source address.
3509 */
3510 addr = name;
3511 addrlen = namelen;
3512 src = NULL;
3513 srclen = 0;
3514 } else if (flags & MSG_SENDTO_NOXLATE) {
3515 /*
3516 * Have an internal form dest. address.
3517 * Pass the source address as usual.
3518 */
3519 addr = name;
3520 addrlen = namelen;
3521 src = sti->sti_laddr_sa;
3522 srclen = (socklen_t)sti->sti_laddr_len;
3523 } else {
3524 /*
3525 * Pass the sockaddr_un source address as an option
3526 * and translate the remote address.
3527 *
3528 * Note that this code does not prevent sti_laddr_sa
3529 * from changing while it is being used. Thus
3530 * if an unbind+bind occurs concurrently with this
3531 * send the peer might see a partially new and a
3532 * partially old "from" address.
3533 */
3534 src = sti->sti_laddr_sa;
3535 srclen = (socklen_t)sti->sti_laddr_len;
3536 dprintso(so, 1,
3537 ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3538 srclen, src));
3539 /*
3540 * The sendmsg caller specified a destination
3541 * address, which we must translate into our
3542 * internal form. addr = &sti->sti_ux_taddr
3543 */
3544 error = so_ux_addr_xlate(so, name, namelen,
3545 (flags & MSG_XPG4_2),
3546 &addr, &addrlen);
3547 if (error) {
3548 eprintsoline(so, error);
3549 return (error);
3550 }
3551 }
3552 } else {
3553 addr = name;
3554 addrlen = namelen;
3555 src = NULL;
3556 srclen = 0;
3557 }
3558 optlen = so_optlen(control, controllen,
3559 !(flags & MSG_XPG4_2));
3560 tudr.PRIM_type = T_UNITDATA_REQ;
3561 tudr.DEST_length = addrlen;
3562 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3563 if (srclen != 0)
3564 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3565 _TPI_ALIGN_TOPT(srclen));
3566 else
3567 tudr.OPT_length = optlen;
3568 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3569 _TPI_ALIGN_TOPT(addrlen));
3570
3571 size = tudr.OPT_offset + tudr.OPT_length;
3572
3573 /*
3574 * File descriptors only when SM_FDPASSING set.
3575 */
3576 error = so_getfdopt(control, controllen,
3577 !(flags & MSG_XPG4_2), &fds, &fdlen);
3578 if (error)
3579 return (error);
3580 if (fdlen != -1) {
3581 if (!(so->so_mode & SM_FDPASSING))
3582 return (EOPNOTSUPP);
3583
3584 error = fdbuf_create(fds, fdlen, &fdbuf);
3585 if (error)
3586 return (error);
3587
3588 /*
3589 * Pre-allocate enough additional space for lower level modules
3590 * to append an option (e.g. see tl_unitdata). The following
3591 * is enough extra space for the largest option we might append.
3592 */
3593 size += sizeof (struct T_opthdr) + ucredsize;
3594 mp = fdbuf_allocmsg(size, fdbuf);
3595 } else {
3596 mp = soallocproto(size, _ALLOC_INTR, CRED());
3597 if (mp == NULL) {
3598 /*
3599 * Caught a signal waiting for memory.
3600 * Let send* return EINTR.
3601 */
3602 return (EINTR);
3603 }
3604 }
3605 soappendmsg(mp, &tudr, sizeof (tudr));
3606 soappendmsg(mp, addr, addrlen);
3607 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3608
3609 if (fdlen != -1) {
3610 ASSERT(fdbuf != NULL);
3611 toh.level = SOL_SOCKET;
3612 toh.name = SO_FILEP;
3613 toh.len = fdbuf->fd_size +
3614 (t_uscalar_t)sizeof (struct T_opthdr);
3615 toh.status = 0;
3616 soappendmsg(mp, &toh, sizeof (toh));
3617 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3618 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3619 }
3620 if (srclen != 0) {
3621 /*
3622 * There is a AF_UNIX sockaddr_un to include as a source
3623 * address option.
3624 */
3625 toh.level = SOL_SOCKET;
3626 toh.name = SO_SRCADDR;
3627 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3628 toh.status = 0;
3629 soappendmsg(mp, &toh, sizeof (toh));
3630 soappendmsg(mp, src, srclen);
3631 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3632 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3633 }
3634 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3635 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3636 /*
3637 * Normally at most 3 bytes left in the message, but we might have
3638 * allowed for extra space if we're passing fd's through.
3639 */
3640 ASSERT(MBLKL(mp) <= (ssize_t)size);
3641
3642 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3643 if (AU_AUDITING())
3644 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3645
3646 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3647 #ifdef SOCK_DEBUG
3648 if (error) {
3649 eprintsoline(so, error);
3650 }
3651 #endif /* SOCK_DEBUG */
3652 return (error);
3653 }
3654
3655 /*
3656 * Sending data with options on a connected stream socket.
3657 * Assumes caller has verified that SS_ISCONNECTED is set.
3658 */
3659 static int
sosend_svccmsg(struct sonode * so,struct uio * uiop,int more,void * control,t_uscalar_t controllen,int flags)3660 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3661 t_uscalar_t controllen, int flags)
3662 {
3663 struct T_optdata_req tdr;
3664 mblk_t *mp;
3665 int error;
3666 ssize_t iosize;
3667 int size;
3668 struct fdbuf *fdbuf;
3669 t_uscalar_t optlen;
3670 void *fds;
3671 int fdlen;
3672 struct T_opthdr toh;
3673 sotpi_info_t *sti = SOTOTPI(so);
3674
3675 dprintso(so, 1,
3676 ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3677
3678 /*
3679 * Has to be bound and connected. However, since no locks are
3680 * held the state could have changed after sotpi_sendmsg checked it
3681 * thus it is not possible to ASSERT on the state.
3682 */
3683
3684 /* Options on connection-oriented only when SM_OPTDATA set. */
3685 if (!(so->so_mode & SM_OPTDATA))
3686 return (EOPNOTSUPP);
3687
3688 do {
3689 /*
3690 * Set the MORE flag if uio_resid does not fit in this
3691 * message or if the caller passed in "more".
3692 * Error for transports with zero tidu_size.
3693 */
3694 tdr.PRIM_type = T_OPTDATA_REQ;
3695 iosize = sti->sti_tidu_size;
3696 if (iosize <= 0)
3697 return (EMSGSIZE);
3698 if (uiop->uio_resid > iosize) {
3699 tdr.DATA_flag = 1;
3700 } else {
3701 if (more)
3702 tdr.DATA_flag = 1;
3703 else
3704 tdr.DATA_flag = 0;
3705 iosize = uiop->uio_resid;
3706 }
3707 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3708 tdr.DATA_flag, iosize));
3709
3710 optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3711 tdr.OPT_length = optlen;
3712 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3713
3714 size = (int)sizeof (tdr) + optlen;
3715 /*
3716 * File descriptors only when SM_FDPASSING set.
3717 */
3718 error = so_getfdopt(control, controllen,
3719 !(flags & MSG_XPG4_2), &fds, &fdlen);
3720 if (error)
3721 return (error);
3722 if (fdlen != -1) {
3723 if (!(so->so_mode & SM_FDPASSING))
3724 return (EOPNOTSUPP);
3725
3726 error = fdbuf_create(fds, fdlen, &fdbuf);
3727 if (error)
3728 return (error);
3729
3730 /*
3731 * Pre-allocate enough additional space for lower level
3732 * modules to append an option (e.g. see tl_unitdata).
3733 * The following is enough extra space for the largest
3734 * option we might append.
3735 */
3736 size += sizeof (struct T_opthdr) + ucredsize;
3737 mp = fdbuf_allocmsg(size, fdbuf);
3738 } else {
3739 mp = soallocproto(size, _ALLOC_INTR, CRED());
3740 if (mp == NULL) {
3741 /*
3742 * Caught a signal waiting for memory.
3743 * Let send* return EINTR.
3744 */
3745 return (EINTR);
3746 }
3747 }
3748 soappendmsg(mp, &tdr, sizeof (tdr));
3749
3750 if (fdlen != -1) {
3751 ASSERT(fdbuf != NULL);
3752 toh.level = SOL_SOCKET;
3753 toh.name = SO_FILEP;
3754 toh.len = fdbuf->fd_size +
3755 (t_uscalar_t)sizeof (struct T_opthdr);
3756 toh.status = 0;
3757 soappendmsg(mp, &toh, sizeof (toh));
3758 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3759 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3760 }
3761 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3762 /*
3763 * Normally at most 3 bytes left in the message, but we might
3764 * have allowed for extra space if we're passing fd's through.
3765 */
3766 ASSERT(MBLKL(mp) <= (ssize_t)size);
3767
3768 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3769
3770 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3771 0, MSG_BAND, 0);
3772 if (error) {
3773 eprintsoline(so, error);
3774 return (error);
3775 }
3776 control = NULL;
3777 if (uiop->uio_resid > 0) {
3778 /*
3779 * Recheck for fatal errors. Fail write even though
3780 * some data have been written. This is consistent
3781 * with strwrite semantics and BSD sockets semantics.
3782 */
3783 if (so->so_state & SS_CANTSENDMORE) {
3784 eprintsoline(so, error);
3785 return (EPIPE);
3786 }
3787 if (so->so_error != 0) {
3788 mutex_enter(&so->so_lock);
3789 error = sogeterr(so, B_TRUE);
3790 mutex_exit(&so->so_lock);
3791 if (error != 0) {
3792 eprintsoline(so, error);
3793 return (error);
3794 }
3795 }
3796 }
3797 } while (uiop->uio_resid > 0);
3798 return (0);
3799 }
3800
3801 /*
3802 * Sending data on a datagram socket.
3803 * Assumes caller has verified that SS_ISBOUND etc. are set.
3804 *
3805 * For AF_UNIX the destination address may be already in
3806 * internal form, as indicated by sti->sti_faddr_noxlate
3807 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3808 * translate the destination address to internal form.
3809 *
3810 * The source address is passed as an option.
3811 */
3812 int
sosend_dgram(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)3813 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3814 struct uio *uiop, int flags)
3815 {
3816 struct T_unitdata_req tudr;
3817 mblk_t *mp;
3818 int error;
3819 void *addr;
3820 socklen_t addrlen;
3821 void *src;
3822 socklen_t srclen;
3823 ssize_t len;
3824 sotpi_info_t *sti = SOTOTPI(so);
3825
3826 ASSERT(name != NULL && namelen != 0);
3827
3828 len = uiop->uio_resid;
3829 if (len > sti->sti_tidu_size) {
3830 error = EMSGSIZE;
3831 goto done;
3832 }
3833
3834 if (sti->sti_faddr_noxlate == 0 &&
3835 (flags & MSG_SENDTO_NOXLATE) == 0) {
3836 /*
3837 * Length and family checks.
3838 * Don't verify internal form.
3839 */
3840 error = so_addr_verify(so, name, namelen);
3841 if (error != 0)
3842 goto done;
3843 }
3844
3845 if (sti->sti_direct) /* Never on AF_UNIX */
3846 return (sodgram_direct(so, name, namelen, uiop, flags));
3847
3848 if (so->so_family == AF_UNIX) {
3849 if (sti->sti_faddr_noxlate) {
3850 /*
3851 * Already have a transport internal address. Do not
3852 * pass any (transport internal) source address.
3853 */
3854 addr = name;
3855 addrlen = namelen;
3856 src = NULL;
3857 srclen = 0;
3858 } else if (flags & MSG_SENDTO_NOXLATE) {
3859 /*
3860 * Have an internal form dest. address.
3861 * Pass the source address as usual.
3862 */
3863 addr = name;
3864 addrlen = namelen;
3865 src = sti->sti_laddr_sa;
3866 srclen = (socklen_t)sti->sti_laddr_len;
3867 } else {
3868 /*
3869 * Pass the sockaddr_un source address as an option
3870 * and translate the remote address.
3871 *
3872 * Note that this code does not prevent sti_laddr_sa
3873 * from changing while it is being used. Thus
3874 * if an unbind+bind occurs concurrently with this
3875 * send the peer might see a partially new and a
3876 * partially old "from" address.
3877 */
3878 src = sti->sti_laddr_sa;
3879 srclen = (socklen_t)sti->sti_laddr_len;
3880 dprintso(so, 1,
3881 ("sosend_dgram UNIX: srclen %d, src %p\n",
3882 srclen, src));
3883 /*
3884 * The sendmsg caller specified a destination
3885 * address, which we must translate into our
3886 * internal form. addr = &sti->sti_ux_taddr
3887 */
3888 error = so_ux_addr_xlate(so, name, namelen,
3889 (flags & MSG_XPG4_2),
3890 &addr, &addrlen);
3891 if (error) {
3892 eprintsoline(so, error);
3893 goto done;
3894 }
3895 }
3896 } else {
3897 addr = name;
3898 addrlen = namelen;
3899 src = NULL;
3900 srclen = 0;
3901 }
3902 tudr.PRIM_type = T_UNITDATA_REQ;
3903 tudr.DEST_length = addrlen;
3904 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3905 if (srclen == 0) {
3906 tudr.OPT_length = 0;
3907 tudr.OPT_offset = 0;
3908
3909 mp = soallocproto2(&tudr, sizeof (tudr),
3910 addr, addrlen, 0, _ALLOC_INTR, CRED());
3911 if (mp == NULL) {
3912 /*
3913 * Caught a signal waiting for memory.
3914 * Let send* return EINTR.
3915 */
3916 error = EINTR;
3917 goto done;
3918 }
3919 } else {
3920 /*
3921 * There is a AF_UNIX sockaddr_un to include as a source
3922 * address option.
3923 */
3924 struct T_opthdr toh;
3925 ssize_t size;
3926
3927 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3928 _TPI_ALIGN_TOPT(srclen));
3929 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3930 _TPI_ALIGN_TOPT(addrlen));
3931
3932 toh.level = SOL_SOCKET;
3933 toh.name = SO_SRCADDR;
3934 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3935 toh.status = 0;
3936
3937 size = tudr.OPT_offset + tudr.OPT_length;
3938 mp = soallocproto2(&tudr, sizeof (tudr),
3939 addr, addrlen, size, _ALLOC_INTR, CRED());
3940 if (mp == NULL) {
3941 /*
3942 * Caught a signal waiting for memory.
3943 * Let send* return EINTR.
3944 */
3945 error = EINTR;
3946 goto done;
3947 }
3948 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3949 soappendmsg(mp, &toh, sizeof (toh));
3950 soappendmsg(mp, src, srclen);
3951 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3952 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3953 }
3954
3955 if (AU_AUDITING())
3956 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3957
3958 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3959 done:
3960 #ifdef SOCK_DEBUG
3961 if (error) {
3962 eprintsoline(so, error);
3963 }
3964 #endif /* SOCK_DEBUG */
3965 return (error);
3966 }
3967
3968 /*
3969 * Sending data on a connected stream socket.
3970 * Assumes caller has verified that SS_ISCONNECTED is set.
3971 */
3972 int
sosend_svc(struct sonode * so,struct uio * uiop,t_scalar_t prim,int more,int sflag)3973 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
3974 int sflag)
3975 {
3976 struct T_data_req tdr;
3977 mblk_t *mp;
3978 int error;
3979 ssize_t iosize;
3980 sotpi_info_t *sti = SOTOTPI(so);
3981
3982 dprintso(so, 1,
3983 ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3984 (void *)so, uiop->uio_resid, prim, sflag));
3985
3986 /*
3987 * Has to be bound and connected. However, since no locks are
3988 * held the state could have changed after sotpi_sendmsg checked it
3989 * thus it is not possible to ASSERT on the state.
3990 */
3991
3992 do {
3993 /*
3994 * Set the MORE flag if uio_resid does not fit in this
3995 * message or if the caller passed in "more".
3996 * Error for transports with zero tidu_size.
3997 */
3998 tdr.PRIM_type = prim;
3999 iosize = sti->sti_tidu_size;
4000 if (iosize <= 0)
4001 return (EMSGSIZE);
4002 if (uiop->uio_resid > iosize) {
4003 tdr.MORE_flag = 1;
4004 } else {
4005 if (more)
4006 tdr.MORE_flag = 1;
4007 else
4008 tdr.MORE_flag = 0;
4009 iosize = uiop->uio_resid;
4010 }
4011 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4012 prim, tdr.MORE_flag, iosize));
4013 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4014 if (mp == NULL) {
4015 /*
4016 * Caught a signal waiting for memory.
4017 * Let send* return EINTR.
4018 */
4019 return (EINTR);
4020 }
4021
4022 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4023 0, sflag | MSG_BAND, 0);
4024 if (error) {
4025 eprintsoline(so, error);
4026 return (error);
4027 }
4028 if (uiop->uio_resid > 0) {
4029 /*
4030 * Recheck for fatal errors. Fail write even though
4031 * some data have been written. This is consistent
4032 * with strwrite semantics and BSD sockets semantics.
4033 */
4034 if (so->so_state & SS_CANTSENDMORE) {
4035 eprintsoline(so, error);
4036 return (EPIPE);
4037 }
4038 if (so->so_error != 0) {
4039 mutex_enter(&so->so_lock);
4040 error = sogeterr(so, B_TRUE);
4041 mutex_exit(&so->so_lock);
4042 if (error != 0) {
4043 eprintsoline(so, error);
4044 return (error);
4045 }
4046 }
4047 }
4048 } while (uiop->uio_resid > 0);
4049 return (0);
4050 }
4051
4052 /*
4053 * Check the state for errors and call the appropriate send function.
4054 *
4055 * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4056 * this function issues a setsockopt to toggle SO_DONTROUTE before and
4057 * after sending the message.
4058 *
4059 * The caller may optionally specify a destination address, for either
4060 * stream or datagram sockets. This table summarizes the cases:
4061 *
4062 * Socket type Dest. given Connected Result
4063 * ----------- ----------- --------- --------------
4064 * Stream * Yes send to conn. addr.
4065 * Stream * No error ENOTCONN
4066 * Dgram yes * send to given addr.
4067 * Dgram no yes send to conn. addr.
4068 * Dgram no no error EDESTADDRREQ
4069 *
4070 * There are subtleties around the destination address when using
4071 * AF_UNIX datagram sockets. When the sendmsg call specifies the
4072 * destination address, it's in (struct sockaddr_un) form and we
4073 * need to translate it to our internal form (struct so_ux_addr).
4074 *
4075 * When the sendmsg call does not specify a destination address
4076 * we're using the peer address saved during sotpi_connect, and
4077 * that address is already in internal form. In this case, the
4078 * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4079 * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4080 * those functions should skip translation to internal form.
4081 * Avoiding that translation is not only more efficient, but it's
4082 * also necessary when a process does a connect on an AF_UNIX
4083 * datagram socket and then drops privileges. After the process
4084 * has dropped privileges, it may no longer be able to lookup the
4085 * the external name in the filesystem, but it should still be
4086 * able to send messages on the connected socket by leaving the
4087 * destination name unspecified.
4088 *
4089 * Yet more subtleties arise with sockets connected by socketpair(),
4090 * which puts internal form addresses in the fields where normally
4091 * the external form is found, and sets sti_faddr_noxlate=1, which
4092 * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4093 * to skip translation of destination addresses to internal form.
4094 * However, beware that the flag sti_faddr_noxlate=1 also triggers
4095 * different behaviour almost everywhere AF_UNIX addresses appear.
4096 */
4097 static int
sotpi_sendmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)4098 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4099 struct cred *cr)
4100 {
4101 int so_state;
4102 int so_mode;
4103 int error;
4104 struct sockaddr *name;
4105 t_uscalar_t namelen;
4106 int dontroute;
4107 int flags;
4108 sotpi_info_t *sti = SOTOTPI(so);
4109
4110 dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4111 (void *)so, (void *)msg, msg->msg_flags,
4112 pr_state(so->so_state, so->so_mode), so->so_error));
4113
4114 if (so->so_version == SOV_STREAM) {
4115 /* The imaginary "sockmod" has been popped - act as a stream */
4116 so_update_attrs(so, SOMOD);
4117 return (strwrite(SOTOV(so), uiop, cr));
4118 }
4119
4120 mutex_enter(&so->so_lock);
4121 so_state = so->so_state;
4122
4123 if (so_state & SS_CANTSENDMORE) {
4124 mutex_exit(&so->so_lock);
4125 return (EPIPE);
4126 }
4127
4128 if (so->so_error != 0) {
4129 error = sogeterr(so, B_TRUE);
4130 if (error != 0) {
4131 mutex_exit(&so->so_lock);
4132 return (error);
4133 }
4134 }
4135
4136 name = (struct sockaddr *)msg->msg_name;
4137 namelen = msg->msg_namelen;
4138 flags = msg->msg_flags;
4139
4140 /*
4141 * Historically, this function does not validate the flags
4142 * passed in, and any errant bits are ignored. However,
4143 * we would not want any such errant flag bits accidently
4144 * being treated as one of the internal-only flags, so
4145 * clear the internal-only flag bits.
4146 */
4147 flags &= ~MSG_SENDTO_NOXLATE;
4148
4149 so_mode = so->so_mode;
4150
4151 if (name == NULL) {
4152 if (!(so_state & SS_ISCONNECTED)) {
4153 mutex_exit(&so->so_lock);
4154 if (so_mode & SM_CONNREQUIRED)
4155 return (ENOTCONN);
4156 else
4157 return (EDESTADDRREQ);
4158 }
4159 /*
4160 * This is a connected socket.
4161 */
4162 if (so_mode & SM_CONNREQUIRED) {
4163 /*
4164 * This is a connected STREAM socket,
4165 * destination not specified.
4166 */
4167 name = NULL;
4168 namelen = 0;
4169 } else {
4170 /*
4171 * Datagram send on connected socket with
4172 * the destination name not specified.
4173 * Use the peer address from connect.
4174 */
4175 if (so->so_family == AF_UNIX) {
4176 /*
4177 * Use the (internal form) address saved
4178 * in sotpi_connect. See above.
4179 */
4180 name = (void *)&sti->sti_ux_faddr;
4181 namelen = sizeof (sti->sti_ux_faddr);
4182 flags |= MSG_SENDTO_NOXLATE;
4183 } else {
4184 ASSERT(sti->sti_faddr_sa);
4185 name = sti->sti_faddr_sa;
4186 namelen = (t_uscalar_t)sti->sti_faddr_len;
4187 }
4188 }
4189 } else {
4190 /*
4191 * Sendmsg specifies a destination name
4192 */
4193 if (!(so_state & SS_ISCONNECTED) &&
4194 (so_mode & SM_CONNREQUIRED)) {
4195 /* i.e. TCP not connected */
4196 mutex_exit(&so->so_lock);
4197 return (ENOTCONN);
4198 }
4199 /*
4200 * Ignore the address on connection-oriented sockets.
4201 * Just like BSD this code does not generate an error for
4202 * TCP (a CONNREQUIRED socket) when sending to an address
4203 * passed in with sendto/sendmsg. Instead the data is
4204 * delivered on the connection as if no address had been
4205 * supplied.
4206 */
4207 if ((so_state & SS_ISCONNECTED) &&
4208 !(so_mode & SM_CONNREQUIRED)) {
4209 mutex_exit(&so->so_lock);
4210 return (EISCONN);
4211 }
4212 if (!(so_state & SS_ISBOUND)) {
4213 so_lock_single(so); /* Set SOLOCKED */
4214 error = sotpi_bind(so, NULL, 0,
4215 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4216 so_unlock_single(so, SOLOCKED);
4217 if (error) {
4218 mutex_exit(&so->so_lock);
4219 eprintsoline(so, error);
4220 return (error);
4221 }
4222 }
4223 /*
4224 * Handle delayed datagram errors. These are only queued
4225 * when the application sets SO_DGRAM_ERRIND.
4226 * Return the error if we are sending to the address
4227 * that was returned in the last T_UDERROR_IND.
4228 * If sending to some other address discard the delayed
4229 * error indication.
4230 */
4231 if (sti->sti_delayed_error) {
4232 struct T_uderror_ind *tudi;
4233 void *addr;
4234 t_uscalar_t addrlen;
4235 boolean_t match = B_FALSE;
4236
4237 ASSERT(sti->sti_eaddr_mp);
4238 error = sti->sti_delayed_error;
4239 sti->sti_delayed_error = 0;
4240 tudi =
4241 (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4242 addrlen = tudi->DEST_length;
4243 addr = sogetoff(sti->sti_eaddr_mp,
4244 tudi->DEST_offset, addrlen, 1);
4245 ASSERT(addr); /* Checked by strsock_proto */
4246 switch (so->so_family) {
4247 case AF_INET: {
4248 /* Compare just IP address and port */
4249 sin_t *sin1 = (sin_t *)name;
4250 sin_t *sin2 = (sin_t *)addr;
4251
4252 if (addrlen == sizeof (sin_t) &&
4253 namelen == addrlen &&
4254 sin1->sin_port == sin2->sin_port &&
4255 sin1->sin_addr.s_addr ==
4256 sin2->sin_addr.s_addr)
4257 match = B_TRUE;
4258 break;
4259 }
4260 case AF_INET6: {
4261 /* Compare just IP address and port. Not flow */
4262 sin6_t *sin1 = (sin6_t *)name;
4263 sin6_t *sin2 = (sin6_t *)addr;
4264
4265 if (addrlen == sizeof (sin6_t) &&
4266 namelen == addrlen &&
4267 sin1->sin6_port == sin2->sin6_port &&
4268 IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4269 &sin2->sin6_addr))
4270 match = B_TRUE;
4271 break;
4272 }
4273 case AF_UNIX:
4274 default:
4275 if (namelen == addrlen &&
4276 bcmp(name, addr, namelen) == 0)
4277 match = B_TRUE;
4278 }
4279 if (match) {
4280 freemsg(sti->sti_eaddr_mp);
4281 sti->sti_eaddr_mp = NULL;
4282 mutex_exit(&so->so_lock);
4283 #ifdef DEBUG
4284 dprintso(so, 0,
4285 ("sockfs delayed error %d for %s\n",
4286 error,
4287 pr_addr(so->so_family, name, namelen)));
4288 #endif /* DEBUG */
4289 return (error);
4290 }
4291 freemsg(sti->sti_eaddr_mp);
4292 sti->sti_eaddr_mp = NULL;
4293 }
4294 }
4295 mutex_exit(&so->so_lock);
4296
4297 dontroute = 0;
4298 if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4299 uint32_t val;
4300
4301 val = 1;
4302 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4303 &val, (t_uscalar_t)sizeof (val), cr);
4304 if (error)
4305 return (error);
4306 dontroute = 1;
4307 }
4308
4309 if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4310 error = EOPNOTSUPP;
4311 goto done;
4312 }
4313 if (msg->msg_controllen != 0) {
4314 if (!(so_mode & SM_CONNREQUIRED)) {
4315 so_update_attrs(so, SOMOD);
4316 error = sosend_dgramcmsg(so, name, namelen, uiop,
4317 msg->msg_control, msg->msg_controllen, flags);
4318 } else {
4319 if (flags & MSG_OOB) {
4320 /* Can't generate T_EXDATA_REQ with options */
4321 error = EOPNOTSUPP;
4322 goto done;
4323 }
4324 so_update_attrs(so, SOMOD);
4325 error = sosend_svccmsg(so, uiop,
4326 !(flags & MSG_EOR),
4327 msg->msg_control, msg->msg_controllen,
4328 flags);
4329 }
4330 goto done;
4331 }
4332
4333 so_update_attrs(so, SOMOD);
4334 if (!(so_mode & SM_CONNREQUIRED)) {
4335 /*
4336 * If there is no SO_DONTROUTE to turn off return immediately
4337 * from send_dgram. This can allow tail-call optimizations.
4338 */
4339 if (!dontroute) {
4340 return (sosend_dgram(so, name, namelen, uiop, flags));
4341 }
4342 error = sosend_dgram(so, name, namelen, uiop, flags);
4343 } else {
4344 t_scalar_t prim;
4345 int sflag;
4346
4347 /* Ignore msg_name in the connected state */
4348 if (flags & MSG_OOB) {
4349 prim = T_EXDATA_REQ;
4350 /*
4351 * Send down T_EXDATA_REQ even if there is flow
4352 * control for data.
4353 */
4354 sflag = MSG_IGNFLOW;
4355 } else {
4356 if (so_mode & SM_BYTESTREAM) {
4357 /* Byte stream transport - use write */
4358 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4359
4360 /* Send M_DATA messages */
4361 /*
4362 * If there is no SO_DONTROUTE to turn off,
4363 * sti_direct is on, and there is no flow
4364 * control, we can take the fast path.
4365 */
4366 if (!dontroute && sti->sti_direct != 0 &&
4367 canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4368 return (sostream_direct(so, uiop,
4369 NULL, cr));
4370 }
4371 error = strwrite(SOTOV(so), uiop, cr);
4372 goto done;
4373 }
4374 prim = T_DATA_REQ;
4375 sflag = 0;
4376 }
4377 /*
4378 * If there is no SO_DONTROUTE to turn off return immediately
4379 * from sosend_svc. This can allow tail-call optimizations.
4380 */
4381 if (!dontroute)
4382 return (sosend_svc(so, uiop, prim,
4383 !(flags & MSG_EOR), sflag));
4384 error = sosend_svc(so, uiop, prim,
4385 !(flags & MSG_EOR), sflag);
4386 }
4387 ASSERT(dontroute);
4388 done:
4389 if (dontroute) {
4390 uint32_t val;
4391
4392 val = 0;
4393 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4394 &val, (t_uscalar_t)sizeof (val), cr);
4395 }
4396 return (error);
4397 }
4398
4399 /*
4400 * kstrwritemp() has very similar semantics as that of strwrite().
4401 * The main difference is it obtains mblks from the caller and also
4402 * does not do any copy as done in strwrite() from user buffers to
4403 * kernel buffers.
4404 *
4405 * Currently, this routine is used by sendfile to send data allocated
4406 * within the kernel without any copying. This interface does not use the
4407 * synchronous stream interface as synch. stream interface implies
4408 * copying.
4409 */
4410 int
kstrwritemp(struct vnode * vp,mblk_t * mp,ushort_t fmode)4411 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4412 {
4413 struct stdata *stp;
4414 struct queue *wqp;
4415 mblk_t *newmp;
4416 char waitflag;
4417 int tempmode;
4418 int error = 0;
4419 int done = 0;
4420 struct sonode *so;
4421 boolean_t direct;
4422
4423 ASSERT(vp->v_stream);
4424 stp = vp->v_stream;
4425
4426 so = VTOSO(vp);
4427 direct = _SOTOTPI(so)->sti_direct;
4428
4429 /*
4430 * This is the sockfs direct fast path. canputnext() need
4431 * not be accurate so we don't grab the sd_lock here. If
4432 * we get flow-controlled, we grab sd_lock just before the
4433 * do..while loop below to emulate what strwrite() does.
4434 */
4435 wqp = stp->sd_wrq;
4436 if (canputnext(wqp) && direct &&
4437 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4438 return (sostream_direct(so, NULL, mp, CRED()));
4439 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4440 /* Fast check of flags before acquiring the lock */
4441 mutex_enter(&stp->sd_lock);
4442 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4443 mutex_exit(&stp->sd_lock);
4444 if (error != 0) {
4445 if (!(stp->sd_flag & STPLEX) &&
4446 (stp->sd_wput_opt & SW_SIGPIPE)) {
4447 error = EPIPE;
4448 }
4449 return (error);
4450 }
4451 }
4452
4453 waitflag = WRITEWAIT;
4454 if (stp->sd_flag & OLDNDELAY)
4455 tempmode = fmode & ~FNDELAY;
4456 else
4457 tempmode = fmode;
4458
4459 mutex_enter(&stp->sd_lock);
4460 do {
4461 if (canputnext(wqp)) {
4462 mutex_exit(&stp->sd_lock);
4463 if (stp->sd_wputdatafunc != NULL) {
4464 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4465 NULL, NULL, NULL);
4466 if (newmp == NULL) {
4467 /* The caller will free mp */
4468 return (ECOMM);
4469 }
4470 mp = newmp;
4471 }
4472 putnext(wqp, mp);
4473 return (0);
4474 }
4475 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4476 &done);
4477 } while (error == 0 && !done);
4478
4479 mutex_exit(&stp->sd_lock);
4480 /*
4481 * EAGAIN tells the application to try again. ENOMEM
4482 * is returned only if the memory allocation size
4483 * exceeds the physical limits of the system. ENOMEM
4484 * can't be true here.
4485 */
4486 if (error == ENOMEM)
4487 error = EAGAIN;
4488 return (error);
4489 }
4490
4491 /* ARGSUSED */
4492 static int
sotpi_sendmblk(struct sonode * so,struct nmsghdr * msg,int fflag,struct cred * cr,mblk_t ** mpp)4493 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4494 struct cred *cr, mblk_t **mpp)
4495 {
4496 int error;
4497
4498 switch (so->so_family) {
4499 case AF_INET:
4500 case AF_INET6:
4501 case AF_UNIX:
4502 break;
4503 default:
4504 return (EAFNOSUPPORT);
4505
4506 }
4507
4508 if (so->so_state & SS_CANTSENDMORE)
4509 return (EPIPE);
4510
4511 if (so->so_type != SOCK_STREAM)
4512 return (EOPNOTSUPP);
4513
4514 if ((so->so_state & SS_ISCONNECTED) == 0)
4515 return (ENOTCONN);
4516
4517 error = kstrwritemp(so->so_vnode, *mpp, fflag);
4518 if (error == 0)
4519 *mpp = NULL;
4520 return (error);
4521 }
4522
4523 /*
4524 * Sending data on a datagram socket.
4525 * Assumes caller has verified that SS_ISBOUND etc. are set.
4526 */
4527 /* ARGSUSED */
4528 static int
sodgram_direct(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)4529 sodgram_direct(struct sonode *so, struct sockaddr *name,
4530 socklen_t namelen, struct uio *uiop, int flags)
4531 {
4532 struct T_unitdata_req tudr;
4533 mblk_t *mp = NULL;
4534 int error = 0;
4535 void *addr;
4536 socklen_t addrlen;
4537 ssize_t len;
4538 struct stdata *stp = SOTOV(so)->v_stream;
4539 int so_state;
4540 queue_t *udp_wq;
4541 boolean_t connected;
4542 mblk_t *mpdata = NULL;
4543 sotpi_info_t *sti = SOTOTPI(so);
4544 uint32_t auditing = AU_AUDITING();
4545
4546 ASSERT(name != NULL && namelen != 0);
4547 ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4548 ASSERT(!(so->so_mode & SM_EXDATA));
4549 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4550 ASSERT(SOTOV(so)->v_type == VSOCK);
4551
4552 /* Caller checked for proper length */
4553 len = uiop->uio_resid;
4554 ASSERT(len <= sti->sti_tidu_size);
4555
4556 /* Length and family checks have been done by caller */
4557 ASSERT(name->sa_family == so->so_family);
4558 ASSERT(so->so_family == AF_INET ||
4559 (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4560 ASSERT(so->so_family == AF_INET6 ||
4561 (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4562
4563 addr = name;
4564 addrlen = namelen;
4565
4566 if (stp->sd_sidp != NULL &&
4567 (error = straccess(stp, JCWRITE)) != 0)
4568 goto done;
4569
4570 so_state = so->so_state;
4571
4572 connected = so_state & SS_ISCONNECTED;
4573 if (!connected) {
4574 tudr.PRIM_type = T_UNITDATA_REQ;
4575 tudr.DEST_length = addrlen;
4576 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4577 tudr.OPT_length = 0;
4578 tudr.OPT_offset = 0;
4579
4580 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4581 _ALLOC_INTR, CRED());
4582 if (mp == NULL) {
4583 /*
4584 * Caught a signal waiting for memory.
4585 * Let send* return EINTR.
4586 */
4587 error = EINTR;
4588 goto done;
4589 }
4590 }
4591
4592 /*
4593 * For UDP we don't break up the copyin into smaller pieces
4594 * as in the TCP case. That means if ENOMEM is returned by
4595 * mcopyinuio() then the uio vector has not been modified at
4596 * all and we fallback to either strwrite() or kstrputmsg()
4597 * below. Note also that we never generate priority messages
4598 * from here.
4599 */
4600 udp_wq = stp->sd_wrq->q_next;
4601 if (canput(udp_wq) &&
4602 (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4603 ASSERT(DB_TYPE(mpdata) == M_DATA);
4604 ASSERT(uiop->uio_resid == 0);
4605 if (!connected)
4606 linkb(mp, mpdata);
4607 else
4608 mp = mpdata;
4609 if (auditing)
4610 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4611
4612 /* Always returns 0... */
4613 return (udp_wput(udp_wq, mp));
4614 }
4615
4616 ASSERT(mpdata == NULL);
4617 if (error != 0 && error != ENOMEM) {
4618 freemsg(mp);
4619 return (error);
4620 }
4621
4622 /*
4623 * For connected, let strwrite() handle the blocking case.
4624 * Otherwise we fall thru and use kstrputmsg().
4625 */
4626 if (connected)
4627 return (strwrite(SOTOV(so), uiop, CRED()));
4628
4629 if (auditing)
4630 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4631
4632 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4633 done:
4634 #ifdef SOCK_DEBUG
4635 if (error != 0) {
4636 eprintsoline(so, error);
4637 }
4638 #endif /* SOCK_DEBUG */
4639 return (error);
4640 }
4641
4642 int
sostream_direct(struct sonode * so,struct uio * uiop,mblk_t * mp,cred_t * cr)4643 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4644 {
4645 struct stdata *stp = SOTOV(so)->v_stream;
4646 ssize_t iosize, rmax, maxblk;
4647 queue_t *tcp_wq = stp->sd_wrq->q_next;
4648 mblk_t *newmp;
4649 int error = 0, wflag = 0;
4650
4651 ASSERT(so->so_mode & SM_BYTESTREAM);
4652 ASSERT(SOTOV(so)->v_type == VSOCK);
4653
4654 if (stp->sd_sidp != NULL &&
4655 (error = straccess(stp, JCWRITE)) != 0)
4656 return (error);
4657
4658 if (uiop == NULL) {
4659 /*
4660 * kstrwritemp() should have checked sd_flag and
4661 * flow-control before coming here. If we end up
4662 * here it means that we can simply pass down the
4663 * data to tcp.
4664 */
4665 ASSERT(mp != NULL);
4666 if (stp->sd_wputdatafunc != NULL) {
4667 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4668 NULL, NULL, NULL);
4669 if (newmp == NULL) {
4670 /* The caller will free mp */
4671 return (ECOMM);
4672 }
4673 mp = newmp;
4674 }
4675 /* Always returns 0... */
4676 return (tcp_wput(tcp_wq, mp));
4677 }
4678
4679 /* Fallback to strwrite() to do proper error handling */
4680 if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4681 return (strwrite(SOTOV(so), uiop, cr));
4682
4683 rmax = stp->sd_qn_maxpsz;
4684 ASSERT(rmax >= 0 || rmax == INFPSZ);
4685 if (rmax == 0 || uiop->uio_resid <= 0)
4686 return (0);
4687
4688 if (rmax == INFPSZ)
4689 rmax = uiop->uio_resid;
4690
4691 maxblk = stp->sd_maxblk;
4692
4693 for (;;) {
4694 iosize = MIN(uiop->uio_resid, rmax);
4695
4696 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4697 if (mp == NULL) {
4698 /*
4699 * Fallback to strwrite() for ENOMEM; if this
4700 * is our first time in this routine and the uio
4701 * vector has not been modified, we will end up
4702 * calling strwrite() without any flag set.
4703 */
4704 if (error == ENOMEM)
4705 goto slow_send;
4706 else
4707 return (error);
4708 }
4709 ASSERT(uiop->uio_resid >= 0);
4710 /*
4711 * If mp is non-NULL and ENOMEM is set, it means that
4712 * mcopyinuio() was able to break down some of the user
4713 * data into one or more mblks. Send the partial data
4714 * to tcp and let the rest be handled in strwrite().
4715 */
4716 ASSERT(error == 0 || error == ENOMEM);
4717 if (stp->sd_wputdatafunc != NULL) {
4718 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4719 NULL, NULL, NULL);
4720 if (newmp == NULL) {
4721 /* The caller will free mp */
4722 return (ECOMM);
4723 }
4724 mp = newmp;
4725 }
4726 (void) tcp_wput(tcp_wq, mp); /* Always returns 0 anyway. */
4727
4728 wflag |= NOINTR;
4729
4730 if (uiop->uio_resid == 0) { /* No more data; we're done */
4731 ASSERT(error == 0);
4732 break;
4733 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4734 (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4735 slow_send:
4736 /*
4737 * We were able to send down partial data using
4738 * the direct call interface, but are now relying
4739 * on strwrite() to handle the non-fastpath cases.
4740 * If the socket is blocking we will sleep in
4741 * strwaitq() until write is permitted, otherwise,
4742 * we will need to return the amount of bytes
4743 * written so far back to the app. This is the
4744 * reason why we pass NOINTR flag to strwrite()
4745 * for non-blocking socket, because we don't want
4746 * to return EAGAIN when portion of the user data
4747 * has actually been sent down.
4748 */
4749 return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4750 }
4751 }
4752 return (0);
4753 }
4754
4755 /*
4756 * Update sti_faddr by asking the transport (unless AF_UNIX).
4757 */
4758 /* ARGSUSED */
4759 int
sotpi_getpeername(struct sonode * so,struct sockaddr * name,socklen_t * namelen,boolean_t accept,struct cred * cr)4760 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4761 boolean_t accept, struct cred *cr)
4762 {
4763 struct strbuf strbuf;
4764 int error = 0, res;
4765 void *addr;
4766 t_uscalar_t addrlen;
4767 k_sigset_t smask;
4768 sotpi_info_t *sti = SOTOTPI(so);
4769
4770 dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4771 (void *)so, pr_state(so->so_state, so->so_mode)));
4772
4773 ASSERT(*namelen > 0);
4774 mutex_enter(&so->so_lock);
4775 so_lock_single(so); /* Set SOLOCKED */
4776
4777 if (accept) {
4778 bcopy(sti->sti_faddr_sa, name,
4779 MIN(*namelen, sti->sti_faddr_len));
4780 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4781 goto done;
4782 }
4783
4784 if (!(so->so_state & SS_ISCONNECTED)) {
4785 error = ENOTCONN;
4786 goto done;
4787 }
4788 /* Added this check for X/Open */
4789 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4790 error = EINVAL;
4791 if (xnet_check_print) {
4792 printf("sockfs: X/Open getpeername check => EINVAL\n");
4793 }
4794 goto done;
4795 }
4796
4797 if (sti->sti_faddr_valid) {
4798 bcopy(sti->sti_faddr_sa, name,
4799 MIN(*namelen, sti->sti_faddr_len));
4800 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4801 goto done;
4802 }
4803
4804 #ifdef DEBUG
4805 dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4806 pr_addr(so->so_family, sti->sti_faddr_sa,
4807 (t_uscalar_t)sti->sti_faddr_len)));
4808 #endif /* DEBUG */
4809
4810 if (so->so_family == AF_UNIX) {
4811 /* Transport has different name space - return local info */
4812 if (sti->sti_faddr_noxlate)
4813 *namelen = 0;
4814 error = 0;
4815 goto done;
4816 }
4817
4818 ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4819
4820 ASSERT(sti->sti_faddr_sa);
4821 /* Allocate local buffer to use with ioctl */
4822 addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4823 mutex_exit(&so->so_lock);
4824 addr = kmem_alloc(addrlen, KM_SLEEP);
4825
4826 /*
4827 * Issue TI_GETPEERNAME with signals masked.
4828 * Put the result in sti_faddr_sa so that getpeername works after
4829 * a shutdown(output).
4830 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4831 * back to the socket.
4832 */
4833 strbuf.buf = addr;
4834 strbuf.maxlen = addrlen;
4835 strbuf.len = 0;
4836
4837 sigintr(&smask, 0);
4838 res = 0;
4839 ASSERT(cr);
4840 error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4841 0, K_TO_K, cr, &res);
4842 sigunintr(&smask);
4843
4844 mutex_enter(&so->so_lock);
4845 /*
4846 * If there is an error record the error in so_error put don't fail
4847 * the getpeername. Instead fallback on the recorded
4848 * sti->sti_faddr_sa.
4849 */
4850 if (error) {
4851 /*
4852 * Various stream head errors can be returned to the ioctl.
4853 * However, it is impossible to determine which ones of
4854 * these are really socket level errors that were incorrectly
4855 * consumed by the ioctl. Thus this code silently ignores the
4856 * error - to code explicitly does not reinstate the error
4857 * using soseterror().
4858 * Experiments have shows that at least this set of
4859 * errors are reported and should not be reinstated on the
4860 * socket:
4861 * EINVAL E.g. if an I_LINK was in effect when
4862 * getpeername was called.
4863 * EPIPE The ioctl error semantics prefer the write
4864 * side error over the read side error.
4865 * ENOTCONN The transport just got disconnected but
4866 * sockfs had not yet seen the T_DISCON_IND
4867 * when issuing the ioctl.
4868 */
4869 error = 0;
4870 } else if (res == 0 && strbuf.len > 0 &&
4871 (so->so_state & SS_ISCONNECTED)) {
4872 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4873 sti->sti_faddr_len = (socklen_t)strbuf.len;
4874 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4875 sti->sti_faddr_valid = 1;
4876
4877 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4878 *namelen = sti->sti_faddr_len;
4879 }
4880 kmem_free(addr, addrlen);
4881 #ifdef DEBUG
4882 dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4883 pr_addr(so->so_family, sti->sti_faddr_sa,
4884 (t_uscalar_t)sti->sti_faddr_len)));
4885 #endif /* DEBUG */
4886 done:
4887 so_unlock_single(so, SOLOCKED);
4888 mutex_exit(&so->so_lock);
4889 return (error);
4890 }
4891
4892 /*
4893 * Update sti_laddr by asking the transport (unless AF_UNIX).
4894 */
4895 int
sotpi_getsockname(struct sonode * so,struct sockaddr * name,socklen_t * namelen,struct cred * cr)4896 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4897 struct cred *cr)
4898 {
4899 struct strbuf strbuf;
4900 int error = 0, res;
4901 void *addr;
4902 t_uscalar_t addrlen;
4903 k_sigset_t smask;
4904 sotpi_info_t *sti = SOTOTPI(so);
4905
4906 dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4907 (void *)so, pr_state(so->so_state, so->so_mode)));
4908
4909 ASSERT(*namelen > 0);
4910 mutex_enter(&so->so_lock);
4911 so_lock_single(so); /* Set SOLOCKED */
4912
4913 #ifdef DEBUG
4914
4915 dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4916 pr_addr(so->so_family, sti->sti_laddr_sa,
4917 (t_uscalar_t)sti->sti_laddr_len)));
4918 #endif /* DEBUG */
4919 if (sti->sti_laddr_valid) {
4920 bcopy(sti->sti_laddr_sa, name,
4921 MIN(*namelen, sti->sti_laddr_len));
4922 *namelen = sti->sti_laddr_len;
4923 goto done;
4924 }
4925
4926 if (so->so_family == AF_UNIX) {
4927 /*
4928 * Transport has different name space - return local info. If we
4929 * have enough space, let consumers know the family.
4930 */
4931 if (*namelen >= sizeof (sa_family_t)) {
4932 name->sa_family = AF_UNIX;
4933 *namelen = sizeof (sa_family_t);
4934 } else {
4935 *namelen = 0;
4936 }
4937 error = 0;
4938 goto done;
4939 }
4940 if (!(so->so_state & SS_ISBOUND)) {
4941 /* If not bound, then nothing to return. */
4942 error = 0;
4943 goto done;
4944 }
4945
4946 /* Allocate local buffer to use with ioctl */
4947 addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
4948 mutex_exit(&so->so_lock);
4949 addr = kmem_alloc(addrlen, KM_SLEEP);
4950
4951 /*
4952 * Issue TI_GETMYNAME with signals masked.
4953 * Put the result in sti_laddr_sa so that getsockname works after
4954 * a shutdown(output).
4955 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4956 * back to the socket.
4957 */
4958 strbuf.buf = addr;
4959 strbuf.maxlen = addrlen;
4960 strbuf.len = 0;
4961
4962 sigintr(&smask, 0);
4963 res = 0;
4964 ASSERT(cr);
4965 error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4966 0, K_TO_K, cr, &res);
4967 sigunintr(&smask);
4968
4969 mutex_enter(&so->so_lock);
4970 /*
4971 * If there is an error record the error in so_error put don't fail
4972 * the getsockname. Instead fallback on the recorded
4973 * sti->sti_laddr_sa.
4974 */
4975 if (error) {
4976 /*
4977 * Various stream head errors can be returned to the ioctl.
4978 * However, it is impossible to determine which ones of
4979 * these are really socket level errors that were incorrectly
4980 * consumed by the ioctl. Thus this code silently ignores the
4981 * error - to code explicitly does not reinstate the error
4982 * using soseterror().
4983 * Experiments have shows that at least this set of
4984 * errors are reported and should not be reinstated on the
4985 * socket:
4986 * EINVAL E.g. if an I_LINK was in effect when
4987 * getsockname was called.
4988 * EPIPE The ioctl error semantics prefer the write
4989 * side error over the read side error.
4990 */
4991 error = 0;
4992 } else if (res == 0 && strbuf.len > 0 &&
4993 (so->so_state & SS_ISBOUND)) {
4994 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
4995 sti->sti_laddr_len = (socklen_t)strbuf.len;
4996 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
4997 sti->sti_laddr_valid = 1;
4998
4999 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5000 *namelen = sti->sti_laddr_len;
5001 }
5002 kmem_free(addr, addrlen);
5003 #ifdef DEBUG
5004 dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5005 pr_addr(so->so_family, sti->sti_laddr_sa,
5006 (t_uscalar_t)sti->sti_laddr_len)));
5007 #endif /* DEBUG */
5008 done:
5009 so_unlock_single(so, SOLOCKED);
5010 mutex_exit(&so->so_lock);
5011 return (error);
5012 }
5013
5014 /*
5015 * Get socket options. For SOL_SOCKET options some options are handled
5016 * by the sockfs while others use the value recorded in the sonode as a
5017 * fallback should the T_SVR4_OPTMGMT_REQ fail.
5018 *
5019 * On the return most *optlenp bytes are copied to optval.
5020 */
5021 /* ARGSUSED */
5022 int
sotpi_getsockopt(struct sonode * so,int level,int option_name,void * optval,socklen_t * optlenp,int flags,struct cred * cr)5023 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5024 void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5025 {
5026 struct T_optmgmt_req optmgmt_req;
5027 struct T_optmgmt_ack *optmgmt_ack;
5028 struct opthdr oh;
5029 struct opthdr *opt_res;
5030 mblk_t *mp = NULL;
5031 int error = 0;
5032 void *option = NULL; /* Set if fallback value */
5033 t_uscalar_t maxlen = *optlenp;
5034 t_uscalar_t len;
5035 uint32_t value;
5036 struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5037 struct timeval32 tmo_val32;
5038 struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
5039
5040 dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5041 (void *)so, level, option_name, optval, (void *)optlenp,
5042 pr_state(so->so_state, so->so_mode)));
5043
5044 mutex_enter(&so->so_lock);
5045 so_lock_single(so); /* Set SOLOCKED */
5046
5047 len = (t_uscalar_t)sizeof (uint32_t); /* Default */
5048
5049 /*
5050 * Check for SOL_SOCKET options.
5051 * Certain SOL_SOCKET options are returned directly whereas
5052 * others only provide a default (fallback) value should
5053 * the T_SVR4_OPTMGMT_REQ fail.
5054 */
5055 if (level == SOL_SOCKET) {
5056 /* Check parameters */
5057 switch (option_name) {
5058 case SO_TYPE:
5059 case SO_ERROR:
5060 case SO_DEBUG:
5061 case SO_ACCEPTCONN:
5062 case SO_REUSEADDR:
5063 case SO_KEEPALIVE:
5064 case SO_DONTROUTE:
5065 case SO_BROADCAST:
5066 case SO_USELOOPBACK:
5067 case SO_OOBINLINE:
5068 case SO_SNDBUF:
5069 case SO_RCVBUF:
5070 #ifdef notyet
5071 case SO_SNDLOWAT:
5072 case SO_RCVLOWAT:
5073 #endif /* notyet */
5074 case SO_DOMAIN:
5075 case SO_DGRAM_ERRIND:
5076 if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5077 error = EINVAL;
5078 eprintsoline(so, error);
5079 goto done2;
5080 }
5081 break;
5082 case SO_RCVTIMEO:
5083 case SO_SNDTIMEO:
5084 if (get_udatamodel() == DATAMODEL_NONE ||
5085 get_udatamodel() == DATAMODEL_NATIVE) {
5086 if (maxlen < sizeof (struct timeval)) {
5087 error = EINVAL;
5088 eprintsoline(so, error);
5089 goto done2;
5090 }
5091 } else {
5092 if (maxlen < sizeof (struct timeval32)) {
5093 error = EINVAL;
5094 eprintsoline(so, error);
5095 goto done2;
5096 }
5097
5098 }
5099 break;
5100 case SO_LINGER:
5101 if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5102 error = EINVAL;
5103 eprintsoline(so, error);
5104 goto done2;
5105 }
5106 break;
5107 case SO_SND_BUFINFO:
5108 if (maxlen < (t_uscalar_t)
5109 sizeof (struct so_snd_bufinfo)) {
5110 error = EINVAL;
5111 eprintsoline(so, error);
5112 goto done2;
5113 }
5114 break;
5115 }
5116
5117 switch (option_name) {
5118 case SO_TYPE:
5119 value = so->so_type;
5120 option = &value;
5121 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5122
5123 case SO_ERROR:
5124 value = sogeterr(so, B_TRUE);
5125 option = &value;
5126 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5127
5128 case SO_ACCEPTCONN:
5129 if (so->so_state & SS_ACCEPTCONN)
5130 value = SO_ACCEPTCONN;
5131 else
5132 value = 0;
5133 #ifdef DEBUG
5134 if (value) {
5135 dprintso(so, 1,
5136 ("sotpi_getsockopt: 0x%x is set\n",
5137 option_name));
5138 } else {
5139 dprintso(so, 1,
5140 ("sotpi_getsockopt: 0x%x not set\n",
5141 option_name));
5142 }
5143 #endif /* DEBUG */
5144 option = &value;
5145 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5146
5147 case SO_DEBUG:
5148 case SO_REUSEADDR:
5149 case SO_KEEPALIVE:
5150 case SO_DONTROUTE:
5151 case SO_BROADCAST:
5152 case SO_USELOOPBACK:
5153 case SO_OOBINLINE:
5154 case SO_DGRAM_ERRIND:
5155 value = (so->so_options & option_name);
5156 #ifdef DEBUG
5157 if (value) {
5158 dprintso(so, 1,
5159 ("sotpi_getsockopt: 0x%x is set\n",
5160 option_name));
5161 } else {
5162 dprintso(so, 1,
5163 ("sotpi_getsockopt: 0x%x not set\n",
5164 option_name));
5165 }
5166 #endif /* DEBUG */
5167 option = &value;
5168 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5169
5170 /*
5171 * The following options are only returned by sockfs when the
5172 * T_SVR4_OPTMGMT_REQ fails.
5173 */
5174 case SO_LINGER:
5175 option = &so->so_linger;
5176 len = (t_uscalar_t)sizeof (struct linger);
5177 break;
5178 case SO_SNDBUF: {
5179 ssize_t lvalue;
5180
5181 /*
5182 * If the option has not been set then get a default
5183 * value from the read queue. This value is
5184 * returned if the transport fails
5185 * the T_SVR4_OPTMGMT_REQ.
5186 */
5187 lvalue = so->so_sndbuf;
5188 if (lvalue == 0) {
5189 mutex_exit(&so->so_lock);
5190 (void) strqget(strvp2wq(SOTOV(so))->q_next,
5191 QHIWAT, 0, &lvalue);
5192 mutex_enter(&so->so_lock);
5193 dprintso(so, 1,
5194 ("got SO_SNDBUF %ld from q\n", lvalue));
5195 }
5196 value = (int)lvalue;
5197 option = &value;
5198 len = (t_uscalar_t)sizeof (so->so_sndbuf);
5199 break;
5200 }
5201 case SO_RCVBUF: {
5202 ssize_t lvalue;
5203
5204 /*
5205 * If the option has not been set then get a default
5206 * value from the read queue. This value is
5207 * returned if the transport fails
5208 * the T_SVR4_OPTMGMT_REQ.
5209 *
5210 * XXX If SO_RCVBUF has been set and this is an
5211 * XPG 4.2 application then do not ask the transport
5212 * since the transport might adjust the value and not
5213 * return exactly what was set by the application.
5214 * For non-XPG 4.2 application we return the value
5215 * that the transport is actually using.
5216 */
5217 lvalue = so->so_rcvbuf;
5218 if (lvalue == 0) {
5219 mutex_exit(&so->so_lock);
5220 (void) strqget(RD(strvp2wq(SOTOV(so))),
5221 QHIWAT, 0, &lvalue);
5222 mutex_enter(&so->so_lock);
5223 dprintso(so, 1,
5224 ("got SO_RCVBUF %ld from q\n", lvalue));
5225 } else if (flags & _SOGETSOCKOPT_XPG4_2) {
5226 value = (int)lvalue;
5227 option = &value;
5228 goto copyout; /* skip asking transport */
5229 }
5230 value = (int)lvalue;
5231 option = &value;
5232 len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5233 break;
5234 }
5235 case SO_DOMAIN:
5236 value = so->so_family;
5237 option = &value;
5238 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5239
5240 #ifdef notyet
5241 /*
5242 * We do not implement the semantics of these options
5243 * thus we shouldn't implement the options either.
5244 */
5245 case SO_SNDLOWAT:
5246 value = so->so_sndlowat;
5247 option = &value;
5248 break;
5249 case SO_RCVLOWAT:
5250 value = so->so_rcvlowat;
5251 option = &value;
5252 break;
5253 #endif /* notyet */
5254 case SO_SNDTIMEO:
5255 case SO_RCVTIMEO: {
5256 clock_t val;
5257
5258 if (option_name == SO_RCVTIMEO)
5259 val = drv_hztousec(so->so_rcvtimeo);
5260 else
5261 val = drv_hztousec(so->so_sndtimeo);
5262 tmo_val.tv_sec = val / (1000 * 1000);
5263 tmo_val.tv_usec = val % (1000 * 1000);
5264 if (get_udatamodel() == DATAMODEL_NONE ||
5265 get_udatamodel() == DATAMODEL_NATIVE) {
5266 option = &tmo_val;
5267 len = sizeof (struct timeval);
5268 } else {
5269 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5270 option = &tmo_val32;
5271 len = sizeof (struct timeval32);
5272 }
5273 break;
5274 }
5275 case SO_SND_BUFINFO: {
5276 snd_bufinfo.sbi_wroff =
5277 (so->so_proto_props).sopp_wroff;
5278 snd_bufinfo.sbi_maxblk =
5279 (so->so_proto_props).sopp_maxblk;
5280 snd_bufinfo.sbi_maxpsz =
5281 (so->so_proto_props).sopp_maxpsz;
5282 snd_bufinfo.sbi_tail =
5283 (so->so_proto_props).sopp_tail;
5284 option = &snd_bufinfo;
5285 len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5286 break;
5287 }
5288 }
5289 }
5290
5291 mutex_exit(&so->so_lock);
5292
5293 /* Send request */
5294 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5295 optmgmt_req.MGMT_flags = T_CHECK;
5296 optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5297 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5298
5299 oh.level = level;
5300 oh.name = option_name;
5301 oh.len = maxlen;
5302
5303 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5304 &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5305 /* Let option management work in the presence of data flow control */
5306 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5307 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5308 mp = NULL;
5309 mutex_enter(&so->so_lock);
5310 if (error) {
5311 eprintsoline(so, error);
5312 goto done2;
5313 }
5314 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5315 (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5316 if (error) {
5317 if (option != NULL) {
5318 /* We have a fallback value */
5319 error = 0;
5320 goto copyout;
5321 }
5322 eprintsoline(so, error);
5323 goto done2;
5324 }
5325 ASSERT(mp);
5326 optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5327 opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5328 optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5329 if (opt_res == NULL) {
5330 if (option != NULL) {
5331 /* We have a fallback value */
5332 error = 0;
5333 goto copyout;
5334 }
5335 error = EPROTO;
5336 eprintsoline(so, error);
5337 goto done;
5338 }
5339 option = &opt_res[1];
5340
5341 /* check to ensure that the option is within bounds */
5342 if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5343 (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5344 if (option != NULL) {
5345 /* We have a fallback value */
5346 error = 0;
5347 goto copyout;
5348 }
5349 error = EPROTO;
5350 eprintsoline(so, error);
5351 goto done;
5352 }
5353
5354 len = opt_res->len;
5355
5356 copyout: {
5357 t_uscalar_t size = MIN(len, maxlen);
5358 bcopy(option, optval, size);
5359 bcopy(&size, optlenp, sizeof (size));
5360 }
5361 done:
5362 freemsg(mp);
5363 done2:
5364 so_unlock_single(so, SOLOCKED);
5365 mutex_exit(&so->so_lock);
5366
5367 return (error);
5368 }
5369
5370 /*
5371 * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5372 * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5373 * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5374 * setsockopt has to work even if the transport does not support the option.
5375 */
5376 /* ARGSUSED */
5377 int
sotpi_setsockopt(struct sonode * so,int level,int option_name,const void * optval,t_uscalar_t optlen,struct cred * cr)5378 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5379 const void *optval, t_uscalar_t optlen, struct cred *cr)
5380 {
5381 struct T_optmgmt_req optmgmt_req;
5382 struct opthdr oh;
5383 mblk_t *mp;
5384 int error = 0;
5385 boolean_t handled = B_FALSE;
5386
5387 dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5388 (void *)so, level, option_name, optval, optlen,
5389 pr_state(so->so_state, so->so_mode)));
5390
5391 /* X/Open requires this check */
5392 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5393 if (xnet_check_print)
5394 printf("sockfs: X/Open setsockopt check => EINVAL\n");
5395 return (EINVAL);
5396 }
5397
5398 mutex_enter(&so->so_lock);
5399 so_lock_single(so); /* Set SOLOCKED */
5400 mutex_exit(&so->so_lock);
5401
5402 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5403 optmgmt_req.MGMT_flags = T_NEGOTIATE;
5404 optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5405 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5406
5407 oh.level = level;
5408 oh.name = option_name;
5409 oh.len = optlen;
5410
5411 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5412 &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5413 /* Let option management work in the presence of data flow control */
5414 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5415 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5416 mp = NULL;
5417 mutex_enter(&so->so_lock);
5418 if (error) {
5419 eprintsoline(so, error);
5420 goto done2;
5421 }
5422 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5423 (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5424 if (error) {
5425 eprintsoline(so, error);
5426 goto done;
5427 }
5428 ASSERT(mp);
5429 /* No need to verify T_optmgmt_ack */
5430 freemsg(mp);
5431 done:
5432 /*
5433 * Check for SOL_SOCKET options and record their values.
5434 * If we know about a SOL_SOCKET parameter and the transport
5435 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5436 * EPROTO) we let the setsockopt succeed.
5437 */
5438 if (level == SOL_SOCKET) {
5439 /* Check parameters */
5440 switch (option_name) {
5441 case SO_DEBUG:
5442 case SO_REUSEADDR:
5443 case SO_KEEPALIVE:
5444 case SO_DONTROUTE:
5445 case SO_BROADCAST:
5446 case SO_USELOOPBACK:
5447 case SO_OOBINLINE:
5448 case SO_SNDBUF:
5449 case SO_RCVBUF:
5450 #ifdef notyet
5451 case SO_SNDLOWAT:
5452 case SO_RCVLOWAT:
5453 #endif /* notyet */
5454 case SO_DGRAM_ERRIND:
5455 if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5456 error = EINVAL;
5457 eprintsoline(so, error);
5458 goto done2;
5459 }
5460 ASSERT(optval);
5461 handled = B_TRUE;
5462 break;
5463 case SO_SNDTIMEO:
5464 case SO_RCVTIMEO:
5465 if (get_udatamodel() == DATAMODEL_NONE ||
5466 get_udatamodel() == DATAMODEL_NATIVE) {
5467 if (optlen != sizeof (struct timeval)) {
5468 error = EINVAL;
5469 eprintsoline(so, error);
5470 goto done2;
5471 }
5472 } else {
5473 if (optlen != sizeof (struct timeval32)) {
5474 error = EINVAL;
5475 eprintsoline(so, error);
5476 goto done2;
5477 }
5478 }
5479 ASSERT(optval);
5480 handled = B_TRUE;
5481 break;
5482 case SO_LINGER:
5483 if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5484 error = EINVAL;
5485 eprintsoline(so, error);
5486 goto done2;
5487 }
5488 ASSERT(optval);
5489 handled = B_TRUE;
5490 break;
5491 }
5492
5493 #define intvalue (*(int32_t *)optval)
5494
5495 switch (option_name) {
5496 case SO_TYPE:
5497 case SO_ERROR:
5498 case SO_ACCEPTCONN:
5499 /* Can't be set */
5500 error = ENOPROTOOPT;
5501 goto done2;
5502 case SO_LINGER: {
5503 struct linger *l = (struct linger *)optval;
5504
5505 so->so_linger.l_linger = l->l_linger;
5506 if (l->l_onoff) {
5507 so->so_linger.l_onoff = SO_LINGER;
5508 so->so_options |= SO_LINGER;
5509 } else {
5510 so->so_linger.l_onoff = 0;
5511 so->so_options &= ~SO_LINGER;
5512 }
5513 break;
5514 }
5515
5516 case SO_DEBUG:
5517 #ifdef SOCK_TEST
5518 if (intvalue & 2)
5519 sock_test_timelimit = 10 * hz;
5520 else
5521 sock_test_timelimit = 0;
5522
5523 if (intvalue & 4)
5524 do_useracc = 0;
5525 else
5526 do_useracc = 1;
5527 #endif /* SOCK_TEST */
5528 /* FALLTHRU */
5529 case SO_REUSEADDR:
5530 case SO_KEEPALIVE:
5531 case SO_DONTROUTE:
5532 case SO_BROADCAST:
5533 case SO_USELOOPBACK:
5534 case SO_OOBINLINE:
5535 case SO_DGRAM_ERRIND:
5536 if (intvalue != 0) {
5537 dprintso(so, 1,
5538 ("socket_setsockopt: setting 0x%x\n",
5539 option_name));
5540 so->so_options |= option_name;
5541 } else {
5542 dprintso(so, 1,
5543 ("socket_setsockopt: clearing 0x%x\n",
5544 option_name));
5545 so->so_options &= ~option_name;
5546 }
5547 break;
5548 /*
5549 * The following options are only returned by us when the
5550 * transport layer fails.
5551 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5552 * since the transport might adjust the value and not
5553 * return exactly what was set by the application.
5554 */
5555 case SO_SNDBUF:
5556 so->so_sndbuf = intvalue;
5557 break;
5558 case SO_RCVBUF:
5559 so->so_rcvbuf = intvalue;
5560 break;
5561 case SO_RCVPSH:
5562 so->so_rcv_timer_interval = intvalue;
5563 break;
5564 #ifdef notyet
5565 /*
5566 * We do not implement the semantics of these options
5567 * thus we shouldn't implement the options either.
5568 */
5569 case SO_SNDLOWAT:
5570 so->so_sndlowat = intvalue;
5571 break;
5572 case SO_RCVLOWAT:
5573 so->so_rcvlowat = intvalue;
5574 break;
5575 #endif /* notyet */
5576 case SO_SNDTIMEO:
5577 case SO_RCVTIMEO: {
5578 struct timeval tl;
5579 clock_t val;
5580
5581 if (get_udatamodel() == DATAMODEL_NONE ||
5582 get_udatamodel() == DATAMODEL_NATIVE)
5583 bcopy(&tl, (struct timeval *)optval,
5584 sizeof (struct timeval));
5585 else
5586 TIMEVAL32_TO_TIMEVAL(&tl,
5587 (struct timeval32 *)optval);
5588 val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5589 if (option_name == SO_RCVTIMEO)
5590 so->so_rcvtimeo = drv_usectohz(val);
5591 else
5592 so->so_sndtimeo = drv_usectohz(val);
5593 break;
5594 }
5595 }
5596 #undef intvalue
5597
5598 if (error) {
5599 if ((error == ENOPROTOOPT || error == EPROTO ||
5600 error == EINVAL) && handled) {
5601 dprintso(so, 1,
5602 ("setsockopt: ignoring error %d for 0x%x\n",
5603 error, option_name));
5604 error = 0;
5605 }
5606 }
5607 }
5608 done2:
5609 so_unlock_single(so, SOLOCKED);
5610 mutex_exit(&so->so_lock);
5611 return (error);
5612 }
5613
5614 /*
5615 * sotpi_close() is called when the last open reference goes away.
5616 */
5617 /* ARGSUSED */
5618 int
sotpi_close(struct sonode * so,int flag,struct cred * cr)5619 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5620 {
5621 struct vnode *vp = SOTOV(so);
5622 dev_t dev;
5623 int error = 0;
5624 sotpi_info_t *sti = SOTOTPI(so);
5625
5626 dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5627 (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5628
5629 dev = sti->sti_dev;
5630
5631 ASSERT(STREAMSTAB(getmajor(dev)));
5632
5633 mutex_enter(&so->so_lock);
5634 so_lock_single(so); /* Set SOLOCKED */
5635
5636 ASSERT(so_verify_oobstate(so));
5637
5638 if (vp->v_stream != NULL) {
5639 vnode_t *ux_vp;
5640
5641 if (so->so_family == AF_UNIX) {
5642 /* Could avoid this when CANTSENDMORE for !dgram */
5643 so_unix_close(so);
5644 }
5645
5646 mutex_exit(&so->so_lock);
5647 /*
5648 * Disassemble the linkage from the AF_UNIX underlying file
5649 * system vnode to this socket (by atomically clearing
5650 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5651 * and frees the stream head.
5652 */
5653 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5654 ASSERT(ux_vp->v_stream);
5655 sti->sti_ux_bound_vp = NULL;
5656 vn_rele_stream(ux_vp);
5657 }
5658 error = strclose(vp, flag, cr);
5659 vp->v_stream = NULL;
5660 mutex_enter(&so->so_lock);
5661 }
5662
5663 /*
5664 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5665 */
5666 so_flush_discon_ind(so);
5667
5668 so_unlock_single(so, SOLOCKED);
5669 mutex_exit(&so->so_lock);
5670
5671 /*
5672 * Needed for STREAMs.
5673 * Decrement the device driver's reference count for streams
5674 * opened via the clone dip. The driver was held in clone_open().
5675 * The absence of clone_close() forces this asymmetry.
5676 */
5677 if (so->so_flag & SOCLONE)
5678 ddi_rele_driver(getmajor(dev));
5679
5680 return (error);
5681 }
5682
5683 static int
sotpi_ioctl(struct sonode * so,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5684 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5685 struct cred *cr, int32_t *rvalp)
5686 {
5687 struct vnode *vp = SOTOV(so);
5688 sotpi_info_t *sti = SOTOTPI(so);
5689 int error = 0;
5690
5691 dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5692 cmd, arg, pr_state(so->so_state, so->so_mode)));
5693
5694 switch (cmd) {
5695 case SIOCSQPTR:
5696 /*
5697 * SIOCSQPTR is valid only when helper stream is created
5698 * by the protocol.
5699 */
5700 case _I_INSERT:
5701 case _I_REMOVE:
5702 /*
5703 * Since there's no compelling reason to support these ioctls
5704 * on sockets, and doing so would increase the complexity
5705 * markedly, prevent it.
5706 */
5707 return (EOPNOTSUPP);
5708
5709 case I_FIND:
5710 case I_LIST:
5711 case I_LOOK:
5712 case I_POP:
5713 case I_PUSH:
5714 /*
5715 * To prevent races and inconsistencies between the actual
5716 * state of the stream and the state according to the sonode,
5717 * we serialize all operations which modify or operate on the
5718 * list of modules on the socket's stream.
5719 */
5720 mutex_enter(&sti->sti_plumb_lock);
5721 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5722 mutex_exit(&sti->sti_plumb_lock);
5723 return (error);
5724
5725 default:
5726 if (so->so_version != SOV_STREAM)
5727 break;
5728
5729 /*
5730 * The imaginary "sockmod" has been popped; act as a stream.
5731 */
5732 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5733 }
5734
5735 ASSERT(so->so_version != SOV_STREAM);
5736
5737 /*
5738 * Process socket-specific ioctls.
5739 */
5740 switch (cmd) {
5741 case FIONBIO: {
5742 int32_t value;
5743
5744 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5745 (mode & (int)FKIOCTL)))
5746 return (EFAULT);
5747
5748 mutex_enter(&so->so_lock);
5749 if (value) {
5750 so->so_state |= SS_NDELAY;
5751 } else {
5752 so->so_state &= ~SS_NDELAY;
5753 }
5754 mutex_exit(&so->so_lock);
5755 return (0);
5756 }
5757
5758 case FIOASYNC: {
5759 int32_t value;
5760
5761 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5762 (mode & (int)FKIOCTL)))
5763 return (EFAULT);
5764
5765 mutex_enter(&so->so_lock);
5766 /*
5767 * SS_ASYNC flag not already set correctly?
5768 * (!value != !(so->so_state & SS_ASYNC))
5769 * but some engineers find that too hard to read.
5770 */
5771 if ((value == 0 && (so->so_state & SS_ASYNC) != 0) ||
5772 (value != 0 && (so->so_state & SS_ASYNC) == 0))
5773 error = so_flip_async(so, vp, mode, cr);
5774 mutex_exit(&so->so_lock);
5775 return (error);
5776 }
5777
5778 case SIOCSPGRP:
5779 case FIOSETOWN: {
5780 pid_t pgrp;
5781
5782 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5783 (mode & (int)FKIOCTL)))
5784 return (EFAULT);
5785
5786 mutex_enter(&so->so_lock);
5787 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5788 /* Any change? */
5789 if (pgrp != so->so_pgrp)
5790 error = so_set_siggrp(so, vp, pgrp, mode, cr);
5791 mutex_exit(&so->so_lock);
5792 return (error);
5793 }
5794 case SIOCGPGRP:
5795 case FIOGETOWN:
5796 if (so_copyout(&so->so_pgrp, (void *)arg,
5797 sizeof (pid_t), (mode & (int)FKIOCTL)))
5798 return (EFAULT);
5799 return (0);
5800
5801 case SIOCATMARK: {
5802 int retval;
5803 uint_t so_state;
5804
5805 /*
5806 * strwaitmark has a finite timeout after which it
5807 * returns -1 if the mark state is undetermined.
5808 * In order to avoid any race between the mark state
5809 * in sockfs and the mark state in the stream head this
5810 * routine loops until the mark state can be determined
5811 * (or the urgent data indication has been removed by some
5812 * other thread).
5813 */
5814 do {
5815 mutex_enter(&so->so_lock);
5816 so_state = so->so_state;
5817 mutex_exit(&so->so_lock);
5818 if (so_state & SS_RCVATMARK) {
5819 retval = 1;
5820 } else if (!(so_state & SS_OOBPEND)) {
5821 /*
5822 * No SIGURG has been generated -- there is no
5823 * pending or present urgent data. Thus can't
5824 * possibly be at the mark.
5825 */
5826 retval = 0;
5827 } else {
5828 /*
5829 * Have the stream head wait until there is
5830 * either some messages on the read queue, or
5831 * STRATMARK or STRNOTATMARK gets set. The
5832 * STRNOTATMARK flag is used so that the
5833 * transport can send up a MSGNOTMARKNEXT
5834 * M_DATA to indicate that it is not
5835 * at the mark and additional data is not about
5836 * to be send upstream.
5837 *
5838 * If the mark state is undetermined this will
5839 * return -1 and we will loop rechecking the
5840 * socket state.
5841 */
5842 retval = strwaitmark(vp);
5843 }
5844 } while (retval == -1);
5845
5846 if (so_copyout(&retval, (void *)arg, sizeof (int),
5847 (mode & (int)FKIOCTL)))
5848 return (EFAULT);
5849 return (0);
5850 }
5851
5852 case I_FDINSERT:
5853 case I_SENDFD:
5854 case I_RECVFD:
5855 case I_ATMARK:
5856 case _SIOCSOCKFALLBACK:
5857 /*
5858 * These ioctls do not apply to sockets. I_FDINSERT can be
5859 * used to send M_PROTO messages without modifying the socket
5860 * state. I_SENDFD/RECVFD should not be used for socket file
5861 * descriptor passing since they assume a twisted stream.
5862 * SIOCATMARK must be used instead of I_ATMARK.
5863 *
5864 * _SIOCSOCKFALLBACK from an application should never be
5865 * processed. It is only generated by socktpi_open() or
5866 * in response to I_POP or I_PUSH.
5867 */
5868 #ifdef DEBUG
5869 zcmn_err(getzoneid(), CE_WARN,
5870 "Unsupported STREAMS ioctl 0x%x on socket. "
5871 "Pid = %d\n", cmd, curproc->p_pid);
5872 #endif /* DEBUG */
5873 return (EOPNOTSUPP);
5874
5875 case _I_GETPEERCRED:
5876 if ((mode & FKIOCTL) == 0)
5877 return (EINVAL);
5878
5879 mutex_enter(&so->so_lock);
5880 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5881 error = ENOTSUP;
5882 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
5883 error = ENOTCONN;
5884 } else if (so->so_peercred != NULL) {
5885 k_peercred_t *kp = (k_peercred_t *)arg;
5886 kp->pc_cr = so->so_peercred;
5887 kp->pc_cpid = so->so_cpid;
5888 crhold(so->so_peercred);
5889 } else {
5890 error = EINVAL;
5891 }
5892 mutex_exit(&so->so_lock);
5893 return (error);
5894
5895 default:
5896 /*
5897 * Do the higher-order bits of the ioctl cmd indicate
5898 * that it is an I_* streams ioctl?
5899 */
5900 if ((cmd & 0xffffff00U) == STR &&
5901 so->so_version == SOV_SOCKBSD) {
5902 #ifdef DEBUG
5903 zcmn_err(getzoneid(), CE_WARN,
5904 "Unsupported STREAMS ioctl 0x%x on socket. "
5905 "Pid = %d\n", cmd, curproc->p_pid);
5906 #endif /* DEBUG */
5907 return (EOPNOTSUPP);
5908 }
5909 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5910 }
5911 }
5912
5913 /*
5914 * Handle plumbing-related ioctls.
5915 */
5916 static int
socktpi_plumbioctl(struct vnode * vp,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5917 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5918 struct cred *cr, int32_t *rvalp)
5919 {
5920 static const char sockmod_name[] = "sockmod";
5921 struct sonode *so = VTOSO(vp);
5922 char mname[FMNAMESZ + 1];
5923 int error;
5924 sotpi_info_t *sti = SOTOTPI(so);
5925
5926 ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5927
5928 if (so->so_version == SOV_SOCKBSD)
5929 return (EOPNOTSUPP);
5930
5931 if (so->so_version == SOV_STREAM) {
5932 /*
5933 * The imaginary "sockmod" has been popped - act as a stream.
5934 * If this is a push of sockmod then change back to a socket.
5935 */
5936 if (cmd == I_PUSH) {
5937 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5938 (void *)arg, mname, sizeof (mname), NULL);
5939
5940 if (error == 0 && strcmp(mname, sockmod_name) == 0) {
5941 dprintso(so, 0, ("socktpi_ioctl: going to "
5942 "socket version\n"));
5943 so_stream2sock(so);
5944 return (0);
5945 }
5946 }
5947 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5948 }
5949
5950 switch (cmd) {
5951 case I_PUSH:
5952 if (sti->sti_direct) {
5953 mutex_enter(&so->so_lock);
5954 so_lock_single(so);
5955 mutex_exit(&so->so_lock);
5956
5957 error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
5958 cr, rvalp);
5959
5960 mutex_enter(&so->so_lock);
5961 if (error == 0)
5962 sti->sti_direct = 0;
5963 so_unlock_single(so, SOLOCKED);
5964 mutex_exit(&so->so_lock);
5965
5966 if (error != 0)
5967 return (error);
5968 }
5969
5970 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5971 if (error == 0)
5972 sti->sti_pushcnt++;
5973 return (error);
5974
5975 case I_POP:
5976 if (sti->sti_pushcnt == 0) {
5977 /* Emulate sockmod being popped */
5978 dprintso(so, 0,
5979 ("socktpi_ioctl: going to STREAMS version\n"));
5980 return (so_sock2stream(so));
5981 }
5982
5983 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5984 if (error == 0)
5985 sti->sti_pushcnt--;
5986 return (error);
5987
5988 case I_LIST: {
5989 struct str_mlist *kmlistp, *umlistp;
5990 struct str_list kstrlist;
5991 ssize_t kstrlistsize;
5992 int i, nmods;
5993
5994 STRUCT_DECL(str_list, ustrlist);
5995 STRUCT_INIT(ustrlist, mode);
5996
5997 if (arg == 0) {
5998 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5999 if (error == 0)
6000 (*rvalp)++; /* Add one for sockmod */
6001 return (error);
6002 }
6003
6004 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6005 STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6006 if (error != 0)
6007 return (error);
6008
6009 nmods = STRUCT_FGET(ustrlist, sl_nmods);
6010 if (nmods <= 0)
6011 return (EINVAL);
6012 /*
6013 * Ceiling nmods at nstrpush to prevent someone from
6014 * maliciously consuming lots of kernel memory.
6015 */
6016 nmods = MIN(nmods, nstrpush);
6017
6018 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6019 kstrlist.sl_nmods = nmods;
6020 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6021
6022 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6023 cr, rvalp);
6024 if (error != 0)
6025 goto done;
6026
6027 /*
6028 * Considering the module list as a 0-based array of sl_nmods
6029 * modules, sockmod should conceptually exist at slot
6030 * sti_pushcnt. Insert sockmod at this location by sliding all
6031 * of the module names after so_pushcnt over by one. We know
6032 * that there will be room to do this since we allocated
6033 * sl_modlist with an additional slot.
6034 */
6035 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6036 kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6037
6038 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6039 kstrlist.sl_nmods++;
6040
6041 /*
6042 * Copy all of the entries out to ustrlist.
6043 */
6044 kmlistp = kstrlist.sl_modlist;
6045 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6046 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6047 error = so_copyout(kmlistp++, umlistp++,
6048 sizeof (struct str_mlist), mode & FKIOCTL);
6049 if (error != 0)
6050 goto done;
6051 }
6052
6053 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6054 mode & FKIOCTL);
6055 if (error == 0)
6056 *rvalp = 0;
6057 done:
6058 kmem_free(kstrlist.sl_modlist, kstrlistsize);
6059 return (error);
6060 }
6061 case I_LOOK:
6062 if (sti->sti_pushcnt == 0) {
6063 return (so_copyout(sockmod_name, (void *)arg,
6064 sizeof (sockmod_name), mode & FKIOCTL));
6065 }
6066 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6067
6068 case I_FIND:
6069 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6070 if (error && error != EINVAL)
6071 return (error);
6072
6073 /* if not found and string was sockmod return 1 */
6074 if (*rvalp == 0 || error == EINVAL) {
6075 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6076 (void *)arg, mname, sizeof (mname), NULL);
6077 if (error == ENAMETOOLONG)
6078 error = EINVAL;
6079
6080 if (error == 0 && strcmp(mname, sockmod_name) == 0)
6081 *rvalp = 1;
6082 }
6083 return (error);
6084
6085 default:
6086 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6087 break;
6088 }
6089
6090 return (0);
6091 }
6092
6093 /*
6094 * Wrapper around the streams poll routine that implements socket poll
6095 * semantics.
6096 * The sockfs never calls pollwakeup itself - the stream head take care
6097 * of all pollwakeups. Since sockfs never holds so_lock when calling the
6098 * stream head there can never be a deadlock due to holding so_lock across
6099 * pollwakeup and acquiring so_lock in this routine.
6100 *
6101 * However, since the performance of VOP_POLL is critical we avoid
6102 * acquiring so_lock here. This is based on two assumptions:
6103 * - The poll implementation holds locks to serialize the VOP_POLL call
6104 * and a pollwakeup for the same pollhead. This ensures that should
6105 * e.g. so_state change during a socktpi_poll call the pollwakeup
6106 * (which strsock_* and strrput conspire to issue) is issued after
6107 * the state change. Thus the pollwakeup will block until VOP_POLL has
6108 * returned and then wake up poll and have it call VOP_POLL again.
6109 * - The reading of so_state without holding so_lock does not result in
6110 * stale data that is older than the latest state change that has dropped
6111 * so_lock. This is ensured by the mutex_exit issuing the appropriate
6112 * memory barrier to force the data into the coherency domain.
6113 */
6114 static int
sotpi_poll(struct sonode * so,short events,int anyyet,short * reventsp,struct pollhead ** phpp)6115 sotpi_poll(
6116 struct sonode *so,
6117 short events,
6118 int anyyet,
6119 short *reventsp,
6120 struct pollhead **phpp)
6121 {
6122 short origevents = events;
6123 struct vnode *vp = SOTOV(so);
6124 int error;
6125 int so_state = so->so_state; /* snapshot */
6126 sotpi_info_t *sti = SOTOTPI(so);
6127
6128 dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6129 (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6130
6131 ASSERT(vp->v_type == VSOCK);
6132 ASSERT(vp->v_stream != NULL);
6133
6134 if (so->so_version == SOV_STREAM) {
6135 /* The imaginary "sockmod" has been popped - act as a stream */
6136 return (strpoll(vp->v_stream, events, anyyet,
6137 reventsp, phpp));
6138 }
6139
6140 if (!(so_state & SS_ISCONNECTED) &&
6141 (so->so_mode & SM_CONNREQUIRED)) {
6142 /* Not connected yet - turn off write side events */
6143 events &= ~(POLLOUT|POLLWRBAND);
6144 }
6145 /*
6146 * Check for errors without calling strpoll if the caller wants them.
6147 * In sockets the errors are represented as input/output events
6148 * and there is no need to ask the stream head for this information.
6149 */
6150 if (so->so_error != 0 &&
6151 ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
6152 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6153 return (0);
6154 }
6155 /*
6156 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6157 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6158 * will not trigger a POLLIN event with POLLRDDATA set.
6159 * The handling of urgent data (causing POLLRDBAND) is done by
6160 * inspecting SS_OOBPEND below.
6161 */
6162 events |= POLLRDDATA;
6163
6164 /*
6165 * After shutdown(output) a stream head write error is set.
6166 * However, we should not return output events.
6167 */
6168 events |= POLLNOERR;
6169 error = strpoll(vp->v_stream, events, anyyet,
6170 reventsp, phpp);
6171 if (error)
6172 return (error);
6173
6174 ASSERT(!(*reventsp & POLLERR));
6175
6176 /*
6177 * Notes on T_CONN_IND handling for sockets.
6178 *
6179 * If strpoll() returned without events, SR_POLLIN is guaranteed
6180 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6181 *
6182 * Since the so_lock is not held, soqueueconnind() may have run
6183 * and a T_CONN_IND may be waiting. We now check for any queued
6184 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6185 * to ensure poll returns.
6186 *
6187 * However:
6188 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6189 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6190 * the following actions will occur; taken together they ensure the
6191 * syscall will return.
6192 *
6193 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6194 * the accept() was run on a non-blocking socket sowaitconnind()
6195 * may have already returned EWOULDBLOCK, so not be waiting to
6196 * process the message. Additionally socktpi_poll() has probably
6197 * proceeded past the sti_conn_ind_head check below.
6198 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6199 * this thread, however that could occur before poll_common()
6200 * has entered cv_wait.
6201 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6202 *
6203 * Before proceeding to cv_wait() in poll_common() for an event,
6204 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6205 * and if set, re-calls strpoll() to ensure the late arriving
6206 * T_CONN_IND is recognized, and pollsys() returns.
6207 */
6208
6209 if (sti->sti_conn_ind_head != NULL)
6210 *reventsp |= (POLLIN|POLLRDNORM) & events;
6211
6212 if (so->so_state & SS_CANTRCVMORE) {
6213 *reventsp |= POLLRDHUP & events;
6214
6215 if (so->so_state & SS_CANTSENDMORE)
6216 *reventsp |= POLLHUP;
6217 }
6218
6219 if (so->so_state & SS_OOBPEND)
6220 *reventsp |= POLLRDBAND & events;
6221
6222 return (0);
6223 }
6224
6225 /*ARGSUSED*/
6226 static int
socktpi_constructor(void * buf,void * cdrarg,int kmflags)6227 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6228 {
6229 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6230 int error = 0;
6231
6232 error = sonode_constructor(buf, cdrarg, kmflags);
6233 if (error != 0)
6234 return (error);
6235
6236 error = i_sotpi_info_constructor(&st->st_info);
6237 if (error != 0)
6238 sonode_destructor(buf, cdrarg);
6239
6240 st->st_sonode.so_priv = &st->st_info;
6241
6242 return (error);
6243 }
6244
6245 /*ARGSUSED1*/
6246 static void
socktpi_destructor(void * buf,void * cdrarg)6247 socktpi_destructor(void *buf, void *cdrarg)
6248 {
6249 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6250
6251 ASSERT(st->st_sonode.so_priv == &st->st_info);
6252 st->st_sonode.so_priv = NULL;
6253
6254 i_sotpi_info_destructor(&st->st_info);
6255 sonode_destructor(buf, cdrarg);
6256 }
6257
6258 static int
socktpi_unix_constructor(void * buf,void * cdrarg,int kmflags)6259 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6260 {
6261 int retval;
6262
6263 if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6264 struct sonode *so = (struct sonode *)buf;
6265 sotpi_info_t *sti = SOTOTPI(so);
6266
6267 mutex_enter(&socklist.sl_lock);
6268
6269 sti->sti_next_so = socklist.sl_list;
6270 sti->sti_prev_so = NULL;
6271 if (sti->sti_next_so != NULL)
6272 SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6273 socklist.sl_list = so;
6274
6275 mutex_exit(&socklist.sl_lock);
6276
6277 }
6278 return (retval);
6279 }
6280
6281 static void
socktpi_unix_destructor(void * buf,void * cdrarg)6282 socktpi_unix_destructor(void *buf, void *cdrarg)
6283 {
6284 struct sonode *so = (struct sonode *)buf;
6285 sotpi_info_t *sti = SOTOTPI(so);
6286
6287 mutex_enter(&socklist.sl_lock);
6288
6289 if (sti->sti_next_so != NULL)
6290 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6291 if (sti->sti_prev_so != NULL)
6292 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6293 else
6294 socklist.sl_list = sti->sti_next_so;
6295
6296 mutex_exit(&socklist.sl_lock);
6297
6298 socktpi_destructor(buf, cdrarg);
6299 }
6300
6301 int
socktpi_init(void)6302 socktpi_init(void)
6303 {
6304 /*
6305 * Create sonode caches. We create a special one for AF_UNIX so
6306 * that we can track them for netstat(8).
6307 */
6308 socktpi_cache = kmem_cache_create("socktpi_cache",
6309 sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6310 socktpi_destructor, NULL, NULL, NULL, 0);
6311
6312 socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6313 sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6314 socktpi_unix_destructor, NULL, NULL, NULL, 0);
6315
6316 return (0);
6317 }
6318
6319 /*
6320 * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6321 *
6322 * Caller must still update state and mode using sotpi_update_state().
6323 */
6324 int
sotpi_convert_sonode(struct sonode * so,struct sockparams * newsp,boolean_t * direct,queue_t ** qp,struct cred * cr)6325 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6326 boolean_t *direct, queue_t **qp, struct cred *cr)
6327 {
6328 sotpi_info_t *sti;
6329 struct sockparams *origsp = so->so_sockparams;
6330 sock_lower_handle_t handle = so->so_proto_handle;
6331 struct stdata *stp;
6332 struct vnode *vp;
6333 queue_t *q;
6334 int error = 0;
6335
6336 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6337 SS_FALLBACK_PENDING);
6338 ASSERT(SOCK_IS_NONSTR(so));
6339
6340 *qp = NULL;
6341 *direct = B_FALSE;
6342 so->so_sockparams = newsp;
6343 /*
6344 * Allocate and initalize fields required by TPI.
6345 */
6346 (void) sotpi_info_create(so, KM_SLEEP);
6347 sotpi_info_init(so);
6348
6349 if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6350 sotpi_info_fini(so);
6351 sotpi_info_destroy(so);
6352 return (error);
6353 }
6354 ASSERT(handle == so->so_proto_handle);
6355 sti = SOTOTPI(so);
6356 if (sti->sti_direct != 0)
6357 *direct = B_TRUE;
6358
6359 /*
6360 * Keep the original sp around so we can properly dispose of the
6361 * sonode when the socket is being closed.
6362 */
6363 sti->sti_orig_sp = origsp;
6364
6365 so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
6366 so_alloc_addr(so, so->so_max_addr_len);
6367
6368 /*
6369 * If the application has done a SIOCSPGRP, make sure the
6370 * STREAM head is aware. This needs to take place before
6371 * the protocol start sending up messages. Otherwise we
6372 * might miss to generate SIGPOLL.
6373 *
6374 * It is possible that the application will receive duplicate
6375 * signals if some were already generated for either data or
6376 * connection indications.
6377 */
6378 if (so->so_pgrp != 0) {
6379 if (so_set_events(so, so->so_vnode, cr) != 0)
6380 so->so_pgrp = 0;
6381 }
6382
6383 /*
6384 * Determine which queue to use.
6385 */
6386 vp = SOTOV(so);
6387 stp = vp->v_stream;
6388 ASSERT(stp != NULL);
6389 q = stp->sd_wrq->q_next;
6390
6391 /*
6392 * Skip any modules that may have been auto pushed when the device
6393 * was opened
6394 */
6395 while (q->q_next != NULL)
6396 q = q->q_next;
6397 *qp = _RD(q);
6398
6399 /* This is now a STREAMS sockets */
6400 so->so_not_str = B_FALSE;
6401
6402 return (error);
6403 }
6404
6405 /*
6406 * Revert a TPI sonode. It is only allowed to revert the sonode during
6407 * the fallback process.
6408 */
6409 void
sotpi_revert_sonode(struct sonode * so,struct cred * cr)6410 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6411 {
6412 vnode_t *vp = SOTOV(so);
6413
6414 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6415 SS_FALLBACK_PENDING);
6416 ASSERT(!SOCK_IS_NONSTR(so));
6417 ASSERT(vp->v_stream != NULL);
6418
6419 strclean(vp);
6420 (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6421
6422 /*
6423 * Restore the original sockparams. The caller is responsible for
6424 * dropping the ref to the new sp.
6425 */
6426 so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6427
6428 sotpi_info_fini(so);
6429 sotpi_info_destroy(so);
6430
6431 /* This is no longer a STREAMS sockets */
6432 so->so_not_str = B_TRUE;
6433 }
6434
6435 void
sotpi_update_state(struct sonode * so,struct T_capability_ack * tcap,struct sockaddr * laddr,socklen_t laddrlen,struct sockaddr * faddr,socklen_t faddrlen,short opts)6436 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6437 struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6438 socklen_t faddrlen, short opts)
6439 {
6440 sotpi_info_t *sti = SOTOTPI(so);
6441
6442 so_proc_tcapability_ack(so, tcap);
6443
6444 so->so_options |= opts;
6445
6446 /*
6447 * Determine whether the foreign and local address are valid
6448 */
6449 if (laddrlen != 0) {
6450 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6451 sti->sti_laddr_len = laddrlen;
6452 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6453 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6454 }
6455
6456 if (faddrlen != 0) {
6457 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6458 sti->sti_faddr_len = faddrlen;
6459 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6460 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6461 }
6462
6463 }
6464
6465 /*
6466 * Allocate enough space to cache the local and foreign addresses.
6467 */
6468 void
so_alloc_addr(struct sonode * so,t_uscalar_t maxlen)6469 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6470 {
6471 sotpi_info_t *sti = SOTOTPI(so);
6472
6473 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6474 ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6475 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6476 P2ROUNDUP(maxlen, KMEM_ALIGN);
6477 so->so_max_addr_len = sti->sti_laddr_maxlen;
6478 sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6479 sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6480 + sti->sti_laddr_maxlen);
6481
6482 if (so->so_family == AF_UNIX) {
6483 /*
6484 * Initialize AF_UNIX related fields.
6485 */
6486 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6487 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6488 }
6489 }
6490
6491
6492 sotpi_info_t *
sotpi_sototpi(struct sonode * so)6493 sotpi_sototpi(struct sonode *so)
6494 {
6495 sotpi_info_t *sti;
6496
6497 ASSERT(so != NULL);
6498
6499 sti = (sotpi_info_t *)so->so_priv;
6500
6501 ASSERT(sti != NULL);
6502 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6503
6504 return (sti);
6505 }
6506
6507 static int
i_sotpi_info_constructor(sotpi_info_t * sti)6508 i_sotpi_info_constructor(sotpi_info_t *sti)
6509 {
6510 sti->sti_magic = SOTPI_INFO_MAGIC;
6511 sti->sti_ack_mp = NULL;
6512 sti->sti_discon_ind_mp = NULL;
6513 sti->sti_ux_bound_vp = NULL;
6514 sti->sti_unbind_mp = NULL;
6515
6516 sti->sti_conn_ind_head = NULL;
6517 sti->sti_conn_ind_tail = NULL;
6518
6519 sti->sti_laddr_sa = NULL;
6520 sti->sti_faddr_sa = NULL;
6521
6522 mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6523 cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6524
6525 return (0);
6526 }
6527
6528 static void
i_sotpi_info_destructor(sotpi_info_t * sti)6529 i_sotpi_info_destructor(sotpi_info_t *sti)
6530 {
6531 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6532 ASSERT(sti->sti_ack_mp == NULL);
6533 ASSERT(sti->sti_discon_ind_mp == NULL);
6534 ASSERT(sti->sti_ux_bound_vp == NULL);
6535 ASSERT(sti->sti_unbind_mp == NULL);
6536
6537 ASSERT(sti->sti_conn_ind_head == NULL);
6538 ASSERT(sti->sti_conn_ind_tail == NULL);
6539
6540 ASSERT(sti->sti_laddr_sa == NULL);
6541 ASSERT(sti->sti_faddr_sa == NULL);
6542
6543 mutex_destroy(&sti->sti_plumb_lock);
6544 cv_destroy(&sti->sti_ack_cv);
6545 }
6546
6547 /*
6548 * Creates and attaches TPI information to the given sonode
6549 */
6550 static boolean_t
sotpi_info_create(struct sonode * so,int kmflags)6551 sotpi_info_create(struct sonode *so, int kmflags)
6552 {
6553 sotpi_info_t *sti;
6554
6555 ASSERT(so->so_priv == NULL);
6556
6557 if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6558 return (B_FALSE);
6559
6560 if (i_sotpi_info_constructor(sti) != 0) {
6561 kmem_free(sti, sizeof (*sti));
6562 return (B_FALSE);
6563 }
6564
6565 so->so_priv = (void *)sti;
6566 return (B_TRUE);
6567 }
6568
6569 /*
6570 * Initializes the TPI information.
6571 */
6572 static void
sotpi_info_init(struct sonode * so)6573 sotpi_info_init(struct sonode *so)
6574 {
6575 struct vnode *vp = SOTOV(so);
6576 sotpi_info_t *sti = SOTOTPI(so);
6577 time_t now;
6578
6579 sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6580 vp->v_rdev = sti->sti_dev;
6581
6582 sti->sti_orig_sp = NULL;
6583
6584 sti->sti_pushcnt = 0;
6585
6586 now = gethrestime_sec();
6587 sti->sti_atime = now;
6588 sti->sti_mtime = now;
6589 sti->sti_ctime = now;
6590
6591 sti->sti_eaddr_mp = NULL;
6592 sti->sti_delayed_error = 0;
6593
6594 sti->sti_provinfo = NULL;
6595
6596 sti->sti_oobcnt = 0;
6597 sti->sti_oobsigcnt = 0;
6598
6599 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6600
6601 sti->sti_laddr_sa = 0;
6602 sti->sti_faddr_sa = 0;
6603 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6604 sti->sti_laddr_len = sti->sti_faddr_len = 0;
6605
6606 sti->sti_laddr_valid = 0;
6607 sti->sti_faddr_valid = 0;
6608 sti->sti_faddr_noxlate = 0;
6609
6610 sti->sti_direct = 0;
6611
6612 ASSERT(sti->sti_ack_mp == NULL);
6613 ASSERT(sti->sti_ux_bound_vp == NULL);
6614 ASSERT(sti->sti_unbind_mp == NULL);
6615
6616 ASSERT(sti->sti_conn_ind_head == NULL);
6617 ASSERT(sti->sti_conn_ind_tail == NULL);
6618 }
6619
6620 /*
6621 * Given a sonode, grab the TPI info and free any data.
6622 */
6623 static void
sotpi_info_fini(struct sonode * so)6624 sotpi_info_fini(struct sonode *so)
6625 {
6626 sotpi_info_t *sti = SOTOTPI(so);
6627 mblk_t *mp;
6628
6629 ASSERT(sti->sti_discon_ind_mp == NULL);
6630
6631 if ((mp = sti->sti_conn_ind_head) != NULL) {
6632 mblk_t *mp1;
6633
6634 while (mp) {
6635 mp1 = mp->b_next;
6636 mp->b_next = NULL;
6637 freemsg(mp);
6638 mp = mp1;
6639 }
6640 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6641 }
6642
6643 /*
6644 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6645 * indirect them. It also uses so_count as a validity test.
6646 */
6647 mutex_enter(&so->so_lock);
6648
6649 if (sti->sti_laddr_sa) {
6650 ASSERT((caddr_t)sti->sti_faddr_sa ==
6651 (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6652 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6653 sti->sti_laddr_valid = 0;
6654 sti->sti_faddr_valid = 0;
6655 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6656 sti->sti_laddr_sa = NULL;
6657 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6658 sti->sti_faddr_sa = NULL;
6659 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6660 }
6661
6662 mutex_exit(&so->so_lock);
6663
6664 if ((mp = sti->sti_eaddr_mp) != NULL) {
6665 freemsg(mp);
6666 sti->sti_eaddr_mp = NULL;
6667 sti->sti_delayed_error = 0;
6668 }
6669
6670 if ((mp = sti->sti_ack_mp) != NULL) {
6671 freemsg(mp);
6672 sti->sti_ack_mp = NULL;
6673 }
6674
6675 ASSERT(sti->sti_ux_bound_vp == NULL);
6676 if ((mp = sti->sti_unbind_mp) != NULL) {
6677 freemsg(mp);
6678 sti->sti_unbind_mp = NULL;
6679 }
6680 }
6681
6682 /*
6683 * Destroys the TPI information attached to a sonode.
6684 */
6685 static void
sotpi_info_destroy(struct sonode * so)6686 sotpi_info_destroy(struct sonode *so)
6687 {
6688 sotpi_info_t *sti = SOTOTPI(so);
6689
6690 i_sotpi_info_destructor(sti);
6691 kmem_free(sti, sizeof (*sti));
6692
6693 so->so_priv = NULL;
6694 }
6695
6696 /*
6697 * Create the global sotpi socket module entry. It will never be freed.
6698 */
6699 smod_info_t *
sotpi_smod_create(void)6700 sotpi_smod_create(void)
6701 {
6702 smod_info_t *smodp;
6703
6704 smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6705 smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6706 (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6707 /*
6708 * Initialize the smod_refcnt to 1 so it will never be freed.
6709 */
6710 smodp->smod_refcnt = 1;
6711 smodp->smod_uc_version = SOCK_UC_VERSION;
6712 smodp->smod_dc_version = SOCK_DC_VERSION;
6713 smodp->smod_sock_create_func = &sotpi_create;
6714 smodp->smod_sock_destroy_func = &sotpi_destroy;
6715 return (smodp);
6716 }
6717