xref: /illumos-gate/usr/src/uts/common/fs/sockfs/sockcommon.c (revision 7d64f41b87275bdc41b1f4cddb0fe3d951ef64bd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/cmn_err.h>
33 #include <sys/vfs.h>
34 #include <sys/policy.h>
35 #include <sys/modctl.h>
36 
37 #include <sys/sunddi.h>
38 
39 #include <sys/strsun.h>
40 #include <sys/stropts.h>
41 #include <sys/strsubr.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/sodirect.h>
45 #include <sys/uio.h>
46 
47 #include <inet/ipclassifier.h>
48 #include <fs/sockfs/sockcommon.h>
49 #include <fs/sockfs/nl7c.h>
50 #include <fs/sockfs/socktpi.h>
51 #include <inet/ip.h>
52 
53 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
54 
55 static struct kmem_cache *sock_sod_cache;
56 
57 /*
58  * Common socket access functions.
59  *
60  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
61  * the socket_xxx() function should be used.
62  */
63 
64 /*
65  * Try to create a new sonode of the requested <family, type, protocol>.
66  */
67 /* ARGSUSED */
68 struct sonode *
69 socket_create(int family, int type, int protocol, char *devpath, char *mod,
70     int flags, int version, struct cred *cr, int *errorp)
71 {
72 	struct sonode *so;
73 	struct sockparams *sp = NULL;
74 	int saved_error;
75 
76 	/*
77 	 * Look for a sockparams entry that match the given criteria.
78 	 * solookup() returns with the entry held.
79 	 */
80 	*errorp = solookup(family, type, protocol, &sp);
81 	saved_error = *errorp;
82 	if (sp == NULL) {
83 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
84 		/*
85 		 * There is no matching sockparams entry. An ephemeral entry is
86 		 * created if the caller specifies a device or a socket module.
87 		 */
88 		if (devpath != NULL) {
89 			saved_error = 0;
90 			sp = sockparams_hold_ephemeral_bydev(family, type,
91 			    protocol, devpath, kmflags, errorp);
92 		} else if (mod != NULL) {
93 			saved_error = 0;
94 			sp = sockparams_hold_ephemeral_bymod(family, type,
95 			    protocol, mod, kmflags, errorp);
96 		} else {
97 			*errorp = solookup(family, type, 0, &sp);
98 		}
99 
100 		if (sp == NULL) {
101 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
102 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
103 				*errorp = saved_error;
104 			return (NULL);
105 		}
106 	}
107 
108 	ASSERT(sp->sp_smod_info != NULL);
109 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
110 	sp->sp_stats.sps_ncreate.value.ui64++;
111 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
112 	    protocol, version, flags, errorp, cr);
113 	if (so == NULL) {
114 		SOCKPARAMS_DEC_REF(sp);
115 	} else {
116 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
117 			/* Cannot fail, only bumps so_count */
118 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
119 		} else {
120 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
121 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
122 				*errorp = saved_error;
123 			socket_destroy(so);
124 			so = NULL;
125 		}
126 	}
127 	return (so);
128 }
129 
130 struct sonode *
131 socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
132     sock_downcalls_t *dc, int flags, int *errorp)
133 {
134 	struct sonode *so;
135 	struct sockparams *sp;
136 	struct cred *cr;
137 
138 	if ((cr = CRED()) == NULL)
139 		cr = kcred;
140 
141 	sp = parent->so_sockparams;
142 	ASSERT(sp != NULL);
143 
144 	sp->sp_stats.sps_ncreate.value.ui64++;
145 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
146 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
147 	    errorp, cr);
148 	if (so != NULL) {
149 		SOCKPARAMS_INC_REF(sp);
150 
151 		so->so_proto_handle = lh;
152 		so->so_downcalls = dc;
153 		/*
154 		 * This function may be called in interrupt context, and CRED()
155 		 * will be NULL. In this case, pass in kcred.
156 		 */
157 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
158 			/* Cannot fail, only bumps so_count */
159 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
160 		} else  {
161 			socket_destroy(so);
162 			so = NULL;
163 		}
164 	}
165 
166 	return (so);
167 }
168 
169 /*
170  * Bind local endpoint.
171  */
172 int
173 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
174     int flags, cred_t *cr)
175 {
176 	return (SOP_BIND(so, name, namelen, flags, cr));
177 }
178 
179 /*
180  * Turn socket into a listen socket.
181  */
182 int
183 socket_listen(struct sonode *so, int backlog, cred_t *cr)
184 {
185 	if (backlog < 0) {
186 		backlog = 0;
187 	}
188 
189 	/*
190 	 * Use the same qlimit as in BSD. BSD checks the qlimit
191 	 * before queuing the next connection implying that a
192 	 * listen(sock, 0) allows one connection to be queued.
193 	 * BSD also uses 1.5 times the requested backlog.
194 	 *
195 	 * XNS Issue 4 required a strict interpretation of the backlog.
196 	 * This has been waived subsequently for Issue 4 and the change
197 	 * incorporated in XNS Issue 5. So we aren't required to do
198 	 * anything special for XPG apps.
199 	 */
200 	if (backlog >= (INT_MAX - 1) / 3)
201 		backlog = INT_MAX;
202 	else
203 		backlog = backlog * 3 / 2 + 1;
204 
205 	return (SOP_LISTEN(so, backlog, cr));
206 }
207 
208 /*
209  * Accept incoming connection.
210  */
211 int
212 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
213 {
214 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
215 }
216 
217 /*
218  * Active open.
219  */
220 int
221 socket_connect(struct sonode *so, const struct sockaddr *name,
222     socklen_t namelen, int fflag, int flags, cred_t *cr)
223 {
224 	int error;
225 
226 	/*
227 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
228 	 * connect to a null address. This is the portable method to
229 	 * unconnect a socket.
230 	 */
231 	if ((namelen >= sizeof (sa_family_t)) &&
232 	    (name->sa_family == AF_UNSPEC)) {
233 		name = NULL;
234 		namelen = 0;
235 	}
236 
237 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
238 
239 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
240 		/*
241 		 * X/Open specification contains a requirement that
242 		 * ENETUNREACH be returned but does not require
243 		 * EHOSTUNREACH. In order to keep the test suite
244 		 * happy we mess with the errno here.
245 		 */
246 		error = ENETUNREACH;
247 	}
248 
249 	return (error);
250 }
251 
252 /*
253  * Get address of remote node.
254  */
255 int
256 socket_getpeername(struct sonode *so, struct sockaddr *addr,
257     socklen_t *addrlen, boolean_t accept, cred_t *cr)
258 {
259 	ASSERT(*addrlen > 0);
260 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
261 
262 }
263 
264 /*
265  * Get local address.
266  */
267 int
268 socket_getsockname(struct sonode *so, struct sockaddr *addr,
269     socklen_t *addrlen, cred_t *cr)
270 {
271 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
272 
273 }
274 
275 /*
276  * Called from shutdown().
277  */
278 int
279 socket_shutdown(struct sonode *so, int how, cred_t *cr)
280 {
281 	return (SOP_SHUTDOWN(so, how, cr));
282 }
283 
284 /*
285  * Get socket options.
286  */
287 /*ARGSUSED*/
288 int
289 socket_getsockopt(struct sonode *so, int level, int option_name,
290     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
291 {
292 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
293 	    optlenp, flags, cr));
294 }
295 
296 /*
297  * Set socket options
298  */
299 int
300 socket_setsockopt(struct sonode *so, int level, int option_name,
301     const void *optval, t_uscalar_t optlen, cred_t *cr)
302 {
303 	int val = 1;
304 	/* Caller allocates aligned optval, or passes null */
305 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
306 	/* If optval is null optlen is 0, and vice-versa */
307 	ASSERT(optval != NULL || optlen == 0);
308 	ASSERT(optlen != 0 || optval == NULL);
309 
310 	if (optval == NULL && optlen == 0)
311 		optval = &val;
312 
313 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
314 }
315 
316 int
317 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
318     cred_t *cr)
319 {
320 	int error = 0;
321 	ssize_t orig_resid = uiop->uio_resid;
322 
323 	/*
324 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
325 	 */
326 	if (so->so_family == AF_UNIX)
327 		uiop->uio_extflg |= UIO_COPY_CACHED;
328 	else
329 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
330 
331 	error = SOP_SENDMSG(so, msg, uiop, cr);
332 	switch (error) {
333 	default:
334 		break;
335 	case EINTR:
336 	/* EAGAIN is EWOULDBLOCK */
337 	case EWOULDBLOCK:
338 		/* We did a partial send */
339 		if (uiop->uio_resid != orig_resid)
340 			error = 0;
341 		break;
342 	case EPIPE:
343 		if ((so->so_mode & SM_KERNEL) == 0)
344 			tsignal(curthread, SIGPIPE);
345 		break;
346 	}
347 
348 	return (error);
349 }
350 
351 int
352 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
353     struct cred *cr, mblk_t **mpp)
354 {
355 	int error = 0;
356 
357 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
358 	if (error == EPIPE) {
359 		tsignal(curthread, SIGPIPE);
360 	}
361 	return (error);
362 }
363 
364 int
365 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
366     cred_t *cr)
367 {
368 	int error;
369 	ssize_t orig_resid = uiop->uio_resid;
370 
371 	/*
372 	 * Do not bypass the cache when reading data, as the application
373 	 * is likely to access the data shortly.
374 	 */
375 	uiop->uio_extflg |= UIO_COPY_CACHED;
376 
377 	error = SOP_RECVMSG(so, msg, uiop, cr);
378 
379 	switch (error) {
380 	case EINTR:
381 	/* EAGAIN is EWOULDBLOCK */
382 	case EWOULDBLOCK:
383 		/* We did a partial read */
384 		if (uiop->uio_resid != orig_resid)
385 			error = 0;
386 		break;
387 	default:
388 		break;
389 	}
390 	return (error);
391 }
392 
393 int
394 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
395     struct cred *cr, int32_t *rvalp)
396 {
397 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
398 }
399 
400 int
401 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
402     struct pollhead **phpp)
403 {
404 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
405 }
406 
407 int
408 socket_close(struct sonode *so, int flag, struct cred *cr)
409 {
410 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
411 }
412 
413 int
414 socket_close_internal(struct sonode *so, int flag, cred_t *cr)
415 {
416 	ASSERT(so->so_count == 0);
417 
418 	return (SOP_CLOSE(so, flag, cr));
419 }
420 
421 void
422 socket_destroy(struct sonode *so)
423 {
424 	vn_invalid(SOTOV(so));
425 	VN_RELE(SOTOV(so));
426 }
427 
428 /* ARGSUSED */
429 void
430 socket_destroy_internal(struct sonode *so, cred_t *cr)
431 {
432 	struct sockparams *sp = so->so_sockparams;
433 	ASSERT(so->so_count == 0 && sp != NULL);
434 
435 	sp->sp_smod_info->smod_sock_destroy_func(so);
436 
437 	SOCKPARAMS_DEC_REF(sp);
438 }
439 
440 /*
441  * TODO Once the common vnode ops is available, then the vnops argument
442  * should be removed.
443  */
444 /*ARGSUSED*/
445 int
446 sonode_constructor(void *buf, void *cdrarg, int kmflags)
447 {
448 	struct sonode *so = buf;
449 	struct vnode *vp;
450 
451 	vp = so->so_vnode = vn_alloc(kmflags);
452 	if (vp == NULL) {
453 		return (-1);
454 	}
455 	vp->v_data = so;
456 	vn_setops(vp, socket_vnodeops);
457 
458 	so->so_priv 		= NULL;
459 	so->so_oobmsg		= NULL;
460 
461 	so->so_proto_handle	= NULL;
462 
463 	so->so_peercred 	= NULL;
464 
465 	so->so_rcv_queued	= 0;
466 	so->so_rcv_q_head 	= NULL;
467 	so->so_rcv_q_last_head 	= NULL;
468 	so->so_rcv_head		= NULL;
469 	so->so_rcv_last_head	= NULL;
470 	so->so_rcv_wanted	= 0;
471 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
472 	so->so_rcv_timer_tid	= 0;
473 	so->so_rcv_thresh	= 0;
474 
475 	so->so_acceptq_head	= NULL;
476 	so->so_acceptq_tail	= &so->so_acceptq_head;
477 	so->so_acceptq_next	= NULL;
478 	so->so_acceptq_len	= 0;
479 	so->so_backlog		= 0;
480 
481 	so->so_snd_qfull	= B_FALSE;
482 
483 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
484 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
485 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
486 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
487 	cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
488 
489 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
490 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
491 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
492 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
493 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
494 
495 	return (0);
496 }
497 
498 /*ARGSUSED*/
499 void
500 sonode_destructor(void *buf, void *cdrarg)
501 {
502 	struct sonode *so = buf;
503 	struct vnode *vp = SOTOV(so);
504 
505 	ASSERT(so->so_priv == NULL);
506 	ASSERT(so->so_peercred == NULL);
507 
508 	ASSERT(so->so_oobmsg == NULL);
509 
510 	ASSERT(so->so_rcv_q_head == NULL);
511 
512 	ASSERT(so->so_acceptq_head == NULL);
513 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
514 	ASSERT(so->so_acceptq_next == NULL);
515 
516 	ASSERT(vp->v_data == so);
517 	ASSERT(vn_matchops(vp, socket_vnodeops));
518 
519 	vn_free(vp);
520 
521 	mutex_destroy(&so->so_lock);
522 	mutex_destroy(&so->so_acceptq_lock);
523 	rw_destroy(&so->so_fallback_rwlock);
524 
525 	cv_destroy(&so->so_state_cv);
526 	cv_destroy(&so->so_want_cv);
527 	cv_destroy(&so->so_acceptq_cv);
528 	cv_destroy(&so->so_snd_cv);
529 	cv_destroy(&so->so_rcv_cv);
530 	cv_destroy(&so->so_closing_cv);
531 }
532 
533 void
534 sonode_init(struct sonode *so, struct sockparams *sp, int family,
535     int type, int protocol, sonodeops_t *sops)
536 {
537 	vnode_t *vp;
538 
539 	vp = SOTOV(so);
540 
541 	so->so_flag	= 0;
542 
543 	so->so_state	= 0;
544 	so->so_mode	= 0;
545 
546 	so->so_count	= 0;
547 
548 	so->so_family	= family;
549 	so->so_type	= type;
550 	so->so_protocol	= protocol;
551 
552 	SOCK_CONNID_INIT(so->so_proto_connid);
553 
554 	so->so_options	= 0;
555 	so->so_linger.l_onoff   = 0;
556 	so->so_linger.l_linger = 0;
557 	so->so_sndbuf	= 0;
558 	so->so_error	= 0;
559 	so->so_rcvtimeo	= 0;
560 	so->so_sndtimeo = 0;
561 	so->so_xpg_rcvbuf = 0;
562 
563 	ASSERT(so->so_oobmsg == NULL);
564 	so->so_oobmark	= 0;
565 	so->so_pgrp	= 0;
566 
567 	ASSERT(so->so_peercred == NULL);
568 
569 	so->so_zoneid = getzoneid();
570 
571 	so->so_sockparams = sp;
572 
573 	so->so_ops = sops;
574 
575 	so->so_not_str = (sops != &sotpi_sonodeops);
576 
577 	so->so_proto_handle = NULL;
578 
579 	so->so_downcalls = NULL;
580 
581 	so->so_copyflag = 0;
582 
583 	ASSERT(so->so_acceptq_head == NULL);
584 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
585 	ASSERT(so->so_acceptq_next == NULL);
586 
587 	vn_reinit(vp);
588 	vp->v_vfsp	= rootvfs;
589 	vp->v_type	= VSOCK;
590 	vp->v_rdev	= sockdev;
591 
592 	so->so_rcv_queued = 0;
593 	so->so_rcv_q_head = NULL;
594 	so->so_rcv_q_last_head = NULL;
595 	so->so_rcv_head	= NULL;
596 	so->so_rcv_last_head = NULL;
597 
598 	so->so_snd_qfull = B_FALSE;
599 	so->so_minpsz = 0;
600 
601 	so->so_rcv_wakeup = B_FALSE;
602 	so->so_snd_wakeup = B_FALSE;
603 	so->so_flowctrld = B_FALSE;
604 
605 	so->so_pollev = 0;
606 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
607 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
608 
609 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
610 	so->so_ksock_cb_arg = NULL;
611 
612 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
613 
614 	so->so_direct = NULL;
615 
616 	vn_exists(vp);
617 }
618 
619 void
620 sonode_fini(struct sonode *so)
621 {
622 	mblk_t *mp;
623 	vnode_t *vp;
624 
625 	ASSERT(so->so_count == 0);
626 
627 	if (so->so_rcv_timer_tid) {
628 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
629 		(void) untimeout(so->so_rcv_timer_tid);
630 		so->so_rcv_timer_tid = 0;
631 	}
632 
633 	so_acceptq_flush(so);
634 
635 	if ((mp = so->so_oobmsg) != NULL) {
636 		freemsg(mp);
637 		so->so_oobmsg = NULL;
638 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
639 		    SS_RCVATMARK);
640 	}
641 
642 	if (so->so_poll_list.ph_list != NULL) {
643 		pollwakeup(&so->so_poll_list, POLLERR);
644 		pollhead_clean(&so->so_poll_list);
645 	}
646 
647 	if (so->so_direct != NULL) {
648 		sodirect_t *sodp = so->so_direct;
649 
650 		ASSERT(sodp->sod_uioafh == NULL);
651 
652 		so->so_direct = NULL;
653 		kmem_cache_free(sock_sod_cache, sodp);
654 	}
655 
656 	vp = SOTOV(so);
657 	vn_invalid(vp);
658 
659 	if (so->so_peercred != NULL) {
660 		crfree(so->so_peercred);
661 		so->so_peercred = NULL;
662 	}
663 }
664 
665 /*
666  * This function is called at the beginning of recvmsg().
667  *
668  * If I/OAT is enabled on this sonode, initialize the uioa state machine
669  * with state UIOA_ALLOC.
670  */
671 uio_t *
672 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
673 {
674 	struct uio *suiop;
675 	struct uio *uiop;
676 	sodirect_t *sodp = so->so_direct;
677 
678 	if (sodp == NULL)
679 		return (NULL);
680 
681 	suiop = NULL;
682 	uiop = *uiopp;
683 
684 	mutex_enter(sodp->sod_lockp);
685 	if (uiop->uio_resid >= uioasync.mincnt &&
686 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
687 	    uioasync.enabled && !(flags & MSG_PEEK) &&
688 	    !(so->so_state & SS_CANTRCVMORE)) {
689 		/*
690 		 * Big enough I/O for uioa min setup and an sodirect socket
691 		 * and sodirect enabled and uioa enabled and I/O will be done
692 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
693 		 */
694 		if (!uioainit(uiop, &sodp->sod_uioa)) {
695 			/*
696 			 * Successful uioainit() so the uio_t part of the
697 			 * uioa_t will be used for all uio_t work to follow,
698 			 * we return the original "uiop" in "suiop".
699 			 */
700 			suiop = uiop;
701 			*uiopp = (uio_t *)&sodp->sod_uioa;
702 			/*
703 			 * Before returning to the caller the passed in uio_t
704 			 * "uiop" will be updated via a call to uioafini()
705 			 * below.
706 			 *
707 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
708 			 * here as first we have to uioamove() any currently
709 			 * queued M_DATA mblk_t(s) so it will be done later.
710 			 */
711 		}
712 		/*
713 		 * In either uioainit() success or not case note the number
714 		 * of uio bytes the caller wants for sod framework and/or
715 		 * transport (e.g. TCP) strategy.
716 		 */
717 		sodp->sod_want = uiop->uio_resid;
718 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
719 		/*
720 		 * No uioa but still using sodirect so note the number of
721 		 * uio bytes the caller wants for sodirect framework and/or
722 		 * transport (e.g. TCP) strategy.
723 		 */
724 		sodp->sod_want = uiop->uio_resid;
725 	}
726 	mutex_exit(sodp->sod_lockp);
727 
728 	return (suiop);
729 }
730 
731 /*
732  * This function is called at the end of recvmsg(), it finializes all the I/OAT
733  * operations, and reset the uioa state to UIOA_ALLOC.
734  */
735 int
736 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
737 {
738 	int error = 0;
739 	sodirect_t *sodp = so->so_direct;
740 	mblk_t *mp;
741 
742 	if (sodp == NULL) {
743 		return (0);
744 	}
745 
746 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
747 	/* Finish any sodirect and uioa processing */
748 	if (suiop != NULL) {
749 		/* Finish any uioa_t processing */
750 
751 		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
752 		error = uioafini(suiop, (uioa_t *)uiop);
753 		if ((mp = sodp->sod_uioafh) != NULL) {
754 			sodp->sod_uioafh = NULL;
755 			sodp->sod_uioaft = NULL;
756 			freemsg(mp);
757 		}
758 	}
759 	ASSERT(sodp->sod_uioafh == NULL);
760 	if (!(sodp->sod_state & SOD_WAKE_NOT)) {
761 		/* Awoke */
762 		sodp->sod_state &= SOD_WAKE_CLR;
763 		sodp->sod_state |= SOD_WAKE_NOT;
764 	}
765 	/* Last, clear sod_want value */
766 	sodp->sod_want = 0;
767 
768 	return (error);
769 }
770 
771 /*
772  * Schedule a uioamove() on a mblk. This is ususally called from
773  * protocols (e.g. TCP) on a I/OAT enabled sonode.
774  */
775 mblk_t *
776 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
777 {
778 	uioa_t *uioap = &sodp->sod_uioa;
779 	mblk_t *mp1 = mp;
780 	mblk_t *lmp = NULL;
781 
782 	ASSERT(DB_TYPE(mp) == M_DATA);
783 	ASSERT(msg_size == msgdsize(mp));
784 
785 	/* Caller must have lock held */
786 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
787 
788 	if (uioap->uioa_state & UIOA_ENABLED) {
789 		/* Uioa is enabled */
790 
791 		if (msg_size > uioap->uio_resid) {
792 			/*
793 			 * There isn't enough uio space for the mblk_t chain
794 			 * so disable uioa such that this and any additional
795 			 * mblk_t data is handled by the socket and schedule
796 			 * the socket for wakeup to finish this uioa.
797 			 */
798 			uioap->uioa_state &= UIOA_CLR;
799 			uioap->uioa_state |= UIOA_FINI;
800 			if (sodp->sod_state & SOD_WAKE_NOT) {
801 				sodp->sod_state &= SOD_WAKE_CLR;
802 				sodp->sod_state |= SOD_WAKE_NEED;
803 			}
804 			return (mp);
805 		}
806 		do {
807 			uint32_t	len = MBLKL(mp1);
808 
809 			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
810 				/* Scheduled, mark dblk_t as such */
811 				DB_FLAGS(mp1) |= DBLK_UIOA;
812 			} else {
813 				/* Error, turn off async processing */
814 				uioap->uioa_state &= UIOA_CLR;
815 				uioap->uioa_state |= UIOA_FINI;
816 				break;
817 			}
818 			lmp = mp1;
819 		} while ((mp1 = mp1->b_cont) != NULL);
820 
821 		if (mp1 != NULL || uioap->uio_resid == 0) {
822 			/*
823 			 * Not all mblk_t(s) uioamoved (error) or all uio
824 			 * space has been consumed so schedule the socket
825 			 * for wakeup to finish this uio.
826 			 */
827 			sodp->sod_state &= SOD_WAKE_CLR;
828 			sodp->sod_state |= SOD_WAKE_NEED;
829 
830 			/* Break the mblk chain if neccessary. */
831 			if (mp1 != NULL && lmp != NULL) {
832 				mp->b_next = mp1;
833 				lmp->b_cont = NULL;
834 			}
835 		}
836 	}
837 	return (mp1);
838 }
839 
840 /*
841  * This function is called on a mblk that thas been successfully uioamoved().
842  */
843 void
844 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
845 {
846 	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
847 		/*
848 		 * A uioa flaged mblk_t chain, already uio processed,
849 		 * add it to the sodirect uioa pending free list.
850 		 *
851 		 * Note, a b_cont chain headed by a DBLK_UIOA enable
852 		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
853 		 */
854 		mblk_t	*bpt = sodp->sod_uioaft;
855 
856 		ASSERT(sodp != NULL);
857 
858 		/*
859 		 * Add first mblk_t of "bp" chain to current sodirect uioa
860 		 * free list tail mblk_t, if any, else empty list so new head.
861 		 */
862 		if (bpt == NULL)
863 			sodp->sod_uioafh = bp;
864 		else
865 			bpt->b_cont = bp;
866 
867 		/*
868 		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
869 		 * each to reflect that uioamove() has consumed all data.
870 		 */
871 		bpt = bp;
872 		for (;;) {
873 			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
874 
875 			bpt->b_rptr = bpt->b_wptr;
876 			if (bpt->b_cont == NULL)
877 				break;
878 			bpt = bpt->b_cont;
879 		}
880 		/* New sodirect uioa free list tail */
881 		sodp->sod_uioaft = bpt;
882 
883 		/* Only dequeue once with data returned per uioa_t */
884 		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
885 			sodp->sod_uioa.uioa_state &= UIOA_CLR;
886 			sodp->sod_uioa.uioa_state |= UIOA_FINI;
887 		}
888 	}
889 }
890 
891 /*
892  * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
893  * this function on a non-STREAMS socket to schedule uioamove() on the data
894  * that has already queued in this socket.
895  */
896 void
897 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
898 {
899 	uioa_t	*uioap = (uioa_t *)uiop;
900 	mblk_t	*lbp;
901 	mblk_t	*wbp;
902 	mblk_t	*bp;
903 	int	len;
904 	int	error;
905 	boolean_t in_rcv_q = B_TRUE;
906 
907 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
908 	ASSERT(&sodp->sod_uioa == uioap);
909 
910 	/*
911 	 * Walk first b_cont chain in sod_q
912 	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
913 	 */
914 	bp = so->so_rcv_q_head;
915 
916 again:
917 	/* Walk the chain */
918 	lbp = NULL;
919 	wbp = bp;
920 
921 	do {
922 		if (bp == NULL)
923 			break;
924 
925 		if (wbp->b_datap->db_type != M_DATA) {
926 			/* Not M_DATA, no more uioa */
927 			goto nouioa;
928 		}
929 		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
930 			/* Have a M_DATA mblk_t with data */
931 			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
932 			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
933 				/* Not enough uio sapce, or beyond oobmark */
934 				goto nouioa;
935 			}
936 			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
937 			error = uioamove(wbp->b_rptr, len,
938 			    UIO_READ, uioap);
939 			if (!error) {
940 				/* Scheduled, mark dblk_t as such */
941 				wbp->b_datap->db_flags |= DBLK_UIOA;
942 			} else {
943 				/* Break the mblk chain */
944 				goto nouioa;
945 			}
946 		}
947 		/* Save last wbp processed */
948 		lbp = wbp;
949 	} while ((wbp = wbp->b_cont) != NULL);
950 
951 	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
952 		/*
953 		 * We get here only once to process the sonode dump area
954 		 * if so_rcv_q_head is NULL or all the mblks have been
955 		 * successfully uioamoved()ed.
956 		 */
957 		in_rcv_q = B_FALSE;
958 
959 		/* move to dump area */
960 		bp = so->so_rcv_head;
961 		goto again;
962 	}
963 
964 	return;
965 
966 nouioa:
967 	/* No more uioa */
968 	uioap->uioa_state &= UIOA_CLR;
969 	uioap->uioa_state |= UIOA_FINI;
970 
971 	/*
972 	 * If we processed 1 or more mblk_t(s) then we need to split the
973 	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
974 	 * are in the current chain and the rest are in the following new
975 	 * chain.
976 	 */
977 	if (lbp != NULL) {
978 		/* New end of current chain */
979 		lbp->b_cont = NULL;
980 
981 		/* Insert new chain wbp after bp */
982 		if ((wbp->b_next = bp->b_next) == NULL) {
983 			/*
984 			 * No need to grab so_lock, since sod_lockp
985 			 * points to so_lock.
986 			 */
987 			if (in_rcv_q)
988 				so->so_rcv_q_last_head = wbp;
989 			else
990 				so->so_rcv_last_head = wbp;
991 		}
992 		bp->b_next = wbp;
993 		bp->b_next->b_prev = bp->b_prev;
994 		bp->b_prev = lbp;
995 	}
996 }
997 
998 /*
999  * Initialize sodirect data structures on a socket.
1000  */
1001 void
1002 sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func,
1003     sod_wakeup_func wake_func, kmutex_t *lockp)
1004 {
1005 	sodirect_t	*sodp;
1006 
1007 	ASSERT(so->so_direct == NULL);
1008 
1009 	so->so_state |= SS_SODIRECT;
1010 
1011 	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
1012 	sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT;
1013 	sodp->sod_want = 0;
1014 	sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL;
1015 	sodp->sod_enqueue = enq_func;
1016 	sodp->sod_wakeup = wake_func;
1017 	sodp->sod_uioafh = NULL;
1018 	sodp->sod_uioaft = NULL;
1019 	sodp->sod_lockp = lockp;
1020 	/*
1021 	 * Remainder of the sod_uioa members are left uninitialized
1022 	 * but will be initialized later by uioainit() before uioa
1023 	 * is enabled.
1024 	 */
1025 	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
1026 	so->so_direct = sodp;
1027 	if (stp != NULL)
1028 		stp->sd_sodirect = sodp;
1029 }
1030 
1031 /*
1032  * Init the sodirect kmem cache while sockfs is loading.
1033  */
1034 void
1035 sod_init()
1036 {
1037 	/* Allocate sodirect_t kmem_cache */
1038 	sock_sod_cache = kmem_cache_create("sock_sod_cache",
1039 	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1040 }
1041 
1042 ssize_t
1043 sod_uioa_mblk(struct sonode *so, mblk_t *mp)
1044 {
1045 	sodirect_t *sodp = so->so_direct;
1046 
1047 	ASSERT(sodp != NULL);
1048 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
1049 
1050 	ASSERT(sodp->sod_state & SOD_ENABLED);
1051 	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
1052 
1053 	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
1054 
1055 	if (mp == NULL && so->so_rcv_q_head != NULL) {
1056 		mp = so->so_rcv_q_head;
1057 		ASSERT(mp->b_prev != NULL);
1058 		mp->b_prev = NULL;
1059 		so->so_rcv_q_head = mp->b_next;
1060 		if (so->so_rcv_q_head == NULL) {
1061 			so->so_rcv_q_last_head = NULL;
1062 		}
1063 		mp->b_next = NULL;
1064 	}
1065 
1066 	sod_uioa_mblk_done(sodp, mp);
1067 
1068 	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
1069 	    DB_TYPE(so->so_rcv_head) == M_DATA &&
1070 	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
1071 		/* more arrived */
1072 		ASSERT(so->so_rcv_q_head == NULL);
1073 		mp = so->so_rcv_head;
1074 		so->so_rcv_head = mp->b_next;
1075 		if (so->so_rcv_head == NULL)
1076 			so->so_rcv_last_head = NULL;
1077 		mp->b_prev = mp->b_next = NULL;
1078 		sod_uioa_mblk_done(sodp, mp);
1079 	}
1080 
1081 #ifdef DEBUG
1082 	if (so->so_rcv_q_head != NULL) {
1083 		mblk_t *m = so->so_rcv_q_head;
1084 		while (m != NULL) {
1085 			if (DB_FLAGS(m) & DBLK_UIOA) {
1086 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1087 				    " in so_rcv_q_head.\n", (void *)m);
1088 			}
1089 			m = m->b_next;
1090 		}
1091 	}
1092 	if (so->so_rcv_head != NULL) {
1093 		mblk_t *m = so->so_rcv_head;
1094 		while (m != NULL) {
1095 			if (DB_FLAGS(m) & DBLK_UIOA) {
1096 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1097 				    " in so_rcv_head.\n", (void *)m);
1098 			}
1099 			m = m->b_next;
1100 		}
1101 	}
1102 #endif
1103 	return (sodp->sod_uioa.uioa_mbytes);
1104 }
1105