xref: /illumos-gate/usr/src/uts/common/fs/sockfs/sockcommon.c (revision 0f1702c5201310f0529cd5abb77652e5e9b241b6)
1*0f1702c5SYu Xiangning /*
2*0f1702c5SYu Xiangning  * CDDL HEADER START
3*0f1702c5SYu Xiangning  *
4*0f1702c5SYu Xiangning  * The contents of this file are subject to the terms of the
5*0f1702c5SYu Xiangning  * Common Development and Distribution License (the "License").
6*0f1702c5SYu Xiangning  * You may not use this file except in compliance with the License.
7*0f1702c5SYu Xiangning  *
8*0f1702c5SYu Xiangning  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*0f1702c5SYu Xiangning  * or http://www.opensolaris.org/os/licensing.
10*0f1702c5SYu Xiangning  * See the License for the specific language governing permissions
11*0f1702c5SYu Xiangning  * and limitations under the License.
12*0f1702c5SYu Xiangning  *
13*0f1702c5SYu Xiangning  * When distributing Covered Code, include this CDDL HEADER in each
14*0f1702c5SYu Xiangning  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*0f1702c5SYu Xiangning  * If applicable, add the following below this CDDL HEADER, with the
16*0f1702c5SYu Xiangning  * fields enclosed by brackets "[]" replaced with your own identifying
17*0f1702c5SYu Xiangning  * information: Portions Copyright [yyyy] [name of copyright owner]
18*0f1702c5SYu Xiangning  *
19*0f1702c5SYu Xiangning  * CDDL HEADER END
20*0f1702c5SYu Xiangning  */
21*0f1702c5SYu Xiangning 
22*0f1702c5SYu Xiangning /*
23*0f1702c5SYu Xiangning  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24*0f1702c5SYu Xiangning  * Use is subject to license terms.
25*0f1702c5SYu Xiangning  */
26*0f1702c5SYu Xiangning 
27*0f1702c5SYu Xiangning #include <sys/types.h>
28*0f1702c5SYu Xiangning #include <sys/param.h>
29*0f1702c5SYu Xiangning #include <sys/systm.h>
30*0f1702c5SYu Xiangning #include <sys/sysmacros.h>
31*0f1702c5SYu Xiangning #include <sys/debug.h>
32*0f1702c5SYu Xiangning #include <sys/cmn_err.h>
33*0f1702c5SYu Xiangning #include <sys/vfs.h>
34*0f1702c5SYu Xiangning #include <sys/policy.h>
35*0f1702c5SYu Xiangning #include <sys/modctl.h>
36*0f1702c5SYu Xiangning 
37*0f1702c5SYu Xiangning #include <sys/sunddi.h>
38*0f1702c5SYu Xiangning 
39*0f1702c5SYu Xiangning #include <sys/strsun.h>
40*0f1702c5SYu Xiangning #include <sys/stropts.h>
41*0f1702c5SYu Xiangning #include <sys/strsubr.h>
42*0f1702c5SYu Xiangning #include <sys/socket.h>
43*0f1702c5SYu Xiangning #include <sys/socketvar.h>
44*0f1702c5SYu Xiangning #include <sys/sodirect.h>
45*0f1702c5SYu Xiangning #include <sys/uio.h>
46*0f1702c5SYu Xiangning 
47*0f1702c5SYu Xiangning #include <inet/ipclassifier.h>
48*0f1702c5SYu Xiangning #include <fs/sockfs/sockcommon.h>
49*0f1702c5SYu Xiangning #include <fs/sockfs/nl7c.h>
50*0f1702c5SYu Xiangning #include <inet/ip.h>
51*0f1702c5SYu Xiangning 
52*0f1702c5SYu Xiangning extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
53*0f1702c5SYu Xiangning 
54*0f1702c5SYu Xiangning static struct kmem_cache *sock_sod_cache;
55*0f1702c5SYu Xiangning 
56*0f1702c5SYu Xiangning /*
57*0f1702c5SYu Xiangning  * Common socket access functions.
58*0f1702c5SYu Xiangning  *
59*0f1702c5SYu Xiangning  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
60*0f1702c5SYu Xiangning  * the socket_xxx() function should be used.
61*0f1702c5SYu Xiangning  */
62*0f1702c5SYu Xiangning 
63*0f1702c5SYu Xiangning /*
64*0f1702c5SYu Xiangning  * Try to create a new sonode of the requested <family, type, protocol>.
65*0f1702c5SYu Xiangning  */
66*0f1702c5SYu Xiangning /* ARGSUSED */
67*0f1702c5SYu Xiangning struct sonode *
68*0f1702c5SYu Xiangning socket_create(int family, int type, int protocol, char *devpath, char *mod,
69*0f1702c5SYu Xiangning     int flags, int version, struct cred *cr, int *errorp)
70*0f1702c5SYu Xiangning {
71*0f1702c5SYu Xiangning 	struct sonode *so;
72*0f1702c5SYu Xiangning 	struct sockparams *sp = NULL;
73*0f1702c5SYu Xiangning 
74*0f1702c5SYu Xiangning 	/*
75*0f1702c5SYu Xiangning 	 * Look for a sockparams entry that match the given criteria.
76*0f1702c5SYu Xiangning 	 * solookup() returns with the entry held.
77*0f1702c5SYu Xiangning 	 */
78*0f1702c5SYu Xiangning 	*errorp = solookup(family, type, protocol, &sp);
79*0f1702c5SYu Xiangning 	if (sp == NULL) {
80*0f1702c5SYu Xiangning 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
81*0f1702c5SYu Xiangning 		/*
82*0f1702c5SYu Xiangning 		 * There is no matching sockparams entry. An ephemeral entry is
83*0f1702c5SYu Xiangning 		 * created if the caller specifies a device or a socket module.
84*0f1702c5SYu Xiangning 		 */
85*0f1702c5SYu Xiangning 		if (devpath != NULL) {
86*0f1702c5SYu Xiangning 			sp = sockparams_hold_ephemeral_bydev(family, type,
87*0f1702c5SYu Xiangning 			    protocol, devpath, kmflags, errorp);
88*0f1702c5SYu Xiangning 		} else if (mod != NULL) {
89*0f1702c5SYu Xiangning 			sp = sockparams_hold_ephemeral_bymod(family, type,
90*0f1702c5SYu Xiangning 			    protocol, mod, kmflags, errorp);
91*0f1702c5SYu Xiangning 		} else {
92*0f1702c5SYu Xiangning 			return (NULL);
93*0f1702c5SYu Xiangning 		}
94*0f1702c5SYu Xiangning 
95*0f1702c5SYu Xiangning 		if (sp == NULL)
96*0f1702c5SYu Xiangning 			return (NULL);
97*0f1702c5SYu Xiangning 	}
98*0f1702c5SYu Xiangning 
99*0f1702c5SYu Xiangning 	ASSERT(sp->sp_smod_info != NULL);
100*0f1702c5SYu Xiangning 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
101*0f1702c5SYu Xiangning 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
102*0f1702c5SYu Xiangning 	    protocol, version, flags, errorp, cr);
103*0f1702c5SYu Xiangning 	if (so == NULL) {
104*0f1702c5SYu Xiangning 		SOCKPARAMS_DEC_REF(sp);
105*0f1702c5SYu Xiangning 	} else {
106*0f1702c5SYu Xiangning 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
107*0f1702c5SYu Xiangning 			/* Cannot fail, only bumps so_count */
108*0f1702c5SYu Xiangning 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
109*0f1702c5SYu Xiangning 		} else {
110*0f1702c5SYu Xiangning 			socket_destroy(so);
111*0f1702c5SYu Xiangning 			so = NULL;
112*0f1702c5SYu Xiangning 		}
113*0f1702c5SYu Xiangning 	}
114*0f1702c5SYu Xiangning 	return (so);
115*0f1702c5SYu Xiangning }
116*0f1702c5SYu Xiangning 
117*0f1702c5SYu Xiangning struct sonode *
118*0f1702c5SYu Xiangning socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
119*0f1702c5SYu Xiangning     sock_downcalls_t *dc, int flags, int *errorp)
120*0f1702c5SYu Xiangning {
121*0f1702c5SYu Xiangning 	struct sonode *so;
122*0f1702c5SYu Xiangning 	struct sockparams *sp;
123*0f1702c5SYu Xiangning 	struct cred *cr;
124*0f1702c5SYu Xiangning 
125*0f1702c5SYu Xiangning 	if ((cr = CRED()) == NULL)
126*0f1702c5SYu Xiangning 		cr = kcred;
127*0f1702c5SYu Xiangning 
128*0f1702c5SYu Xiangning 	sp = parent->so_sockparams;
129*0f1702c5SYu Xiangning 	ASSERT(sp != NULL);
130*0f1702c5SYu Xiangning 
131*0f1702c5SYu Xiangning 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
132*0f1702c5SYu Xiangning 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
133*0f1702c5SYu Xiangning 	    errorp, cr);
134*0f1702c5SYu Xiangning 	if (so != NULL) {
135*0f1702c5SYu Xiangning 		SOCKPARAMS_INC_REF(sp);
136*0f1702c5SYu Xiangning 
137*0f1702c5SYu Xiangning 		so->so_proto_handle = lh;
138*0f1702c5SYu Xiangning 		so->so_downcalls = dc;
139*0f1702c5SYu Xiangning 		/*
140*0f1702c5SYu Xiangning 		 * This function may be called in interrupt context, and CRED()
141*0f1702c5SYu Xiangning 		 * will be NULL. In this case, pass in kcred.
142*0f1702c5SYu Xiangning 		 */
143*0f1702c5SYu Xiangning 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
144*0f1702c5SYu Xiangning 			/* Cannot fail, only bumps so_count */
145*0f1702c5SYu Xiangning 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
146*0f1702c5SYu Xiangning 		} else  {
147*0f1702c5SYu Xiangning 			socket_destroy(so);
148*0f1702c5SYu Xiangning 			so = NULL;
149*0f1702c5SYu Xiangning 		}
150*0f1702c5SYu Xiangning 	}
151*0f1702c5SYu Xiangning 
152*0f1702c5SYu Xiangning 	return (so);
153*0f1702c5SYu Xiangning }
154*0f1702c5SYu Xiangning 
155*0f1702c5SYu Xiangning /*
156*0f1702c5SYu Xiangning  * Bind local endpoint.
157*0f1702c5SYu Xiangning  */
158*0f1702c5SYu Xiangning int
159*0f1702c5SYu Xiangning socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
160*0f1702c5SYu Xiangning     int flags, cred_t *cr)
161*0f1702c5SYu Xiangning {
162*0f1702c5SYu Xiangning 	return (SOP_BIND(so, name, namelen, flags, cr));
163*0f1702c5SYu Xiangning }
164*0f1702c5SYu Xiangning 
165*0f1702c5SYu Xiangning /*
166*0f1702c5SYu Xiangning  * Turn socket into a listen socket.
167*0f1702c5SYu Xiangning  */
168*0f1702c5SYu Xiangning int
169*0f1702c5SYu Xiangning socket_listen(struct sonode *so, int backlog, cred_t *cr)
170*0f1702c5SYu Xiangning {
171*0f1702c5SYu Xiangning 	if (backlog < 0) {
172*0f1702c5SYu Xiangning 		backlog = 0;
173*0f1702c5SYu Xiangning 	}
174*0f1702c5SYu Xiangning 
175*0f1702c5SYu Xiangning 	/*
176*0f1702c5SYu Xiangning 	 * Use the same qlimit as in BSD. BSD checks the qlimit
177*0f1702c5SYu Xiangning 	 * before queuing the next connection implying that a
178*0f1702c5SYu Xiangning 	 * listen(sock, 0) allows one connection to be queued.
179*0f1702c5SYu Xiangning 	 * BSD also uses 1.5 times the requested backlog.
180*0f1702c5SYu Xiangning 	 *
181*0f1702c5SYu Xiangning 	 * XNS Issue 4 required a strict interpretation of the backlog.
182*0f1702c5SYu Xiangning 	 * This has been waived subsequently for Issue 4 and the change
183*0f1702c5SYu Xiangning 	 * incorporated in XNS Issue 5. So we aren't required to do
184*0f1702c5SYu Xiangning 	 * anything special for XPG apps.
185*0f1702c5SYu Xiangning 	 */
186*0f1702c5SYu Xiangning 	if (backlog >= (INT_MAX - 1) / 3)
187*0f1702c5SYu Xiangning 		backlog = INT_MAX;
188*0f1702c5SYu Xiangning 	else
189*0f1702c5SYu Xiangning 		backlog = backlog * 3 / 2 + 1;
190*0f1702c5SYu Xiangning 
191*0f1702c5SYu Xiangning 	return (SOP_LISTEN(so, backlog, cr));
192*0f1702c5SYu Xiangning }
193*0f1702c5SYu Xiangning 
194*0f1702c5SYu Xiangning /*
195*0f1702c5SYu Xiangning  * Accept incoming connection.
196*0f1702c5SYu Xiangning  */
197*0f1702c5SYu Xiangning int
198*0f1702c5SYu Xiangning socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
199*0f1702c5SYu Xiangning {
200*0f1702c5SYu Xiangning 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
201*0f1702c5SYu Xiangning }
202*0f1702c5SYu Xiangning 
203*0f1702c5SYu Xiangning /*
204*0f1702c5SYu Xiangning  * Active open.
205*0f1702c5SYu Xiangning  */
206*0f1702c5SYu Xiangning int
207*0f1702c5SYu Xiangning socket_connect(struct sonode *so, const struct sockaddr *name,
208*0f1702c5SYu Xiangning     socklen_t namelen, int fflag, int flags, cred_t *cr)
209*0f1702c5SYu Xiangning {
210*0f1702c5SYu Xiangning 	int error;
211*0f1702c5SYu Xiangning 
212*0f1702c5SYu Xiangning 	/*
213*0f1702c5SYu Xiangning 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
214*0f1702c5SYu Xiangning 	 * connect to a null address. This is the portable method to
215*0f1702c5SYu Xiangning 	 * unconnect a socket.
216*0f1702c5SYu Xiangning 	 */
217*0f1702c5SYu Xiangning 	if ((namelen >= sizeof (sa_family_t)) &&
218*0f1702c5SYu Xiangning 	    (name->sa_family == AF_UNSPEC)) {
219*0f1702c5SYu Xiangning 		name = NULL;
220*0f1702c5SYu Xiangning 		namelen = 0;
221*0f1702c5SYu Xiangning 	}
222*0f1702c5SYu Xiangning 
223*0f1702c5SYu Xiangning 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
224*0f1702c5SYu Xiangning 
225*0f1702c5SYu Xiangning 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
226*0f1702c5SYu Xiangning 		/*
227*0f1702c5SYu Xiangning 		 * X/Open specification contains a requirement that
228*0f1702c5SYu Xiangning 		 * ENETUNREACH be returned but does not require
229*0f1702c5SYu Xiangning 		 * EHOSTUNREACH. In order to keep the test suite
230*0f1702c5SYu Xiangning 		 * happy we mess with the errno here.
231*0f1702c5SYu Xiangning 		 */
232*0f1702c5SYu Xiangning 		error = ENETUNREACH;
233*0f1702c5SYu Xiangning 	}
234*0f1702c5SYu Xiangning 
235*0f1702c5SYu Xiangning 	return (error);
236*0f1702c5SYu Xiangning }
237*0f1702c5SYu Xiangning 
238*0f1702c5SYu Xiangning /*
239*0f1702c5SYu Xiangning  * Get address of remote node.
240*0f1702c5SYu Xiangning  */
241*0f1702c5SYu Xiangning int
242*0f1702c5SYu Xiangning socket_getpeername(struct sonode *so, struct sockaddr *addr,
243*0f1702c5SYu Xiangning     socklen_t *addrlen, boolean_t accept, cred_t *cr)
244*0f1702c5SYu Xiangning {
245*0f1702c5SYu Xiangning 	ASSERT(*addrlen > 0);
246*0f1702c5SYu Xiangning 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
247*0f1702c5SYu Xiangning 
248*0f1702c5SYu Xiangning }
249*0f1702c5SYu Xiangning 
250*0f1702c5SYu Xiangning /*
251*0f1702c5SYu Xiangning  * Get local address.
252*0f1702c5SYu Xiangning  */
253*0f1702c5SYu Xiangning int
254*0f1702c5SYu Xiangning socket_getsockname(struct sonode *so, struct sockaddr *addr,
255*0f1702c5SYu Xiangning     socklen_t *addrlen, cred_t *cr)
256*0f1702c5SYu Xiangning {
257*0f1702c5SYu Xiangning 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
258*0f1702c5SYu Xiangning 
259*0f1702c5SYu Xiangning }
260*0f1702c5SYu Xiangning 
261*0f1702c5SYu Xiangning /*
262*0f1702c5SYu Xiangning  * Called from shutdown().
263*0f1702c5SYu Xiangning  */
264*0f1702c5SYu Xiangning int
265*0f1702c5SYu Xiangning socket_shutdown(struct sonode *so, int how, cred_t *cr)
266*0f1702c5SYu Xiangning {
267*0f1702c5SYu Xiangning 	return (SOP_SHUTDOWN(so, how, cr));
268*0f1702c5SYu Xiangning }
269*0f1702c5SYu Xiangning 
270*0f1702c5SYu Xiangning /*
271*0f1702c5SYu Xiangning  * Get socket options.
272*0f1702c5SYu Xiangning  */
273*0f1702c5SYu Xiangning /*ARGSUSED*/
274*0f1702c5SYu Xiangning int
275*0f1702c5SYu Xiangning socket_getsockopt(struct sonode *so, int level, int option_name,
276*0f1702c5SYu Xiangning     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
277*0f1702c5SYu Xiangning {
278*0f1702c5SYu Xiangning 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
279*0f1702c5SYu Xiangning 	    optlenp, flags, cr));
280*0f1702c5SYu Xiangning }
281*0f1702c5SYu Xiangning 
282*0f1702c5SYu Xiangning /*
283*0f1702c5SYu Xiangning  * Set socket options
284*0f1702c5SYu Xiangning  */
285*0f1702c5SYu Xiangning int
286*0f1702c5SYu Xiangning socket_setsockopt(struct sonode *so, int level, int option_name,
287*0f1702c5SYu Xiangning     const void *optval, t_uscalar_t optlen, cred_t *cr)
288*0f1702c5SYu Xiangning {
289*0f1702c5SYu Xiangning 	/* Caller allocates aligned optval, or passes null */
290*0f1702c5SYu Xiangning 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
291*0f1702c5SYu Xiangning 	/* If optval is null optlen is 0, and vice-versa */
292*0f1702c5SYu Xiangning 	ASSERT(optval != NULL || optlen == 0);
293*0f1702c5SYu Xiangning 	ASSERT(optlen != 0 || optval == NULL);
294*0f1702c5SYu Xiangning 
295*0f1702c5SYu Xiangning 	/* No options should be zero-length */
296*0f1702c5SYu Xiangning 	if (optlen == 0)
297*0f1702c5SYu Xiangning 		return (EINVAL);
298*0f1702c5SYu Xiangning 
299*0f1702c5SYu Xiangning 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
300*0f1702c5SYu Xiangning }
301*0f1702c5SYu Xiangning 
302*0f1702c5SYu Xiangning int
303*0f1702c5SYu Xiangning socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
304*0f1702c5SYu Xiangning     cred_t *cr)
305*0f1702c5SYu Xiangning {
306*0f1702c5SYu Xiangning 	int error = 0;
307*0f1702c5SYu Xiangning 	ssize_t orig_resid = uiop->uio_resid;
308*0f1702c5SYu Xiangning 
309*0f1702c5SYu Xiangning 	/*
310*0f1702c5SYu Xiangning 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
311*0f1702c5SYu Xiangning 	 */
312*0f1702c5SYu Xiangning 	if (so->so_family == AF_UNIX)
313*0f1702c5SYu Xiangning 		uiop->uio_extflg |= UIO_COPY_CACHED;
314*0f1702c5SYu Xiangning 	else
315*0f1702c5SYu Xiangning 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
316*0f1702c5SYu Xiangning 
317*0f1702c5SYu Xiangning 	error = SOP_SENDMSG(so, msg, uiop, cr);
318*0f1702c5SYu Xiangning 	switch (error) {
319*0f1702c5SYu Xiangning 	default:
320*0f1702c5SYu Xiangning 		break;
321*0f1702c5SYu Xiangning 	case EINTR:
322*0f1702c5SYu Xiangning 	case ETIME:
323*0f1702c5SYu Xiangning 	case EWOULDBLOCK:
324*0f1702c5SYu Xiangning 		/* We did a partial send */
325*0f1702c5SYu Xiangning 		if (uiop->uio_resid != orig_resid)
326*0f1702c5SYu Xiangning 			error = 0;
327*0f1702c5SYu Xiangning 		break;
328*0f1702c5SYu Xiangning 	case EPIPE:
329*0f1702c5SYu Xiangning 		if ((so->so_mode & SM_KERNEL) == 0)
330*0f1702c5SYu Xiangning 			tsignal(curthread, SIGPIPE);
331*0f1702c5SYu Xiangning 		break;
332*0f1702c5SYu Xiangning 	}
333*0f1702c5SYu Xiangning 
334*0f1702c5SYu Xiangning 	return (error);
335*0f1702c5SYu Xiangning }
336*0f1702c5SYu Xiangning 
337*0f1702c5SYu Xiangning int
338*0f1702c5SYu Xiangning socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
339*0f1702c5SYu Xiangning     struct cred *cr, mblk_t **mpp)
340*0f1702c5SYu Xiangning {
341*0f1702c5SYu Xiangning 	int error = 0;
342*0f1702c5SYu Xiangning 
343*0f1702c5SYu Xiangning 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
344*0f1702c5SYu Xiangning 	if (error == EPIPE) {
345*0f1702c5SYu Xiangning 		tsignal(curthread, SIGPIPE);
346*0f1702c5SYu Xiangning 	}
347*0f1702c5SYu Xiangning 	return (error);
348*0f1702c5SYu Xiangning }
349*0f1702c5SYu Xiangning 
350*0f1702c5SYu Xiangning int
351*0f1702c5SYu Xiangning socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
352*0f1702c5SYu Xiangning     cred_t *cr)
353*0f1702c5SYu Xiangning {
354*0f1702c5SYu Xiangning 	int error;
355*0f1702c5SYu Xiangning 	ssize_t orig_resid = uiop->uio_resid;
356*0f1702c5SYu Xiangning 
357*0f1702c5SYu Xiangning 	/*
358*0f1702c5SYu Xiangning 	 * Do not bypass the cache when reading data, as the application
359*0f1702c5SYu Xiangning 	 * is likely to access the data shortly.
360*0f1702c5SYu Xiangning 	 */
361*0f1702c5SYu Xiangning 	uiop->uio_extflg |= UIO_COPY_CACHED;
362*0f1702c5SYu Xiangning 
363*0f1702c5SYu Xiangning 	error = SOP_RECVMSG(so, msg, uiop, cr);
364*0f1702c5SYu Xiangning 
365*0f1702c5SYu Xiangning 	switch (error) {
366*0f1702c5SYu Xiangning 	case EINTR:
367*0f1702c5SYu Xiangning 	case ETIME:
368*0f1702c5SYu Xiangning 	case EWOULDBLOCK:
369*0f1702c5SYu Xiangning 		/* We did a partial read */
370*0f1702c5SYu Xiangning 		if (uiop->uio_resid != orig_resid)
371*0f1702c5SYu Xiangning 			error = 0;
372*0f1702c5SYu Xiangning 		break;
373*0f1702c5SYu Xiangning 	default:
374*0f1702c5SYu Xiangning 		break;
375*0f1702c5SYu Xiangning 	}
376*0f1702c5SYu Xiangning 	return (error);
377*0f1702c5SYu Xiangning }
378*0f1702c5SYu Xiangning 
379*0f1702c5SYu Xiangning int
380*0f1702c5SYu Xiangning socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
381*0f1702c5SYu Xiangning     struct cred *cr, int32_t *rvalp)
382*0f1702c5SYu Xiangning {
383*0f1702c5SYu Xiangning 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
384*0f1702c5SYu Xiangning }
385*0f1702c5SYu Xiangning 
386*0f1702c5SYu Xiangning int
387*0f1702c5SYu Xiangning socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
388*0f1702c5SYu Xiangning     struct pollhead **phpp)
389*0f1702c5SYu Xiangning {
390*0f1702c5SYu Xiangning 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
391*0f1702c5SYu Xiangning }
392*0f1702c5SYu Xiangning 
393*0f1702c5SYu Xiangning int
394*0f1702c5SYu Xiangning socket_close(struct sonode *so, int flag, struct cred *cr)
395*0f1702c5SYu Xiangning {
396*0f1702c5SYu Xiangning 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
397*0f1702c5SYu Xiangning }
398*0f1702c5SYu Xiangning 
399*0f1702c5SYu Xiangning int
400*0f1702c5SYu Xiangning socket_close_internal(struct sonode *so, int flag, cred_t *cr)
401*0f1702c5SYu Xiangning {
402*0f1702c5SYu Xiangning 	ASSERT(so->so_count == 0);
403*0f1702c5SYu Xiangning 
404*0f1702c5SYu Xiangning 	return (SOP_CLOSE(so, flag, cr));
405*0f1702c5SYu Xiangning }
406*0f1702c5SYu Xiangning 
407*0f1702c5SYu Xiangning void
408*0f1702c5SYu Xiangning socket_destroy(struct sonode *so)
409*0f1702c5SYu Xiangning {
410*0f1702c5SYu Xiangning 	vn_invalid(SOTOV(so));
411*0f1702c5SYu Xiangning 	VN_RELE(SOTOV(so));
412*0f1702c5SYu Xiangning }
413*0f1702c5SYu Xiangning 
414*0f1702c5SYu Xiangning /* ARGSUSED */
415*0f1702c5SYu Xiangning void
416*0f1702c5SYu Xiangning socket_destroy_internal(struct sonode *so, cred_t *cr)
417*0f1702c5SYu Xiangning {
418*0f1702c5SYu Xiangning 	struct sockparams *sp = so->so_sockparams;
419*0f1702c5SYu Xiangning 	ASSERT(so->so_count == 0 && sp != NULL);
420*0f1702c5SYu Xiangning 
421*0f1702c5SYu Xiangning 	sp->sp_smod_info->smod_sock_destroy_func(so);
422*0f1702c5SYu Xiangning 
423*0f1702c5SYu Xiangning 	SOCKPARAMS_DEC_REF(sp);
424*0f1702c5SYu Xiangning }
425*0f1702c5SYu Xiangning 
426*0f1702c5SYu Xiangning /*
427*0f1702c5SYu Xiangning  * TODO Once the common vnode ops is available, then the vnops argument
428*0f1702c5SYu Xiangning  * should be removed.
429*0f1702c5SYu Xiangning  */
430*0f1702c5SYu Xiangning /*ARGSUSED*/
431*0f1702c5SYu Xiangning int
432*0f1702c5SYu Xiangning sonode_constructor(void *buf, void *cdrarg, int kmflags)
433*0f1702c5SYu Xiangning {
434*0f1702c5SYu Xiangning 	struct sonode *so = buf;
435*0f1702c5SYu Xiangning 	struct vnode *vp;
436*0f1702c5SYu Xiangning 
437*0f1702c5SYu Xiangning 	vp = so->so_vnode = vn_alloc(kmflags);
438*0f1702c5SYu Xiangning 	if (vp == NULL) {
439*0f1702c5SYu Xiangning 		return (-1);
440*0f1702c5SYu Xiangning 	}
441*0f1702c5SYu Xiangning 	vp->v_data = so;
442*0f1702c5SYu Xiangning 	vn_setops(vp, socket_vnodeops);
443*0f1702c5SYu Xiangning 
444*0f1702c5SYu Xiangning 	so->so_priv 		= NULL;
445*0f1702c5SYu Xiangning 	so->so_oobmsg		= NULL;
446*0f1702c5SYu Xiangning 
447*0f1702c5SYu Xiangning 	so->so_proto_handle	= NULL;
448*0f1702c5SYu Xiangning 
449*0f1702c5SYu Xiangning 	so->so_peercred 	= NULL;
450*0f1702c5SYu Xiangning 
451*0f1702c5SYu Xiangning 	so->so_rcv_queued	= 0;
452*0f1702c5SYu Xiangning 	so->so_rcv_q_head 	= NULL;
453*0f1702c5SYu Xiangning 	so->so_rcv_q_last_head 	= NULL;
454*0f1702c5SYu Xiangning 	so->so_rcv_head		= NULL;
455*0f1702c5SYu Xiangning 	so->so_rcv_last_head	= NULL;
456*0f1702c5SYu Xiangning 	so->so_rcv_wanted	= 0;
457*0f1702c5SYu Xiangning 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
458*0f1702c5SYu Xiangning 	so->so_rcv_timer_tid	= 0;
459*0f1702c5SYu Xiangning 	so->so_rcv_thresh	= 0;
460*0f1702c5SYu Xiangning 
461*0f1702c5SYu Xiangning 	so->so_acceptq_head	= NULL;
462*0f1702c5SYu Xiangning 	so->so_acceptq_tail	= &so->so_acceptq_head;
463*0f1702c5SYu Xiangning 	so->so_acceptq_next	= NULL;
464*0f1702c5SYu Xiangning 	so->so_acceptq_len	= 0;
465*0f1702c5SYu Xiangning 	so->so_backlog		= 0;
466*0f1702c5SYu Xiangning 
467*0f1702c5SYu Xiangning 	so->so_snd_qfull	= B_FALSE;
468*0f1702c5SYu Xiangning 
469*0f1702c5SYu Xiangning 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
470*0f1702c5SYu Xiangning 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
471*0f1702c5SYu Xiangning 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
472*0f1702c5SYu Xiangning 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
473*0f1702c5SYu Xiangning 	cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
474*0f1702c5SYu Xiangning 
475*0f1702c5SYu Xiangning 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
476*0f1702c5SYu Xiangning 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
477*0f1702c5SYu Xiangning 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
478*0f1702c5SYu Xiangning 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
479*0f1702c5SYu Xiangning 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
480*0f1702c5SYu Xiangning 
481*0f1702c5SYu Xiangning 	return (0);
482*0f1702c5SYu Xiangning }
483*0f1702c5SYu Xiangning 
484*0f1702c5SYu Xiangning /*ARGSUSED*/
485*0f1702c5SYu Xiangning void
486*0f1702c5SYu Xiangning sonode_destructor(void *buf, void *cdrarg)
487*0f1702c5SYu Xiangning {
488*0f1702c5SYu Xiangning 	struct sonode *so = buf;
489*0f1702c5SYu Xiangning 	struct vnode *vp = SOTOV(so);
490*0f1702c5SYu Xiangning 
491*0f1702c5SYu Xiangning 	ASSERT(so->so_priv == NULL);
492*0f1702c5SYu Xiangning 	ASSERT(so->so_peercred == NULL);
493*0f1702c5SYu Xiangning 
494*0f1702c5SYu Xiangning 	ASSERT(so->so_oobmsg == NULL);
495*0f1702c5SYu Xiangning 
496*0f1702c5SYu Xiangning 	ASSERT(so->so_rcv_q_head == NULL);
497*0f1702c5SYu Xiangning 
498*0f1702c5SYu Xiangning 	ASSERT(so->so_acceptq_head == NULL);
499*0f1702c5SYu Xiangning 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
500*0f1702c5SYu Xiangning 	ASSERT(so->so_acceptq_next == NULL);
501*0f1702c5SYu Xiangning 
502*0f1702c5SYu Xiangning 	ASSERT(vp->v_data == so);
503*0f1702c5SYu Xiangning 	ASSERT(vn_matchops(vp, socket_vnodeops));
504*0f1702c5SYu Xiangning 
505*0f1702c5SYu Xiangning 	vn_free(vp);
506*0f1702c5SYu Xiangning 
507*0f1702c5SYu Xiangning 	mutex_destroy(&so->so_lock);
508*0f1702c5SYu Xiangning 	mutex_destroy(&so->so_acceptq_lock);
509*0f1702c5SYu Xiangning 	rw_destroy(&so->so_fallback_rwlock);
510*0f1702c5SYu Xiangning 
511*0f1702c5SYu Xiangning 	cv_destroy(&so->so_state_cv);
512*0f1702c5SYu Xiangning 	cv_destroy(&so->so_want_cv);
513*0f1702c5SYu Xiangning 	cv_destroy(&so->so_acceptq_cv);
514*0f1702c5SYu Xiangning 	cv_destroy(&so->so_snd_cv);
515*0f1702c5SYu Xiangning 	cv_destroy(&so->so_rcv_cv);
516*0f1702c5SYu Xiangning 	cv_destroy(&so->so_closing_cv);
517*0f1702c5SYu Xiangning }
518*0f1702c5SYu Xiangning 
519*0f1702c5SYu Xiangning void
520*0f1702c5SYu Xiangning sonode_init(struct sonode *so, struct sockparams *sp, int family,
521*0f1702c5SYu Xiangning     int type, int protocol, sonodeops_t *sops)
522*0f1702c5SYu Xiangning {
523*0f1702c5SYu Xiangning 	vnode_t *vp;
524*0f1702c5SYu Xiangning 
525*0f1702c5SYu Xiangning 	vp = SOTOV(so);
526*0f1702c5SYu Xiangning 
527*0f1702c5SYu Xiangning 	so->so_flag	= 0;
528*0f1702c5SYu Xiangning 
529*0f1702c5SYu Xiangning 	so->so_state	= 0;
530*0f1702c5SYu Xiangning 	so->so_mode	= 0;
531*0f1702c5SYu Xiangning 
532*0f1702c5SYu Xiangning 	so->so_count	= 0;
533*0f1702c5SYu Xiangning 
534*0f1702c5SYu Xiangning 	so->so_family	= family;
535*0f1702c5SYu Xiangning 	so->so_type	= type;
536*0f1702c5SYu Xiangning 	so->so_protocol	= protocol;
537*0f1702c5SYu Xiangning 
538*0f1702c5SYu Xiangning 	SOCK_CONNID_INIT(so->so_proto_connid);
539*0f1702c5SYu Xiangning 
540*0f1702c5SYu Xiangning 	so->so_options	= 0;
541*0f1702c5SYu Xiangning 	so->so_linger.l_onoff   = 0;
542*0f1702c5SYu Xiangning 	so->so_linger.l_linger = 0;
543*0f1702c5SYu Xiangning 	so->so_sndbuf	= 0;
544*0f1702c5SYu Xiangning 	so->so_error	= 0;
545*0f1702c5SYu Xiangning 	so->so_rcvtimeo	= 0;
546*0f1702c5SYu Xiangning 	so->so_sndtimeo = 0;
547*0f1702c5SYu Xiangning 
548*0f1702c5SYu Xiangning 	ASSERT(so->so_oobmsg == NULL);
549*0f1702c5SYu Xiangning 	so->so_oobmark	= 0;
550*0f1702c5SYu Xiangning 	so->so_pgrp	= 0;
551*0f1702c5SYu Xiangning 
552*0f1702c5SYu Xiangning 	ASSERT(so->so_peercred == NULL);
553*0f1702c5SYu Xiangning 
554*0f1702c5SYu Xiangning 	so->so_zoneid = getzoneid();
555*0f1702c5SYu Xiangning 
556*0f1702c5SYu Xiangning 	so->so_sockparams = sp;
557*0f1702c5SYu Xiangning 
558*0f1702c5SYu Xiangning 	so->so_ops = sops;
559*0f1702c5SYu Xiangning 
560*0f1702c5SYu Xiangning 	so->so_proto_handle = NULL;
561*0f1702c5SYu Xiangning 
562*0f1702c5SYu Xiangning 	so->so_downcalls = NULL;
563*0f1702c5SYu Xiangning 
564*0f1702c5SYu Xiangning 	so->so_copyflag = 0;
565*0f1702c5SYu Xiangning 
566*0f1702c5SYu Xiangning 	ASSERT(so->so_acceptq_head == NULL);
567*0f1702c5SYu Xiangning 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
568*0f1702c5SYu Xiangning 	ASSERT(so->so_acceptq_next == NULL);
569*0f1702c5SYu Xiangning 
570*0f1702c5SYu Xiangning 	vn_reinit(vp);
571*0f1702c5SYu Xiangning 	vp->v_vfsp	= rootvfs;
572*0f1702c5SYu Xiangning 	vp->v_type	= VSOCK;
573*0f1702c5SYu Xiangning 	vp->v_rdev	= sockdev;
574*0f1702c5SYu Xiangning 
575*0f1702c5SYu Xiangning 	so->so_rcv_queued = 0;
576*0f1702c5SYu Xiangning 	so->so_rcv_q_head = NULL;
577*0f1702c5SYu Xiangning 	so->so_rcv_q_last_head = NULL;
578*0f1702c5SYu Xiangning 	so->so_rcv_head	= NULL;
579*0f1702c5SYu Xiangning 	so->so_rcv_last_head = NULL;
580*0f1702c5SYu Xiangning 
581*0f1702c5SYu Xiangning 	so->so_snd_qfull = B_FALSE;
582*0f1702c5SYu Xiangning 	so->so_minpsz = 0;
583*0f1702c5SYu Xiangning 
584*0f1702c5SYu Xiangning 	so->so_rcv_wakeup = B_FALSE;
585*0f1702c5SYu Xiangning 	so->so_snd_wakeup = B_FALSE;
586*0f1702c5SYu Xiangning 	so->so_flowctrld = B_FALSE;
587*0f1702c5SYu Xiangning 
588*0f1702c5SYu Xiangning 	so->so_pollev = 0;
589*0f1702c5SYu Xiangning 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
590*0f1702c5SYu Xiangning 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
591*0f1702c5SYu Xiangning 
592*0f1702c5SYu Xiangning 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
593*0f1702c5SYu Xiangning 	so->so_ksock_cb_arg = NULL;
594*0f1702c5SYu Xiangning 
595*0f1702c5SYu Xiangning 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
596*0f1702c5SYu Xiangning 
597*0f1702c5SYu Xiangning 	so->so_direct = NULL;
598*0f1702c5SYu Xiangning 
599*0f1702c5SYu Xiangning 	vn_exists(vp);
600*0f1702c5SYu Xiangning }
601*0f1702c5SYu Xiangning 
602*0f1702c5SYu Xiangning void
603*0f1702c5SYu Xiangning sonode_fini(struct sonode *so)
604*0f1702c5SYu Xiangning {
605*0f1702c5SYu Xiangning 	mblk_t *mp;
606*0f1702c5SYu Xiangning 	vnode_t *vp;
607*0f1702c5SYu Xiangning 
608*0f1702c5SYu Xiangning 	ASSERT(so->so_count == 0);
609*0f1702c5SYu Xiangning 
610*0f1702c5SYu Xiangning 	if (so->so_rcv_timer_tid) {
611*0f1702c5SYu Xiangning 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
612*0f1702c5SYu Xiangning 		(void) untimeout(so->so_rcv_timer_tid);
613*0f1702c5SYu Xiangning 		so->so_rcv_timer_tid = 0;
614*0f1702c5SYu Xiangning 	}
615*0f1702c5SYu Xiangning 
616*0f1702c5SYu Xiangning 	so_acceptq_flush(so);
617*0f1702c5SYu Xiangning 
618*0f1702c5SYu Xiangning #ifdef DEBUG
619*0f1702c5SYu Xiangning 	mutex_enter(&so->so_lock);
620*0f1702c5SYu Xiangning 	ASSERT(so_verify_oobstate(so));
621*0f1702c5SYu Xiangning 	mutex_exit(&so->so_lock);
622*0f1702c5SYu Xiangning #endif /* DEBUG */
623*0f1702c5SYu Xiangning 	if ((mp = so->so_oobmsg) != NULL) {
624*0f1702c5SYu Xiangning 		freemsg(mp);
625*0f1702c5SYu Xiangning 		so->so_oobmsg = NULL;
626*0f1702c5SYu Xiangning 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
627*0f1702c5SYu Xiangning 		    SS_RCVATMARK);
628*0f1702c5SYu Xiangning 	}
629*0f1702c5SYu Xiangning 
630*0f1702c5SYu Xiangning 	if (so->so_poll_list.ph_list != NULL) {
631*0f1702c5SYu Xiangning 		pollwakeup(&so->so_poll_list, POLLERR);
632*0f1702c5SYu Xiangning 		pollhead_clean(&so->so_poll_list);
633*0f1702c5SYu Xiangning 	}
634*0f1702c5SYu Xiangning 
635*0f1702c5SYu Xiangning 	if (so->so_direct != NULL) {
636*0f1702c5SYu Xiangning 		sodirect_t *sodp = so->so_direct;
637*0f1702c5SYu Xiangning 
638*0f1702c5SYu Xiangning 		ASSERT(sodp->sod_uioafh == NULL);
639*0f1702c5SYu Xiangning 
640*0f1702c5SYu Xiangning 		so->so_direct = NULL;
641*0f1702c5SYu Xiangning 		kmem_cache_free(sock_sod_cache, sodp);
642*0f1702c5SYu Xiangning 	}
643*0f1702c5SYu Xiangning 
644*0f1702c5SYu Xiangning 	vp = SOTOV(so);
645*0f1702c5SYu Xiangning 	vn_invalid(vp);
646*0f1702c5SYu Xiangning 
647*0f1702c5SYu Xiangning 	if (so->so_peercred != NULL) {
648*0f1702c5SYu Xiangning 		crfree(so->so_peercred);
649*0f1702c5SYu Xiangning 		so->so_peercred = NULL;
650*0f1702c5SYu Xiangning 	}
651*0f1702c5SYu Xiangning }
652*0f1702c5SYu Xiangning 
653*0f1702c5SYu Xiangning /*
654*0f1702c5SYu Xiangning  * This function is called at the beginning of recvmsg().
655*0f1702c5SYu Xiangning  *
656*0f1702c5SYu Xiangning  * If I/OAT is enabled on this sonode, initialize the uioa state machine
657*0f1702c5SYu Xiangning  * with state UIOA_ALLOC.
658*0f1702c5SYu Xiangning  */
659*0f1702c5SYu Xiangning uio_t *
660*0f1702c5SYu Xiangning sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
661*0f1702c5SYu Xiangning {
662*0f1702c5SYu Xiangning 	struct uio *suiop;
663*0f1702c5SYu Xiangning 	struct uio *uiop;
664*0f1702c5SYu Xiangning 	sodirect_t *sodp = so->so_direct;
665*0f1702c5SYu Xiangning 
666*0f1702c5SYu Xiangning 	if (sodp == NULL)
667*0f1702c5SYu Xiangning 		return (NULL);
668*0f1702c5SYu Xiangning 
669*0f1702c5SYu Xiangning 	suiop = NULL;
670*0f1702c5SYu Xiangning 	uiop = *uiopp;
671*0f1702c5SYu Xiangning 
672*0f1702c5SYu Xiangning 	mutex_enter(sodp->sod_lockp);
673*0f1702c5SYu Xiangning 	if (uiop->uio_resid >= uioasync.mincnt &&
674*0f1702c5SYu Xiangning 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
675*0f1702c5SYu Xiangning 	    uioasync.enabled && !(flags & MSG_PEEK) &&
676*0f1702c5SYu Xiangning 	    !(so->so_state & SS_CANTRCVMORE)) {
677*0f1702c5SYu Xiangning 		/*
678*0f1702c5SYu Xiangning 		 * Big enough I/O for uioa min setup and an sodirect socket
679*0f1702c5SYu Xiangning 		 * and sodirect enabled and uioa enabled and I/O will be done
680*0f1702c5SYu Xiangning 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
681*0f1702c5SYu Xiangning 		 */
682*0f1702c5SYu Xiangning 		if (!uioainit(uiop, &sodp->sod_uioa)) {
683*0f1702c5SYu Xiangning 			/*
684*0f1702c5SYu Xiangning 			 * Successful uioainit() so the uio_t part of the
685*0f1702c5SYu Xiangning 			 * uioa_t will be used for all uio_t work to follow,
686*0f1702c5SYu Xiangning 			 * we return the original "uiop" in "suiop".
687*0f1702c5SYu Xiangning 			 */
688*0f1702c5SYu Xiangning 			suiop = uiop;
689*0f1702c5SYu Xiangning 			*uiopp = (uio_t *)&sodp->sod_uioa;
690*0f1702c5SYu Xiangning 			/*
691*0f1702c5SYu Xiangning 			 * Before returning to the caller the passed in uio_t
692*0f1702c5SYu Xiangning 			 * "uiop" will be updated via a call to uioafini()
693*0f1702c5SYu Xiangning 			 * below.
694*0f1702c5SYu Xiangning 			 *
695*0f1702c5SYu Xiangning 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
696*0f1702c5SYu Xiangning 			 * here as first we have to uioamove() any currently
697*0f1702c5SYu Xiangning 			 * queued M_DATA mblk_t(s) so it will be done later.
698*0f1702c5SYu Xiangning 			 */
699*0f1702c5SYu Xiangning 		}
700*0f1702c5SYu Xiangning 		/*
701*0f1702c5SYu Xiangning 		 * In either uioainit() success or not case note the number
702*0f1702c5SYu Xiangning 		 * of uio bytes the caller wants for sod framework and/or
703*0f1702c5SYu Xiangning 		 * transport (e.g. TCP) strategy.
704*0f1702c5SYu Xiangning 		 */
705*0f1702c5SYu Xiangning 		sodp->sod_want = uiop->uio_resid;
706*0f1702c5SYu Xiangning 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
707*0f1702c5SYu Xiangning 		/*
708*0f1702c5SYu Xiangning 		 * No uioa but still using sodirect so note the number of
709*0f1702c5SYu Xiangning 		 * uio bytes the caller wants for sodirect framework and/or
710*0f1702c5SYu Xiangning 		 * transport (e.g. TCP) strategy.
711*0f1702c5SYu Xiangning 		 */
712*0f1702c5SYu Xiangning 		sodp->sod_want = uiop->uio_resid;
713*0f1702c5SYu Xiangning 	}
714*0f1702c5SYu Xiangning 	mutex_exit(sodp->sod_lockp);
715*0f1702c5SYu Xiangning 
716*0f1702c5SYu Xiangning 	return (suiop);
717*0f1702c5SYu Xiangning }
718*0f1702c5SYu Xiangning 
719*0f1702c5SYu Xiangning /*
720*0f1702c5SYu Xiangning  * This function is called at the end of recvmsg(), it finializes all the I/OAT
721*0f1702c5SYu Xiangning  * operations, and reset the uioa state to UIOA_ALLOC.
722*0f1702c5SYu Xiangning  */
723*0f1702c5SYu Xiangning int
724*0f1702c5SYu Xiangning sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
725*0f1702c5SYu Xiangning {
726*0f1702c5SYu Xiangning 	int error = 0;
727*0f1702c5SYu Xiangning 	sodirect_t *sodp = so->so_direct;
728*0f1702c5SYu Xiangning 	mblk_t *mp;
729*0f1702c5SYu Xiangning 
730*0f1702c5SYu Xiangning 	if (sodp == NULL) {
731*0f1702c5SYu Xiangning 		return (0);
732*0f1702c5SYu Xiangning 	}
733*0f1702c5SYu Xiangning 
734*0f1702c5SYu Xiangning 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
735*0f1702c5SYu Xiangning 	/* Finish any sodirect and uioa processing */
736*0f1702c5SYu Xiangning 	if (suiop != NULL) {
737*0f1702c5SYu Xiangning 		/* Finish any uioa_t processing */
738*0f1702c5SYu Xiangning 
739*0f1702c5SYu Xiangning 		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
740*0f1702c5SYu Xiangning 		error = uioafini(suiop, (uioa_t *)uiop);
741*0f1702c5SYu Xiangning 		if ((mp = sodp->sod_uioafh) != NULL) {
742*0f1702c5SYu Xiangning 			sodp->sod_uioafh = NULL;
743*0f1702c5SYu Xiangning 			sodp->sod_uioaft = NULL;
744*0f1702c5SYu Xiangning 			freemsg(mp);
745*0f1702c5SYu Xiangning 		}
746*0f1702c5SYu Xiangning 	}
747*0f1702c5SYu Xiangning 	ASSERT(sodp->sod_uioafh == NULL);
748*0f1702c5SYu Xiangning 	if (!(sodp->sod_state & SOD_WAKE_NOT)) {
749*0f1702c5SYu Xiangning 		/* Awoke */
750*0f1702c5SYu Xiangning 		sodp->sod_state &= SOD_WAKE_CLR;
751*0f1702c5SYu Xiangning 		sodp->sod_state |= SOD_WAKE_NOT;
752*0f1702c5SYu Xiangning 	}
753*0f1702c5SYu Xiangning 	/* Last, clear sod_want value */
754*0f1702c5SYu Xiangning 	sodp->sod_want = 0;
755*0f1702c5SYu Xiangning 
756*0f1702c5SYu Xiangning 	return (error);
757*0f1702c5SYu Xiangning }
758*0f1702c5SYu Xiangning 
759*0f1702c5SYu Xiangning /*
760*0f1702c5SYu Xiangning  * Schedule a uioamove() on a mblk. This is ususally called from
761*0f1702c5SYu Xiangning  * protocols (e.g. TCP) on a I/OAT enabled sonode.
762*0f1702c5SYu Xiangning  */
763*0f1702c5SYu Xiangning mblk_t *
764*0f1702c5SYu Xiangning sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
765*0f1702c5SYu Xiangning {
766*0f1702c5SYu Xiangning 	uioa_t *uioap = &sodp->sod_uioa;
767*0f1702c5SYu Xiangning 	mblk_t *mp1 = mp;
768*0f1702c5SYu Xiangning 	mblk_t *lmp = NULL;
769*0f1702c5SYu Xiangning 
770*0f1702c5SYu Xiangning 	ASSERT(DB_TYPE(mp) == M_DATA);
771*0f1702c5SYu Xiangning 	ASSERT(msg_size == msgdsize(mp));
772*0f1702c5SYu Xiangning 
773*0f1702c5SYu Xiangning 	/* Caller must have lock held */
774*0f1702c5SYu Xiangning 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
775*0f1702c5SYu Xiangning 
776*0f1702c5SYu Xiangning 	if (uioap->uioa_state & UIOA_ENABLED) {
777*0f1702c5SYu Xiangning 		/* Uioa is enabled */
778*0f1702c5SYu Xiangning 
779*0f1702c5SYu Xiangning 		if (msg_size > uioap->uio_resid) {
780*0f1702c5SYu Xiangning 			/*
781*0f1702c5SYu Xiangning 			 * There isn't enough uio space for the mblk_t chain
782*0f1702c5SYu Xiangning 			 * so disable uioa such that this and any additional
783*0f1702c5SYu Xiangning 			 * mblk_t data is handled by the socket and schedule
784*0f1702c5SYu Xiangning 			 * the socket for wakeup to finish this uioa.
785*0f1702c5SYu Xiangning 			 */
786*0f1702c5SYu Xiangning 			uioap->uioa_state &= UIOA_CLR;
787*0f1702c5SYu Xiangning 			uioap->uioa_state |= UIOA_FINI;
788*0f1702c5SYu Xiangning 			if (sodp->sod_state & SOD_WAKE_NOT) {
789*0f1702c5SYu Xiangning 				sodp->sod_state &= SOD_WAKE_CLR;
790*0f1702c5SYu Xiangning 				sodp->sod_state |= SOD_WAKE_NEED;
791*0f1702c5SYu Xiangning 			}
792*0f1702c5SYu Xiangning 			return (mp);
793*0f1702c5SYu Xiangning 		}
794*0f1702c5SYu Xiangning 		do {
795*0f1702c5SYu Xiangning 			uint32_t	len = MBLKL(mp1);
796*0f1702c5SYu Xiangning 
797*0f1702c5SYu Xiangning 			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
798*0f1702c5SYu Xiangning 				/* Scheduled, mark dblk_t as such */
799*0f1702c5SYu Xiangning 				DB_FLAGS(mp1) |= DBLK_UIOA;
800*0f1702c5SYu Xiangning 			} else {
801*0f1702c5SYu Xiangning 				/* Error, turn off async processing */
802*0f1702c5SYu Xiangning 				uioap->uioa_state &= UIOA_CLR;
803*0f1702c5SYu Xiangning 				uioap->uioa_state |= UIOA_FINI;
804*0f1702c5SYu Xiangning 				break;
805*0f1702c5SYu Xiangning 			}
806*0f1702c5SYu Xiangning 			lmp = mp1;
807*0f1702c5SYu Xiangning 		} while ((mp1 = mp1->b_cont) != NULL);
808*0f1702c5SYu Xiangning 
809*0f1702c5SYu Xiangning 		if (mp1 != NULL || uioap->uio_resid == 0) {
810*0f1702c5SYu Xiangning 			/*
811*0f1702c5SYu Xiangning 			 * Not all mblk_t(s) uioamoved (error) or all uio
812*0f1702c5SYu Xiangning 			 * space has been consumed so schedule the socket
813*0f1702c5SYu Xiangning 			 * for wakeup to finish this uio.
814*0f1702c5SYu Xiangning 			 */
815*0f1702c5SYu Xiangning 			sodp->sod_state &= SOD_WAKE_CLR;
816*0f1702c5SYu Xiangning 			sodp->sod_state |= SOD_WAKE_NEED;
817*0f1702c5SYu Xiangning 
818*0f1702c5SYu Xiangning 			/* Break the mblk chain if neccessary. */
819*0f1702c5SYu Xiangning 			if (mp1 != NULL && lmp != NULL) {
820*0f1702c5SYu Xiangning 				mp->b_next = mp1;
821*0f1702c5SYu Xiangning 				lmp->b_cont = NULL;
822*0f1702c5SYu Xiangning 			}
823*0f1702c5SYu Xiangning 		}
824*0f1702c5SYu Xiangning 	}
825*0f1702c5SYu Xiangning 	return (mp1);
826*0f1702c5SYu Xiangning }
827*0f1702c5SYu Xiangning 
828*0f1702c5SYu Xiangning /*
829*0f1702c5SYu Xiangning  * This function is called on a mblk that thas been successfully uioamoved().
830*0f1702c5SYu Xiangning  */
831*0f1702c5SYu Xiangning void
832*0f1702c5SYu Xiangning sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
833*0f1702c5SYu Xiangning {
834*0f1702c5SYu Xiangning 	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
835*0f1702c5SYu Xiangning 		/*
836*0f1702c5SYu Xiangning 		 * A uioa flaged mblk_t chain, already uio processed,
837*0f1702c5SYu Xiangning 		 * add it to the sodirect uioa pending free list.
838*0f1702c5SYu Xiangning 		 *
839*0f1702c5SYu Xiangning 		 * Note, a b_cont chain headed by a DBLK_UIOA enable
840*0f1702c5SYu Xiangning 		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
841*0f1702c5SYu Xiangning 		 */
842*0f1702c5SYu Xiangning 		mblk_t	*bpt = sodp->sod_uioaft;
843*0f1702c5SYu Xiangning 
844*0f1702c5SYu Xiangning 		ASSERT(sodp != NULL);
845*0f1702c5SYu Xiangning 
846*0f1702c5SYu Xiangning 		/*
847*0f1702c5SYu Xiangning 		 * Add first mblk_t of "bp" chain to current sodirect uioa
848*0f1702c5SYu Xiangning 		 * free list tail mblk_t, if any, else empty list so new head.
849*0f1702c5SYu Xiangning 		 */
850*0f1702c5SYu Xiangning 		if (bpt == NULL)
851*0f1702c5SYu Xiangning 			sodp->sod_uioafh = bp;
852*0f1702c5SYu Xiangning 		else
853*0f1702c5SYu Xiangning 			bpt->b_cont = bp;
854*0f1702c5SYu Xiangning 
855*0f1702c5SYu Xiangning 		/*
856*0f1702c5SYu Xiangning 		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
857*0f1702c5SYu Xiangning 		 * each to reflect that uioamove() has consumed all data.
858*0f1702c5SYu Xiangning 		 */
859*0f1702c5SYu Xiangning 		bpt = bp;
860*0f1702c5SYu Xiangning 		for (;;) {
861*0f1702c5SYu Xiangning 			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
862*0f1702c5SYu Xiangning 
863*0f1702c5SYu Xiangning 			bpt->b_rptr = bpt->b_wptr;
864*0f1702c5SYu Xiangning 			if (bpt->b_cont == NULL)
865*0f1702c5SYu Xiangning 				break;
866*0f1702c5SYu Xiangning 			bpt = bpt->b_cont;
867*0f1702c5SYu Xiangning 		}
868*0f1702c5SYu Xiangning 		/* New sodirect uioa free list tail */
869*0f1702c5SYu Xiangning 		sodp->sod_uioaft = bpt;
870*0f1702c5SYu Xiangning 
871*0f1702c5SYu Xiangning 		/* Only dequeue once with data returned per uioa_t */
872*0f1702c5SYu Xiangning 		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
873*0f1702c5SYu Xiangning 			sodp->sod_uioa.uioa_state &= UIOA_CLR;
874*0f1702c5SYu Xiangning 			sodp->sod_uioa.uioa_state |= UIOA_FINI;
875*0f1702c5SYu Xiangning 		}
876*0f1702c5SYu Xiangning 	}
877*0f1702c5SYu Xiangning }
878*0f1702c5SYu Xiangning 
879*0f1702c5SYu Xiangning /*
880*0f1702c5SYu Xiangning  * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
881*0f1702c5SYu Xiangning  * this function on a non-STREAMS socket to schedule uioamove() on the data
882*0f1702c5SYu Xiangning  * that has already queued in this socket.
883*0f1702c5SYu Xiangning  */
884*0f1702c5SYu Xiangning void
885*0f1702c5SYu Xiangning sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
886*0f1702c5SYu Xiangning {
887*0f1702c5SYu Xiangning 	uioa_t	*uioap = (uioa_t *)uiop;
888*0f1702c5SYu Xiangning 	mblk_t	*lbp;
889*0f1702c5SYu Xiangning 	mblk_t	*wbp;
890*0f1702c5SYu Xiangning 	mblk_t	*bp;
891*0f1702c5SYu Xiangning 	int	len;
892*0f1702c5SYu Xiangning 	int	error;
893*0f1702c5SYu Xiangning 	boolean_t in_rcv_q = B_TRUE;
894*0f1702c5SYu Xiangning 
895*0f1702c5SYu Xiangning 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
896*0f1702c5SYu Xiangning 	ASSERT(&sodp->sod_uioa == uioap);
897*0f1702c5SYu Xiangning 
898*0f1702c5SYu Xiangning 	/*
899*0f1702c5SYu Xiangning 	 * Walk first b_cont chain in sod_q
900*0f1702c5SYu Xiangning 	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
901*0f1702c5SYu Xiangning 	 */
902*0f1702c5SYu Xiangning 	bp = so->so_rcv_q_head;
903*0f1702c5SYu Xiangning 
904*0f1702c5SYu Xiangning again:
905*0f1702c5SYu Xiangning 	/* Walk the chain */
906*0f1702c5SYu Xiangning 	lbp = NULL;
907*0f1702c5SYu Xiangning 	wbp = bp;
908*0f1702c5SYu Xiangning 
909*0f1702c5SYu Xiangning 	do {
910*0f1702c5SYu Xiangning 		if (bp == NULL)
911*0f1702c5SYu Xiangning 			break;
912*0f1702c5SYu Xiangning 
913*0f1702c5SYu Xiangning 		if (wbp->b_datap->db_type != M_DATA) {
914*0f1702c5SYu Xiangning 			/* Not M_DATA, no more uioa */
915*0f1702c5SYu Xiangning 			goto nouioa;
916*0f1702c5SYu Xiangning 		}
917*0f1702c5SYu Xiangning 		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
918*0f1702c5SYu Xiangning 			/* Have a M_DATA mblk_t with data */
919*0f1702c5SYu Xiangning 			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
920*0f1702c5SYu Xiangning 			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
921*0f1702c5SYu Xiangning 				/* Not enough uio sapce, or beyond oobmark */
922*0f1702c5SYu Xiangning 				goto nouioa;
923*0f1702c5SYu Xiangning 			}
924*0f1702c5SYu Xiangning 			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
925*0f1702c5SYu Xiangning 			error = uioamove(wbp->b_rptr, len,
926*0f1702c5SYu Xiangning 			    UIO_READ, uioap);
927*0f1702c5SYu Xiangning 			if (!error) {
928*0f1702c5SYu Xiangning 				/* Scheduled, mark dblk_t as such */
929*0f1702c5SYu Xiangning 				wbp->b_datap->db_flags |= DBLK_UIOA;
930*0f1702c5SYu Xiangning 			} else {
931*0f1702c5SYu Xiangning 				/* Break the mblk chain */
932*0f1702c5SYu Xiangning 				goto nouioa;
933*0f1702c5SYu Xiangning 			}
934*0f1702c5SYu Xiangning 		}
935*0f1702c5SYu Xiangning 		/* Save last wbp processed */
936*0f1702c5SYu Xiangning 		lbp = wbp;
937*0f1702c5SYu Xiangning 	} while ((wbp = wbp->b_cont) != NULL);
938*0f1702c5SYu Xiangning 
939*0f1702c5SYu Xiangning 	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
940*0f1702c5SYu Xiangning 		/*
941*0f1702c5SYu Xiangning 		 * We get here only once to process the sonode dump area
942*0f1702c5SYu Xiangning 		 * if so_rcv_q_head is NULL or all the mblks have been
943*0f1702c5SYu Xiangning 		 * successfully uioamoved()ed.
944*0f1702c5SYu Xiangning 		 */
945*0f1702c5SYu Xiangning 		in_rcv_q = B_FALSE;
946*0f1702c5SYu Xiangning 
947*0f1702c5SYu Xiangning 		/* move to dump area */
948*0f1702c5SYu Xiangning 		bp = so->so_rcv_head;
949*0f1702c5SYu Xiangning 		goto again;
950*0f1702c5SYu Xiangning 	}
951*0f1702c5SYu Xiangning 
952*0f1702c5SYu Xiangning 	return;
953*0f1702c5SYu Xiangning 
954*0f1702c5SYu Xiangning nouioa:
955*0f1702c5SYu Xiangning 	/* No more uioa */
956*0f1702c5SYu Xiangning 	uioap->uioa_state &= UIOA_CLR;
957*0f1702c5SYu Xiangning 	uioap->uioa_state |= UIOA_FINI;
958*0f1702c5SYu Xiangning 
959*0f1702c5SYu Xiangning 	/*
960*0f1702c5SYu Xiangning 	 * If we processed 1 or more mblk_t(s) then we need to split the
961*0f1702c5SYu Xiangning 	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
962*0f1702c5SYu Xiangning 	 * are in the current chain and the rest are in the following new
963*0f1702c5SYu Xiangning 	 * chain.
964*0f1702c5SYu Xiangning 	 */
965*0f1702c5SYu Xiangning 	if (lbp != NULL) {
966*0f1702c5SYu Xiangning 		/* New end of current chain */
967*0f1702c5SYu Xiangning 		lbp->b_cont = NULL;
968*0f1702c5SYu Xiangning 
969*0f1702c5SYu Xiangning 		/* Insert new chain wbp after bp */
970*0f1702c5SYu Xiangning 		if ((wbp->b_next = bp->b_next) == NULL) {
971*0f1702c5SYu Xiangning 			/*
972*0f1702c5SYu Xiangning 			 * No need to grab so_lock, since sod_lockp
973*0f1702c5SYu Xiangning 			 * points to so_lock.
974*0f1702c5SYu Xiangning 			 */
975*0f1702c5SYu Xiangning 			if (in_rcv_q)
976*0f1702c5SYu Xiangning 				so->so_rcv_q_last_head = wbp;
977*0f1702c5SYu Xiangning 			else
978*0f1702c5SYu Xiangning 				so->so_rcv_last_head = wbp;
979*0f1702c5SYu Xiangning 		}
980*0f1702c5SYu Xiangning 		bp->b_next = wbp;
981*0f1702c5SYu Xiangning 		bp->b_next->b_prev = bp->b_prev;
982*0f1702c5SYu Xiangning 		bp->b_prev = lbp;
983*0f1702c5SYu Xiangning 	}
984*0f1702c5SYu Xiangning }
985*0f1702c5SYu Xiangning 
986*0f1702c5SYu Xiangning /*
987*0f1702c5SYu Xiangning  * Initialize sodirect data structures on a socket.
988*0f1702c5SYu Xiangning  */
989*0f1702c5SYu Xiangning void
990*0f1702c5SYu Xiangning sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func,
991*0f1702c5SYu Xiangning     sod_wakeup_func wake_func, kmutex_t *lockp)
992*0f1702c5SYu Xiangning {
993*0f1702c5SYu Xiangning 	sodirect_t	*sodp;
994*0f1702c5SYu Xiangning 
995*0f1702c5SYu Xiangning 	ASSERT(so->so_direct == NULL);
996*0f1702c5SYu Xiangning 
997*0f1702c5SYu Xiangning 	so->so_state |= SS_SODIRECT;
998*0f1702c5SYu Xiangning 
999*0f1702c5SYu Xiangning 	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
1000*0f1702c5SYu Xiangning 	sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT;
1001*0f1702c5SYu Xiangning 	sodp->sod_want = 0;
1002*0f1702c5SYu Xiangning 	sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL;
1003*0f1702c5SYu Xiangning 	sodp->sod_enqueue = enq_func;
1004*0f1702c5SYu Xiangning 	sodp->sod_wakeup = wake_func;
1005*0f1702c5SYu Xiangning 	sodp->sod_uioafh = NULL;
1006*0f1702c5SYu Xiangning 	sodp->sod_uioaft = NULL;
1007*0f1702c5SYu Xiangning 	sodp->sod_lockp = lockp;
1008*0f1702c5SYu Xiangning 	/*
1009*0f1702c5SYu Xiangning 	 * Remainder of the sod_uioa members are left uninitialized
1010*0f1702c5SYu Xiangning 	 * but will be initialized later by uioainit() before uioa
1011*0f1702c5SYu Xiangning 	 * is enabled.
1012*0f1702c5SYu Xiangning 	 */
1013*0f1702c5SYu Xiangning 	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
1014*0f1702c5SYu Xiangning 	so->so_direct = sodp;
1015*0f1702c5SYu Xiangning 	if (stp != NULL)
1016*0f1702c5SYu Xiangning 		stp->sd_sodirect = sodp;
1017*0f1702c5SYu Xiangning }
1018*0f1702c5SYu Xiangning 
1019*0f1702c5SYu Xiangning /*
1020*0f1702c5SYu Xiangning  * Init the sodirect kmem cache while sockfs is loading.
1021*0f1702c5SYu Xiangning  */
1022*0f1702c5SYu Xiangning void
1023*0f1702c5SYu Xiangning sod_init()
1024*0f1702c5SYu Xiangning {
1025*0f1702c5SYu Xiangning 	/* Allocate sodirect_t kmem_cache */
1026*0f1702c5SYu Xiangning 	sock_sod_cache = kmem_cache_create("sock_sod_cache",
1027*0f1702c5SYu Xiangning 	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1028*0f1702c5SYu Xiangning }
1029*0f1702c5SYu Xiangning 
1030*0f1702c5SYu Xiangning ssize_t
1031*0f1702c5SYu Xiangning sod_uioa_mblk(struct sonode *so, mblk_t *mp)
1032*0f1702c5SYu Xiangning {
1033*0f1702c5SYu Xiangning 	sodirect_t *sodp = so->so_direct;
1034*0f1702c5SYu Xiangning 
1035*0f1702c5SYu Xiangning 	ASSERT(sodp != NULL);
1036*0f1702c5SYu Xiangning 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
1037*0f1702c5SYu Xiangning 
1038*0f1702c5SYu Xiangning 	ASSERT(sodp->sod_state & SOD_ENABLED);
1039*0f1702c5SYu Xiangning 	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
1040*0f1702c5SYu Xiangning 
1041*0f1702c5SYu Xiangning 	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
1042*0f1702c5SYu Xiangning 
1043*0f1702c5SYu Xiangning 	if (mp == NULL && so->so_rcv_q_head != NULL) {
1044*0f1702c5SYu Xiangning 		mp = so->so_rcv_q_head;
1045*0f1702c5SYu Xiangning 		ASSERT(mp->b_prev != NULL);
1046*0f1702c5SYu Xiangning 		mp->b_prev = NULL;
1047*0f1702c5SYu Xiangning 		so->so_rcv_q_head = mp->b_next;
1048*0f1702c5SYu Xiangning 		if (so->so_rcv_q_head == NULL) {
1049*0f1702c5SYu Xiangning 			so->so_rcv_q_last_head = NULL;
1050*0f1702c5SYu Xiangning 		}
1051*0f1702c5SYu Xiangning 		mp->b_next = NULL;
1052*0f1702c5SYu Xiangning 	}
1053*0f1702c5SYu Xiangning 
1054*0f1702c5SYu Xiangning 	sod_uioa_mblk_done(sodp, mp);
1055*0f1702c5SYu Xiangning 
1056*0f1702c5SYu Xiangning 	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
1057*0f1702c5SYu Xiangning 	    DB_TYPE(so->so_rcv_head) == M_DATA &&
1058*0f1702c5SYu Xiangning 	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
1059*0f1702c5SYu Xiangning 		/* more arrived */
1060*0f1702c5SYu Xiangning 		ASSERT(so->so_rcv_q_head == NULL);
1061*0f1702c5SYu Xiangning 		mp = so->so_rcv_head;
1062*0f1702c5SYu Xiangning 		so->so_rcv_head = mp->b_next;
1063*0f1702c5SYu Xiangning 		if (so->so_rcv_head == NULL)
1064*0f1702c5SYu Xiangning 			so->so_rcv_last_head = NULL;
1065*0f1702c5SYu Xiangning 		mp->b_prev = mp->b_next = NULL;
1066*0f1702c5SYu Xiangning 		sod_uioa_mblk_done(sodp, mp);
1067*0f1702c5SYu Xiangning 	}
1068*0f1702c5SYu Xiangning 
1069*0f1702c5SYu Xiangning #ifdef DEBUG
1070*0f1702c5SYu Xiangning 	if (so->so_rcv_q_head != NULL) {
1071*0f1702c5SYu Xiangning 		mblk_t *m = so->so_rcv_q_head;
1072*0f1702c5SYu Xiangning 		while (m != NULL) {
1073*0f1702c5SYu Xiangning 			if (DB_FLAGS(m) & DBLK_UIOA) {
1074*0f1702c5SYu Xiangning 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1075*0f1702c5SYu Xiangning 				    " in so_rcv_q_head.\n", (void *)m);
1076*0f1702c5SYu Xiangning 			}
1077*0f1702c5SYu Xiangning 			m = m->b_next;
1078*0f1702c5SYu Xiangning 		}
1079*0f1702c5SYu Xiangning 	}
1080*0f1702c5SYu Xiangning 	if (so->so_rcv_head != NULL) {
1081*0f1702c5SYu Xiangning 		mblk_t *m = so->so_rcv_head;
1082*0f1702c5SYu Xiangning 		while (m != NULL) {
1083*0f1702c5SYu Xiangning 			if (DB_FLAGS(m) & DBLK_UIOA) {
1084*0f1702c5SYu Xiangning 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1085*0f1702c5SYu Xiangning 				    " in so_rcv_head.\n", (void *)m);
1086*0f1702c5SYu Xiangning 			}
1087*0f1702c5SYu Xiangning 			m = m->b_next;
1088*0f1702c5SYu Xiangning 		}
1089*0f1702c5SYu Xiangning 	}
1090*0f1702c5SYu Xiangning #endif
1091*0f1702c5SYu Xiangning 	return (sodp->sod_uioa.uioa_mbytes);
1092*0f1702c5SYu Xiangning }
1093