xref: /illumos-gate/usr/src/uts/common/fs/sockfs/sockcommon.c (revision f67b7b8d4cffb2612609ecbca47fc3d9e1d65d8a)
10f1702c5SYu Xiangning /*
20f1702c5SYu Xiangning  * CDDL HEADER START
30f1702c5SYu Xiangning  *
40f1702c5SYu Xiangning  * The contents of this file are subject to the terms of the
50f1702c5SYu Xiangning  * Common Development and Distribution License (the "License").
60f1702c5SYu Xiangning  * You may not use this file except in compliance with the License.
70f1702c5SYu Xiangning  *
80f1702c5SYu Xiangning  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90f1702c5SYu Xiangning  * or http://www.opensolaris.org/os/licensing.
100f1702c5SYu Xiangning  * See the License for the specific language governing permissions
110f1702c5SYu Xiangning  * and limitations under the License.
120f1702c5SYu Xiangning  *
130f1702c5SYu Xiangning  * When distributing Covered Code, include this CDDL HEADER in each
140f1702c5SYu Xiangning  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150f1702c5SYu Xiangning  * If applicable, add the following below this CDDL HEADER, with the
160f1702c5SYu Xiangning  * fields enclosed by brackets "[]" replaced with your own identifying
170f1702c5SYu Xiangning  * information: Portions Copyright [yyyy] [name of copyright owner]
180f1702c5SYu Xiangning  *
190f1702c5SYu Xiangning  * CDDL HEADER END
200f1702c5SYu Xiangning  */
210f1702c5SYu Xiangning 
220f1702c5SYu Xiangning /*
233e95bd4aSAnders Persson  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24*f67b7b8dSSebastian Wiedenroth  * Copyright 2017 Sebastian Wiedenroth
250f1702c5SYu Xiangning  */
260f1702c5SYu Xiangning 
270f1702c5SYu Xiangning #include <sys/types.h>
280f1702c5SYu Xiangning #include <sys/param.h>
290f1702c5SYu Xiangning #include <sys/systm.h>
300f1702c5SYu Xiangning #include <sys/sysmacros.h>
310f1702c5SYu Xiangning #include <sys/debug.h>
320f1702c5SYu Xiangning #include <sys/cmn_err.h>
330f1702c5SYu Xiangning #include <sys/vfs.h>
340f1702c5SYu Xiangning #include <sys/policy.h>
350f1702c5SYu Xiangning #include <sys/modctl.h>
360f1702c5SYu Xiangning 
370f1702c5SYu Xiangning #include <sys/sunddi.h>
380f1702c5SYu Xiangning 
390f1702c5SYu Xiangning #include <sys/strsun.h>
400f1702c5SYu Xiangning #include <sys/stropts.h>
410f1702c5SYu Xiangning #include <sys/strsubr.h>
420f1702c5SYu Xiangning #include <sys/socket.h>
430f1702c5SYu Xiangning #include <sys/socketvar.h>
440f1702c5SYu Xiangning #include <sys/uio.h>
450f1702c5SYu Xiangning 
460f1702c5SYu Xiangning #include <inet/ipclassifier.h>
470f1702c5SYu Xiangning #include <fs/sockfs/sockcommon.h>
483e95bd4aSAnders Persson #include <fs/sockfs/sockfilter_impl.h>
490f1702c5SYu Xiangning #include <fs/sockfs/nl7c.h>
50d36be52eSRao Shoaib #include <fs/sockfs/socktpi.h>
51bbc000e5SAnders Persson #include <fs/sockfs/sodirect.h>
520f1702c5SYu Xiangning #include <inet/ip.h>
530f1702c5SYu Xiangning 
540f1702c5SYu Xiangning extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
550f1702c5SYu Xiangning 
560f1702c5SYu Xiangning /*
570f1702c5SYu Xiangning  * Common socket access functions.
580f1702c5SYu Xiangning  *
590f1702c5SYu Xiangning  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
600f1702c5SYu Xiangning  * the socket_xxx() function should be used.
610f1702c5SYu Xiangning  */
620f1702c5SYu Xiangning 
630f1702c5SYu Xiangning /*
640f1702c5SYu Xiangning  * Try to create a new sonode of the requested <family, type, protocol>.
650f1702c5SYu Xiangning  */
660f1702c5SYu Xiangning /* ARGSUSED */
670f1702c5SYu Xiangning struct sonode *
680f1702c5SYu Xiangning socket_create(int family, int type, int protocol, char *devpath, char *mod,
690f1702c5SYu Xiangning     int flags, int version, struct cred *cr, int *errorp)
700f1702c5SYu Xiangning {
710f1702c5SYu Xiangning 	struct sonode *so;
720f1702c5SYu Xiangning 	struct sockparams *sp = NULL;
7322238f73Sshenjian 	int saved_error;
740f1702c5SYu Xiangning 
750f1702c5SYu Xiangning 	/*
760f1702c5SYu Xiangning 	 * Look for a sockparams entry that match the given criteria.
770f1702c5SYu Xiangning 	 * solookup() returns with the entry held.
780f1702c5SYu Xiangning 	 */
790f1702c5SYu Xiangning 	*errorp = solookup(family, type, protocol, &sp);
8022238f73Sshenjian 	saved_error = *errorp;
810f1702c5SYu Xiangning 	if (sp == NULL) {
820f1702c5SYu Xiangning 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
830f1702c5SYu Xiangning 		/*
840f1702c5SYu Xiangning 		 * There is no matching sockparams entry. An ephemeral entry is
850f1702c5SYu Xiangning 		 * created if the caller specifies a device or a socket module.
860f1702c5SYu Xiangning 		 */
870f1702c5SYu Xiangning 		if (devpath != NULL) {
8822238f73Sshenjian 			saved_error = 0;
890f1702c5SYu Xiangning 			sp = sockparams_hold_ephemeral_bydev(family, type,
900f1702c5SYu Xiangning 			    protocol, devpath, kmflags, errorp);
910f1702c5SYu Xiangning 		} else if (mod != NULL) {
9222238f73Sshenjian 			saved_error = 0;
930f1702c5SYu Xiangning 			sp = sockparams_hold_ephemeral_bymod(family, type,
940f1702c5SYu Xiangning 			    protocol, mod, kmflags, errorp);
950f1702c5SYu Xiangning 		} else {
9622238f73Sshenjian 			*errorp = solookup(family, type, 0, &sp);
970f1702c5SYu Xiangning 		}
980f1702c5SYu Xiangning 
9922238f73Sshenjian 		if (sp == NULL) {
10022238f73Sshenjian 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
10122238f73Sshenjian 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
10222238f73Sshenjian 				*errorp = saved_error;
1030f1702c5SYu Xiangning 			return (NULL);
10422238f73Sshenjian 		}
1050f1702c5SYu Xiangning 	}
1060f1702c5SYu Xiangning 
1070f1702c5SYu Xiangning 	ASSERT(sp->sp_smod_info != NULL);
1080f1702c5SYu Xiangning 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
1097d64f41bSAnders Persson 	sp->sp_stats.sps_ncreate.value.ui64++;
1100f1702c5SYu Xiangning 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
1110f1702c5SYu Xiangning 	    protocol, version, flags, errorp, cr);
1120f1702c5SYu Xiangning 	if (so == NULL) {
1130f1702c5SYu Xiangning 		SOCKPARAMS_DEC_REF(sp);
1140f1702c5SYu Xiangning 	} else {
1150f1702c5SYu Xiangning 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
1160f1702c5SYu Xiangning 			/* Cannot fail, only bumps so_count */
1170f1702c5SYu Xiangning 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
1180f1702c5SYu Xiangning 		} else {
11922238f73Sshenjian 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
12022238f73Sshenjian 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
12122238f73Sshenjian 				*errorp = saved_error;
1220f1702c5SYu Xiangning 			socket_destroy(so);
1230f1702c5SYu Xiangning 			so = NULL;
1240f1702c5SYu Xiangning 		}
1250f1702c5SYu Xiangning 	}
1260f1702c5SYu Xiangning 	return (so);
1270f1702c5SYu Xiangning }
1280f1702c5SYu Xiangning 
1290f1702c5SYu Xiangning struct sonode *
1300f1702c5SYu Xiangning socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
1310f1702c5SYu Xiangning     sock_downcalls_t *dc, int flags, int *errorp)
1320f1702c5SYu Xiangning {
1330f1702c5SYu Xiangning 	struct sonode *so;
1340f1702c5SYu Xiangning 	struct sockparams *sp;
1350f1702c5SYu Xiangning 	struct cred *cr;
1360f1702c5SYu Xiangning 
1370f1702c5SYu Xiangning 	if ((cr = CRED()) == NULL)
1380f1702c5SYu Xiangning 		cr = kcred;
1390f1702c5SYu Xiangning 
1400f1702c5SYu Xiangning 	sp = parent->so_sockparams;
1410f1702c5SYu Xiangning 	ASSERT(sp != NULL);
1420f1702c5SYu Xiangning 
1437d64f41bSAnders Persson 	sp->sp_stats.sps_ncreate.value.ui64++;
1440f1702c5SYu Xiangning 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
1450f1702c5SYu Xiangning 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
1460f1702c5SYu Xiangning 	    errorp, cr);
1470f1702c5SYu Xiangning 	if (so != NULL) {
1480f1702c5SYu Xiangning 		SOCKPARAMS_INC_REF(sp);
1490f1702c5SYu Xiangning 
1500f1702c5SYu Xiangning 		so->so_proto_handle = lh;
1510f1702c5SYu Xiangning 		so->so_downcalls = dc;
1520f1702c5SYu Xiangning 		/*
1530f1702c5SYu Xiangning 		 * This function may be called in interrupt context, and CRED()
1540f1702c5SYu Xiangning 		 * will be NULL. In this case, pass in kcred.
1550f1702c5SYu Xiangning 		 */
1560f1702c5SYu Xiangning 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
1570f1702c5SYu Xiangning 			/* Cannot fail, only bumps so_count */
1580f1702c5SYu Xiangning 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
1590f1702c5SYu Xiangning 		} else  {
1600f1702c5SYu Xiangning 			socket_destroy(so);
1610f1702c5SYu Xiangning 			so = NULL;
1620f1702c5SYu Xiangning 		}
1630f1702c5SYu Xiangning 	}
1640f1702c5SYu Xiangning 
1650f1702c5SYu Xiangning 	return (so);
1660f1702c5SYu Xiangning }
1670f1702c5SYu Xiangning 
1680f1702c5SYu Xiangning /*
1690f1702c5SYu Xiangning  * Bind local endpoint.
1700f1702c5SYu Xiangning  */
1710f1702c5SYu Xiangning int
1720f1702c5SYu Xiangning socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1730f1702c5SYu Xiangning     int flags, cred_t *cr)
1740f1702c5SYu Xiangning {
1750f1702c5SYu Xiangning 	return (SOP_BIND(so, name, namelen, flags, cr));
1760f1702c5SYu Xiangning }
1770f1702c5SYu Xiangning 
1780f1702c5SYu Xiangning /*
1790f1702c5SYu Xiangning  * Turn socket into a listen socket.
1800f1702c5SYu Xiangning  */
1810f1702c5SYu Xiangning int
1820f1702c5SYu Xiangning socket_listen(struct sonode *so, int backlog, cred_t *cr)
1830f1702c5SYu Xiangning {
1840f1702c5SYu Xiangning 	if (backlog < 0) {
1850f1702c5SYu Xiangning 		backlog = 0;
1860f1702c5SYu Xiangning 	}
1870f1702c5SYu Xiangning 
1880f1702c5SYu Xiangning 	/*
1890f1702c5SYu Xiangning 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1900f1702c5SYu Xiangning 	 * before queuing the next connection implying that a
1910f1702c5SYu Xiangning 	 * listen(sock, 0) allows one connection to be queued.
1920f1702c5SYu Xiangning 	 * BSD also uses 1.5 times the requested backlog.
1930f1702c5SYu Xiangning 	 *
1940f1702c5SYu Xiangning 	 * XNS Issue 4 required a strict interpretation of the backlog.
1950f1702c5SYu Xiangning 	 * This has been waived subsequently for Issue 4 and the change
1960f1702c5SYu Xiangning 	 * incorporated in XNS Issue 5. So we aren't required to do
1970f1702c5SYu Xiangning 	 * anything special for XPG apps.
1980f1702c5SYu Xiangning 	 */
1990f1702c5SYu Xiangning 	if (backlog >= (INT_MAX - 1) / 3)
2000f1702c5SYu Xiangning 		backlog = INT_MAX;
2010f1702c5SYu Xiangning 	else
2020f1702c5SYu Xiangning 		backlog = backlog * 3 / 2 + 1;
2030f1702c5SYu Xiangning 
2040f1702c5SYu Xiangning 	return (SOP_LISTEN(so, backlog, cr));
2050f1702c5SYu Xiangning }
2060f1702c5SYu Xiangning 
2070f1702c5SYu Xiangning /*
2080f1702c5SYu Xiangning  * Accept incoming connection.
2090f1702c5SYu Xiangning  */
2100f1702c5SYu Xiangning int
2110f1702c5SYu Xiangning socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
2120f1702c5SYu Xiangning {
2130f1702c5SYu Xiangning 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
2140f1702c5SYu Xiangning }
2150f1702c5SYu Xiangning 
2160f1702c5SYu Xiangning /*
2170f1702c5SYu Xiangning  * Active open.
2180f1702c5SYu Xiangning  */
2190f1702c5SYu Xiangning int
2203e95bd4aSAnders Persson socket_connect(struct sonode *so, struct sockaddr *name,
2210f1702c5SYu Xiangning     socklen_t namelen, int fflag, int flags, cred_t *cr)
2220f1702c5SYu Xiangning {
2230f1702c5SYu Xiangning 	int error;
2240f1702c5SYu Xiangning 
2250f1702c5SYu Xiangning 	/*
2260f1702c5SYu Xiangning 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2270f1702c5SYu Xiangning 	 * connect to a null address. This is the portable method to
2280f1702c5SYu Xiangning 	 * unconnect a socket.
2290f1702c5SYu Xiangning 	 */
2300f1702c5SYu Xiangning 	if ((namelen >= sizeof (sa_family_t)) &&
2310f1702c5SYu Xiangning 	    (name->sa_family == AF_UNSPEC)) {
2320f1702c5SYu Xiangning 		name = NULL;
2330f1702c5SYu Xiangning 		namelen = 0;
2340f1702c5SYu Xiangning 	}
2350f1702c5SYu Xiangning 
2360f1702c5SYu Xiangning 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
2370f1702c5SYu Xiangning 
2380f1702c5SYu Xiangning 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
2390f1702c5SYu Xiangning 		/*
2400f1702c5SYu Xiangning 		 * X/Open specification contains a requirement that
2410f1702c5SYu Xiangning 		 * ENETUNREACH be returned but does not require
2420f1702c5SYu Xiangning 		 * EHOSTUNREACH. In order to keep the test suite
2430f1702c5SYu Xiangning 		 * happy we mess with the errno here.
2440f1702c5SYu Xiangning 		 */
2450f1702c5SYu Xiangning 		error = ENETUNREACH;
2460f1702c5SYu Xiangning 	}
2470f1702c5SYu Xiangning 
2480f1702c5SYu Xiangning 	return (error);
2490f1702c5SYu Xiangning }
2500f1702c5SYu Xiangning 
2510f1702c5SYu Xiangning /*
2520f1702c5SYu Xiangning  * Get address of remote node.
2530f1702c5SYu Xiangning  */
2540f1702c5SYu Xiangning int
2550f1702c5SYu Xiangning socket_getpeername(struct sonode *so, struct sockaddr *addr,
2560f1702c5SYu Xiangning     socklen_t *addrlen, boolean_t accept, cred_t *cr)
2570f1702c5SYu Xiangning {
2580f1702c5SYu Xiangning 	ASSERT(*addrlen > 0);
2590f1702c5SYu Xiangning 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
2600f1702c5SYu Xiangning 
2610f1702c5SYu Xiangning }
2620f1702c5SYu Xiangning 
2630f1702c5SYu Xiangning /*
2640f1702c5SYu Xiangning  * Get local address.
2650f1702c5SYu Xiangning  */
2660f1702c5SYu Xiangning int
2670f1702c5SYu Xiangning socket_getsockname(struct sonode *so, struct sockaddr *addr,
2680f1702c5SYu Xiangning     socklen_t *addrlen, cred_t *cr)
2690f1702c5SYu Xiangning {
2700f1702c5SYu Xiangning 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
2710f1702c5SYu Xiangning 
2720f1702c5SYu Xiangning }
2730f1702c5SYu Xiangning 
2740f1702c5SYu Xiangning /*
2750f1702c5SYu Xiangning  * Called from shutdown().
2760f1702c5SYu Xiangning  */
2770f1702c5SYu Xiangning int
2780f1702c5SYu Xiangning socket_shutdown(struct sonode *so, int how, cred_t *cr)
2790f1702c5SYu Xiangning {
2800f1702c5SYu Xiangning 	return (SOP_SHUTDOWN(so, how, cr));
2810f1702c5SYu Xiangning }
2820f1702c5SYu Xiangning 
2830f1702c5SYu Xiangning /*
2840f1702c5SYu Xiangning  * Get socket options.
2850f1702c5SYu Xiangning  */
2860f1702c5SYu Xiangning /*ARGSUSED*/
2870f1702c5SYu Xiangning int
2880f1702c5SYu Xiangning socket_getsockopt(struct sonode *so, int level, int option_name,
2890f1702c5SYu Xiangning     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
2900f1702c5SYu Xiangning {
2910f1702c5SYu Xiangning 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
2920f1702c5SYu Xiangning 	    optlenp, flags, cr));
2930f1702c5SYu Xiangning }
2940f1702c5SYu Xiangning 
2950f1702c5SYu Xiangning /*
2960f1702c5SYu Xiangning  * Set socket options
2970f1702c5SYu Xiangning  */
2980f1702c5SYu Xiangning int
2990f1702c5SYu Xiangning socket_setsockopt(struct sonode *so, int level, int option_name,
3000f1702c5SYu Xiangning     const void *optval, t_uscalar_t optlen, cred_t *cr)
3010f1702c5SYu Xiangning {
30222238f73Sshenjian 	int val = 1;
3030f1702c5SYu Xiangning 	/* Caller allocates aligned optval, or passes null */
3040f1702c5SYu Xiangning 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
3050f1702c5SYu Xiangning 	/* If optval is null optlen is 0, and vice-versa */
3060f1702c5SYu Xiangning 	ASSERT(optval != NULL || optlen == 0);
3070f1702c5SYu Xiangning 	ASSERT(optlen != 0 || optval == NULL);
3080f1702c5SYu Xiangning 
30922238f73Sshenjian 	if (optval == NULL && optlen == 0)
31022238f73Sshenjian 		optval = &val;
3110f1702c5SYu Xiangning 
3120f1702c5SYu Xiangning 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
3130f1702c5SYu Xiangning }
3140f1702c5SYu Xiangning 
3150f1702c5SYu Xiangning int
3160f1702c5SYu Xiangning socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3170f1702c5SYu Xiangning     cred_t *cr)
3180f1702c5SYu Xiangning {
3190f1702c5SYu Xiangning 	int error = 0;
3200f1702c5SYu Xiangning 	ssize_t orig_resid = uiop->uio_resid;
3210f1702c5SYu Xiangning 
3220f1702c5SYu Xiangning 	/*
3230f1702c5SYu Xiangning 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
3240f1702c5SYu Xiangning 	 */
3250f1702c5SYu Xiangning 	if (so->so_family == AF_UNIX)
3260f1702c5SYu Xiangning 		uiop->uio_extflg |= UIO_COPY_CACHED;
3270f1702c5SYu Xiangning 	else
3280f1702c5SYu Xiangning 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
3290f1702c5SYu Xiangning 
3300f1702c5SYu Xiangning 	error = SOP_SENDMSG(so, msg, uiop, cr);
3310f1702c5SYu Xiangning 	switch (error) {
3320f1702c5SYu Xiangning 	default:
3330f1702c5SYu Xiangning 		break;
3340f1702c5SYu Xiangning 	case EINTR:
33526406123SAnders Persson 	case ENOMEM:
33634dfe683Sshenjian 	/* EAGAIN is EWOULDBLOCK */
3370f1702c5SYu Xiangning 	case EWOULDBLOCK:
3380f1702c5SYu Xiangning 		/* We did a partial send */
3390f1702c5SYu Xiangning 		if (uiop->uio_resid != orig_resid)
3400f1702c5SYu Xiangning 			error = 0;
3410f1702c5SYu Xiangning 		break;
3420f1702c5SYu Xiangning 	case EPIPE:
343*f67b7b8dSSebastian Wiedenroth 		if (((so->so_mode & SM_KERNEL) == 0) &&
344*f67b7b8dSSebastian Wiedenroth 		    ((msg->msg_flags & MSG_NOSIGNAL) == 0)) {
3450f1702c5SYu Xiangning 			tsignal(curthread, SIGPIPE);
346*f67b7b8dSSebastian Wiedenroth 		}
3470f1702c5SYu Xiangning 		break;
3480f1702c5SYu Xiangning 	}
3490f1702c5SYu Xiangning 
3500f1702c5SYu Xiangning 	return (error);
3510f1702c5SYu Xiangning }
3520f1702c5SYu Xiangning 
3530f1702c5SYu Xiangning int
3540f1702c5SYu Xiangning socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
3550f1702c5SYu Xiangning     struct cred *cr, mblk_t **mpp)
3560f1702c5SYu Xiangning {
3570f1702c5SYu Xiangning 	int error = 0;
3580f1702c5SYu Xiangning 
3590f1702c5SYu Xiangning 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
3600f1702c5SYu Xiangning 	if (error == EPIPE) {
3610f1702c5SYu Xiangning 		tsignal(curthread, SIGPIPE);
3620f1702c5SYu Xiangning 	}
3630f1702c5SYu Xiangning 	return (error);
3640f1702c5SYu Xiangning }
3650f1702c5SYu Xiangning 
3660f1702c5SYu Xiangning int
3670f1702c5SYu Xiangning socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3680f1702c5SYu Xiangning     cred_t *cr)
3690f1702c5SYu Xiangning {
3700f1702c5SYu Xiangning 	int error;
3710f1702c5SYu Xiangning 	ssize_t orig_resid = uiop->uio_resid;
3720f1702c5SYu Xiangning 
3730f1702c5SYu Xiangning 	/*
3740f1702c5SYu Xiangning 	 * Do not bypass the cache when reading data, as the application
3750f1702c5SYu Xiangning 	 * is likely to access the data shortly.
3760f1702c5SYu Xiangning 	 */
3770f1702c5SYu Xiangning 	uiop->uio_extflg |= UIO_COPY_CACHED;
3780f1702c5SYu Xiangning 
3790f1702c5SYu Xiangning 	error = SOP_RECVMSG(so, msg, uiop, cr);
3800f1702c5SYu Xiangning 
3810f1702c5SYu Xiangning 	switch (error) {
3820f1702c5SYu Xiangning 	case EINTR:
38334dfe683Sshenjian 	/* EAGAIN is EWOULDBLOCK */
3840f1702c5SYu Xiangning 	case EWOULDBLOCK:
3850f1702c5SYu Xiangning 		/* We did a partial read */
3860f1702c5SYu Xiangning 		if (uiop->uio_resid != orig_resid)
3870f1702c5SYu Xiangning 			error = 0;
3880f1702c5SYu Xiangning 		break;
3890f1702c5SYu Xiangning 	default:
3900f1702c5SYu Xiangning 		break;
3910f1702c5SYu Xiangning 	}
3920f1702c5SYu Xiangning 	return (error);
3930f1702c5SYu Xiangning }
3940f1702c5SYu Xiangning 
3950f1702c5SYu Xiangning int
3960f1702c5SYu Xiangning socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
3970f1702c5SYu Xiangning     struct cred *cr, int32_t *rvalp)
3980f1702c5SYu Xiangning {
3990f1702c5SYu Xiangning 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
4000f1702c5SYu Xiangning }
4010f1702c5SYu Xiangning 
4020f1702c5SYu Xiangning int
4030f1702c5SYu Xiangning socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
4040f1702c5SYu Xiangning     struct pollhead **phpp)
4050f1702c5SYu Xiangning {
4060f1702c5SYu Xiangning 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
4070f1702c5SYu Xiangning }
4080f1702c5SYu Xiangning 
4090f1702c5SYu Xiangning int
4100f1702c5SYu Xiangning socket_close(struct sonode *so, int flag, struct cred *cr)
4110f1702c5SYu Xiangning {
4120f1702c5SYu Xiangning 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
4130f1702c5SYu Xiangning }
4140f1702c5SYu Xiangning 
4150f1702c5SYu Xiangning int
4160f1702c5SYu Xiangning socket_close_internal(struct sonode *so, int flag, cred_t *cr)
4170f1702c5SYu Xiangning {
4180f1702c5SYu Xiangning 	ASSERT(so->so_count == 0);
4190f1702c5SYu Xiangning 
4200f1702c5SYu Xiangning 	return (SOP_CLOSE(so, flag, cr));
4210f1702c5SYu Xiangning }
4220f1702c5SYu Xiangning 
4230f1702c5SYu Xiangning void
4240f1702c5SYu Xiangning socket_destroy(struct sonode *so)
4250f1702c5SYu Xiangning {
4260f1702c5SYu Xiangning 	vn_invalid(SOTOV(so));
4270f1702c5SYu Xiangning 	VN_RELE(SOTOV(so));
4280f1702c5SYu Xiangning }
4290f1702c5SYu Xiangning 
4300f1702c5SYu Xiangning /* ARGSUSED */
4310f1702c5SYu Xiangning void
4320f1702c5SYu Xiangning socket_destroy_internal(struct sonode *so, cred_t *cr)
4330f1702c5SYu Xiangning {
4340f1702c5SYu Xiangning 	struct sockparams *sp = so->so_sockparams;
4350f1702c5SYu Xiangning 	ASSERT(so->so_count == 0 && sp != NULL);
4360f1702c5SYu Xiangning 
4370f1702c5SYu Xiangning 	sp->sp_smod_info->smod_sock_destroy_func(so);
4380f1702c5SYu Xiangning 
4390f1702c5SYu Xiangning 	SOCKPARAMS_DEC_REF(sp);
4400f1702c5SYu Xiangning }
4410f1702c5SYu Xiangning 
4420f1702c5SYu Xiangning /*
4430f1702c5SYu Xiangning  * TODO Once the common vnode ops is available, then the vnops argument
4440f1702c5SYu Xiangning  * should be removed.
4450f1702c5SYu Xiangning  */
4460f1702c5SYu Xiangning /*ARGSUSED*/
4470f1702c5SYu Xiangning int
4480f1702c5SYu Xiangning sonode_constructor(void *buf, void *cdrarg, int kmflags)
4490f1702c5SYu Xiangning {
4500f1702c5SYu Xiangning 	struct sonode *so = buf;
4510f1702c5SYu Xiangning 	struct vnode *vp;
4520f1702c5SYu Xiangning 
4530f1702c5SYu Xiangning 	vp = so->so_vnode = vn_alloc(kmflags);
4540f1702c5SYu Xiangning 	if (vp == NULL) {
4550f1702c5SYu Xiangning 		return (-1);
4560f1702c5SYu Xiangning 	}
4570f1702c5SYu Xiangning 	vp->v_data = so;
4580f1702c5SYu Xiangning 	vn_setops(vp, socket_vnodeops);
4590f1702c5SYu Xiangning 
4600f1702c5SYu Xiangning 	so->so_priv 		= NULL;
4610f1702c5SYu Xiangning 	so->so_oobmsg		= NULL;
4620f1702c5SYu Xiangning 
4630f1702c5SYu Xiangning 	so->so_proto_handle	= NULL;
4640f1702c5SYu Xiangning 
4650f1702c5SYu Xiangning 	so->so_peercred 	= NULL;
4660f1702c5SYu Xiangning 
4670f1702c5SYu Xiangning 	so->so_rcv_queued	= 0;
4680f1702c5SYu Xiangning 	so->so_rcv_q_head 	= NULL;
4690f1702c5SYu Xiangning 	so->so_rcv_q_last_head 	= NULL;
4700f1702c5SYu Xiangning 	so->so_rcv_head		= NULL;
4710f1702c5SYu Xiangning 	so->so_rcv_last_head	= NULL;
4720f1702c5SYu Xiangning 	so->so_rcv_wanted	= 0;
4730f1702c5SYu Xiangning 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
4740f1702c5SYu Xiangning 	so->so_rcv_timer_tid	= 0;
4750f1702c5SYu Xiangning 	so->so_rcv_thresh	= 0;
4760f1702c5SYu Xiangning 
4773e95bd4aSAnders Persson 	list_create(&so->so_acceptq_list, sizeof (struct sonode),
4783e95bd4aSAnders Persson 	    offsetof(struct sonode, so_acceptq_node));
4793e95bd4aSAnders Persson 	list_create(&so->so_acceptq_defer, sizeof (struct sonode),
4803e95bd4aSAnders Persson 	    offsetof(struct sonode, so_acceptq_node));
4813e95bd4aSAnders Persson 	list_link_init(&so->so_acceptq_node);
4820f1702c5SYu Xiangning 	so->so_acceptq_len	= 0;
4830f1702c5SYu Xiangning 	so->so_backlog		= 0;
4843e95bd4aSAnders Persson 	so->so_listener		= NULL;
4850f1702c5SYu Xiangning 
4860f1702c5SYu Xiangning 	so->so_snd_qfull	= B_FALSE;
4870f1702c5SYu Xiangning 
4883e95bd4aSAnders Persson 	so->so_filter_active	= 0;
4893e95bd4aSAnders Persson 	so->so_filter_tx	= 0;
4903e95bd4aSAnders Persson 	so->so_filter_defertime = 0;
4913e95bd4aSAnders Persson 	so->so_filter_top	= NULL;
4923e95bd4aSAnders Persson 	so->so_filter_bottom	= NULL;
4933e95bd4aSAnders Persson 
4940f1702c5SYu Xiangning 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
4950f1702c5SYu Xiangning 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
4960f1702c5SYu Xiangning 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
4970f1702c5SYu Xiangning 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
4986a571a2dSAnders Persson 	cv_init(&so->so_single_cv, NULL, CV_DEFAULT, NULL);
4996a571a2dSAnders Persson 	cv_init(&so->so_read_cv, NULL, CV_DEFAULT, NULL);
5000f1702c5SYu Xiangning 
5010f1702c5SYu Xiangning 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
5020f1702c5SYu Xiangning 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
5030f1702c5SYu Xiangning 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
5040f1702c5SYu Xiangning 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
5050f1702c5SYu Xiangning 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
5060f1702c5SYu Xiangning 
5070f1702c5SYu Xiangning 	return (0);
5080f1702c5SYu Xiangning }
5090f1702c5SYu Xiangning 
5100f1702c5SYu Xiangning /*ARGSUSED*/
5110f1702c5SYu Xiangning void
5120f1702c5SYu Xiangning sonode_destructor(void *buf, void *cdrarg)
5130f1702c5SYu Xiangning {
5140f1702c5SYu Xiangning 	struct sonode *so = buf;
5150f1702c5SYu Xiangning 	struct vnode *vp = SOTOV(so);
5160f1702c5SYu Xiangning 
5170f1702c5SYu Xiangning 	ASSERT(so->so_priv == NULL);
5180f1702c5SYu Xiangning 	ASSERT(so->so_peercred == NULL);
5190f1702c5SYu Xiangning 
5200f1702c5SYu Xiangning 	ASSERT(so->so_oobmsg == NULL);
5210f1702c5SYu Xiangning 
5220f1702c5SYu Xiangning 	ASSERT(so->so_rcv_q_head == NULL);
5230f1702c5SYu Xiangning 
5243e95bd4aSAnders Persson 	list_destroy(&so->so_acceptq_list);
5253e95bd4aSAnders Persson 	list_destroy(&so->so_acceptq_defer);
5263e95bd4aSAnders Persson 	ASSERT(!list_link_active(&so->so_acceptq_node));
5273e95bd4aSAnders Persson 	ASSERT(so->so_listener == NULL);
5283e95bd4aSAnders Persson 
5293e95bd4aSAnders Persson 	ASSERT(so->so_filter_active == 0);
5303e95bd4aSAnders Persson 	ASSERT(so->so_filter_tx == 0);
5313e95bd4aSAnders Persson 	ASSERT(so->so_filter_top == NULL);
5323e95bd4aSAnders Persson 	ASSERT(so->so_filter_bottom == NULL);
5330f1702c5SYu Xiangning 
5340f1702c5SYu Xiangning 	ASSERT(vp->v_data == so);
5350f1702c5SYu Xiangning 	ASSERT(vn_matchops(vp, socket_vnodeops));
5360f1702c5SYu Xiangning 
5370f1702c5SYu Xiangning 	vn_free(vp);
5380f1702c5SYu Xiangning 
5390f1702c5SYu Xiangning 	mutex_destroy(&so->so_lock);
5400f1702c5SYu Xiangning 	mutex_destroy(&so->so_acceptq_lock);
5410f1702c5SYu Xiangning 	rw_destroy(&so->so_fallback_rwlock);
5420f1702c5SYu Xiangning 
5430f1702c5SYu Xiangning 	cv_destroy(&so->so_state_cv);
5446a571a2dSAnders Persson 	cv_destroy(&so->so_single_cv);
5456a571a2dSAnders Persson 	cv_destroy(&so->so_read_cv);
5460f1702c5SYu Xiangning 	cv_destroy(&so->so_acceptq_cv);
5470f1702c5SYu Xiangning 	cv_destroy(&so->so_snd_cv);
5480f1702c5SYu Xiangning 	cv_destroy(&so->so_rcv_cv);
5490f1702c5SYu Xiangning 	cv_destroy(&so->so_closing_cv);
5500f1702c5SYu Xiangning }
5510f1702c5SYu Xiangning 
5520f1702c5SYu Xiangning void
5530f1702c5SYu Xiangning sonode_init(struct sonode *so, struct sockparams *sp, int family,
5540f1702c5SYu Xiangning     int type, int protocol, sonodeops_t *sops)
5550f1702c5SYu Xiangning {
5560f1702c5SYu Xiangning 	vnode_t *vp;
5570f1702c5SYu Xiangning 
5580f1702c5SYu Xiangning 	vp = SOTOV(so);
5590f1702c5SYu Xiangning 
5600f1702c5SYu Xiangning 	so->so_flag	= 0;
5610f1702c5SYu Xiangning 
5620f1702c5SYu Xiangning 	so->so_state	= 0;
5630f1702c5SYu Xiangning 	so->so_mode	= 0;
5640f1702c5SYu Xiangning 
5650f1702c5SYu Xiangning 	so->so_count	= 0;
5660f1702c5SYu Xiangning 
5670f1702c5SYu Xiangning 	so->so_family	= family;
5680f1702c5SYu Xiangning 	so->so_type	= type;
5690f1702c5SYu Xiangning 	so->so_protocol	= protocol;
5700f1702c5SYu Xiangning 
5710f1702c5SYu Xiangning 	SOCK_CONNID_INIT(so->so_proto_connid);
5720f1702c5SYu Xiangning 
5730f1702c5SYu Xiangning 	so->so_options	= 0;
5740f1702c5SYu Xiangning 	so->so_linger.l_onoff   = 0;
5750f1702c5SYu Xiangning 	so->so_linger.l_linger = 0;
5760f1702c5SYu Xiangning 	so->so_sndbuf	= 0;
5770f1702c5SYu Xiangning 	so->so_error	= 0;
5780f1702c5SYu Xiangning 	so->so_rcvtimeo	= 0;
5790f1702c5SYu Xiangning 	so->so_sndtimeo = 0;
580a5adac4dSYu Xiangning 	so->so_xpg_rcvbuf = 0;
5810f1702c5SYu Xiangning 
5820f1702c5SYu Xiangning 	ASSERT(so->so_oobmsg == NULL);
5830f1702c5SYu Xiangning 	so->so_oobmark	= 0;
5840f1702c5SYu Xiangning 	so->so_pgrp	= 0;
5850f1702c5SYu Xiangning 
5860f1702c5SYu Xiangning 	ASSERT(so->so_peercred == NULL);
5870f1702c5SYu Xiangning 
5880f1702c5SYu Xiangning 	so->so_zoneid = getzoneid();
5890f1702c5SYu Xiangning 
5900f1702c5SYu Xiangning 	so->so_sockparams = sp;
5910f1702c5SYu Xiangning 
5920f1702c5SYu Xiangning 	so->so_ops = sops;
5930f1702c5SYu Xiangning 
594d36be52eSRao Shoaib 	so->so_not_str = (sops != &sotpi_sonodeops);
595d36be52eSRao Shoaib 
5960f1702c5SYu Xiangning 	so->so_proto_handle = NULL;
5970f1702c5SYu Xiangning 
5980f1702c5SYu Xiangning 	so->so_downcalls = NULL;
5990f1702c5SYu Xiangning 
6000f1702c5SYu Xiangning 	so->so_copyflag = 0;
6010f1702c5SYu Xiangning 
6020f1702c5SYu Xiangning 	vn_reinit(vp);
6030f1702c5SYu Xiangning 	vp->v_vfsp	= rootvfs;
6040f1702c5SYu Xiangning 	vp->v_type	= VSOCK;
6050f1702c5SYu Xiangning 	vp->v_rdev	= sockdev;
6060f1702c5SYu Xiangning 
6070f1702c5SYu Xiangning 	so->so_snd_qfull = B_FALSE;
6080f1702c5SYu Xiangning 	so->so_minpsz = 0;
6090f1702c5SYu Xiangning 
6100f1702c5SYu Xiangning 	so->so_rcv_wakeup = B_FALSE;
6110f1702c5SYu Xiangning 	so->so_snd_wakeup = B_FALSE;
6120f1702c5SYu Xiangning 	so->so_flowctrld = B_FALSE;
6130f1702c5SYu Xiangning 
6140f1702c5SYu Xiangning 	so->so_pollev = 0;
6150f1702c5SYu Xiangning 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
6160f1702c5SYu Xiangning 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
6170f1702c5SYu Xiangning 
6180f1702c5SYu Xiangning 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
6190f1702c5SYu Xiangning 	so->so_ksock_cb_arg = NULL;
6200f1702c5SYu Xiangning 
6210f1702c5SYu Xiangning 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
6220f1702c5SYu Xiangning 
6230f1702c5SYu Xiangning 	so->so_direct = NULL;
6240f1702c5SYu Xiangning 
6250f1702c5SYu Xiangning 	vn_exists(vp);
6260f1702c5SYu Xiangning }
6270f1702c5SYu Xiangning 
6280f1702c5SYu Xiangning void
6290f1702c5SYu Xiangning sonode_fini(struct sonode *so)
6300f1702c5SYu Xiangning {
6310f1702c5SYu Xiangning 	vnode_t *vp;
6320f1702c5SYu Xiangning 
6330f1702c5SYu Xiangning 	ASSERT(so->so_count == 0);
6340f1702c5SYu Xiangning 
6350f1702c5SYu Xiangning 	if (so->so_rcv_timer_tid) {
6360f1702c5SYu Xiangning 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
6370f1702c5SYu Xiangning 		(void) untimeout(so->so_rcv_timer_tid);
6380f1702c5SYu Xiangning 		so->so_rcv_timer_tid = 0;
6390f1702c5SYu Xiangning 	}
6400f1702c5SYu Xiangning 
6410f1702c5SYu Xiangning 	if (so->so_poll_list.ph_list != NULL) {
6420f1702c5SYu Xiangning 		pollwakeup(&so->so_poll_list, POLLERR);
6430f1702c5SYu Xiangning 		pollhead_clean(&so->so_poll_list);
6440f1702c5SYu Xiangning 	}
6450f1702c5SYu Xiangning 
646bbc000e5SAnders Persson 	if (so->so_direct != NULL)
647bbc000e5SAnders Persson 		sod_sock_fini(so);
6480f1702c5SYu Xiangning 
6490f1702c5SYu Xiangning 	vp = SOTOV(so);
6500f1702c5SYu Xiangning 	vn_invalid(vp);
6510f1702c5SYu Xiangning 
6520f1702c5SYu Xiangning 	if (so->so_peercred != NULL) {
6530f1702c5SYu Xiangning 		crfree(so->so_peercred);
6540f1702c5SYu Xiangning 		so->so_peercred = NULL;
6550f1702c5SYu Xiangning 	}
6563e95bd4aSAnders Persson 	/* Detach and destroy filters */
6573e95bd4aSAnders Persson 	if (so->so_filter_top != NULL)
6583e95bd4aSAnders Persson 		sof_sonode_cleanup(so);
6593e95bd4aSAnders Persson 
6603e95bd4aSAnders Persson 	ASSERT(list_is_empty(&so->so_acceptq_list));
6613e95bd4aSAnders Persson 	ASSERT(list_is_empty(&so->so_acceptq_defer));
6623e95bd4aSAnders Persson 	ASSERT(!list_link_active(&so->so_acceptq_node));
6633e95bd4aSAnders Persson 
6643e95bd4aSAnders Persson 	ASSERT(so->so_rcv_queued == 0);
6653e95bd4aSAnders Persson 	ASSERT(so->so_rcv_q_head == NULL);
6663e95bd4aSAnders Persson 	ASSERT(so->so_rcv_q_last_head == NULL);
6673e95bd4aSAnders Persson 	ASSERT(so->so_rcv_head == NULL);
6683e95bd4aSAnders Persson 	ASSERT(so->so_rcv_last_head == NULL);
6690f1702c5SYu Xiangning }
670