10f1702c5SYu Xiangning /* 20f1702c5SYu Xiangning * CDDL HEADER START 30f1702c5SYu Xiangning * 40f1702c5SYu Xiangning * The contents of this file are subject to the terms of the 50f1702c5SYu Xiangning * Common Development and Distribution License (the "License"). 60f1702c5SYu Xiangning * You may not use this file except in compliance with the License. 70f1702c5SYu Xiangning * 80f1702c5SYu Xiangning * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90f1702c5SYu Xiangning * or http://www.opensolaris.org/os/licensing. 100f1702c5SYu Xiangning * See the License for the specific language governing permissions 110f1702c5SYu Xiangning * and limitations under the License. 120f1702c5SYu Xiangning * 130f1702c5SYu Xiangning * When distributing Covered Code, include this CDDL HEADER in each 140f1702c5SYu Xiangning * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150f1702c5SYu Xiangning * If applicable, add the following below this CDDL HEADER, with the 160f1702c5SYu Xiangning * fields enclosed by brackets "[]" replaced with your own identifying 170f1702c5SYu Xiangning * information: Portions Copyright [yyyy] [name of copyright owner] 180f1702c5SYu Xiangning * 190f1702c5SYu Xiangning * CDDL HEADER END 200f1702c5SYu Xiangning */ 210f1702c5SYu Xiangning 220f1702c5SYu Xiangning /* 23*22238f73Sshenjian * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 240f1702c5SYu Xiangning * Use is subject to license terms. 250f1702c5SYu Xiangning */ 260f1702c5SYu Xiangning 270f1702c5SYu Xiangning #include <sys/types.h> 280f1702c5SYu Xiangning #include <sys/param.h> 290f1702c5SYu Xiangning #include <sys/systm.h> 300f1702c5SYu Xiangning #include <sys/sysmacros.h> 310f1702c5SYu Xiangning #include <sys/debug.h> 320f1702c5SYu Xiangning #include <sys/cmn_err.h> 330f1702c5SYu Xiangning #include <sys/vfs.h> 340f1702c5SYu Xiangning #include <sys/policy.h> 350f1702c5SYu Xiangning #include <sys/modctl.h> 360f1702c5SYu Xiangning 370f1702c5SYu Xiangning #include <sys/sunddi.h> 380f1702c5SYu Xiangning 390f1702c5SYu Xiangning #include <sys/strsun.h> 400f1702c5SYu Xiangning #include <sys/stropts.h> 410f1702c5SYu Xiangning #include <sys/strsubr.h> 420f1702c5SYu Xiangning #include <sys/socket.h> 430f1702c5SYu Xiangning #include <sys/socketvar.h> 440f1702c5SYu Xiangning #include <sys/sodirect.h> 450f1702c5SYu Xiangning #include <sys/uio.h> 460f1702c5SYu Xiangning 470f1702c5SYu Xiangning #include <inet/ipclassifier.h> 480f1702c5SYu Xiangning #include <fs/sockfs/sockcommon.h> 490f1702c5SYu Xiangning #include <fs/sockfs/nl7c.h> 50d36be52eSRao Shoaib #include <fs/sockfs/socktpi.h> 510f1702c5SYu Xiangning #include <inet/ip.h> 520f1702c5SYu Xiangning 530f1702c5SYu Xiangning extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print; 540f1702c5SYu Xiangning 550f1702c5SYu Xiangning static struct kmem_cache *sock_sod_cache; 560f1702c5SYu Xiangning 570f1702c5SYu Xiangning /* 580f1702c5SYu Xiangning * Common socket access functions. 590f1702c5SYu Xiangning * 600f1702c5SYu Xiangning * Instead of accessing the sonode switch directly (i.e., SOP_xxx()), 610f1702c5SYu Xiangning * the socket_xxx() function should be used. 620f1702c5SYu Xiangning */ 630f1702c5SYu Xiangning 640f1702c5SYu Xiangning /* 650f1702c5SYu Xiangning * Try to create a new sonode of the requested <family, type, protocol>. 660f1702c5SYu Xiangning */ 670f1702c5SYu Xiangning /* ARGSUSED */ 680f1702c5SYu Xiangning struct sonode * 690f1702c5SYu Xiangning socket_create(int family, int type, int protocol, char *devpath, char *mod, 700f1702c5SYu Xiangning int flags, int version, struct cred *cr, int *errorp) 710f1702c5SYu Xiangning { 720f1702c5SYu Xiangning struct sonode *so; 730f1702c5SYu Xiangning struct sockparams *sp = NULL; 74*22238f73Sshenjian int saved_error; 750f1702c5SYu Xiangning 760f1702c5SYu Xiangning /* 770f1702c5SYu Xiangning * Look for a sockparams entry that match the given criteria. 780f1702c5SYu Xiangning * solookup() returns with the entry held. 790f1702c5SYu Xiangning */ 800f1702c5SYu Xiangning *errorp = solookup(family, type, protocol, &sp); 81*22238f73Sshenjian saved_error = *errorp; 820f1702c5SYu Xiangning if (sp == NULL) { 830f1702c5SYu Xiangning int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP; 840f1702c5SYu Xiangning /* 850f1702c5SYu Xiangning * There is no matching sockparams entry. An ephemeral entry is 860f1702c5SYu Xiangning * created if the caller specifies a device or a socket module. 870f1702c5SYu Xiangning */ 880f1702c5SYu Xiangning if (devpath != NULL) { 89*22238f73Sshenjian saved_error = 0; 900f1702c5SYu Xiangning sp = sockparams_hold_ephemeral_bydev(family, type, 910f1702c5SYu Xiangning protocol, devpath, kmflags, errorp); 920f1702c5SYu Xiangning } else if (mod != NULL) { 93*22238f73Sshenjian saved_error = 0; 940f1702c5SYu Xiangning sp = sockparams_hold_ephemeral_bymod(family, type, 950f1702c5SYu Xiangning protocol, mod, kmflags, errorp); 960f1702c5SYu Xiangning } else { 97*22238f73Sshenjian *errorp = solookup(family, type, 0, &sp); 980f1702c5SYu Xiangning } 990f1702c5SYu Xiangning 100*22238f73Sshenjian if (sp == NULL) { 101*22238f73Sshenjian if (saved_error && (*errorp == EPROTONOSUPPORT || 102*22238f73Sshenjian *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT)) 103*22238f73Sshenjian *errorp = saved_error; 1040f1702c5SYu Xiangning return (NULL); 105*22238f73Sshenjian } 1060f1702c5SYu Xiangning } 1070f1702c5SYu Xiangning 1080f1702c5SYu Xiangning ASSERT(sp->sp_smod_info != NULL); 1090f1702c5SYu Xiangning ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP); 1100f1702c5SYu Xiangning so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, 1110f1702c5SYu Xiangning protocol, version, flags, errorp, cr); 1120f1702c5SYu Xiangning if (so == NULL) { 1130f1702c5SYu Xiangning SOCKPARAMS_DEC_REF(sp); 1140f1702c5SYu Xiangning } else { 1150f1702c5SYu Xiangning if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) { 1160f1702c5SYu Xiangning /* Cannot fail, only bumps so_count */ 1170f1702c5SYu Xiangning (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 1180f1702c5SYu Xiangning } else { 119*22238f73Sshenjian if (saved_error && (*errorp == EPROTONOSUPPORT || 120*22238f73Sshenjian *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT)) 121*22238f73Sshenjian *errorp = saved_error; 1220f1702c5SYu Xiangning socket_destroy(so); 1230f1702c5SYu Xiangning so = NULL; 1240f1702c5SYu Xiangning } 1250f1702c5SYu Xiangning } 1260f1702c5SYu Xiangning return (so); 1270f1702c5SYu Xiangning } 1280f1702c5SYu Xiangning 1290f1702c5SYu Xiangning struct sonode * 1300f1702c5SYu Xiangning socket_newconn(struct sonode *parent, sock_lower_handle_t lh, 1310f1702c5SYu Xiangning sock_downcalls_t *dc, int flags, int *errorp) 1320f1702c5SYu Xiangning { 1330f1702c5SYu Xiangning struct sonode *so; 1340f1702c5SYu Xiangning struct sockparams *sp; 1350f1702c5SYu Xiangning struct cred *cr; 1360f1702c5SYu Xiangning 1370f1702c5SYu Xiangning if ((cr = CRED()) == NULL) 1380f1702c5SYu Xiangning cr = kcred; 1390f1702c5SYu Xiangning 1400f1702c5SYu Xiangning sp = parent->so_sockparams; 1410f1702c5SYu Xiangning ASSERT(sp != NULL); 1420f1702c5SYu Xiangning 1430f1702c5SYu Xiangning so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family, 1440f1702c5SYu Xiangning parent->so_type, parent->so_protocol, parent->so_version, flags, 1450f1702c5SYu Xiangning errorp, cr); 1460f1702c5SYu Xiangning if (so != NULL) { 1470f1702c5SYu Xiangning SOCKPARAMS_INC_REF(sp); 1480f1702c5SYu Xiangning 1490f1702c5SYu Xiangning so->so_proto_handle = lh; 1500f1702c5SYu Xiangning so->so_downcalls = dc; 1510f1702c5SYu Xiangning /* 1520f1702c5SYu Xiangning * This function may be called in interrupt context, and CRED() 1530f1702c5SYu Xiangning * will be NULL. In this case, pass in kcred. 1540f1702c5SYu Xiangning */ 1550f1702c5SYu Xiangning if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) { 1560f1702c5SYu Xiangning /* Cannot fail, only bumps so_count */ 1570f1702c5SYu Xiangning (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 1580f1702c5SYu Xiangning } else { 1590f1702c5SYu Xiangning socket_destroy(so); 1600f1702c5SYu Xiangning so = NULL; 1610f1702c5SYu Xiangning } 1620f1702c5SYu Xiangning } 1630f1702c5SYu Xiangning 1640f1702c5SYu Xiangning return (so); 1650f1702c5SYu Xiangning } 1660f1702c5SYu Xiangning 1670f1702c5SYu Xiangning /* 1680f1702c5SYu Xiangning * Bind local endpoint. 1690f1702c5SYu Xiangning */ 1700f1702c5SYu Xiangning int 1710f1702c5SYu Xiangning socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, 1720f1702c5SYu Xiangning int flags, cred_t *cr) 1730f1702c5SYu Xiangning { 1740f1702c5SYu Xiangning return (SOP_BIND(so, name, namelen, flags, cr)); 1750f1702c5SYu Xiangning } 1760f1702c5SYu Xiangning 1770f1702c5SYu Xiangning /* 1780f1702c5SYu Xiangning * Turn socket into a listen socket. 1790f1702c5SYu Xiangning */ 1800f1702c5SYu Xiangning int 1810f1702c5SYu Xiangning socket_listen(struct sonode *so, int backlog, cred_t *cr) 1820f1702c5SYu Xiangning { 1830f1702c5SYu Xiangning if (backlog < 0) { 1840f1702c5SYu Xiangning backlog = 0; 1850f1702c5SYu Xiangning } 1860f1702c5SYu Xiangning 1870f1702c5SYu Xiangning /* 1880f1702c5SYu Xiangning * Use the same qlimit as in BSD. BSD checks the qlimit 1890f1702c5SYu Xiangning * before queuing the next connection implying that a 1900f1702c5SYu Xiangning * listen(sock, 0) allows one connection to be queued. 1910f1702c5SYu Xiangning * BSD also uses 1.5 times the requested backlog. 1920f1702c5SYu Xiangning * 1930f1702c5SYu Xiangning * XNS Issue 4 required a strict interpretation of the backlog. 1940f1702c5SYu Xiangning * This has been waived subsequently for Issue 4 and the change 1950f1702c5SYu Xiangning * incorporated in XNS Issue 5. So we aren't required to do 1960f1702c5SYu Xiangning * anything special for XPG apps. 1970f1702c5SYu Xiangning */ 1980f1702c5SYu Xiangning if (backlog >= (INT_MAX - 1) / 3) 1990f1702c5SYu Xiangning backlog = INT_MAX; 2000f1702c5SYu Xiangning else 2010f1702c5SYu Xiangning backlog = backlog * 3 / 2 + 1; 2020f1702c5SYu Xiangning 2030f1702c5SYu Xiangning return (SOP_LISTEN(so, backlog, cr)); 2040f1702c5SYu Xiangning } 2050f1702c5SYu Xiangning 2060f1702c5SYu Xiangning /* 2070f1702c5SYu Xiangning * Accept incoming connection. 2080f1702c5SYu Xiangning */ 2090f1702c5SYu Xiangning int 2100f1702c5SYu Xiangning socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop) 2110f1702c5SYu Xiangning { 2120f1702c5SYu Xiangning return (SOP_ACCEPT(lso, fflag, cr, nsop)); 2130f1702c5SYu Xiangning } 2140f1702c5SYu Xiangning 2150f1702c5SYu Xiangning /* 2160f1702c5SYu Xiangning * Active open. 2170f1702c5SYu Xiangning */ 2180f1702c5SYu Xiangning int 2190f1702c5SYu Xiangning socket_connect(struct sonode *so, const struct sockaddr *name, 2200f1702c5SYu Xiangning socklen_t namelen, int fflag, int flags, cred_t *cr) 2210f1702c5SYu Xiangning { 2220f1702c5SYu Xiangning int error; 2230f1702c5SYu Xiangning 2240f1702c5SYu Xiangning /* 2250f1702c5SYu Xiangning * Handle a connect to a name parameter of type AF_UNSPEC like a 2260f1702c5SYu Xiangning * connect to a null address. This is the portable method to 2270f1702c5SYu Xiangning * unconnect a socket. 2280f1702c5SYu Xiangning */ 2290f1702c5SYu Xiangning if ((namelen >= sizeof (sa_family_t)) && 2300f1702c5SYu Xiangning (name->sa_family == AF_UNSPEC)) { 2310f1702c5SYu Xiangning name = NULL; 2320f1702c5SYu Xiangning namelen = 0; 2330f1702c5SYu Xiangning } 2340f1702c5SYu Xiangning 2350f1702c5SYu Xiangning error = SOP_CONNECT(so, name, namelen, fflag, flags, cr); 2360f1702c5SYu Xiangning 2370f1702c5SYu Xiangning if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) { 2380f1702c5SYu Xiangning /* 2390f1702c5SYu Xiangning * X/Open specification contains a requirement that 2400f1702c5SYu Xiangning * ENETUNREACH be returned but does not require 2410f1702c5SYu Xiangning * EHOSTUNREACH. In order to keep the test suite 2420f1702c5SYu Xiangning * happy we mess with the errno here. 2430f1702c5SYu Xiangning */ 2440f1702c5SYu Xiangning error = ENETUNREACH; 2450f1702c5SYu Xiangning } 2460f1702c5SYu Xiangning 2470f1702c5SYu Xiangning return (error); 2480f1702c5SYu Xiangning } 2490f1702c5SYu Xiangning 2500f1702c5SYu Xiangning /* 2510f1702c5SYu Xiangning * Get address of remote node. 2520f1702c5SYu Xiangning */ 2530f1702c5SYu Xiangning int 2540f1702c5SYu Xiangning socket_getpeername(struct sonode *so, struct sockaddr *addr, 2550f1702c5SYu Xiangning socklen_t *addrlen, boolean_t accept, cred_t *cr) 2560f1702c5SYu Xiangning { 2570f1702c5SYu Xiangning ASSERT(*addrlen > 0); 2580f1702c5SYu Xiangning return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr)); 2590f1702c5SYu Xiangning 2600f1702c5SYu Xiangning } 2610f1702c5SYu Xiangning 2620f1702c5SYu Xiangning /* 2630f1702c5SYu Xiangning * Get local address. 2640f1702c5SYu Xiangning */ 2650f1702c5SYu Xiangning int 2660f1702c5SYu Xiangning socket_getsockname(struct sonode *so, struct sockaddr *addr, 2670f1702c5SYu Xiangning socklen_t *addrlen, cred_t *cr) 2680f1702c5SYu Xiangning { 2690f1702c5SYu Xiangning return (SOP_GETSOCKNAME(so, addr, addrlen, cr)); 2700f1702c5SYu Xiangning 2710f1702c5SYu Xiangning } 2720f1702c5SYu Xiangning 2730f1702c5SYu Xiangning /* 2740f1702c5SYu Xiangning * Called from shutdown(). 2750f1702c5SYu Xiangning */ 2760f1702c5SYu Xiangning int 2770f1702c5SYu Xiangning socket_shutdown(struct sonode *so, int how, cred_t *cr) 2780f1702c5SYu Xiangning { 2790f1702c5SYu Xiangning return (SOP_SHUTDOWN(so, how, cr)); 2800f1702c5SYu Xiangning } 2810f1702c5SYu Xiangning 2820f1702c5SYu Xiangning /* 2830f1702c5SYu Xiangning * Get socket options. 2840f1702c5SYu Xiangning */ 2850f1702c5SYu Xiangning /*ARGSUSED*/ 2860f1702c5SYu Xiangning int 2870f1702c5SYu Xiangning socket_getsockopt(struct sonode *so, int level, int option_name, 2880f1702c5SYu Xiangning void *optval, socklen_t *optlenp, int flags, cred_t *cr) 2890f1702c5SYu Xiangning { 2900f1702c5SYu Xiangning return (SOP_GETSOCKOPT(so, level, option_name, optval, 2910f1702c5SYu Xiangning optlenp, flags, cr)); 2920f1702c5SYu Xiangning } 2930f1702c5SYu Xiangning 2940f1702c5SYu Xiangning /* 2950f1702c5SYu Xiangning * Set socket options 2960f1702c5SYu Xiangning */ 2970f1702c5SYu Xiangning int 2980f1702c5SYu Xiangning socket_setsockopt(struct sonode *so, int level, int option_name, 2990f1702c5SYu Xiangning const void *optval, t_uscalar_t optlen, cred_t *cr) 3000f1702c5SYu Xiangning { 301*22238f73Sshenjian int val = 1; 3020f1702c5SYu Xiangning /* Caller allocates aligned optval, or passes null */ 3030f1702c5SYu Xiangning ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); 3040f1702c5SYu Xiangning /* If optval is null optlen is 0, and vice-versa */ 3050f1702c5SYu Xiangning ASSERT(optval != NULL || optlen == 0); 3060f1702c5SYu Xiangning ASSERT(optlen != 0 || optval == NULL); 3070f1702c5SYu Xiangning 308*22238f73Sshenjian if (optval == NULL && optlen == 0) 309*22238f73Sshenjian optval = &val; 3100f1702c5SYu Xiangning 3110f1702c5SYu Xiangning return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); 3120f1702c5SYu Xiangning } 3130f1702c5SYu Xiangning 3140f1702c5SYu Xiangning int 3150f1702c5SYu Xiangning socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 3160f1702c5SYu Xiangning cred_t *cr) 3170f1702c5SYu Xiangning { 3180f1702c5SYu Xiangning int error = 0; 3190f1702c5SYu Xiangning ssize_t orig_resid = uiop->uio_resid; 3200f1702c5SYu Xiangning 3210f1702c5SYu Xiangning /* 3220f1702c5SYu Xiangning * Do not bypass the cache if we are doing a local (AF_UNIX) write. 3230f1702c5SYu Xiangning */ 3240f1702c5SYu Xiangning if (so->so_family == AF_UNIX) 3250f1702c5SYu Xiangning uiop->uio_extflg |= UIO_COPY_CACHED; 3260f1702c5SYu Xiangning else 3270f1702c5SYu Xiangning uiop->uio_extflg &= ~UIO_COPY_CACHED; 3280f1702c5SYu Xiangning 3290f1702c5SYu Xiangning error = SOP_SENDMSG(so, msg, uiop, cr); 3300f1702c5SYu Xiangning switch (error) { 3310f1702c5SYu Xiangning default: 3320f1702c5SYu Xiangning break; 3330f1702c5SYu Xiangning case EINTR: 3340f1702c5SYu Xiangning case ETIME: 3350f1702c5SYu Xiangning case EWOULDBLOCK: 3360f1702c5SYu Xiangning /* We did a partial send */ 3370f1702c5SYu Xiangning if (uiop->uio_resid != orig_resid) 3380f1702c5SYu Xiangning error = 0; 3390f1702c5SYu Xiangning break; 3400f1702c5SYu Xiangning case EPIPE: 3410f1702c5SYu Xiangning if ((so->so_mode & SM_KERNEL) == 0) 3420f1702c5SYu Xiangning tsignal(curthread, SIGPIPE); 3430f1702c5SYu Xiangning break; 3440f1702c5SYu Xiangning } 3450f1702c5SYu Xiangning 3460f1702c5SYu Xiangning return (error); 3470f1702c5SYu Xiangning } 3480f1702c5SYu Xiangning 3490f1702c5SYu Xiangning int 3500f1702c5SYu Xiangning socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, 3510f1702c5SYu Xiangning struct cred *cr, mblk_t **mpp) 3520f1702c5SYu Xiangning { 3530f1702c5SYu Xiangning int error = 0; 3540f1702c5SYu Xiangning 3550f1702c5SYu Xiangning error = SOP_SENDMBLK(so, msg, fflag, cr, mpp); 3560f1702c5SYu Xiangning if (error == EPIPE) { 3570f1702c5SYu Xiangning tsignal(curthread, SIGPIPE); 3580f1702c5SYu Xiangning } 3590f1702c5SYu Xiangning return (error); 3600f1702c5SYu Xiangning } 3610f1702c5SYu Xiangning 3620f1702c5SYu Xiangning int 3630f1702c5SYu Xiangning socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 3640f1702c5SYu Xiangning cred_t *cr) 3650f1702c5SYu Xiangning { 3660f1702c5SYu Xiangning int error; 3670f1702c5SYu Xiangning ssize_t orig_resid = uiop->uio_resid; 3680f1702c5SYu Xiangning 3690f1702c5SYu Xiangning /* 3700f1702c5SYu Xiangning * Do not bypass the cache when reading data, as the application 3710f1702c5SYu Xiangning * is likely to access the data shortly. 3720f1702c5SYu Xiangning */ 3730f1702c5SYu Xiangning uiop->uio_extflg |= UIO_COPY_CACHED; 3740f1702c5SYu Xiangning 3750f1702c5SYu Xiangning error = SOP_RECVMSG(so, msg, uiop, cr); 3760f1702c5SYu Xiangning 3770f1702c5SYu Xiangning switch (error) { 3780f1702c5SYu Xiangning case EINTR: 3790f1702c5SYu Xiangning case ETIME: 3800f1702c5SYu Xiangning case EWOULDBLOCK: 3810f1702c5SYu Xiangning /* We did a partial read */ 3820f1702c5SYu Xiangning if (uiop->uio_resid != orig_resid) 3830f1702c5SYu Xiangning error = 0; 3840f1702c5SYu Xiangning break; 3850f1702c5SYu Xiangning default: 3860f1702c5SYu Xiangning break; 3870f1702c5SYu Xiangning } 3880f1702c5SYu Xiangning return (error); 3890f1702c5SYu Xiangning } 3900f1702c5SYu Xiangning 3910f1702c5SYu Xiangning int 3920f1702c5SYu Xiangning socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, 3930f1702c5SYu Xiangning struct cred *cr, int32_t *rvalp) 3940f1702c5SYu Xiangning { 3950f1702c5SYu Xiangning return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); 3960f1702c5SYu Xiangning } 3970f1702c5SYu Xiangning 3980f1702c5SYu Xiangning int 3990f1702c5SYu Xiangning socket_poll(struct sonode *so, short events, int anyyet, short *reventsp, 4000f1702c5SYu Xiangning struct pollhead **phpp) 4010f1702c5SYu Xiangning { 4020f1702c5SYu Xiangning return (SOP_POLL(so, events, anyyet, reventsp, phpp)); 4030f1702c5SYu Xiangning } 4040f1702c5SYu Xiangning 4050f1702c5SYu Xiangning int 4060f1702c5SYu Xiangning socket_close(struct sonode *so, int flag, struct cred *cr) 4070f1702c5SYu Xiangning { 4080f1702c5SYu Xiangning return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL)); 4090f1702c5SYu Xiangning } 4100f1702c5SYu Xiangning 4110f1702c5SYu Xiangning int 4120f1702c5SYu Xiangning socket_close_internal(struct sonode *so, int flag, cred_t *cr) 4130f1702c5SYu Xiangning { 4140f1702c5SYu Xiangning ASSERT(so->so_count == 0); 4150f1702c5SYu Xiangning 4160f1702c5SYu Xiangning return (SOP_CLOSE(so, flag, cr)); 4170f1702c5SYu Xiangning } 4180f1702c5SYu Xiangning 4190f1702c5SYu Xiangning void 4200f1702c5SYu Xiangning socket_destroy(struct sonode *so) 4210f1702c5SYu Xiangning { 4220f1702c5SYu Xiangning vn_invalid(SOTOV(so)); 4230f1702c5SYu Xiangning VN_RELE(SOTOV(so)); 4240f1702c5SYu Xiangning } 4250f1702c5SYu Xiangning 4260f1702c5SYu Xiangning /* ARGSUSED */ 4270f1702c5SYu Xiangning void 4280f1702c5SYu Xiangning socket_destroy_internal(struct sonode *so, cred_t *cr) 4290f1702c5SYu Xiangning { 4300f1702c5SYu Xiangning struct sockparams *sp = so->so_sockparams; 4310f1702c5SYu Xiangning ASSERT(so->so_count == 0 && sp != NULL); 4320f1702c5SYu Xiangning 4330f1702c5SYu Xiangning sp->sp_smod_info->smod_sock_destroy_func(so); 4340f1702c5SYu Xiangning 4350f1702c5SYu Xiangning SOCKPARAMS_DEC_REF(sp); 4360f1702c5SYu Xiangning } 4370f1702c5SYu Xiangning 4380f1702c5SYu Xiangning /* 4390f1702c5SYu Xiangning * TODO Once the common vnode ops is available, then the vnops argument 4400f1702c5SYu Xiangning * should be removed. 4410f1702c5SYu Xiangning */ 4420f1702c5SYu Xiangning /*ARGSUSED*/ 4430f1702c5SYu Xiangning int 4440f1702c5SYu Xiangning sonode_constructor(void *buf, void *cdrarg, int kmflags) 4450f1702c5SYu Xiangning { 4460f1702c5SYu Xiangning struct sonode *so = buf; 4470f1702c5SYu Xiangning struct vnode *vp; 4480f1702c5SYu Xiangning 4490f1702c5SYu Xiangning vp = so->so_vnode = vn_alloc(kmflags); 4500f1702c5SYu Xiangning if (vp == NULL) { 4510f1702c5SYu Xiangning return (-1); 4520f1702c5SYu Xiangning } 4530f1702c5SYu Xiangning vp->v_data = so; 4540f1702c5SYu Xiangning vn_setops(vp, socket_vnodeops); 4550f1702c5SYu Xiangning 4560f1702c5SYu Xiangning so->so_priv = NULL; 4570f1702c5SYu Xiangning so->so_oobmsg = NULL; 4580f1702c5SYu Xiangning 4590f1702c5SYu Xiangning so->so_proto_handle = NULL; 4600f1702c5SYu Xiangning 4610f1702c5SYu Xiangning so->so_peercred = NULL; 4620f1702c5SYu Xiangning 4630f1702c5SYu Xiangning so->so_rcv_queued = 0; 4640f1702c5SYu Xiangning so->so_rcv_q_head = NULL; 4650f1702c5SYu Xiangning so->so_rcv_q_last_head = NULL; 4660f1702c5SYu Xiangning so->so_rcv_head = NULL; 4670f1702c5SYu Xiangning so->so_rcv_last_head = NULL; 4680f1702c5SYu Xiangning so->so_rcv_wanted = 0; 4690f1702c5SYu Xiangning so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER; 4700f1702c5SYu Xiangning so->so_rcv_timer_tid = 0; 4710f1702c5SYu Xiangning so->so_rcv_thresh = 0; 4720f1702c5SYu Xiangning 4730f1702c5SYu Xiangning so->so_acceptq_head = NULL; 4740f1702c5SYu Xiangning so->so_acceptq_tail = &so->so_acceptq_head; 4750f1702c5SYu Xiangning so->so_acceptq_next = NULL; 4760f1702c5SYu Xiangning so->so_acceptq_len = 0; 4770f1702c5SYu Xiangning so->so_backlog = 0; 4780f1702c5SYu Xiangning 4790f1702c5SYu Xiangning so->so_snd_qfull = B_FALSE; 4800f1702c5SYu Xiangning 4810f1702c5SYu Xiangning mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); 4820f1702c5SYu Xiangning mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL); 4830f1702c5SYu Xiangning rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL); 4840f1702c5SYu Xiangning cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); 4850f1702c5SYu Xiangning cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); 4860f1702c5SYu Xiangning 4870f1702c5SYu Xiangning cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL); 4880f1702c5SYu Xiangning cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL); 4890f1702c5SYu Xiangning cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL); 4900f1702c5SYu Xiangning cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); 4910f1702c5SYu Xiangning cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); 4920f1702c5SYu Xiangning 4930f1702c5SYu Xiangning return (0); 4940f1702c5SYu Xiangning } 4950f1702c5SYu Xiangning 4960f1702c5SYu Xiangning /*ARGSUSED*/ 4970f1702c5SYu Xiangning void 4980f1702c5SYu Xiangning sonode_destructor(void *buf, void *cdrarg) 4990f1702c5SYu Xiangning { 5000f1702c5SYu Xiangning struct sonode *so = buf; 5010f1702c5SYu Xiangning struct vnode *vp = SOTOV(so); 5020f1702c5SYu Xiangning 5030f1702c5SYu Xiangning ASSERT(so->so_priv == NULL); 5040f1702c5SYu Xiangning ASSERT(so->so_peercred == NULL); 5050f1702c5SYu Xiangning 5060f1702c5SYu Xiangning ASSERT(so->so_oobmsg == NULL); 5070f1702c5SYu Xiangning 5080f1702c5SYu Xiangning ASSERT(so->so_rcv_q_head == NULL); 5090f1702c5SYu Xiangning 5100f1702c5SYu Xiangning ASSERT(so->so_acceptq_head == NULL); 5110f1702c5SYu Xiangning ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 5120f1702c5SYu Xiangning ASSERT(so->so_acceptq_next == NULL); 5130f1702c5SYu Xiangning 5140f1702c5SYu Xiangning ASSERT(vp->v_data == so); 5150f1702c5SYu Xiangning ASSERT(vn_matchops(vp, socket_vnodeops)); 5160f1702c5SYu Xiangning 5170f1702c5SYu Xiangning vn_free(vp); 5180f1702c5SYu Xiangning 5190f1702c5SYu Xiangning mutex_destroy(&so->so_lock); 5200f1702c5SYu Xiangning mutex_destroy(&so->so_acceptq_lock); 5210f1702c5SYu Xiangning rw_destroy(&so->so_fallback_rwlock); 5220f1702c5SYu Xiangning 5230f1702c5SYu Xiangning cv_destroy(&so->so_state_cv); 5240f1702c5SYu Xiangning cv_destroy(&so->so_want_cv); 5250f1702c5SYu Xiangning cv_destroy(&so->so_acceptq_cv); 5260f1702c5SYu Xiangning cv_destroy(&so->so_snd_cv); 5270f1702c5SYu Xiangning cv_destroy(&so->so_rcv_cv); 5280f1702c5SYu Xiangning cv_destroy(&so->so_closing_cv); 5290f1702c5SYu Xiangning } 5300f1702c5SYu Xiangning 5310f1702c5SYu Xiangning void 5320f1702c5SYu Xiangning sonode_init(struct sonode *so, struct sockparams *sp, int family, 5330f1702c5SYu Xiangning int type, int protocol, sonodeops_t *sops) 5340f1702c5SYu Xiangning { 5350f1702c5SYu Xiangning vnode_t *vp; 5360f1702c5SYu Xiangning 5370f1702c5SYu Xiangning vp = SOTOV(so); 5380f1702c5SYu Xiangning 5390f1702c5SYu Xiangning so->so_flag = 0; 5400f1702c5SYu Xiangning 5410f1702c5SYu Xiangning so->so_state = 0; 5420f1702c5SYu Xiangning so->so_mode = 0; 5430f1702c5SYu Xiangning 5440f1702c5SYu Xiangning so->so_count = 0; 5450f1702c5SYu Xiangning 5460f1702c5SYu Xiangning so->so_family = family; 5470f1702c5SYu Xiangning so->so_type = type; 5480f1702c5SYu Xiangning so->so_protocol = protocol; 5490f1702c5SYu Xiangning 5500f1702c5SYu Xiangning SOCK_CONNID_INIT(so->so_proto_connid); 5510f1702c5SYu Xiangning 5520f1702c5SYu Xiangning so->so_options = 0; 5530f1702c5SYu Xiangning so->so_linger.l_onoff = 0; 5540f1702c5SYu Xiangning so->so_linger.l_linger = 0; 5550f1702c5SYu Xiangning so->so_sndbuf = 0; 5560f1702c5SYu Xiangning so->so_error = 0; 5570f1702c5SYu Xiangning so->so_rcvtimeo = 0; 5580f1702c5SYu Xiangning so->so_sndtimeo = 0; 559a5adac4dSYu Xiangning so->so_xpg_rcvbuf = 0; 5600f1702c5SYu Xiangning 5610f1702c5SYu Xiangning ASSERT(so->so_oobmsg == NULL); 5620f1702c5SYu Xiangning so->so_oobmark = 0; 5630f1702c5SYu Xiangning so->so_pgrp = 0; 5640f1702c5SYu Xiangning 5650f1702c5SYu Xiangning ASSERT(so->so_peercred == NULL); 5660f1702c5SYu Xiangning 5670f1702c5SYu Xiangning so->so_zoneid = getzoneid(); 5680f1702c5SYu Xiangning 5690f1702c5SYu Xiangning so->so_sockparams = sp; 5700f1702c5SYu Xiangning 5710f1702c5SYu Xiangning so->so_ops = sops; 5720f1702c5SYu Xiangning 573d36be52eSRao Shoaib so->so_not_str = (sops != &sotpi_sonodeops); 574d36be52eSRao Shoaib 5750f1702c5SYu Xiangning so->so_proto_handle = NULL; 5760f1702c5SYu Xiangning 5770f1702c5SYu Xiangning so->so_downcalls = NULL; 5780f1702c5SYu Xiangning 5790f1702c5SYu Xiangning so->so_copyflag = 0; 5800f1702c5SYu Xiangning 5810f1702c5SYu Xiangning ASSERT(so->so_acceptq_head == NULL); 5820f1702c5SYu Xiangning ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 5830f1702c5SYu Xiangning ASSERT(so->so_acceptq_next == NULL); 5840f1702c5SYu Xiangning 5850f1702c5SYu Xiangning vn_reinit(vp); 5860f1702c5SYu Xiangning vp->v_vfsp = rootvfs; 5870f1702c5SYu Xiangning vp->v_type = VSOCK; 5880f1702c5SYu Xiangning vp->v_rdev = sockdev; 5890f1702c5SYu Xiangning 5900f1702c5SYu Xiangning so->so_rcv_queued = 0; 5910f1702c5SYu Xiangning so->so_rcv_q_head = NULL; 5920f1702c5SYu Xiangning so->so_rcv_q_last_head = NULL; 5930f1702c5SYu Xiangning so->so_rcv_head = NULL; 5940f1702c5SYu Xiangning so->so_rcv_last_head = NULL; 5950f1702c5SYu Xiangning 5960f1702c5SYu Xiangning so->so_snd_qfull = B_FALSE; 5970f1702c5SYu Xiangning so->so_minpsz = 0; 5980f1702c5SYu Xiangning 5990f1702c5SYu Xiangning so->so_rcv_wakeup = B_FALSE; 6000f1702c5SYu Xiangning so->so_snd_wakeup = B_FALSE; 6010f1702c5SYu Xiangning so->so_flowctrld = B_FALSE; 6020f1702c5SYu Xiangning 6030f1702c5SYu Xiangning so->so_pollev = 0; 6040f1702c5SYu Xiangning bzero(&so->so_poll_list, sizeof (so->so_poll_list)); 6050f1702c5SYu Xiangning bzero(&so->so_proto_props, sizeof (struct sock_proto_props)); 6060f1702c5SYu Xiangning 6070f1702c5SYu Xiangning bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t)); 6080f1702c5SYu Xiangning so->so_ksock_cb_arg = NULL; 6090f1702c5SYu Xiangning 6100f1702c5SYu Xiangning so->so_max_addr_len = sizeof (struct sockaddr_storage); 6110f1702c5SYu Xiangning 6120f1702c5SYu Xiangning so->so_direct = NULL; 6130f1702c5SYu Xiangning 6140f1702c5SYu Xiangning vn_exists(vp); 6150f1702c5SYu Xiangning } 6160f1702c5SYu Xiangning 6170f1702c5SYu Xiangning void 6180f1702c5SYu Xiangning sonode_fini(struct sonode *so) 6190f1702c5SYu Xiangning { 6200f1702c5SYu Xiangning mblk_t *mp; 6210f1702c5SYu Xiangning vnode_t *vp; 6220f1702c5SYu Xiangning 6230f1702c5SYu Xiangning ASSERT(so->so_count == 0); 6240f1702c5SYu Xiangning 6250f1702c5SYu Xiangning if (so->so_rcv_timer_tid) { 6260f1702c5SYu Xiangning ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 6270f1702c5SYu Xiangning (void) untimeout(so->so_rcv_timer_tid); 6280f1702c5SYu Xiangning so->so_rcv_timer_tid = 0; 6290f1702c5SYu Xiangning } 6300f1702c5SYu Xiangning 6310f1702c5SYu Xiangning so_acceptq_flush(so); 6320f1702c5SYu Xiangning 6330f1702c5SYu Xiangning if ((mp = so->so_oobmsg) != NULL) { 6340f1702c5SYu Xiangning freemsg(mp); 6350f1702c5SYu Xiangning so->so_oobmsg = NULL; 6360f1702c5SYu Xiangning so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA| 6370f1702c5SYu Xiangning SS_RCVATMARK); 6380f1702c5SYu Xiangning } 6390f1702c5SYu Xiangning 6400f1702c5SYu Xiangning if (so->so_poll_list.ph_list != NULL) { 6410f1702c5SYu Xiangning pollwakeup(&so->so_poll_list, POLLERR); 6420f1702c5SYu Xiangning pollhead_clean(&so->so_poll_list); 6430f1702c5SYu Xiangning } 6440f1702c5SYu Xiangning 6450f1702c5SYu Xiangning if (so->so_direct != NULL) { 6460f1702c5SYu Xiangning sodirect_t *sodp = so->so_direct; 6470f1702c5SYu Xiangning 6480f1702c5SYu Xiangning ASSERT(sodp->sod_uioafh == NULL); 6490f1702c5SYu Xiangning 6500f1702c5SYu Xiangning so->so_direct = NULL; 6510f1702c5SYu Xiangning kmem_cache_free(sock_sod_cache, sodp); 6520f1702c5SYu Xiangning } 6530f1702c5SYu Xiangning 6540f1702c5SYu Xiangning vp = SOTOV(so); 6550f1702c5SYu Xiangning vn_invalid(vp); 6560f1702c5SYu Xiangning 6570f1702c5SYu Xiangning if (so->so_peercred != NULL) { 6580f1702c5SYu Xiangning crfree(so->so_peercred); 6590f1702c5SYu Xiangning so->so_peercred = NULL; 6600f1702c5SYu Xiangning } 6610f1702c5SYu Xiangning } 6620f1702c5SYu Xiangning 6630f1702c5SYu Xiangning /* 6640f1702c5SYu Xiangning * This function is called at the beginning of recvmsg(). 6650f1702c5SYu Xiangning * 6660f1702c5SYu Xiangning * If I/OAT is enabled on this sonode, initialize the uioa state machine 6670f1702c5SYu Xiangning * with state UIOA_ALLOC. 6680f1702c5SYu Xiangning */ 6690f1702c5SYu Xiangning uio_t * 6700f1702c5SYu Xiangning sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp) 6710f1702c5SYu Xiangning { 6720f1702c5SYu Xiangning struct uio *suiop; 6730f1702c5SYu Xiangning struct uio *uiop; 6740f1702c5SYu Xiangning sodirect_t *sodp = so->so_direct; 6750f1702c5SYu Xiangning 6760f1702c5SYu Xiangning if (sodp == NULL) 6770f1702c5SYu Xiangning return (NULL); 6780f1702c5SYu Xiangning 6790f1702c5SYu Xiangning suiop = NULL; 6800f1702c5SYu Xiangning uiop = *uiopp; 6810f1702c5SYu Xiangning 6820f1702c5SYu Xiangning mutex_enter(sodp->sod_lockp); 6830f1702c5SYu Xiangning if (uiop->uio_resid >= uioasync.mincnt && 6840f1702c5SYu Xiangning sodp != NULL && (sodp->sod_state & SOD_ENABLED) && 6850f1702c5SYu Xiangning uioasync.enabled && !(flags & MSG_PEEK) && 6860f1702c5SYu Xiangning !(so->so_state & SS_CANTRCVMORE)) { 6870f1702c5SYu Xiangning /* 6880f1702c5SYu Xiangning * Big enough I/O for uioa min setup and an sodirect socket 6890f1702c5SYu Xiangning * and sodirect enabled and uioa enabled and I/O will be done 6900f1702c5SYu Xiangning * and not EOF so initialize the sodirect_t uioa_t with "uiop". 6910f1702c5SYu Xiangning */ 6920f1702c5SYu Xiangning if (!uioainit(uiop, &sodp->sod_uioa)) { 6930f1702c5SYu Xiangning /* 6940f1702c5SYu Xiangning * Successful uioainit() so the uio_t part of the 6950f1702c5SYu Xiangning * uioa_t will be used for all uio_t work to follow, 6960f1702c5SYu Xiangning * we return the original "uiop" in "suiop". 6970f1702c5SYu Xiangning */ 6980f1702c5SYu Xiangning suiop = uiop; 6990f1702c5SYu Xiangning *uiopp = (uio_t *)&sodp->sod_uioa; 7000f1702c5SYu Xiangning /* 7010f1702c5SYu Xiangning * Before returning to the caller the passed in uio_t 7020f1702c5SYu Xiangning * "uiop" will be updated via a call to uioafini() 7030f1702c5SYu Xiangning * below. 7040f1702c5SYu Xiangning * 7050f1702c5SYu Xiangning * Note, the uioa.uioa_state isn't set to UIOA_ENABLED 7060f1702c5SYu Xiangning * here as first we have to uioamove() any currently 7070f1702c5SYu Xiangning * queued M_DATA mblk_t(s) so it will be done later. 7080f1702c5SYu Xiangning */ 7090f1702c5SYu Xiangning } 7100f1702c5SYu Xiangning /* 7110f1702c5SYu Xiangning * In either uioainit() success or not case note the number 7120f1702c5SYu Xiangning * of uio bytes the caller wants for sod framework and/or 7130f1702c5SYu Xiangning * transport (e.g. TCP) strategy. 7140f1702c5SYu Xiangning */ 7150f1702c5SYu Xiangning sodp->sod_want = uiop->uio_resid; 7160f1702c5SYu Xiangning } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { 7170f1702c5SYu Xiangning /* 7180f1702c5SYu Xiangning * No uioa but still using sodirect so note the number of 7190f1702c5SYu Xiangning * uio bytes the caller wants for sodirect framework and/or 7200f1702c5SYu Xiangning * transport (e.g. TCP) strategy. 7210f1702c5SYu Xiangning */ 7220f1702c5SYu Xiangning sodp->sod_want = uiop->uio_resid; 7230f1702c5SYu Xiangning } 7240f1702c5SYu Xiangning mutex_exit(sodp->sod_lockp); 7250f1702c5SYu Xiangning 7260f1702c5SYu Xiangning return (suiop); 7270f1702c5SYu Xiangning } 7280f1702c5SYu Xiangning 7290f1702c5SYu Xiangning /* 7300f1702c5SYu Xiangning * This function is called at the end of recvmsg(), it finializes all the I/OAT 7310f1702c5SYu Xiangning * operations, and reset the uioa state to UIOA_ALLOC. 7320f1702c5SYu Xiangning */ 7330f1702c5SYu Xiangning int 7340f1702c5SYu Xiangning sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop) 7350f1702c5SYu Xiangning { 7360f1702c5SYu Xiangning int error = 0; 7370f1702c5SYu Xiangning sodirect_t *sodp = so->so_direct; 7380f1702c5SYu Xiangning mblk_t *mp; 7390f1702c5SYu Xiangning 7400f1702c5SYu Xiangning if (sodp == NULL) { 7410f1702c5SYu Xiangning return (0); 7420f1702c5SYu Xiangning } 7430f1702c5SYu Xiangning 7440f1702c5SYu Xiangning ASSERT(MUTEX_HELD(sodp->sod_lockp)); 7450f1702c5SYu Xiangning /* Finish any sodirect and uioa processing */ 7460f1702c5SYu Xiangning if (suiop != NULL) { 7470f1702c5SYu Xiangning /* Finish any uioa_t processing */ 7480f1702c5SYu Xiangning 7490f1702c5SYu Xiangning ASSERT(uiop == (uio_t *)&sodp->sod_uioa); 7500f1702c5SYu Xiangning error = uioafini(suiop, (uioa_t *)uiop); 7510f1702c5SYu Xiangning if ((mp = sodp->sod_uioafh) != NULL) { 7520f1702c5SYu Xiangning sodp->sod_uioafh = NULL; 7530f1702c5SYu Xiangning sodp->sod_uioaft = NULL; 7540f1702c5SYu Xiangning freemsg(mp); 7550f1702c5SYu Xiangning } 7560f1702c5SYu Xiangning } 7570f1702c5SYu Xiangning ASSERT(sodp->sod_uioafh == NULL); 7580f1702c5SYu Xiangning if (!(sodp->sod_state & SOD_WAKE_NOT)) { 7590f1702c5SYu Xiangning /* Awoke */ 7600f1702c5SYu Xiangning sodp->sod_state &= SOD_WAKE_CLR; 7610f1702c5SYu Xiangning sodp->sod_state |= SOD_WAKE_NOT; 7620f1702c5SYu Xiangning } 7630f1702c5SYu Xiangning /* Last, clear sod_want value */ 7640f1702c5SYu Xiangning sodp->sod_want = 0; 7650f1702c5SYu Xiangning 7660f1702c5SYu Xiangning return (error); 7670f1702c5SYu Xiangning } 7680f1702c5SYu Xiangning 7690f1702c5SYu Xiangning /* 7700f1702c5SYu Xiangning * Schedule a uioamove() on a mblk. This is ususally called from 7710f1702c5SYu Xiangning * protocols (e.g. TCP) on a I/OAT enabled sonode. 7720f1702c5SYu Xiangning */ 7730f1702c5SYu Xiangning mblk_t * 7740f1702c5SYu Xiangning sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size) 7750f1702c5SYu Xiangning { 7760f1702c5SYu Xiangning uioa_t *uioap = &sodp->sod_uioa; 7770f1702c5SYu Xiangning mblk_t *mp1 = mp; 7780f1702c5SYu Xiangning mblk_t *lmp = NULL; 7790f1702c5SYu Xiangning 7800f1702c5SYu Xiangning ASSERT(DB_TYPE(mp) == M_DATA); 7810f1702c5SYu Xiangning ASSERT(msg_size == msgdsize(mp)); 7820f1702c5SYu Xiangning 7830f1702c5SYu Xiangning /* Caller must have lock held */ 7840f1702c5SYu Xiangning ASSERT(MUTEX_HELD(sodp->sod_lockp)); 7850f1702c5SYu Xiangning 7860f1702c5SYu Xiangning if (uioap->uioa_state & UIOA_ENABLED) { 7870f1702c5SYu Xiangning /* Uioa is enabled */ 7880f1702c5SYu Xiangning 7890f1702c5SYu Xiangning if (msg_size > uioap->uio_resid) { 7900f1702c5SYu Xiangning /* 7910f1702c5SYu Xiangning * There isn't enough uio space for the mblk_t chain 7920f1702c5SYu Xiangning * so disable uioa such that this and any additional 7930f1702c5SYu Xiangning * mblk_t data is handled by the socket and schedule 7940f1702c5SYu Xiangning * the socket for wakeup to finish this uioa. 7950f1702c5SYu Xiangning */ 7960f1702c5SYu Xiangning uioap->uioa_state &= UIOA_CLR; 7970f1702c5SYu Xiangning uioap->uioa_state |= UIOA_FINI; 7980f1702c5SYu Xiangning if (sodp->sod_state & SOD_WAKE_NOT) { 7990f1702c5SYu Xiangning sodp->sod_state &= SOD_WAKE_CLR; 8000f1702c5SYu Xiangning sodp->sod_state |= SOD_WAKE_NEED; 8010f1702c5SYu Xiangning } 8020f1702c5SYu Xiangning return (mp); 8030f1702c5SYu Xiangning } 8040f1702c5SYu Xiangning do { 8050f1702c5SYu Xiangning uint32_t len = MBLKL(mp1); 8060f1702c5SYu Xiangning 8070f1702c5SYu Xiangning if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) { 8080f1702c5SYu Xiangning /* Scheduled, mark dblk_t as such */ 8090f1702c5SYu Xiangning DB_FLAGS(mp1) |= DBLK_UIOA; 8100f1702c5SYu Xiangning } else { 8110f1702c5SYu Xiangning /* Error, turn off async processing */ 8120f1702c5SYu Xiangning uioap->uioa_state &= UIOA_CLR; 8130f1702c5SYu Xiangning uioap->uioa_state |= UIOA_FINI; 8140f1702c5SYu Xiangning break; 8150f1702c5SYu Xiangning } 8160f1702c5SYu Xiangning lmp = mp1; 8170f1702c5SYu Xiangning } while ((mp1 = mp1->b_cont) != NULL); 8180f1702c5SYu Xiangning 8190f1702c5SYu Xiangning if (mp1 != NULL || uioap->uio_resid == 0) { 8200f1702c5SYu Xiangning /* 8210f1702c5SYu Xiangning * Not all mblk_t(s) uioamoved (error) or all uio 8220f1702c5SYu Xiangning * space has been consumed so schedule the socket 8230f1702c5SYu Xiangning * for wakeup to finish this uio. 8240f1702c5SYu Xiangning */ 8250f1702c5SYu Xiangning sodp->sod_state &= SOD_WAKE_CLR; 8260f1702c5SYu Xiangning sodp->sod_state |= SOD_WAKE_NEED; 8270f1702c5SYu Xiangning 8280f1702c5SYu Xiangning /* Break the mblk chain if neccessary. */ 8290f1702c5SYu Xiangning if (mp1 != NULL && lmp != NULL) { 8300f1702c5SYu Xiangning mp->b_next = mp1; 8310f1702c5SYu Xiangning lmp->b_cont = NULL; 8320f1702c5SYu Xiangning } 8330f1702c5SYu Xiangning } 8340f1702c5SYu Xiangning } 8350f1702c5SYu Xiangning return (mp1); 8360f1702c5SYu Xiangning } 8370f1702c5SYu Xiangning 8380f1702c5SYu Xiangning /* 8390f1702c5SYu Xiangning * This function is called on a mblk that thas been successfully uioamoved(). 8400f1702c5SYu Xiangning */ 8410f1702c5SYu Xiangning void 8420f1702c5SYu Xiangning sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp) 8430f1702c5SYu Xiangning { 8440f1702c5SYu Xiangning if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { 8450f1702c5SYu Xiangning /* 8460f1702c5SYu Xiangning * A uioa flaged mblk_t chain, already uio processed, 8470f1702c5SYu Xiangning * add it to the sodirect uioa pending free list. 8480f1702c5SYu Xiangning * 8490f1702c5SYu Xiangning * Note, a b_cont chain headed by a DBLK_UIOA enable 8500f1702c5SYu Xiangning * mblk_t must have all mblk_t(s) DBLK_UIOA enabled. 8510f1702c5SYu Xiangning */ 8520f1702c5SYu Xiangning mblk_t *bpt = sodp->sod_uioaft; 8530f1702c5SYu Xiangning 8540f1702c5SYu Xiangning ASSERT(sodp != NULL); 8550f1702c5SYu Xiangning 8560f1702c5SYu Xiangning /* 8570f1702c5SYu Xiangning * Add first mblk_t of "bp" chain to current sodirect uioa 8580f1702c5SYu Xiangning * free list tail mblk_t, if any, else empty list so new head. 8590f1702c5SYu Xiangning */ 8600f1702c5SYu Xiangning if (bpt == NULL) 8610f1702c5SYu Xiangning sodp->sod_uioafh = bp; 8620f1702c5SYu Xiangning else 8630f1702c5SYu Xiangning bpt->b_cont = bp; 8640f1702c5SYu Xiangning 8650f1702c5SYu Xiangning /* 8660f1702c5SYu Xiangning * Walk mblk_t "bp" chain to find tail and adjust rptr of 8670f1702c5SYu Xiangning * each to reflect that uioamove() has consumed all data. 8680f1702c5SYu Xiangning */ 8690f1702c5SYu Xiangning bpt = bp; 8700f1702c5SYu Xiangning for (;;) { 8710f1702c5SYu Xiangning ASSERT(bpt->b_datap->db_flags & DBLK_UIOA); 8720f1702c5SYu Xiangning 8730f1702c5SYu Xiangning bpt->b_rptr = bpt->b_wptr; 8740f1702c5SYu Xiangning if (bpt->b_cont == NULL) 8750f1702c5SYu Xiangning break; 8760f1702c5SYu Xiangning bpt = bpt->b_cont; 8770f1702c5SYu Xiangning } 8780f1702c5SYu Xiangning /* New sodirect uioa free list tail */ 8790f1702c5SYu Xiangning sodp->sod_uioaft = bpt; 8800f1702c5SYu Xiangning 8810f1702c5SYu Xiangning /* Only dequeue once with data returned per uioa_t */ 8820f1702c5SYu Xiangning if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { 8830f1702c5SYu Xiangning sodp->sod_uioa.uioa_state &= UIOA_CLR; 8840f1702c5SYu Xiangning sodp->sod_uioa.uioa_state |= UIOA_FINI; 8850f1702c5SYu Xiangning } 8860f1702c5SYu Xiangning } 8870f1702c5SYu Xiangning } 8880f1702c5SYu Xiangning 8890f1702c5SYu Xiangning /* 8900f1702c5SYu Xiangning * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call 8910f1702c5SYu Xiangning * this function on a non-STREAMS socket to schedule uioamove() on the data 8920f1702c5SYu Xiangning * that has already queued in this socket. 8930f1702c5SYu Xiangning */ 8940f1702c5SYu Xiangning void 8950f1702c5SYu Xiangning sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop) 8960f1702c5SYu Xiangning { 8970f1702c5SYu Xiangning uioa_t *uioap = (uioa_t *)uiop; 8980f1702c5SYu Xiangning mblk_t *lbp; 8990f1702c5SYu Xiangning mblk_t *wbp; 9000f1702c5SYu Xiangning mblk_t *bp; 9010f1702c5SYu Xiangning int len; 9020f1702c5SYu Xiangning int error; 9030f1702c5SYu Xiangning boolean_t in_rcv_q = B_TRUE; 9040f1702c5SYu Xiangning 9050f1702c5SYu Xiangning ASSERT(MUTEX_HELD(sodp->sod_lockp)); 9060f1702c5SYu Xiangning ASSERT(&sodp->sod_uioa == uioap); 9070f1702c5SYu Xiangning 9080f1702c5SYu Xiangning /* 9090f1702c5SYu Xiangning * Walk first b_cont chain in sod_q 9100f1702c5SYu Xiangning * and schedule any M_DATA mblk_t's for uio asynchronous move. 9110f1702c5SYu Xiangning */ 9120f1702c5SYu Xiangning bp = so->so_rcv_q_head; 9130f1702c5SYu Xiangning 9140f1702c5SYu Xiangning again: 9150f1702c5SYu Xiangning /* Walk the chain */ 9160f1702c5SYu Xiangning lbp = NULL; 9170f1702c5SYu Xiangning wbp = bp; 9180f1702c5SYu Xiangning 9190f1702c5SYu Xiangning do { 9200f1702c5SYu Xiangning if (bp == NULL) 9210f1702c5SYu Xiangning break; 9220f1702c5SYu Xiangning 9230f1702c5SYu Xiangning if (wbp->b_datap->db_type != M_DATA) { 9240f1702c5SYu Xiangning /* Not M_DATA, no more uioa */ 9250f1702c5SYu Xiangning goto nouioa; 9260f1702c5SYu Xiangning } 9270f1702c5SYu Xiangning if ((len = wbp->b_wptr - wbp->b_rptr) > 0) { 9280f1702c5SYu Xiangning /* Have a M_DATA mblk_t with data */ 9290f1702c5SYu Xiangning if (len > uioap->uio_resid || (so->so_oobmark > 0 && 9300f1702c5SYu Xiangning len + uioap->uioa_mbytes >= so->so_oobmark)) { 9310f1702c5SYu Xiangning /* Not enough uio sapce, or beyond oobmark */ 9320f1702c5SYu Xiangning goto nouioa; 9330f1702c5SYu Xiangning } 9340f1702c5SYu Xiangning ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA)); 9350f1702c5SYu Xiangning error = uioamove(wbp->b_rptr, len, 9360f1702c5SYu Xiangning UIO_READ, uioap); 9370f1702c5SYu Xiangning if (!error) { 9380f1702c5SYu Xiangning /* Scheduled, mark dblk_t as such */ 9390f1702c5SYu Xiangning wbp->b_datap->db_flags |= DBLK_UIOA; 9400f1702c5SYu Xiangning } else { 9410f1702c5SYu Xiangning /* Break the mblk chain */ 9420f1702c5SYu Xiangning goto nouioa; 9430f1702c5SYu Xiangning } 9440f1702c5SYu Xiangning } 9450f1702c5SYu Xiangning /* Save last wbp processed */ 9460f1702c5SYu Xiangning lbp = wbp; 9470f1702c5SYu Xiangning } while ((wbp = wbp->b_cont) != NULL); 9480f1702c5SYu Xiangning 9490f1702c5SYu Xiangning if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) { 9500f1702c5SYu Xiangning /* 9510f1702c5SYu Xiangning * We get here only once to process the sonode dump area 9520f1702c5SYu Xiangning * if so_rcv_q_head is NULL or all the mblks have been 9530f1702c5SYu Xiangning * successfully uioamoved()ed. 9540f1702c5SYu Xiangning */ 9550f1702c5SYu Xiangning in_rcv_q = B_FALSE; 9560f1702c5SYu Xiangning 9570f1702c5SYu Xiangning /* move to dump area */ 9580f1702c5SYu Xiangning bp = so->so_rcv_head; 9590f1702c5SYu Xiangning goto again; 9600f1702c5SYu Xiangning } 9610f1702c5SYu Xiangning 9620f1702c5SYu Xiangning return; 9630f1702c5SYu Xiangning 9640f1702c5SYu Xiangning nouioa: 9650f1702c5SYu Xiangning /* No more uioa */ 9660f1702c5SYu Xiangning uioap->uioa_state &= UIOA_CLR; 9670f1702c5SYu Xiangning uioap->uioa_state |= UIOA_FINI; 9680f1702c5SYu Xiangning 9690f1702c5SYu Xiangning /* 9700f1702c5SYu Xiangning * If we processed 1 or more mblk_t(s) then we need to split the 9710f1702c5SYu Xiangning * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s) 9720f1702c5SYu Xiangning * are in the current chain and the rest are in the following new 9730f1702c5SYu Xiangning * chain. 9740f1702c5SYu Xiangning */ 9750f1702c5SYu Xiangning if (lbp != NULL) { 9760f1702c5SYu Xiangning /* New end of current chain */ 9770f1702c5SYu Xiangning lbp->b_cont = NULL; 9780f1702c5SYu Xiangning 9790f1702c5SYu Xiangning /* Insert new chain wbp after bp */ 9800f1702c5SYu Xiangning if ((wbp->b_next = bp->b_next) == NULL) { 9810f1702c5SYu Xiangning /* 9820f1702c5SYu Xiangning * No need to grab so_lock, since sod_lockp 9830f1702c5SYu Xiangning * points to so_lock. 9840f1702c5SYu Xiangning */ 9850f1702c5SYu Xiangning if (in_rcv_q) 9860f1702c5SYu Xiangning so->so_rcv_q_last_head = wbp; 9870f1702c5SYu Xiangning else 9880f1702c5SYu Xiangning so->so_rcv_last_head = wbp; 9890f1702c5SYu Xiangning } 9900f1702c5SYu Xiangning bp->b_next = wbp; 9910f1702c5SYu Xiangning bp->b_next->b_prev = bp->b_prev; 9920f1702c5SYu Xiangning bp->b_prev = lbp; 9930f1702c5SYu Xiangning } 9940f1702c5SYu Xiangning } 9950f1702c5SYu Xiangning 9960f1702c5SYu Xiangning /* 9970f1702c5SYu Xiangning * Initialize sodirect data structures on a socket. 9980f1702c5SYu Xiangning */ 9990f1702c5SYu Xiangning void 10000f1702c5SYu Xiangning sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func, 10010f1702c5SYu Xiangning sod_wakeup_func wake_func, kmutex_t *lockp) 10020f1702c5SYu Xiangning { 10030f1702c5SYu Xiangning sodirect_t *sodp; 10040f1702c5SYu Xiangning 10050f1702c5SYu Xiangning ASSERT(so->so_direct == NULL); 10060f1702c5SYu Xiangning 10070f1702c5SYu Xiangning so->so_state |= SS_SODIRECT; 10080f1702c5SYu Xiangning 10090f1702c5SYu Xiangning sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP); 10100f1702c5SYu Xiangning sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT; 10110f1702c5SYu Xiangning sodp->sod_want = 0; 10120f1702c5SYu Xiangning sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL; 10130f1702c5SYu Xiangning sodp->sod_enqueue = enq_func; 10140f1702c5SYu Xiangning sodp->sod_wakeup = wake_func; 10150f1702c5SYu Xiangning sodp->sod_uioafh = NULL; 10160f1702c5SYu Xiangning sodp->sod_uioaft = NULL; 10170f1702c5SYu Xiangning sodp->sod_lockp = lockp; 10180f1702c5SYu Xiangning /* 10190f1702c5SYu Xiangning * Remainder of the sod_uioa members are left uninitialized 10200f1702c5SYu Xiangning * but will be initialized later by uioainit() before uioa 10210f1702c5SYu Xiangning * is enabled. 10220f1702c5SYu Xiangning */ 10230f1702c5SYu Xiangning sodp->sod_uioa.uioa_state = UIOA_ALLOC; 10240f1702c5SYu Xiangning so->so_direct = sodp; 10250f1702c5SYu Xiangning if (stp != NULL) 10260f1702c5SYu Xiangning stp->sd_sodirect = sodp; 10270f1702c5SYu Xiangning } 10280f1702c5SYu Xiangning 10290f1702c5SYu Xiangning /* 10300f1702c5SYu Xiangning * Init the sodirect kmem cache while sockfs is loading. 10310f1702c5SYu Xiangning */ 10320f1702c5SYu Xiangning void 10330f1702c5SYu Xiangning sod_init() 10340f1702c5SYu Xiangning { 10350f1702c5SYu Xiangning /* Allocate sodirect_t kmem_cache */ 10360f1702c5SYu Xiangning sock_sod_cache = kmem_cache_create("sock_sod_cache", 10370f1702c5SYu Xiangning sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 10380f1702c5SYu Xiangning } 10390f1702c5SYu Xiangning 10400f1702c5SYu Xiangning ssize_t 10410f1702c5SYu Xiangning sod_uioa_mblk(struct sonode *so, mblk_t *mp) 10420f1702c5SYu Xiangning { 10430f1702c5SYu Xiangning sodirect_t *sodp = so->so_direct; 10440f1702c5SYu Xiangning 10450f1702c5SYu Xiangning ASSERT(sodp != NULL); 10460f1702c5SYu Xiangning ASSERT(MUTEX_HELD(sodp->sod_lockp)); 10470f1702c5SYu Xiangning 10480f1702c5SYu Xiangning ASSERT(sodp->sod_state & SOD_ENABLED); 10490f1702c5SYu Xiangning ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT)); 10500f1702c5SYu Xiangning 10510f1702c5SYu Xiangning ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI)); 10520f1702c5SYu Xiangning 10530f1702c5SYu Xiangning if (mp == NULL && so->so_rcv_q_head != NULL) { 10540f1702c5SYu Xiangning mp = so->so_rcv_q_head; 10550f1702c5SYu Xiangning ASSERT(mp->b_prev != NULL); 10560f1702c5SYu Xiangning mp->b_prev = NULL; 10570f1702c5SYu Xiangning so->so_rcv_q_head = mp->b_next; 10580f1702c5SYu Xiangning if (so->so_rcv_q_head == NULL) { 10590f1702c5SYu Xiangning so->so_rcv_q_last_head = NULL; 10600f1702c5SYu Xiangning } 10610f1702c5SYu Xiangning mp->b_next = NULL; 10620f1702c5SYu Xiangning } 10630f1702c5SYu Xiangning 10640f1702c5SYu Xiangning sod_uioa_mblk_done(sodp, mp); 10650f1702c5SYu Xiangning 10660f1702c5SYu Xiangning if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL && 10670f1702c5SYu Xiangning DB_TYPE(so->so_rcv_head) == M_DATA && 10680f1702c5SYu Xiangning (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) { 10690f1702c5SYu Xiangning /* more arrived */ 10700f1702c5SYu Xiangning ASSERT(so->so_rcv_q_head == NULL); 10710f1702c5SYu Xiangning mp = so->so_rcv_head; 10720f1702c5SYu Xiangning so->so_rcv_head = mp->b_next; 10730f1702c5SYu Xiangning if (so->so_rcv_head == NULL) 10740f1702c5SYu Xiangning so->so_rcv_last_head = NULL; 10750f1702c5SYu Xiangning mp->b_prev = mp->b_next = NULL; 10760f1702c5SYu Xiangning sod_uioa_mblk_done(sodp, mp); 10770f1702c5SYu Xiangning } 10780f1702c5SYu Xiangning 10790f1702c5SYu Xiangning #ifdef DEBUG 10800f1702c5SYu Xiangning if (so->so_rcv_q_head != NULL) { 10810f1702c5SYu Xiangning mblk_t *m = so->so_rcv_q_head; 10820f1702c5SYu Xiangning while (m != NULL) { 10830f1702c5SYu Xiangning if (DB_FLAGS(m) & DBLK_UIOA) { 10840f1702c5SYu Xiangning cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 10850f1702c5SYu Xiangning " in so_rcv_q_head.\n", (void *)m); 10860f1702c5SYu Xiangning } 10870f1702c5SYu Xiangning m = m->b_next; 10880f1702c5SYu Xiangning } 10890f1702c5SYu Xiangning } 10900f1702c5SYu Xiangning if (so->so_rcv_head != NULL) { 10910f1702c5SYu Xiangning mblk_t *m = so->so_rcv_head; 10920f1702c5SYu Xiangning while (m != NULL) { 10930f1702c5SYu Xiangning if (DB_FLAGS(m) & DBLK_UIOA) { 10940f1702c5SYu Xiangning cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 10950f1702c5SYu Xiangning " in so_rcv_head.\n", (void *)m); 10960f1702c5SYu Xiangning } 10970f1702c5SYu Xiangning m = m->b_next; 10980f1702c5SYu Xiangning } 10990f1702c5SYu Xiangning } 11000f1702c5SYu Xiangning #endif 11010f1702c5SYu Xiangning return (sodp->sod_uioa.uioa_mbytes); 11020f1702c5SYu Xiangning } 1103