1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
27/*
28 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
29 */
30
31#include <sys/types.h>
32#include <sys/t_lock.h>
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/buf.h>
36#include <sys/conf.h>
37#include <sys/cred.h>
38#include <sys/kmem.h>
39#include <sys/sysmacros.h>
40#include <sys/vfs.h>
41#include <sys/vnode.h>
42#include <sys/debug.h>
43#include <sys/errno.h>
44#include <sys/time.h>
45#include <sys/file.h>
46#include <sys/user.h>
47#include <sys/stream.h>
48#include <sys/strsubr.h>
49#include <sys/strsun.h>
50#include <sys/sunddi.h>
51#include <sys/esunddi.h>
52#include <sys/flock.h>
53#include <sys/modctl.h>
54#include <sys/cmn_err.h>
55#include <sys/vmsystm.h>
56#include <sys/policy.h>
57
58#include <sys/socket.h>
59#include <sys/socketvar.h>
60
61#include <sys/isa_defs.h>
62#include <sys/inttypes.h>
63#include <sys/systm.h>
64#include <sys/cpuvar.h>
65#include <sys/filio.h>
66#include <sys/sendfile.h>
67#include <sys/ddi.h>
68#include <vm/seg.h>
69#include <vm/seg_map.h>
70#include <vm/seg_kpm.h>
71
72#include <fs/sockfs/nl7c.h>
73#include <fs/sockfs/sockcommon.h>
74#include <fs/sockfs/sockfilter_impl.h>
75#include <fs/sockfs/socktpi.h>
76
77#ifdef SOCK_TEST
78int do_useracc = 1;		/* Controlled by setting SO_DEBUG to 4 */
79#else
80#define	do_useracc	1
81#endif /* SOCK_TEST */
82
83extern int 	xnet_truncate_print;
84
85extern void	nl7c_init(void);
86extern int	sockfs_defer_nl7c_init;
87
88/*
89 * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
90 *	 as there isn't a formal definition of IOV_MAX ???
91 */
92#define	MSG_MAXIOVLEN	16
93
94/*
95 * Kernel component of socket creation.
96 *
97 * The socket library determines which version number to use.
98 * First the library calls this with a NULL devpath. If this fails
99 * to find a transport (using solookup) the library will look in /etc/netconfig
100 * for the appropriate transport. If one is found it will pass in the
101 * devpath for the kernel to use.
102 */
103int
104so_socket(int family, int type_w_flags, int protocol, char *devpath,
105    int version)
106{
107	struct sonode *so;
108	vnode_t *vp;
109	struct file *fp;
110	int fd;
111	int error;
112	int type;
113
114	type = type_w_flags & SOCK_TYPE_MASK;
115	type_w_flags &= ~SOCK_TYPE_MASK;
116	if (type_w_flags & ~(SOCK_CLOEXEC|SOCK_NDELAY|SOCK_NONBLOCK))
117		return (set_errno(EINVAL));
118
119	if (devpath != NULL) {
120		char *buf;
121		size_t kdevpathlen = 0;
122
123		buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
124		if ((error = copyinstr(devpath, buf,
125		    MAXPATHLEN, &kdevpathlen)) != 0) {
126			kmem_free(buf, MAXPATHLEN);
127			return (set_errno(error));
128		}
129		so = socket_create(family, type, protocol, buf, NULL,
130		    SOCKET_SLEEP, version, CRED(), &error);
131		kmem_free(buf, MAXPATHLEN);
132	} else {
133		so = socket_create(family, type, protocol, NULL, NULL,
134		    SOCKET_SLEEP, version, CRED(), &error);
135	}
136	if (so == NULL)
137		return (set_errno(error));
138
139	/* Allocate a file descriptor for the socket */
140	vp = SOTOV(so);
141	if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
142		(void) socket_close(so, 0, CRED());
143		socket_destroy(so);
144		return (set_errno(error));
145	}
146
147	/*
148	 * Now fill in the entries that falloc reserved
149	 */
150	if (type_w_flags & SOCK_NDELAY) {
151		so->so_state |= SS_NDELAY;
152		fp->f_flag |= FNDELAY;
153	}
154	if (type_w_flags & SOCK_NONBLOCK) {
155		so->so_state |= SS_NONBLOCK;
156		fp->f_flag |= FNONBLOCK;
157	}
158	mutex_exit(&fp->f_tlock);
159	setf(fd, fp);
160	if ((type_w_flags & SOCK_CLOEXEC) != 0) {
161		f_setfd(fd, FD_CLOEXEC);
162	}
163
164	return (fd);
165}
166
167/*
168 * Map from a file descriptor to a socket node.
169 * Returns with the file descriptor held i.e. the caller has to
170 * use releasef when done with the file descriptor.
171 */
172struct sonode *
173getsonode(int sock, int *errorp, file_t **fpp)
174{
175	file_t *fp;
176	vnode_t *vp;
177	struct sonode *so;
178
179	if ((fp = getf(sock)) == NULL) {
180		*errorp = EBADF;
181		eprintline(*errorp);
182		return (NULL);
183	}
184	vp = fp->f_vnode;
185	/* Check if it is a socket */
186	if (vp->v_type != VSOCK) {
187		releasef(sock);
188		*errorp = ENOTSOCK;
189		eprintline(*errorp);
190		return (NULL);
191	}
192	/*
193	 * Use the stream head to find the real socket vnode.
194	 * This is needed when namefs sits above sockfs.
195	 */
196	if (vp->v_stream) {
197		ASSERT(vp->v_stream->sd_vnode);
198		vp = vp->v_stream->sd_vnode;
199
200		so = VTOSO(vp);
201		if (so->so_version == SOV_STREAM) {
202			releasef(sock);
203			*errorp = ENOTSOCK;
204			eprintsoline(so, *errorp);
205			return (NULL);
206		}
207	} else {
208		so = VTOSO(vp);
209	}
210	if (fpp)
211		*fpp = fp;
212	return (so);
213}
214
215/*
216 * Allocate and copyin a sockaddr.
217 * Ensures NULL termination for AF_UNIX addresses by extending them
218 * with one NULL byte if need be. Verifies that the length is not
219 * excessive to prevent an application from consuming all of kernel
220 * memory. Returns NULL when an error occurred.
221 */
222static struct sockaddr *
223copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
224	    int *errorp)
225{
226	char	*faddr;
227	size_t	namelen = (size_t)*namelenp;
228
229	ASSERT(namelen != 0);
230	if (namelen > SO_MAXARGSIZE) {
231		*errorp = EINVAL;
232		eprintsoline(so, *errorp);
233		return (NULL);
234	}
235
236	faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
237	if (copyin(name, faddr, namelen)) {
238		kmem_free(faddr, namelen);
239		*errorp = EFAULT;
240		eprintsoline(so, *errorp);
241		return (NULL);
242	}
243
244	/*
245	 * Add space for NULL termination if needed.
246	 * Do a quick check if the last byte is NUL.
247	 */
248	if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
249		/* Check if there is any NULL termination */
250		size_t	i;
251		int foundnull = 0;
252
253		for (i = sizeof (name->sa_family); i < namelen; i++) {
254			if (faddr[i] == '\0') {
255				foundnull = 1;
256				break;
257			}
258		}
259		if (!foundnull) {
260			/* Add extra byte for NUL padding */
261			char *nfaddr;
262
263			nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
264			bcopy(faddr, nfaddr, namelen);
265			kmem_free(faddr, namelen);
266
267			/* NUL terminate */
268			nfaddr[namelen] = '\0';
269			namelen++;
270			ASSERT((socklen_t)namelen == namelen);
271			*namelenp = (socklen_t)namelen;
272			faddr = nfaddr;
273		}
274	}
275	return ((struct sockaddr *)faddr);
276}
277
278/*
279 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
280 */
281static int
282copyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
283		void *kaddr, socklen_t klen)
284{
285	if (uaddr != NULL) {
286		if (ulen > klen)
287			ulen = klen;
288
289		if (ulen != 0) {
290			if (copyout(kaddr, uaddr, ulen))
291				return (EFAULT);
292		}
293	} else
294		ulen = 0;
295
296	if (ulenp != NULL) {
297		if (copyout(&ulen, ulenp, sizeof (ulen)))
298			return (EFAULT);
299	}
300	return (0);
301}
302
303/*
304 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
305 * If klen is greater than ulen it still uses the non-truncated
306 * klen to update ulenp.
307 */
308static int
309copyout_name(void *uaddr, socklen_t ulen, void *ulenp,
310		void *kaddr, socklen_t klen)
311{
312	if (uaddr != NULL) {
313		if (ulen >= klen)
314			ulen = klen;
315		else if (ulen != 0 && xnet_truncate_print) {
316			printf("sockfs: truncating copyout of address using "
317			    "XNET semantics for pid = %d. Lengths %d, %d\n",
318			    curproc->p_pid, klen, ulen);
319		}
320
321		if (ulen != 0) {
322			if (copyout(kaddr, uaddr, ulen))
323				return (EFAULT);
324		} else
325			klen = 0;
326	} else
327		klen = 0;
328
329	if (ulenp != NULL) {
330		if (copyout(&klen, ulenp, sizeof (klen)))
331			return (EFAULT);
332	}
333	return (0);
334}
335
336/*
337 * The socketpair() code in libsocket creates two sockets (using
338 * the /etc/netconfig fallback if needed) before calling this routine
339 * to connect the two sockets together.
340 *
341 * For a SOCK_STREAM socketpair a listener is needed - in that case this
342 * routine will create a new file descriptor as part of accepting the
343 * connection. The library socketpair() will check if svs[2] has changed
344 * in which case it will close the changed fd.
345 *
346 * Note that this code could use the TPI feature of accepting the connection
347 * on the listening endpoint. However, that would require significant changes
348 * to soaccept.
349 */
350int
351so_socketpair(int sv[2])
352{
353	int svs[2];
354	struct sonode *so1, *so2;
355	int error;
356	int orig_flags;
357	struct sockaddr_ux *name;
358	size_t namelen;
359	sotpi_info_t *sti1;
360	sotpi_info_t *sti2;
361
362	dprint(1, ("so_socketpair(%p)\n", (void *)sv));
363
364	error = useracc(sv, sizeof (svs), B_WRITE);
365	if (error && do_useracc)
366		return (set_errno(EFAULT));
367
368	if (copyin(sv, svs, sizeof (svs)))
369		return (set_errno(EFAULT));
370
371	if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
372		return (set_errno(error));
373
374	if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
375		releasef(svs[0]);
376		return (set_errno(error));
377	}
378
379	if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
380		error = EOPNOTSUPP;
381		goto done;
382	}
383
384	sti1 = SOTOTPI(so1);
385	sti2 = SOTOTPI(so2);
386
387	/*
388	 * The code below makes assumptions about the "sockfs" implementation.
389	 * So make sure that the correct implementation is really used.
390	 */
391	ASSERT(so1->so_ops == &sotpi_sonodeops);
392	ASSERT(so2->so_ops == &sotpi_sonodeops);
393
394	if (so1->so_type == SOCK_DGRAM) {
395		/*
396		 * Bind both sockets and connect them with each other.
397		 * Need to allocate name/namelen for soconnect.
398		 */
399		error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
400		if (error) {
401			eprintsoline(so1, error);
402			goto done;
403		}
404		error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
405		if (error) {
406			eprintsoline(so2, error);
407			goto done;
408		}
409		namelen = sizeof (struct sockaddr_ux);
410		name = kmem_alloc(namelen, KM_SLEEP);
411		name->sou_family = AF_UNIX;
412		name->sou_addr = sti2->sti_ux_laddr;
413		error = socket_connect(so1,
414		    (struct sockaddr *)name,
415		    (socklen_t)namelen,
416		    0, _SOCONNECT_NOXLATE, CRED());
417		if (error) {
418			kmem_free(name, namelen);
419			eprintsoline(so1, error);
420			goto done;
421		}
422		name->sou_addr = sti1->sti_ux_laddr;
423		error = socket_connect(so2,
424		    (struct sockaddr *)name,
425		    (socklen_t)namelen,
426		    0, _SOCONNECT_NOXLATE, CRED());
427		kmem_free(name, namelen);
428		if (error) {
429			eprintsoline(so2, error);
430			goto done;
431		}
432		releasef(svs[0]);
433		releasef(svs[1]);
434	} else {
435		/*
436		 * Bind both sockets, with so1 being a listener.
437		 * Connect so2 to so1 - nonblocking to avoid waiting for
438		 * soaccept to complete.
439		 * Accept a connection on so1. Pass out the new fd as sv[0].
440		 * The library will detect the changed fd and close
441		 * the original one.
442		 */
443		struct sonode *nso;
444		struct vnode *nvp;
445		struct file *nfp;
446		int nfd;
447
448		/*
449		 * We could simply call socket_listen() here (which would do the
450		 * binding automatically) if the code didn't rely on passing
451		 * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
452		 */
453		error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
454		    _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
455		    CRED());
456		if (error) {
457			eprintsoline(so1, error);
458			goto done;
459		}
460		error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
461		if (error) {
462			eprintsoline(so2, error);
463			goto done;
464		}
465
466		namelen = sizeof (struct sockaddr_ux);
467		name = kmem_alloc(namelen, KM_SLEEP);
468		name->sou_family = AF_UNIX;
469		name->sou_addr = sti1->sti_ux_laddr;
470		error = socket_connect(so2,
471		    (struct sockaddr *)name,
472		    (socklen_t)namelen,
473		    FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
474		kmem_free(name, namelen);
475		if (error) {
476			if (error != EINPROGRESS) {
477				eprintsoline(so2, error); goto done;
478			}
479		}
480
481		error = socket_accept(so1, 0, CRED(), &nso);
482		if (error) {
483			eprintsoline(so1, error);
484			goto done;
485		}
486
487		/* wait for so2 being SS_CONNECTED ignoring signals */
488		mutex_enter(&so2->so_lock);
489		error = sowaitconnected(so2, 0, 1);
490		mutex_exit(&so2->so_lock);
491		if (error != 0) {
492			(void) socket_close(nso, 0, CRED());
493			socket_destroy(nso);
494			eprintsoline(so2, error);
495			goto done;
496		}
497
498		nvp = SOTOV(nso);
499		if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
500			(void) socket_close(nso, 0, CRED());
501			socket_destroy(nso);
502			eprintsoline(nso, error);
503			goto done;
504		}
505		/*
506		 * copy over FNONBLOCK and FNDELAY flags should they exist
507		 */
508		if (so1->so_state & SS_NONBLOCK)
509			nfp->f_flag |= FNONBLOCK;
510		if (so1->so_state & SS_NDELAY)
511			nfp->f_flag |= FNDELAY;
512
513		/*
514		 * fill in the entries that falloc reserved
515		 */
516		mutex_exit(&nfp->f_tlock);
517		setf(nfd, nfp);
518
519		/*
520		 * get the original flags before we release
521		 */
522		VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
523
524		releasef(svs[0]);
525		releasef(svs[1]);
526
527		/*
528		 * If FD_CLOEXEC was set on the filedescriptor we're
529		 * swapping out, we should set it on the new one too.
530		 */
531		if (orig_flags & FD_CLOEXEC) {
532			f_setfd(nfd, FD_CLOEXEC);
533		}
534
535		/*
536		 * The socketpair library routine will close the original
537		 * svs[0] when this code passes out a different file
538		 * descriptor.
539		 */
540		svs[0] = nfd;
541
542		if (copyout(svs, sv, sizeof (svs))) {
543			(void) closeandsetf(nfd, NULL);
544			eprintline(EFAULT);
545			return (set_errno(EFAULT));
546		}
547	}
548	return (0);
549
550done:
551	releasef(svs[0]);
552	releasef(svs[1]);
553	return (set_errno(error));
554}
555
556int
557bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
558{
559	struct sonode *so;
560	int error;
561
562	dprint(1, ("bind(%d, %p, %d)\n",
563	    sock, (void *)name, namelen));
564
565	if ((so = getsonode(sock, &error, NULL)) == NULL)
566		return (set_errno(error));
567
568	/* Allocate and copyin name */
569	/*
570	 * X/Open test does not expect EFAULT with NULL name and non-zero
571	 * namelen.
572	 */
573	if (name != NULL && namelen != 0) {
574		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
575		name = copyin_name(so, name, &namelen, &error);
576		if (name == NULL) {
577			releasef(sock);
578			return (set_errno(error));
579		}
580	} else {
581		name = NULL;
582		namelen = 0;
583	}
584
585	switch (version) {
586	default:
587		error = socket_bind(so, name, namelen, 0, CRED());
588		break;
589	case SOV_XPG4_2:
590		error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
591		break;
592	case SOV_SOCKBSD:
593		error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
594		break;
595	}
596done:
597	releasef(sock);
598	if (name != NULL)
599		kmem_free(name, (size_t)namelen);
600
601	if (error)
602		return (set_errno(error));
603	return (0);
604}
605
606/* ARGSUSED2 */
607int
608listen(int sock, int backlog, int version)
609{
610	struct sonode *so;
611	int error;
612
613	dprint(1, ("listen(%d, %d)\n",
614	    sock, backlog));
615
616	if ((so = getsonode(sock, &error, NULL)) == NULL)
617		return (set_errno(error));
618
619	error = socket_listen(so, backlog, CRED());
620
621	releasef(sock);
622	if (error)
623		return (set_errno(error));
624	return (0);
625}
626
627/*ARGSUSED3*/
628int
629accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
630    int flags)
631{
632	struct sonode *so;
633	file_t *fp;
634	int error;
635	socklen_t namelen;
636	struct sonode *nso;
637	struct vnode *nvp;
638	struct file *nfp;
639	int nfd;
640	int ssflags;
641	struct sockaddr *addrp;
642	socklen_t addrlen;
643
644	dprint(1, ("accept(%d, %p, %p)\n",
645	    sock, (void *)name, (void *)namelenp));
646
647	if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
648		return (set_errno(EINVAL));
649	}
650
651	/* Translate SOCK_ flags to their SS_ variant */
652	ssflags = 0;
653	if (flags & SOCK_NONBLOCK)
654		ssflags |= SS_NONBLOCK;
655	if (flags & SOCK_NDELAY)
656		ssflags |= SS_NDELAY;
657
658	if ((so = getsonode(sock, &error, &fp)) == NULL)
659		return (set_errno(error));
660
661	if (name != NULL) {
662		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
663		if (copyin(namelenp, &namelen, sizeof (namelen))) {
664			releasef(sock);
665			return (set_errno(EFAULT));
666		}
667		if (namelen != 0) {
668			error = useracc(name, (size_t)namelen, B_WRITE);
669			if (error && do_useracc) {
670				releasef(sock);
671				return (set_errno(EFAULT));
672			}
673		} else
674			name = NULL;
675	} else {
676		namelen = 0;
677	}
678
679	/*
680	 * Allocate the user fd before socket_accept() in order to
681	 * catch EMFILE errors before calling socket_accept().
682	 */
683	if ((nfd = ufalloc(0)) == -1) {
684		eprintsoline(so, EMFILE);
685		releasef(sock);
686		return (set_errno(EMFILE));
687	}
688	error = socket_accept(so, fp->f_flag, CRED(), &nso);
689	if (error) {
690		setf(nfd, NULL);
691		releasef(sock);
692		return (set_errno(error));
693	}
694
695	nvp = SOTOV(nso);
696
697	ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
698	if (namelen != 0) {
699		addrlen = so->so_max_addr_len;
700		addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
701
702		if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
703		    &addrlen, B_TRUE, CRED())) == 0) {
704			error = copyout_name(name, namelen, namelenp,
705			    addrp, addrlen);
706		} else {
707			ASSERT(error == EINVAL || error == ENOTCONN);
708			error = ECONNABORTED;
709		}
710		kmem_free(addrp, so->so_max_addr_len);
711	}
712
713	if (error) {
714		setf(nfd, NULL);
715		(void) socket_close(nso, 0, CRED());
716		socket_destroy(nso);
717		releasef(sock);
718		return (set_errno(error));
719	}
720	if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
721		setf(nfd, NULL);
722		(void) socket_close(nso, 0, CRED());
723		socket_destroy(nso);
724		eprintsoline(so, error);
725		releasef(sock);
726		return (set_errno(error));
727	}
728	/*
729	 * fill in the entries that falloc reserved
730	 */
731	nfp->f_vnode = nvp;
732	mutex_exit(&nfp->f_tlock);
733	setf(nfd, nfp);
734
735	/*
736	 * Act on SOCK_CLOEXEC from flags
737	 */
738	if (flags & SOCK_CLOEXEC) {
739		f_setfd(nfd, FD_CLOEXEC);
740	}
741
742	/*
743	 * Copy FNDELAY and FNONBLOCK from listener to acceptor
744	 * and from ssflags
745	 */
746	if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
747		uint_t oflag = nfp->f_flag;
748		int arg = 0;
749
750		if ((ssflags | so->so_state) & SS_NONBLOCK)
751			arg |= FNONBLOCK;
752		else if ((ssflags | so->so_state) & SS_NDELAY)
753			arg |= FNDELAY;
754
755		/*
756		 * This code is a simplification of the F_SETFL code in fcntl()
757		 * Ignore any errors from VOP_SETFL.
758		 */
759		if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
760		    != 0) {
761			eprintsoline(so, error);
762			error = 0;
763		} else {
764			mutex_enter(&nfp->f_tlock);
765			nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
766			nfp->f_flag |= arg;
767			mutex_exit(&nfp->f_tlock);
768		}
769	}
770	releasef(sock);
771	return (nfd);
772}
773
774int
775connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
776{
777	struct sonode *so;
778	file_t *fp;
779	int error;
780
781	dprint(1, ("connect(%d, %p, %d)\n",
782	    sock, (void *)name, namelen));
783
784	if ((so = getsonode(sock, &error, &fp)) == NULL)
785		return (set_errno(error));
786
787	/* Allocate and copyin name */
788	if (namelen != 0) {
789		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
790		name = copyin_name(so, name, &namelen, &error);
791		if (name == NULL) {
792			releasef(sock);
793			return (set_errno(error));
794		}
795	} else
796		name = NULL;
797
798	error = socket_connect(so, name, namelen, fp->f_flag,
799	    (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
800	releasef(sock);
801	if (name)
802		kmem_free(name, (size_t)namelen);
803	if (error)
804		return (set_errno(error));
805	return (0);
806}
807
808/*ARGSUSED2*/
809int
810shutdown(int sock, int how, int version)
811{
812	struct sonode *so;
813	int error;
814
815	dprint(1, ("shutdown(%d, %d)\n",
816	    sock, how));
817
818	if ((so = getsonode(sock, &error, NULL)) == NULL)
819		return (set_errno(error));
820
821	error = socket_shutdown(so, how, CRED());
822
823	releasef(sock);
824	if (error)
825		return (set_errno(error));
826	return (0);
827}
828
829/*
830 * Common receive routine.
831 */
832static ssize_t
833recvit(int sock,
834	struct nmsghdr *msg,
835	struct uio *uiop,
836	int flags,
837	socklen_t *namelenp,
838	socklen_t *controllenp,
839	int *flagsp)
840{
841	struct sonode *so;
842	file_t *fp;
843	void *name;
844	socklen_t namelen;
845	void *control;
846	socklen_t controllen;
847	ssize_t len;
848	int error;
849
850	if ((so = getsonode(sock, &error, &fp)) == NULL)
851		return (set_errno(error));
852
853	len = uiop->uio_resid;
854	uiop->uio_fmode = fp->f_flag;
855	uiop->uio_extflg = UIO_COPY_CACHED;
856
857	name = msg->msg_name;
858	namelen = msg->msg_namelen;
859	control = msg->msg_control;
860	controllen = msg->msg_controllen;
861
862	msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
863	    MSG_DONTWAIT | MSG_XPG4_2);
864
865	error = socket_recvmsg(so, msg, uiop, CRED());
866	if (error) {
867		releasef(sock);
868		return (set_errno(error));
869	}
870	lwp_stat_update(LWP_STAT_MSGRCV, 1);
871	releasef(sock);
872
873	error = copyout_name(name, namelen, namelenp,
874	    msg->msg_name, msg->msg_namelen);
875	if (error)
876		goto err;
877
878	if (flagsp != NULL) {
879		/*
880		 * Clear internal flag.
881		 */
882		msg->msg_flags &= ~MSG_XPG4_2;
883
884		/*
885		 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
886		 * when controllen is zero and there is control data to
887		 * copy out.
888		 */
889		if (controllen != 0 &&
890		    (msg->msg_controllen > controllen || control == NULL)) {
891			dprint(1, ("recvit: CTRUNC %d %d %p\n",
892			    msg->msg_controllen, controllen, control));
893
894			msg->msg_flags |= MSG_CTRUNC;
895		}
896		if (copyout(&msg->msg_flags, flagsp,
897		    sizeof (msg->msg_flags))) {
898			error = EFAULT;
899			goto err;
900		}
901	}
902	/*
903	 * Note: This MUST be done last. There can be no "goto err" after this
904	 * point since it could make so_closefds run twice on some part
905	 * of the file descriptor array.
906	 */
907	if (controllen != 0) {
908		if (!(flags & MSG_XPG4_2)) {
909			/*
910			 * Good old msg_accrights can only return a multiple
911			 * of 4 bytes.
912			 */
913			controllen &= ~((int)sizeof (uint32_t) - 1);
914		}
915		error = copyout_arg(control, controllen, controllenp,
916		    msg->msg_control, msg->msg_controllen);
917		if (error)
918			goto err;
919
920		if (msg->msg_controllen > controllen || control == NULL) {
921			if (control == NULL)
922				controllen = 0;
923			so_closefds(msg->msg_control, msg->msg_controllen,
924			    !(flags & MSG_XPG4_2), controllen);
925		}
926	}
927	if (msg->msg_namelen != 0)
928		kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
929	if (msg->msg_controllen != 0)
930		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
931	return (len - uiop->uio_resid);
932
933err:
934	/*
935	 * If we fail and the control part contains file descriptors
936	 * we have to close the fd's.
937	 */
938	if (msg->msg_controllen != 0)
939		so_closefds(msg->msg_control, msg->msg_controllen,
940		    !(flags & MSG_XPG4_2), 0);
941	if (msg->msg_namelen != 0)
942		kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
943	if (msg->msg_controllen != 0)
944		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
945	return (set_errno(error));
946}
947
948/*
949 * Native system call
950 */
951ssize_t
952recv(int sock, void *buffer, size_t len, int flags)
953{
954	struct nmsghdr lmsg;
955	struct uio auio;
956	struct iovec aiov[1];
957
958	dprint(1, ("recv(%d, %p, %ld, %d)\n",
959	    sock, buffer, len, flags));
960
961	if ((ssize_t)len < 0) {
962		return (set_errno(EINVAL));
963	}
964
965	aiov[0].iov_base = buffer;
966	aiov[0].iov_len = len;
967	auio.uio_loffset = 0;
968	auio.uio_iov = aiov;
969	auio.uio_iovcnt = 1;
970	auio.uio_resid = len;
971	auio.uio_segflg = UIO_USERSPACE;
972	auio.uio_limit = 0;
973
974	lmsg.msg_namelen = 0;
975	lmsg.msg_controllen = 0;
976	lmsg.msg_flags = 0;
977	return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
978}
979
980ssize_t
981recvfrom(int sock, void *buffer, size_t len, int flags,
982	struct sockaddr *name, socklen_t *namelenp)
983{
984	struct nmsghdr lmsg;
985	struct uio auio;
986	struct iovec aiov[1];
987
988	dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
989	    sock, buffer, len, flags, (void *)name, (void *)namelenp));
990
991	if ((ssize_t)len < 0) {
992		return (set_errno(EINVAL));
993	}
994
995	aiov[0].iov_base = buffer;
996	aiov[0].iov_len = len;
997	auio.uio_loffset = 0;
998	auio.uio_iov = aiov;
999	auio.uio_iovcnt = 1;
1000	auio.uio_resid = len;
1001	auio.uio_segflg = UIO_USERSPACE;
1002	auio.uio_limit = 0;
1003
1004	lmsg.msg_name = (char *)name;
1005	if (namelenp != NULL) {
1006		if (copyin(namelenp, &lmsg.msg_namelen,
1007		    sizeof (lmsg.msg_namelen)))
1008			return (set_errno(EFAULT));
1009	} else {
1010		lmsg.msg_namelen = 0;
1011	}
1012	lmsg.msg_controllen = 0;
1013	lmsg.msg_flags = 0;
1014
1015	return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
1016}
1017
1018/*
1019 * Uses the MSG_XPG4_2 flag to determine if the caller is using
1020 * struct omsghdr or struct nmsghdr.
1021 */
1022ssize_t
1023recvmsg(int sock, struct nmsghdr *msg, int flags)
1024{
1025	STRUCT_DECL(nmsghdr, u_lmsg);
1026	STRUCT_HANDLE(nmsghdr, umsgptr);
1027	struct nmsghdr lmsg;
1028	struct uio auio;
1029	struct iovec aiov[MSG_MAXIOVLEN];
1030	int iovcnt;
1031	ssize_t len;
1032	int i;
1033	int *flagsp;
1034	model_t	model;
1035
1036	dprint(1, ("recvmsg(%d, %p, %d)\n",
1037	    sock, (void *)msg, flags));
1038
1039	model = get_udatamodel();
1040	STRUCT_INIT(u_lmsg, model);
1041	STRUCT_SET_HANDLE(umsgptr, model, msg);
1042
1043	if (flags & MSG_XPG4_2) {
1044		if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1045			return (set_errno(EFAULT));
1046		flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1047	} else {
1048		/*
1049		 * Assumes that nmsghdr and omsghdr are identically shaped
1050		 * except for the added msg_flags field.
1051		 */
1052		if (copyin(msg, STRUCT_BUF(u_lmsg),
1053		    SIZEOF_STRUCT(omsghdr, model)))
1054			return (set_errno(EFAULT));
1055		STRUCT_FSET(u_lmsg, msg_flags, 0);
1056		flagsp = NULL;
1057	}
1058
1059	/*
1060	 * Code below us will kmem_alloc memory and hang it
1061	 * off msg_control and msg_name fields. This forces
1062	 * us to copy the structure to its native form.
1063	 */
1064	lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1065	lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1066	lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1067	lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1068	lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1069	lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1070	lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1071
1072	iovcnt = lmsg.msg_iovlen;
1073
1074	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1075		return (set_errno(EMSGSIZE));
1076	}
1077
1078#ifdef _SYSCALL32_IMPL
1079	/*
1080	 * 32-bit callers need to have their iovec expanded, while ensuring
1081	 * that they can't move more than 2Gbytes of data in a single call.
1082	 */
1083	if (model == DATAMODEL_ILP32) {
1084		struct iovec32 aiov32[MSG_MAXIOVLEN];
1085		ssize32_t count32;
1086
1087		if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1088		    iovcnt * sizeof (struct iovec32)))
1089			return (set_errno(EFAULT));
1090
1091		count32 = 0;
1092		for (i = 0; i < iovcnt; i++) {
1093			ssize32_t iovlen32;
1094
1095			iovlen32 = aiov32[i].iov_len;
1096			count32 += iovlen32;
1097			if (iovlen32 < 0 || count32 < 0)
1098				return (set_errno(EINVAL));
1099			aiov[i].iov_len = iovlen32;
1100			aiov[i].iov_base =
1101			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1102		}
1103	} else
1104#endif /* _SYSCALL32_IMPL */
1105	if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1106		return (set_errno(EFAULT));
1107	}
1108	len = 0;
1109	for (i = 0; i < iovcnt; i++) {
1110		ssize_t iovlen = aiov[i].iov_len;
1111		len += iovlen;
1112		if (iovlen < 0 || len < 0) {
1113			return (set_errno(EINVAL));
1114		}
1115	}
1116	auio.uio_loffset = 0;
1117	auio.uio_iov = aiov;
1118	auio.uio_iovcnt = iovcnt;
1119	auio.uio_resid = len;
1120	auio.uio_segflg = UIO_USERSPACE;
1121	auio.uio_limit = 0;
1122
1123	if (lmsg.msg_control != NULL &&
1124	    (do_useracc == 0 ||
1125	    useracc(lmsg.msg_control, lmsg.msg_controllen,
1126	    B_WRITE) != 0)) {
1127		return (set_errno(EFAULT));
1128	}
1129
1130	return (recvit(sock, &lmsg, &auio, flags,
1131	    STRUCT_FADDR(umsgptr, msg_namelen),
1132	    STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
1133}
1134
1135/*
1136 * Common send function.
1137 */
1138static ssize_t
1139sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1140{
1141	struct sonode *so;
1142	file_t *fp;
1143	void *name;
1144	socklen_t namelen;
1145	void *control;
1146	socklen_t controllen;
1147	ssize_t len;
1148	int error;
1149
1150	if ((so = getsonode(sock, &error, &fp)) == NULL)
1151		return (set_errno(error));
1152
1153	uiop->uio_fmode = fp->f_flag;
1154
1155	if (so->so_family == AF_UNIX)
1156		uiop->uio_extflg = UIO_COPY_CACHED;
1157	else
1158		uiop->uio_extflg = UIO_COPY_DEFAULT;
1159
1160	/* Allocate and copyin name and control */
1161	name = msg->msg_name;
1162	namelen = msg->msg_namelen;
1163	if (name != NULL && namelen != 0) {
1164		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1165		name = copyin_name(so,
1166		    (struct sockaddr *)name,
1167		    &namelen, &error);
1168		if (name == NULL)
1169			goto done3;
1170		/* copyin_name null terminates addresses for AF_UNIX */
1171		msg->msg_namelen = namelen;
1172		msg->msg_name = name;
1173	} else {
1174		msg->msg_name = name = NULL;
1175		msg->msg_namelen = namelen = 0;
1176	}
1177
1178	control = msg->msg_control;
1179	controllen = msg->msg_controllen;
1180	if ((control != NULL) && (controllen != 0)) {
1181		/*
1182		 * Verify that the length is not excessive to prevent
1183		 * an application from consuming all of kernel memory.
1184		 */
1185		if (controllen > SO_MAXARGSIZE) {
1186			error = EINVAL;
1187			goto done2;
1188		}
1189		control = kmem_alloc(controllen, KM_SLEEP);
1190
1191		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1192		if (copyin(msg->msg_control, control, controllen)) {
1193			error = EFAULT;
1194			goto done1;
1195		}
1196		msg->msg_control = control;
1197	} else {
1198		msg->msg_control = control = NULL;
1199		msg->msg_controllen = controllen = 0;
1200	}
1201
1202	len = uiop->uio_resid;
1203	msg->msg_flags = flags;
1204
1205	error = socket_sendmsg(so, msg, uiop, CRED());
1206done1:
1207	if (control != NULL)
1208		kmem_free(control, controllen);
1209done2:
1210	if (name != NULL)
1211		kmem_free(name, namelen);
1212done3:
1213	if (error != 0) {
1214		releasef(sock);
1215		return (set_errno(error));
1216	}
1217	lwp_stat_update(LWP_STAT_MSGSND, 1);
1218	releasef(sock);
1219	return (len - uiop->uio_resid);
1220}
1221
1222/*
1223 * Native system call
1224 */
1225ssize_t
1226send(int sock, void *buffer, size_t len, int flags)
1227{
1228	struct nmsghdr lmsg;
1229	struct uio auio;
1230	struct iovec aiov[1];
1231
1232	dprint(1, ("send(%d, %p, %ld, %d)\n",
1233	    sock, buffer, len, flags));
1234
1235	if ((ssize_t)len < 0) {
1236		return (set_errno(EINVAL));
1237	}
1238
1239	aiov[0].iov_base = buffer;
1240	aiov[0].iov_len = len;
1241	auio.uio_loffset = 0;
1242	auio.uio_iov = aiov;
1243	auio.uio_iovcnt = 1;
1244	auio.uio_resid = len;
1245	auio.uio_segflg = UIO_USERSPACE;
1246	auio.uio_limit = 0;
1247
1248	lmsg.msg_name = NULL;
1249	lmsg.msg_control = NULL;
1250	if (!(flags & MSG_XPG4_2)) {
1251		/*
1252		 * In order to be compatible with the libsocket/sockmod
1253		 * implementation we set EOR for all send* calls.
1254		 */
1255		flags |= MSG_EOR;
1256	}
1257	return (sendit(sock, &lmsg, &auio, flags));
1258}
1259
1260/*
1261 * Uses the MSG_XPG4_2 flag to determine if the caller is using
1262 * struct omsghdr or struct nmsghdr.
1263 */
1264ssize_t
1265sendmsg(int sock, struct nmsghdr *msg, int flags)
1266{
1267	struct nmsghdr lmsg;
1268	STRUCT_DECL(nmsghdr, u_lmsg);
1269	struct uio auio;
1270	struct iovec aiov[MSG_MAXIOVLEN];
1271	int iovcnt;
1272	ssize_t len;
1273	int i;
1274	model_t	model;
1275
1276	dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1277
1278	model = get_udatamodel();
1279	STRUCT_INIT(u_lmsg, model);
1280
1281	if (flags & MSG_XPG4_2) {
1282		if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1283		    STRUCT_SIZE(u_lmsg)))
1284			return (set_errno(EFAULT));
1285	} else {
1286		/*
1287		 * Assumes that nmsghdr and omsghdr are identically shaped
1288		 * except for the added msg_flags field.
1289		 */
1290		if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1291		    SIZEOF_STRUCT(omsghdr, model)))
1292			return (set_errno(EFAULT));
1293		/*
1294		 * In order to be compatible with the libsocket/sockmod
1295		 * implementation we set EOR for all send* calls.
1296		 */
1297		flags |= MSG_EOR;
1298	}
1299
1300	/*
1301	 * Code below us will kmem_alloc memory and hang it
1302	 * off msg_control and msg_name fields. This forces
1303	 * us to copy the structure to its native form.
1304	 */
1305	lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1306	lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1307	lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1308	lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1309	lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1310	lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1311	lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1312
1313	iovcnt = lmsg.msg_iovlen;
1314
1315	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1316		/*
1317		 * Unless this is XPG 4.2 we allow iovcnt == 0 to
1318		 * be compatible with SunOS 4.X and 4.4BSD.
1319		 */
1320		if (iovcnt != 0 || (flags & MSG_XPG4_2))
1321			return (set_errno(EMSGSIZE));
1322	}
1323
1324#ifdef _SYSCALL32_IMPL
1325	/*
1326	 * 32-bit callers need to have their iovec expanded, while ensuring
1327	 * that they can't move more than 2Gbytes of data in a single call.
1328	 */
1329	if (model == DATAMODEL_ILP32) {
1330		struct iovec32 aiov32[MSG_MAXIOVLEN];
1331		ssize32_t count32;
1332
1333		if (iovcnt != 0 &&
1334		    copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1335		    iovcnt * sizeof (struct iovec32)))
1336			return (set_errno(EFAULT));
1337
1338		count32 = 0;
1339		for (i = 0; i < iovcnt; i++) {
1340			ssize32_t iovlen32;
1341
1342			iovlen32 = aiov32[i].iov_len;
1343			count32 += iovlen32;
1344			if (iovlen32 < 0 || count32 < 0)
1345				return (set_errno(EINVAL));
1346			aiov[i].iov_len = iovlen32;
1347			aiov[i].iov_base =
1348			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1349		}
1350	} else
1351#endif /* _SYSCALL32_IMPL */
1352	if (iovcnt != 0 &&
1353	    copyin(lmsg.msg_iov, aiov,
1354	    (unsigned)iovcnt * sizeof (struct iovec))) {
1355		return (set_errno(EFAULT));
1356	}
1357	len = 0;
1358	for (i = 0; i < iovcnt; i++) {
1359		ssize_t iovlen = aiov[i].iov_len;
1360		len += iovlen;
1361		if (iovlen < 0 || len < 0) {
1362			return (set_errno(EINVAL));
1363		}
1364	}
1365	auio.uio_loffset = 0;
1366	auio.uio_iov = aiov;
1367	auio.uio_iovcnt = iovcnt;
1368	auio.uio_resid = len;
1369	auio.uio_segflg = UIO_USERSPACE;
1370	auio.uio_limit = 0;
1371
1372	return (sendit(sock, &lmsg, &auio, flags));
1373}
1374
1375ssize_t
1376sendto(int sock, void *buffer, size_t len, int flags,
1377    struct sockaddr *name, socklen_t namelen)
1378{
1379	struct nmsghdr lmsg;
1380	struct uio auio;
1381	struct iovec aiov[1];
1382
1383	dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1384	    sock, buffer, len, flags, (void *)name, namelen));
1385
1386	if ((ssize_t)len < 0) {
1387		return (set_errno(EINVAL));
1388	}
1389
1390	aiov[0].iov_base = buffer;
1391	aiov[0].iov_len = len;
1392	auio.uio_loffset = 0;
1393	auio.uio_iov = aiov;
1394	auio.uio_iovcnt = 1;
1395	auio.uio_resid = len;
1396	auio.uio_segflg = UIO_USERSPACE;
1397	auio.uio_limit = 0;
1398
1399	lmsg.msg_name = (char *)name;
1400	lmsg.msg_namelen = namelen;
1401	lmsg.msg_control = NULL;
1402	if (!(flags & MSG_XPG4_2)) {
1403		/*
1404		 * In order to be compatible with the libsocket/sockmod
1405		 * implementation we set EOR for all send* calls.
1406		 */
1407		flags |= MSG_EOR;
1408	}
1409	return (sendit(sock, &lmsg, &auio, flags));
1410}
1411
1412/*ARGSUSED3*/
1413int
1414getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1415{
1416	struct sonode *so;
1417	int error;
1418	socklen_t namelen;
1419	socklen_t sock_addrlen;
1420	struct sockaddr *sock_addrp;
1421
1422	dprint(1, ("getpeername(%d, %p, %p)\n",
1423	    sock, (void *)name, (void *)namelenp));
1424
1425	if ((so = getsonode(sock, &error, NULL)) == NULL)
1426		goto bad;
1427
1428	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1429	if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1430	    (name == NULL && namelen != 0)) {
1431		error = EFAULT;
1432		goto rel_out;
1433	}
1434	sock_addrlen = so->so_max_addr_len;
1435	sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1436
1437	if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1438	    B_FALSE, CRED())) == 0) {
1439		ASSERT(sock_addrlen <= so->so_max_addr_len);
1440		error = copyout_name(name, namelen, namelenp,
1441		    (void *)sock_addrp, sock_addrlen);
1442	}
1443	kmem_free(sock_addrp, so->so_max_addr_len);
1444rel_out:
1445	releasef(sock);
1446bad:	return (error != 0 ? set_errno(error) : 0);
1447}
1448
1449/*ARGSUSED3*/
1450int
1451getsockname(int sock, struct sockaddr *name,
1452		socklen_t *namelenp, int version)
1453{
1454	struct sonode *so;
1455	int error;
1456	socklen_t namelen, sock_addrlen;
1457	struct sockaddr *sock_addrp;
1458
1459	dprint(1, ("getsockname(%d, %p, %p)\n",
1460	    sock, (void *)name, (void *)namelenp));
1461
1462	if ((so = getsonode(sock, &error, NULL)) == NULL)
1463		goto bad;
1464
1465	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1466	if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1467	    (name == NULL && namelen != 0)) {
1468		error = EFAULT;
1469		goto rel_out;
1470	}
1471
1472	sock_addrlen = so->so_max_addr_len;
1473	sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1474	if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1475	    CRED())) == 0) {
1476		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1477		ASSERT(sock_addrlen <= so->so_max_addr_len);
1478		error = copyout_name(name, namelen, namelenp,
1479		    (void *)sock_addrp, sock_addrlen);
1480	}
1481	kmem_free(sock_addrp, so->so_max_addr_len);
1482rel_out:
1483	releasef(sock);
1484bad:	return (error != 0 ? set_errno(error) : 0);
1485}
1486
1487/*ARGSUSED5*/
1488int
1489getsockopt(int sock,
1490	int level,
1491	int option_name,
1492	void *option_value,
1493	socklen_t *option_lenp,
1494	int version)
1495{
1496	struct sonode *so;
1497	socklen_t optlen, optlen_res;
1498	void *optval;
1499	int error;
1500
1501	dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1502	    sock, level, option_name, option_value, (void *)option_lenp));
1503
1504	if ((so = getsonode(sock, &error, NULL)) == NULL)
1505		return (set_errno(error));
1506
1507	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1508	if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1509		releasef(sock);
1510		return (set_errno(EFAULT));
1511	}
1512	/*
1513	 * Verify that the length is not excessive to prevent
1514	 * an application from consuming all of kernel memory.
1515	 */
1516	if (optlen > SO_MAXARGSIZE) {
1517		error = EINVAL;
1518		releasef(sock);
1519		return (set_errno(error));
1520	}
1521	optval = kmem_alloc(optlen, KM_SLEEP);
1522	optlen_res = optlen;
1523	error = socket_getsockopt(so, level, option_name, optval,
1524	    &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1525	    CRED());
1526	releasef(sock);
1527	if (error) {
1528		kmem_free(optval, optlen);
1529		return (set_errno(error));
1530	}
1531	error = copyout_arg(option_value, optlen, option_lenp,
1532	    optval, optlen_res);
1533	kmem_free(optval, optlen);
1534	if (error)
1535		return (set_errno(error));
1536	return (0);
1537}
1538
1539/*ARGSUSED5*/
1540int
1541setsockopt(int sock,
1542	int level,
1543	int option_name,
1544	void *option_value,
1545	socklen_t option_len,
1546	int version)
1547{
1548	struct sonode *so;
1549	intptr_t buffer[2];
1550	void *optval = NULL;
1551	int error;
1552
1553	dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1554	    sock, level, option_name, option_value, option_len));
1555
1556	if ((so = getsonode(sock, &error, NULL)) == NULL)
1557		return (set_errno(error));
1558
1559	if (option_value != NULL) {
1560		if (option_len != 0) {
1561			/*
1562			 * Verify that the length is not excessive to prevent
1563			 * an application from consuming all of kernel memory.
1564			 */
1565			if (option_len > SO_MAXARGSIZE) {
1566				error = EINVAL;
1567				goto done2;
1568			}
1569			optval = option_len <= sizeof (buffer) ?
1570			    &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1571			ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1572			if (copyin(option_value, optval, (size_t)option_len)) {
1573				error = EFAULT;
1574				goto done1;
1575			}
1576		}
1577	} else
1578		option_len = 0;
1579
1580	error = socket_setsockopt(so, level, option_name, optval,
1581	    (t_uscalar_t)option_len, CRED());
1582done1:
1583	if (optval != buffer)
1584		kmem_free(optval, (size_t)option_len);
1585done2:
1586	releasef(sock);
1587	if (error)
1588		return (set_errno(error));
1589	return (0);
1590}
1591
1592static int
1593sockconf_add_sock(int family, int type, int protocol, char *name)
1594{
1595	int error = 0;
1596	char *kdevpath = NULL;
1597	char *kmodule = NULL;
1598	char *buf = NULL;
1599	size_t pathlen = 0;
1600	struct sockparams *sp;
1601
1602	if (name == NULL)
1603		return (EINVAL);
1604	/*
1605	 * Copyin the name.
1606	 * This also makes it possible to check for too long pathnames.
1607	 * Compress the space needed for the name before passing it
1608	 * to soconfig - soconfig will store the string until
1609	 * the configuration is removed.
1610	 */
1611	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1612	if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1613		kmem_free(buf, MAXPATHLEN);
1614		return (error);
1615	}
1616	if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1617		/* For device */
1618
1619		/*
1620		 * Special handling for NCA:
1621		 *
1622		 * DEV_NCA is never opened even if an application
1623		 * requests for AF_NCA. The device opened is instead a
1624		 * predefined AF_INET transport (NCA_INET_DEV).
1625		 *
1626		 * Prior to Volo (PSARC/2007/587) NCA would determine
1627		 * the device using a lookup, which worked then because
1628		 * all protocols were based on TPI. Since TPI is no
1629		 * longer the default, we have to explicitly state
1630		 * which device to use.
1631		 */
1632		if (strcmp(buf, NCA_DEV) == 0) {
1633			/* only support entry <28, 2, 0> */
1634			if (family != AF_NCA || type != SOCK_STREAM ||
1635			    protocol != 0) {
1636				kmem_free(buf, MAXPATHLEN);
1637				return (EINVAL);
1638			}
1639
1640			pathlen = strlen(NCA_INET_DEV) + 1;
1641			kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1642			bcopy(NCA_INET_DEV, kdevpath, pathlen);
1643			kdevpath[pathlen - 1] = '\0';
1644		} else {
1645			kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1646			bcopy(buf, kdevpath, pathlen);
1647			kdevpath[pathlen - 1] = '\0';
1648		}
1649	} else {
1650		/* For socket module */
1651		kmodule = kmem_alloc(pathlen, KM_SLEEP);
1652		bcopy(buf, kmodule, pathlen);
1653		kmodule[pathlen - 1] = '\0';
1654		pathlen = 0;
1655	}
1656	kmem_free(buf, MAXPATHLEN);
1657
1658	/* sockparams_create frees mod name and devpath upon failure */
1659	sp = sockparams_create(family, type, protocol, kmodule,
1660	    kdevpath, pathlen, 0, KM_SLEEP, &error);
1661	if (sp != NULL) {
1662		error = sockparams_add(sp);
1663		if (error != 0)
1664			sockparams_destroy(sp);
1665	}
1666
1667	return (error);
1668}
1669
1670static int
1671sockconf_remove_sock(int family, int type, int protocol)
1672{
1673	return (sockparams_delete(family, type, protocol));
1674}
1675
1676static int
1677sockconfig_remove_filter(const char *uname)
1678{
1679	char kname[SOF_MAXNAMELEN];
1680	size_t len;
1681	int error;
1682	sof_entry_t *ent;
1683
1684	if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1685		return (error);
1686
1687	ent = sof_entry_remove_by_name(kname);
1688	if (ent == NULL)
1689		return (ENXIO);
1690
1691	mutex_enter(&ent->sofe_lock);
1692	ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1693	if (ent->sofe_refcnt == 0) {
1694		mutex_exit(&ent->sofe_lock);
1695		sof_entry_free(ent);
1696	} else {
1697		/* let the last socket free the filter */
1698		ent->sofe_flags |= SOFEF_CONDEMED;
1699		mutex_exit(&ent->sofe_lock);
1700	}
1701
1702	return (0);
1703}
1704
1705static int
1706sockconfig_add_filter(const char *uname, void *ufilpropp)
1707{
1708	struct sockconfig_filter_props filprop;
1709	sof_entry_t *ent;
1710	int error;
1711	size_t tuplesz, len;
1712	char hintbuf[SOF_MAXNAMELEN];
1713
1714	ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1715	mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1716
1717	if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1718	    &len)) != 0) {
1719		sof_entry_free(ent);
1720		return (error);
1721	}
1722
1723	if (get_udatamodel() == DATAMODEL_NATIVE) {
1724		if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1725			sof_entry_free(ent);
1726			return (EFAULT);
1727		}
1728	}
1729#ifdef	_SYSCALL32_IMPL
1730	else {
1731		struct sockconfig_filter_props32 filprop32;
1732
1733		if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1734			sof_entry_free(ent);
1735			return (EFAULT);
1736		}
1737		filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1738		filprop.sfp_autoattach = filprop32.sfp_autoattach;
1739		filprop.sfp_hint = filprop32.sfp_hint;
1740		filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1741		filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1742		filprop.sfp_socktuple =
1743		    (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1744	}
1745#endif	/* _SYSCALL32_IMPL */
1746
1747	if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1748	    sizeof (ent->sofe_modname), &len)) != 0) {
1749		sof_entry_free(ent);
1750		return (error);
1751	}
1752
1753	/*
1754	 * A filter must specify at least one socket tuple.
1755	 */
1756	if (filprop.sfp_socktuple_cnt == 0 ||
1757	    filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1758		sof_entry_free(ent);
1759		return (EINVAL);
1760	}
1761	ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1762	ent->sofe_hint = filprop.sfp_hint;
1763
1764	/*
1765	 * Verify the hint, and copy in the hint argument, if necessary.
1766	 */
1767	switch (ent->sofe_hint) {
1768	case SOF_HINT_BEFORE:
1769	case SOF_HINT_AFTER:
1770		if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1771		    sizeof (hintbuf), &len)) != 0) {
1772			sof_entry_free(ent);
1773			return (error);
1774		}
1775		ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1776		bcopy(hintbuf, ent->sofe_hintarg, len);
1777		/* FALLTHRU */
1778	case SOF_HINT_TOP:
1779	case SOF_HINT_BOTTOM:
1780		/* hints cannot be used with programmatic filters */
1781		if (ent->sofe_flags & SOFEF_PROG) {
1782			sof_entry_free(ent);
1783			return (EINVAL);
1784		}
1785		break;
1786	case SOF_HINT_NONE:
1787		break;
1788	default:
1789		/* bad hint value */
1790		sof_entry_free(ent);
1791		return (EINVAL);
1792	}
1793
1794	ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1795	tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1796	ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1797
1798	if (get_udatamodel() == DATAMODEL_NATIVE) {
1799		if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1800		    tuplesz)) {
1801			sof_entry_free(ent);
1802			return (EFAULT);
1803		}
1804	}
1805#ifdef	_SYSCALL32_IMPL
1806	else {
1807		int i;
1808		caddr_t data = (caddr_t)filprop.sfp_socktuple;
1809		sof_socktuple_t	*tup = ent->sofe_socktuple;
1810		sof_socktuple32_t tup32;
1811
1812		tup = ent->sofe_socktuple;
1813		for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1814			ASSERT(tup < ent->sofe_socktuple + tuplesz);
1815
1816			if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1817				sof_entry_free(ent);
1818				return (EFAULT);
1819			}
1820			tup->sofst_family = tup32.sofst_family;
1821			tup->sofst_type = tup32.sofst_type;
1822			tup->sofst_protocol = tup32.sofst_protocol;
1823
1824			data += sizeof (tup32);
1825		}
1826	}
1827#endif	/* _SYSCALL32_IMPL */
1828
1829	/* Sockets can start using the filter as soon as the filter is added */
1830	if ((error = sof_entry_add(ent)) != 0)
1831		sof_entry_free(ent);
1832
1833	return (error);
1834}
1835
1836/*
1837 * Socket configuration system call. It is used to add and remove
1838 * socket types.
1839 */
1840int
1841sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1842{
1843	int error = 0;
1844
1845	if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1846		return (set_errno(EPERM));
1847
1848	if (sockfs_defer_nl7c_init) {
1849		nl7c_init();
1850		sockfs_defer_nl7c_init = 0;
1851	}
1852
1853	switch (cmd) {
1854	case SOCKCONFIG_ADD_SOCK:
1855		error = sockconf_add_sock((int)(uintptr_t)arg1,
1856		    (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1857		break;
1858	case SOCKCONFIG_REMOVE_SOCK:
1859		error = sockconf_remove_sock((int)(uintptr_t)arg1,
1860		    (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1861		break;
1862	case SOCKCONFIG_ADD_FILTER:
1863		error = sockconfig_add_filter((const char *)arg1, arg2);
1864		break;
1865	case SOCKCONFIG_REMOVE_FILTER:
1866		error = sockconfig_remove_filter((const char *)arg1);
1867		break;
1868	case SOCKCONFIG_GET_SOCKTABLE:
1869		error = sockparams_copyout_socktable((int)(uintptr_t)arg1);
1870		break;
1871	default:
1872#ifdef	DEBUG
1873		cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1874#endif
1875		error = EINVAL;
1876		break;
1877	}
1878
1879	if (error != 0) {
1880		eprintline(error);
1881		return (set_errno(error));
1882	}
1883	return (0);
1884}
1885
1886
1887/*
1888 * Sendfile is implemented through two schemes, direct I/O or by
1889 * caching in the filesystem page cache. We cache the input file by
1890 * default and use direct I/O only if sendfile_max_size is set
1891 * appropriately as explained below. Note that this logic is consistent
1892 * with other filesystems where caching is turned on by default
1893 * unless explicitly turned off by using the DIRECTIO ioctl.
1894 *
1895 * We choose a slightly different scheme here. One can turn off
1896 * caching by setting sendfile_max_size to 0. One can also enable
1897 * caching of files <= sendfile_max_size by setting sendfile_max_size
1898 * to an appropriate value. By default sendfile_max_size is set to the
1899 * maximum value so that all files are cached. In future, we may provide
1900 * better interfaces for caching the file.
1901 *
1902 * Sendfile through Direct I/O (Zero copy)
1903 * --------------------------------------
1904 *
1905 * As disks are normally slower than the network, we can't have a
1906 * single thread that reads the disk and writes to the network. We
1907 * need to have parallelism. This is done by having the sendfile
1908 * thread create another thread that reads from the filesystem
1909 * and queues it for network processing. In this scheme, the data
1910 * is never copied anywhere i.e it is zero copy unlike the other
1911 * scheme.
1912 *
1913 * We have a sendfile queue (snfq) where each sendfile
1914 * request (snf_req_t) is queued for processing by a thread. Number
1915 * of threads is dynamically allocated and they exit if they are idling
1916 * beyond a specified amount of time. When each request (snf_req_t) is
1917 * processed by a thread, it produces a number of mblk_t structures to
1918 * be consumed by the sendfile thread. snf_deque and snf_enque are
1919 * used for consuming and producing mblks. Size of the filesystem
1920 * read is determined by the tunable (sendfile_read_size). A single
1921 * mblk holds sendfile_read_size worth of data (except the last
1922 * read of the file) which is sent down as a whole to the network.
1923 * sendfile_read_size is set to 1 MB as this seems to be the optimal
1924 * value for the UFS filesystem backed by a striped storage array.
1925 *
1926 * Synchronisation between read (producer) and write (consumer) threads.
1927 * --------------------------------------------------------------------
1928 *
1929 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1930 * adding and deleting items in this list. Error can happen anytime
1931 * during read or write. There could be unprocessed mblks in the
1932 * sr_ib_XXX list when a read or write error occurs. Whenever error
1933 * is encountered, we need two things to happen :
1934 *
1935 * a) One of the threads need to clean the mblks.
1936 * b) When one thread encounters an error, the other should stop.
1937 *
1938 * For (a), we don't want to penalize the reader thread as it could do
1939 * some useful work processing other requests. For (b), the error can
1940 * be detected by examining sr_read_error or sr_write_error.
1941 * sr_lock protects sr_read_error and sr_write_error. If both reader and
1942 * writer encounters error, we need to report the write error back to
1943 * the application as that's what would have happened if the operations
1944 * were done sequentially. With this in mind, following should work :
1945 *
1946 * 	- Check for errors before read or write.
1947 *	- If the reader encounters error, set the error in sr_read_error.
1948 *	  Check sr_write_error, if it is set, send cv_signal as it is
1949 *	  waiting for reader to complete. If it is not set, the writer
1950 *	  is either running sinking data to the network or blocked
1951 *        because of flow control. For handling the latter case, we
1952 *	  always send a signal. In any case, it will examine sr_read_error
1953 *	  and return. sr_read_error is marked with SR_READ_DONE to tell
1954 *	  the writer that the reader is done in all the cases.
1955 *	- If the writer encounters error, set the error in sr_write_error.
1956 *	  The reader thread is either blocked because of flow control or
1957 *	  running reading data from the disk. For the former, we need to
1958 *	  wakeup the thread. Again to keep it simple, we always wake up
1959 *	  the reader thread. Then, wait for the read thread to complete
1960 *	  if it is not done yet. Cleanup and return.
1961 *
1962 * High and low water marks for the read thread.
1963 * --------------------------------------------
1964 *
1965 * If sendfile() is used to send data over a slow network, we need to
1966 * make sure that the read thread does not produce data at a faster
1967 * rate than the network. This can happen if the disk is faster than
1968 * the network. In such a case, we don't want to build a very large queue.
1969 * But we would still like to get all of the network throughput possible.
1970 * This implies that network should never block waiting for data.
1971 * As there are lot of disk throughput/network throughput combinations
1972 * possible, it is difficult to come up with an accurate number.
1973 * A typical 10K RPM disk has a max seek latency 17ms and rotational
1974 * latency of 3ms for reading a disk block. Thus, the total latency to
1975 * initiate a new read, transfer data from the disk and queue for
1976 * transmission would take about a max of 25ms. Todays max transfer rate
1977 * for network is 100MB/sec. If the thread is blocked because of flow
1978 * control, it would take 25ms to get new data ready for transmission.
1979 * We have to make sure that network is not idling, while we are initiating
1980 * new transfers. So, at 100MB/sec, to keep network busy we would need
1981 * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1982 * We need to pick a high water mark so that the woken up thread would
1983 * do considerable work before blocking again to prevent thrashing. Currently,
1984 * we pick this to be 10 times that of the low water mark.
1985 *
1986 * Sendfile with segmap caching (One copy from page cache to mblks).
1987 * ----------------------------------------------------------------
1988 *
1989 * We use the segmap cache for caching the file, if the size of file
1990 * is <= sendfile_max_size. In this case we don't use threads as VM
1991 * is reasonably fast enough to keep up with the network. If the underlying
1992 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1993 * of data into segmap space, and use the virtual address from segmap
1994 * directly through desballoc() to avoid copy. Once the transport is done
1995 * with the data, the mapping will be released through segmap_release()
1996 * called by the call-back routine.
1997 *
1998 * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1999 * to copy the data from the filesystem into our temporary network buffer.
2000 *
2001 * To disable caching, set sendfile_max_size to 0.
2002 */
2003
2004uint_t sendfile_read_size = 1024 * 1024;
2005#define	SENDFILE_REQ_LOWAT	3 * 1024 * 1024
2006uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
2007uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
2008struct sendfile_stats sf_stats;
2009struct sendfile_queue *snfq;
2010clock_t snfq_timeout;
2011off64_t sendfile_max_size;
2012
2013static void snf_enque(snf_req_t *, mblk_t *);
2014static mblk_t *snf_deque(snf_req_t *);
2015
2016void
2017sendfile_init(void)
2018{
2019	snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
2020
2021	mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
2022	cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
2023	snfq->snfq_max_threads = max_ncpus;
2024	snfq_timeout = SNFQ_TIMEOUT;
2025	/* Cache all files by default. */
2026	sendfile_max_size = MAXOFFSET_T;
2027}
2028
2029/*
2030 * Queues a mblk_t for network processing.
2031 */
2032static void
2033snf_enque(snf_req_t *sr, mblk_t *mp)
2034{
2035	mp->b_next = NULL;
2036	mutex_enter(&sr->sr_lock);
2037	if (sr->sr_mp_head == NULL) {
2038		sr->sr_mp_head = sr->sr_mp_tail = mp;
2039		cv_signal(&sr->sr_cv);
2040	} else {
2041		sr->sr_mp_tail->b_next = mp;
2042		sr->sr_mp_tail = mp;
2043	}
2044	sr->sr_qlen += MBLKL(mp);
2045	while ((sr->sr_qlen > sr->sr_hiwat) &&
2046	    (sr->sr_write_error == 0)) {
2047		sf_stats.ss_full_waits++;
2048		cv_wait(&sr->sr_cv, &sr->sr_lock);
2049	}
2050	mutex_exit(&sr->sr_lock);
2051}
2052
2053/*
2054 * De-queues a mblk_t for network processing.
2055 */
2056static mblk_t *
2057snf_deque(snf_req_t *sr)
2058{
2059	mblk_t *mp;
2060
2061	mutex_enter(&sr->sr_lock);
2062	/*
2063	 * If we have encountered an error on read or read is
2064	 * completed and no more mblks, return NULL.
2065	 * We need to check for NULL sr_mp_head also as
2066	 * the reads could have completed and there is
2067	 * nothing more to come.
2068	 */
2069	if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2070	    ((sr->sr_read_error & SR_READ_DONE) &&
2071	    sr->sr_mp_head == NULL)) {
2072		mutex_exit(&sr->sr_lock);
2073		return (NULL);
2074	}
2075	/*
2076	 * To start with neither SR_READ_DONE is marked nor
2077	 * the error is set. When we wake up from cv_wait,
2078	 * following are the possibilities :
2079	 *
2080	 *	a) sr_read_error is zero and mblks are queued.
2081	 *	b) sr_read_error is set to SR_READ_DONE
2082	 *	   and mblks are queued.
2083	 *	c) sr_read_error is set to SR_READ_DONE
2084	 *	   and no mblks.
2085	 *	d) sr_read_error is set to some error other
2086	 *	   than SR_READ_DONE.
2087	 */
2088
2089	while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2090		sf_stats.ss_empty_waits++;
2091		cv_wait(&sr->sr_cv, &sr->sr_lock);
2092	}
2093	/* Handle (a) and (b) first  - the normal case. */
2094	if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2095	    (sr->sr_mp_head != NULL)) {
2096		mp = sr->sr_mp_head;
2097		sr->sr_mp_head = mp->b_next;
2098		sr->sr_qlen -= MBLKL(mp);
2099		if (sr->sr_qlen < sr->sr_lowat)
2100			cv_signal(&sr->sr_cv);
2101		mutex_exit(&sr->sr_lock);
2102		mp->b_next = NULL;
2103		return (mp);
2104	}
2105	/* Handle (c) and (d). */
2106	mutex_exit(&sr->sr_lock);
2107	return (NULL);
2108}
2109
2110/*
2111 * Reads data from the filesystem and queues it for network processing.
2112 */
2113void
2114snf_async_read(snf_req_t *sr)
2115{
2116	size_t iosize;
2117	u_offset_t fileoff;
2118	u_offset_t size;
2119	int ret_size;
2120	int error;
2121	file_t *fp;
2122	mblk_t *mp;
2123	struct vnode *vp;
2124	int extra = 0;
2125	int maxblk = 0;
2126	int wroff = 0;
2127	struct sonode *so;
2128
2129	fp = sr->sr_fp;
2130	size = sr->sr_file_size;
2131	fileoff = sr->sr_file_off;
2132
2133	/*
2134	 * Ignore the error for filesystems that doesn't support DIRECTIO.
2135	 */
2136	(void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2137	    kcred, NULL, NULL);
2138
2139	vp = sr->sr_vp;
2140	if (vp->v_type == VSOCK) {
2141		stdata_t *stp;
2142
2143		/*
2144		 * Get the extra space to insert a header and a trailer.
2145		 */
2146		so = VTOSO(vp);
2147		stp = vp->v_stream;
2148		if (stp == NULL) {
2149			wroff = so->so_proto_props.sopp_wroff;
2150			maxblk = so->so_proto_props.sopp_maxblk;
2151			extra = wroff + so->so_proto_props.sopp_tail;
2152		} else {
2153			wroff = (int)(stp->sd_wroff);
2154			maxblk = (int)(stp->sd_maxblk);
2155			extra = wroff + (int)(stp->sd_tail);
2156		}
2157	}
2158
2159	while ((size != 0) && (sr->sr_write_error == 0)) {
2160
2161		iosize = (int)MIN(sr->sr_maxpsz, size);
2162
2163		/*
2164		 * Socket filters can limit the mblk size,
2165		 * so limit reads to maxblk if there are
2166		 * filters present.
2167		 */
2168		if (vp->v_type == VSOCK &&
2169		    so->so_filter_active > 0 && maxblk != INFPSZ)
2170			iosize = (int)MIN(iosize, maxblk);
2171
2172		if (is_system_labeled()) {
2173			mp = allocb_cred(iosize + extra, CRED(),
2174			    curproc->p_pid);
2175		} else {
2176			mp = allocb(iosize + extra, BPRI_MED);
2177		}
2178		if (mp == NULL) {
2179			error = EAGAIN;
2180			break;
2181		}
2182
2183		mp->b_rptr += wroff;
2184
2185		ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2186
2187		/* Error or Reached EOF ? */
2188		if ((error != 0) || (ret_size == 0)) {
2189			freeb(mp);
2190			break;
2191		}
2192		mp->b_wptr = mp->b_rptr + ret_size;
2193
2194		snf_enque(sr, mp);
2195		size -= ret_size;
2196		fileoff += ret_size;
2197	}
2198	(void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2199	    kcred, NULL, NULL);
2200	mutex_enter(&sr->sr_lock);
2201	sr->sr_read_error = error;
2202	sr->sr_read_error |= SR_READ_DONE;
2203	cv_signal(&sr->sr_cv);
2204	mutex_exit(&sr->sr_lock);
2205}
2206
2207void
2208snf_async_thread(void)
2209{
2210	snf_req_t *sr;
2211	callb_cpr_t cprinfo;
2212	clock_t time_left = 1;
2213
2214	CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2215
2216	mutex_enter(&snfq->snfq_lock);
2217	for (;;) {
2218		/*
2219		 * If we didn't find a entry, then block until woken up
2220		 * again and then look through the queues again.
2221		 */
2222		while ((sr = snfq->snfq_req_head) == NULL) {
2223			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2224			if (time_left <= 0) {
2225				snfq->snfq_svc_threads--;
2226				CALLB_CPR_EXIT(&cprinfo);
2227				thread_exit();
2228				/* NOTREACHED */
2229			}
2230			snfq->snfq_idle_cnt++;
2231
2232			time_left = cv_reltimedwait(&snfq->snfq_cv,
2233			    &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2234			snfq->snfq_idle_cnt--;
2235
2236			CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2237		}
2238		snfq->snfq_req_head = sr->sr_next;
2239		snfq->snfq_req_cnt--;
2240		mutex_exit(&snfq->snfq_lock);
2241		snf_async_read(sr);
2242		mutex_enter(&snfq->snfq_lock);
2243	}
2244}
2245
2246
2247snf_req_t *
2248create_thread(int operation, struct vnode *vp, file_t *fp,
2249    u_offset_t fileoff, u_offset_t size)
2250{
2251	snf_req_t *sr;
2252	stdata_t *stp;
2253
2254	sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2255
2256	sr->sr_vp = vp;
2257	sr->sr_fp = fp;
2258	stp = vp->v_stream;
2259
2260	/*
2261	 * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2262	 * stream might be closed before thread returns from snf_async_read.
2263	 */
2264	if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2265		sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2266	} else {
2267		sr->sr_maxpsz = MAXBSIZE;
2268	}
2269
2270	sr->sr_operation = operation;
2271	sr->sr_file_off = fileoff;
2272	sr->sr_file_size = size;
2273	sr->sr_hiwat = sendfile_req_hiwat;
2274	sr->sr_lowat = sendfile_req_lowat;
2275	mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2276	cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2277	/*
2278	 * See whether we need another thread for servicing this
2279	 * request. If there are already enough requests queued
2280	 * for the threads, create one if not exceeding
2281	 * snfq_max_threads.
2282	 */
2283	mutex_enter(&snfq->snfq_lock);
2284	if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2285	    snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2286		(void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2287		    TS_RUN, minclsyspri);
2288		snfq->snfq_svc_threads++;
2289	}
2290	if (snfq->snfq_req_head == NULL) {
2291		snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2292		cv_signal(&snfq->snfq_cv);
2293	} else {
2294		snfq->snfq_req_tail->sr_next = sr;
2295		snfq->snfq_req_tail = sr;
2296	}
2297	snfq->snfq_req_cnt++;
2298	mutex_exit(&snfq->snfq_lock);
2299	return (sr);
2300}
2301
2302int
2303snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2304    ssize_t *count)
2305{
2306	snf_req_t *sr;
2307	mblk_t *mp;
2308	int iosize;
2309	int error = 0;
2310	short fflag;
2311	struct vnode *vp;
2312	int ksize;
2313	struct nmsghdr msg;
2314
2315	ksize = 0;
2316	*count = 0;
2317	bzero(&msg, sizeof (msg));
2318
2319	vp = fp->f_vnode;
2320	fflag = fp->f_flag;
2321	if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2322		return (EAGAIN);
2323
2324	/*
2325	 * We check for read error in snf_deque. It has to check
2326	 * for successful READ_DONE and return NULL, and we might
2327	 * as well make an additional check there.
2328	 */
2329	while ((mp = snf_deque(sr)) != NULL) {
2330
2331		if (ISSIG(curthread, JUSTLOOKING)) {
2332			freeb(mp);
2333			error = EINTR;
2334			break;
2335		}
2336		iosize = MBLKL(mp);
2337
2338		error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2339
2340		if (error != 0) {
2341			if (mp != NULL)
2342				freeb(mp);
2343			break;
2344		}
2345		ksize += iosize;
2346	}
2347	*count = ksize;
2348
2349	mutex_enter(&sr->sr_lock);
2350	sr->sr_write_error = error;
2351	/* Look at the big comments on why we cv_signal here. */
2352	cv_signal(&sr->sr_cv);
2353
2354	/* Wait for the reader to complete always. */
2355	while (!(sr->sr_read_error & SR_READ_DONE)) {
2356		cv_wait(&sr->sr_cv, &sr->sr_lock);
2357	}
2358	/* If there is no write error, check for read error. */
2359	if (error == 0)
2360		error = (sr->sr_read_error & ~SR_READ_DONE);
2361
2362	if (error != 0) {
2363		mblk_t *next_mp;
2364
2365		mp = sr->sr_mp_head;
2366		while (mp != NULL) {
2367			next_mp = mp->b_next;
2368			mp->b_next = NULL;
2369			freeb(mp);
2370			mp = next_mp;
2371		}
2372	}
2373	mutex_exit(&sr->sr_lock);
2374	kmem_free(sr, sizeof (snf_req_t));
2375	return (error);
2376}
2377
2378/* Maximum no.of pages allocated by vpm for sendfile at a time */
2379#define	SNF_VPMMAXPGS	(VPMMAXPGS/2)
2380
2381/*
2382 * Maximum no.of elements in the list returned by vpm, including
2383 * NULL for the last entry
2384 */
2385#define	SNF_MAXVMAPS	(SNF_VPMMAXPGS + 1)
2386
2387typedef struct {
2388	unsigned int	snfv_ref;
2389	frtn_t		snfv_frtn;
2390	vnode_t		*snfv_vp;
2391	struct vmap	snfv_vml[SNF_MAXVMAPS];
2392} snf_vmap_desbinfo;
2393
2394typedef struct {
2395	frtn_t		snfi_frtn;
2396	caddr_t		snfi_base;
2397	uint_t		snfi_mapoff;
2398	size_t		snfi_len;
2399	vnode_t		*snfi_vp;
2400} snf_smap_desbinfo;
2401
2402/*
2403 * The callback function used for vpm mapped mblks called when the last ref of
2404 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2405 * can be the driver too due to lazy reclaim.
2406 */
2407void
2408snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2409{
2410	ASSERT(snfv->snfv_ref != 0);
2411	if (atomic_dec_32_nv(&snfv->snfv_ref) == 0) {
2412		vpm_unmap_pages(snfv->snfv_vml, S_READ);
2413		VN_RELE(snfv->snfv_vp);
2414		kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2415	}
2416}
2417
2418/*
2419 * The callback function used for segmap'ped mblks called when the last ref of
2420 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2421 * can be the driver too due to lazy reclaim.
2422 */
2423void
2424snf_smap_desbfree(snf_smap_desbinfo *snfi)
2425{
2426	if (! IS_KPM_ADDR(snfi->snfi_base)) {
2427		/*
2428		 * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2429		 * segmap_kpm as long as the latter never falls back to
2430		 * "use_segmap_range". (See segmap_getmapflt().)
2431		 *
2432		 * Using S_OTHER saves an redundant hat_setref() in
2433		 * segmap_unlock()
2434		 */
2435		(void) segmap_fault(kas.a_hat, segkmap,
2436		    (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2437		    snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2438		    F_SOFTUNLOCK, S_OTHER);
2439	}
2440	(void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2441	VN_RELE(snfi->snfi_vp);
2442	kmem_free(snfi, sizeof (*snfi));
2443}
2444
2445/*
2446 * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2447 * When segmap is used, the mblk contains a segmap slot of no more
2448 * than MAXBSIZE.
2449 *
2450 * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2451 * in each iteration and sent by socket_sendmblk until an error occurs or
2452 * the requested size has been transferred. An mblk is esballoca'ed from
2453 * each mapped page and a chain of these mblk is sent to the transport layer.
2454 * vpm will be called to unmap the pages when all mblks have been freed by
2455 * free_func.
2456 *
2457 * At the end of the whole sendfile() operation, we wait till the data from
2458 * the last mblk is ack'ed by the transport before returning so that the
2459 * caller of sendfile() can safely modify the file content.
2460 */
2461int
2462snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2463    ssize_t *count, boolean_t nowait)
2464{
2465	caddr_t base;
2466	int mapoff;
2467	vnode_t *vp;
2468	mblk_t *mp = NULL;
2469	int chain_size;
2470	int error;
2471	clock_t deadlk_wait;
2472	short fflag;
2473	int ksize;
2474	struct vattr va;
2475	boolean_t dowait = B_FALSE;
2476	struct nmsghdr msg;
2477
2478	vp = fp->f_vnode;
2479	fflag = fp->f_flag;
2480	ksize = 0;
2481	bzero(&msg, sizeof (msg));
2482
2483	for (;;) {
2484		if (ISSIG(curthread, JUSTLOOKING)) {
2485			error = EINTR;
2486			break;
2487		}
2488
2489		if (vpm_enable) {
2490			snf_vmap_desbinfo *snfv;
2491			mblk_t *nmp;
2492			int mblk_size;
2493			int maxsize;
2494			int i;
2495
2496			mapoff = fileoff & PAGEOFFSET;
2497			maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2498
2499			snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2500			    KM_SLEEP);
2501
2502			/*
2503			 * Get vpm mappings for maxsize with read access.
2504			 * If the pages aren't available yet, we get
2505			 * DEADLK, so wait and try again a little later using
2506			 * an increasing wait. We might be here a long time.
2507			 *
2508			 * If delay_sig returns EINTR, be sure to exit and
2509			 * pass it up to the caller.
2510			 */
2511			deadlk_wait = 0;
2512			while ((error = vpm_map_pages(fvp, fileoff,
2513			    (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2514			    SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2515				deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2516				if ((error = delay_sig(deadlk_wait)) != 0) {
2517					break;
2518				}
2519			}
2520			if (error != 0) {
2521				kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2522				error = (error == EINTR) ? EINTR : EIO;
2523				goto out;
2524			}
2525			snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2526			snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2527
2528			/* Construct the mblk chain from the page mappings */
2529			chain_size = 0;
2530			for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2531			    total_size > 0; i++) {
2532				ASSERT(chain_size < maxsize);
2533				mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2534				    mapoff, total_size);
2535				nmp = esballoca(
2536				    (uchar_t *)snfv->snfv_vml[i].vs_addr +
2537				    mapoff, mblk_size, BPRI_HI,
2538				    &snfv->snfv_frtn);
2539
2540				/*
2541				 * We return EAGAIN after unmapping the pages
2542				 * if we cannot allocate the the head of the
2543				 * chain. Otherwise, we continue sending the
2544				 * mblks constructed so far.
2545				 */
2546				if (nmp == NULL) {
2547					if (i == 0) {
2548						vpm_unmap_pages(snfv->snfv_vml,
2549						    S_READ);
2550						kmem_free(snfv,
2551						    sizeof (snf_vmap_desbinfo));
2552						error = EAGAIN;
2553						goto out;
2554					}
2555					break;
2556				}
2557				/* Mark this dblk with the zero-copy flag */
2558				nmp->b_datap->db_struioflag |= STRUIO_ZC;
2559				nmp->b_wptr += mblk_size;
2560				chain_size += mblk_size;
2561				fileoff += mblk_size;
2562				total_size -= mblk_size;
2563				snfv->snfv_ref++;
2564				mapoff = 0;
2565				if (i > 0)
2566					linkb(mp, nmp);
2567				else
2568					mp = nmp;
2569			}
2570			VN_HOLD(fvp);
2571			snfv->snfv_vp = fvp;
2572		} else {
2573			/* vpm not supported. fallback to segmap */
2574			snf_smap_desbinfo *snfi;
2575
2576			mapoff = fileoff & MAXBOFFSET;
2577			chain_size = MAXBSIZE - mapoff;
2578			if (chain_size > total_size)
2579				chain_size = total_size;
2580			/*
2581			 * we don't forcefault because we'll call
2582			 * segmap_fault(F_SOFTLOCK) next.
2583			 *
2584			 * S_READ will get the ref bit set (by either
2585			 * segmap_getmapflt() or segmap_fault()) and page
2586			 * shared locked.
2587			 */
2588			base = segmap_getmapflt(segkmap, fvp, fileoff,
2589			    chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2590
2591			snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2592			snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2593			    PAGESIZE)- (mapoff & PAGEMASK);
2594			/*
2595			 * We must call segmap_fault() even for segmap_kpm
2596			 * because that's how error gets returned.
2597			 * (segmap_getmapflt() never fails but segmap_fault()
2598			 * does.)
2599			 *
2600			 * If the pages aren't available yet, we get
2601			 * DEADLK, so wait and try again a little later using
2602			 * an increasing wait. We might be here a long time.
2603			 *
2604			 * If delay_sig returns EINTR, be sure to exit and
2605			 * pass it up to the caller.
2606			 */
2607			deadlk_wait = 0;
2608			while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2609			    segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2610			    mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2611			    S_READ))) == EDEADLK) {
2612				deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2613				if ((error = delay_sig(deadlk_wait)) != 0) {
2614					break;
2615				}
2616			}
2617			if (error != 0) {
2618				(void) segmap_release(segkmap, base, 0);
2619				kmem_free(snfi, sizeof (*snfi));
2620				error = (error == EINTR) ? EINTR : EIO;
2621				goto out;
2622			}
2623			snfi->snfi_frtn.free_func = snf_smap_desbfree;
2624			snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2625			snfi->snfi_base = base;
2626			snfi->snfi_mapoff = mapoff;
2627			mp = esballoca((uchar_t *)base + mapoff, chain_size,
2628			    BPRI_HI, &snfi->snfi_frtn);
2629
2630			if (mp == NULL) {
2631				(void) segmap_fault(kas.a_hat, segkmap,
2632				    (caddr_t)(uintptr_t)(((uintptr_t)base +
2633				    mapoff) & PAGEMASK), snfi->snfi_len,
2634				    F_SOFTUNLOCK, S_OTHER);
2635				(void) segmap_release(segkmap, base, 0);
2636				kmem_free(snfi, sizeof (*snfi));
2637				freemsg(mp);
2638				error = EAGAIN;
2639				goto out;
2640			}
2641			VN_HOLD(fvp);
2642			snfi->snfi_vp = fvp;
2643			mp->b_wptr += chain_size;
2644
2645			/* Mark this dblk with the zero-copy flag */
2646			mp->b_datap->db_struioflag |= STRUIO_ZC;
2647			fileoff += chain_size;
2648			total_size -= chain_size;
2649		}
2650
2651		if (total_size == 0 && !nowait) {
2652			ASSERT(!dowait);
2653			dowait = B_TRUE;
2654			mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2655		}
2656		VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2657		error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2658		if (error != 0) {
2659			/*
2660			 * mp contains the mblks that were not sent by
2661			 * socket_sendmblk. Use its size to update *count
2662			 */
2663			*count = ksize + (chain_size - msgdsize(mp));
2664			if (mp != NULL)
2665				freemsg(mp);
2666			return (error);
2667		}
2668		ksize += chain_size;
2669		if (total_size == 0)
2670			goto done;
2671
2672		(void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2673		va.va_mask = AT_SIZE;
2674		error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2675		if (error)
2676			break;
2677		/* Read as much as possible. */
2678		if (fileoff >= va.va_size)
2679			break;
2680		if (total_size + fileoff > va.va_size)
2681			total_size = va.va_size - fileoff;
2682	}
2683out:
2684	VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2685done:
2686	*count = ksize;
2687	if (dowait) {
2688		stdata_t *stp;
2689
2690		stp = vp->v_stream;
2691		if (stp == NULL) {
2692			struct sonode *so;
2693			so = VTOSO(vp);
2694			error = so_zcopy_wait(so);
2695		} else {
2696			mutex_enter(&stp->sd_lock);
2697			while (!(stp->sd_flag & STZCNOTIFY)) {
2698				if (cv_wait_sig(&stp->sd_zcopy_wait,
2699				    &stp->sd_lock) == 0) {
2700					error = EINTR;
2701					break;
2702				}
2703			}
2704			stp->sd_flag &= ~STZCNOTIFY;
2705			mutex_exit(&stp->sd_lock);
2706		}
2707	}
2708	return (error);
2709}
2710
2711int
2712snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2713    uint_t maxpsz, ssize_t *count)
2714{
2715	struct vnode *vp;
2716	mblk_t *mp;
2717	int iosize;
2718	int extra = 0;
2719	int error;
2720	short fflag;
2721	int ksize;
2722	int ioflag;
2723	struct uio auio;
2724	struct iovec aiov;
2725	struct vattr va;
2726	int maxblk = 0;
2727	int wroff = 0;
2728	struct sonode *so;
2729	struct nmsghdr msg;
2730
2731	vp = fp->f_vnode;
2732	if (vp->v_type == VSOCK) {
2733		stdata_t *stp;
2734
2735		/*
2736		 * Get the extra space to insert a header and a trailer.
2737		 */
2738		so = VTOSO(vp);
2739		stp = vp->v_stream;
2740		if (stp == NULL) {
2741			wroff = so->so_proto_props.sopp_wroff;
2742			maxblk = so->so_proto_props.sopp_maxblk;
2743			extra = wroff + so->so_proto_props.sopp_tail;
2744		} else {
2745			wroff = (int)(stp->sd_wroff);
2746			maxblk = (int)(stp->sd_maxblk);
2747			extra = wroff + (int)(stp->sd_tail);
2748		}
2749	}
2750	bzero(&msg, sizeof (msg));
2751	fflag = fp->f_flag;
2752	ksize = 0;
2753	auio.uio_iov = &aiov;
2754	auio.uio_iovcnt = 1;
2755	auio.uio_segflg = UIO_SYSSPACE;
2756	auio.uio_llimit = MAXOFFSET_T;
2757	auio.uio_fmode = fflag;
2758	auio.uio_extflg = UIO_COPY_CACHED;
2759	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2760	/* If read sync is not asked for, filter sync flags */
2761	if ((ioflag & FRSYNC) == 0)
2762		ioflag &= ~(FSYNC|FDSYNC);
2763	for (;;) {
2764		if (ISSIG(curthread, JUSTLOOKING)) {
2765			error = EINTR;
2766			break;
2767		}
2768		iosize = (int)MIN(maxpsz, size);
2769
2770		/*
2771		 * Socket filters can limit the mblk size,
2772		 * so limit reads to maxblk if there are
2773		 * filters present.
2774		 */
2775		if (vp->v_type == VSOCK &&
2776		    so->so_filter_active > 0 && maxblk != INFPSZ)
2777			iosize = (int)MIN(iosize, maxblk);
2778
2779		if (is_system_labeled()) {
2780			mp = allocb_cred(iosize + extra, CRED(),
2781			    curproc->p_pid);
2782		} else {
2783			mp = allocb(iosize + extra, BPRI_MED);
2784		}
2785		if (mp == NULL) {
2786			error = EAGAIN;
2787			break;
2788		}
2789
2790		mp->b_rptr += wroff;
2791
2792		aiov.iov_base = (caddr_t)mp->b_rptr;
2793		aiov.iov_len = iosize;
2794		auio.uio_loffset = fileoff;
2795		auio.uio_resid = iosize;
2796
2797		error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2798		iosize -= auio.uio_resid;
2799
2800		if (error == EINTR && iosize != 0)
2801			error = 0;
2802
2803		if (error != 0 || iosize == 0) {
2804			freeb(mp);
2805			break;
2806		}
2807		mp->b_wptr = mp->b_rptr + iosize;
2808
2809		VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2810
2811		error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2812
2813		if (error != 0) {
2814			*count = ksize;
2815			if (mp != NULL)
2816				freeb(mp);
2817			return (error);
2818		}
2819		ksize += iosize;
2820		size -= iosize;
2821		if (size == 0)
2822			goto done;
2823
2824		fileoff += iosize;
2825		(void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2826		va.va_mask = AT_SIZE;
2827		error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2828		if (error)
2829			break;
2830		/* Read as much as possible. */
2831		if (fileoff >= va.va_size)
2832			size = 0;
2833		else if (size + fileoff > va.va_size)
2834			size = va.va_size - fileoff;
2835	}
2836	VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2837done:
2838	*count = ksize;
2839	return (error);
2840}
2841
2842#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2843/*
2844 * Largefile support for 32 bit applications only.
2845 */
2846int
2847sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2848    ssize32_t *count32)
2849{
2850	ssize32_t sfv_len;
2851	u_offset_t sfv_off, va_size;
2852	struct vnode *vp, *fvp, *realvp;
2853	struct vattr va;
2854	stdata_t *stp;
2855	ssize_t count = 0;
2856	int error = 0;
2857	boolean_t dozcopy = B_FALSE;
2858	uint_t maxpsz;
2859
2860	sfv_len = (ssize32_t)sfv->sfv_len;
2861	if (sfv_len < 0) {
2862		error = EINVAL;
2863		goto out;
2864	}
2865
2866	if (sfv_len == 0) goto out;
2867
2868	sfv_off = (u_offset_t)sfv->sfv_off;
2869
2870	/* Same checks as in pread */
2871	if (sfv_off > MAXOFFSET_T) {
2872		error = EINVAL;
2873		goto out;
2874	}
2875	if (sfv_off + sfv_len > MAXOFFSET_T)
2876		sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2877
2878	/*
2879	 * There are no more checks on sfv_len. So, we cast it to
2880	 * u_offset_t and share the snf_direct_io/snf_cache code between
2881	 * 32 bit and 64 bit.
2882	 *
2883	 * TODO: should do nbl_need_check() like read()?
2884	 */
2885	if (sfv_len > sendfile_max_size) {
2886		sf_stats.ss_file_not_cached++;
2887		error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2888		    &count);
2889		goto out;
2890	}
2891	fvp = rfp->f_vnode;
2892	if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2893		fvp = realvp;
2894	/*
2895	 * Grab the lock as a reader to prevent the file size
2896	 * from changing underneath.
2897	 */
2898	(void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2899	va.va_mask = AT_SIZE;
2900	error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2901	va_size = va.va_size;
2902	if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2903		VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2904		goto out;
2905	}
2906	/* Read as much as possible. */
2907	if (sfv_off + sfv_len > va_size)
2908		sfv_len = va_size - sfv_off;
2909
2910	vp = fp->f_vnode;
2911	stp = vp->v_stream;
2912	/*
2913	 * When the NOWAIT flag is not set, we enable zero-copy only if the
2914	 * transfer size is large enough. This prevents performance loss
2915	 * when the caller sends the file piece by piece.
2916	 */
2917	if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2918	    (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2919	    !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2920		uint_t copyflag;
2921		copyflag = stp != NULL ? stp->sd_copyflag :
2922		    VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2923		if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2924			int on = 1;
2925
2926			if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2927			    SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2928				dozcopy = B_TRUE;
2929		} else {
2930			dozcopy = copyflag & STZCVMSAFE;
2931		}
2932	}
2933	if (dozcopy) {
2934		sf_stats.ss_file_segmap++;
2935		error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2936		    &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2937	} else {
2938		if (vp->v_type == VSOCK && stp == NULL) {
2939			sonode_t *so = VTOSO(vp);
2940			maxpsz = so->so_proto_props.sopp_maxpsz;
2941		} else if (stp != NULL) {
2942			maxpsz = stp->sd_qn_maxpsz;
2943		} else {
2944			maxpsz = maxphys;
2945		}
2946
2947		if (maxpsz == INFPSZ)
2948			maxpsz = maxphys;
2949		else
2950			maxpsz = roundup(maxpsz, MAXBSIZE);
2951		sf_stats.ss_file_cached++;
2952		error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2953		    maxpsz, &count);
2954	}
2955out:
2956	releasef(sfv->sfv_fd);
2957	*count32 = (ssize32_t)count;
2958	return (error);
2959}
2960#endif
2961
2962#ifdef _SYSCALL32_IMPL
2963/*
2964 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2965 * ssize_t rather than ssize32_t; see the comments above read32 for details.
2966 */
2967
2968ssize_t
2969recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2970{
2971	return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2972}
2973
2974ssize_t
2975recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2976	caddr32_t name, caddr32_t namelenp)
2977{
2978	return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2979	    (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2980}
2981
2982ssize_t
2983send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2984{
2985	return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2986}
2987
2988ssize_t
2989sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2990	caddr32_t name, socklen_t namelen)
2991{
2992	return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2993	    (void *)(uintptr_t)name, namelen));
2994}
2995#endif	/* _SYSCALL32_IMPL */
2996
2997/*
2998 * Function wrappers (mostly around the sonode switch) for
2999 * backward compatibility.
3000 */
3001
3002int
3003soaccept(struct sonode *so, int fflag, struct sonode **nsop)
3004{
3005	return (socket_accept(so, fflag, CRED(), nsop));
3006}
3007
3008int
3009sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3010    int backlog, int flags)
3011{
3012	int	error;
3013
3014	error = socket_bind(so, name, namelen, flags, CRED());
3015	if (error == 0 && backlog != 0)
3016		return (socket_listen(so, backlog, CRED()));
3017
3018	return (error);
3019}
3020
3021int
3022solisten(struct sonode *so, int backlog)
3023{
3024	return (socket_listen(so, backlog, CRED()));
3025}
3026
3027int
3028soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3029    int fflag, int flags)
3030{
3031	return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3032}
3033
3034int
3035sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3036{
3037	return (socket_recvmsg(so, msg, uiop, CRED()));
3038}
3039
3040int
3041sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3042{
3043	return (socket_sendmsg(so, msg, uiop, CRED()));
3044}
3045
3046int
3047soshutdown(struct sonode *so, int how)
3048{
3049	return (socket_shutdown(so, how, CRED()));
3050}
3051
3052int
3053sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3054    socklen_t *optlenp, int flags)
3055{
3056	return (socket_getsockopt(so, level, option_name, optval, optlenp,
3057	    flags, CRED()));
3058}
3059
3060int
3061sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3062    t_uscalar_t optlen)
3063{
3064	return (socket_setsockopt(so, level, option_name, optval, optlen,
3065	    CRED()));
3066}
3067
3068/*
3069 * Because this is backward compatibility interface it only needs to be
3070 * able to handle the creation of TPI sockfs sockets.
3071 */
3072struct sonode *
3073socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3074    int *errorp)
3075{
3076	struct sonode *so;
3077
3078	ASSERT(sp != NULL);
3079
3080	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3081	    version, SOCKET_SLEEP, errorp, CRED());
3082	if (so == NULL) {
3083		SOCKPARAMS_DEC_REF(sp);
3084	} else {
3085		if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3086			/* Cannot fail, only bumps so_count */
3087			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3088		} else {
3089			socket_destroy(so);
3090			so = NULL;
3091		}
3092	}
3093	return (so);
3094}
3095