1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <sys/t_lock.h>
28#include <sys/param.h>
29#include <sys/systm.h>
30#include <sys/buf.h>
31#include <sys/conf.h>
32#include <sys/cred.h>
33#include <sys/kmem.h>
34#include <sys/sysmacros.h>
35#include <sys/vfs.h>
36#include <sys/vnode.h>
37#include <sys/debug.h>
38#include <sys/errno.h>
39#include <sys/time.h>
40#include <sys/file.h>
41#include <sys/open.h>
42#include <sys/user.h>
43#include <sys/termios.h>
44#include <sys/stream.h>
45#include <sys/strsubr.h>
46#include <sys/sunddi.h>
47#include <sys/esunddi.h>
48#include <sys/flock.h>
49#include <sys/modctl.h>
50#include <sys/cmn_err.h>
51#include <sys/vmsystm.h>
52
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <fs/sockfs/sockcommon.h>
56#include <fs/sockfs/socktpi.h>
57
58#include <netinet/in.h>
59#include <sys/sendfile.h>
60#include <sys/un.h>
61#include <sys/tihdr.h>
62#include <sys/atomic.h>
63
64#include <inet/common.h>
65#include <inet/ip.h>
66#include <inet/ip6.h>
67#include <inet/tcp.h>
68
69extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
70		ssize32_t *);
71extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *,
72		int, ssize_t *);
73extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *,
74		boolean_t);
75extern sotpi_info_t *sotpi_sototpi(struct sonode *);
76
77#define	SEND_MAX_CHUNK	16
78
79#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
80/*
81 * 64 bit offsets for 32 bit applications only running either on
82 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
83 * more than 2GB of data.
84 */
85int
86sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
87    int copy_cnt, ssize32_t *count)
88{
89	struct vnode *vp;
90	ushort_t fflag;
91	int ioflag;
92	size32_t cnt;
93	ssize32_t sfv_len;
94	ssize32_t tmpcount;
95	u_offset_t sfv_off;
96	struct uio auio;
97	struct iovec aiov;
98	int i, error;
99
100	fflag = fp->f_flag;
101	vp = fp->f_vnode;
102	for (i = 0; i < copy_cnt; i++) {
103
104		if (ISSIG(curthread, JUSTLOOKING))
105			return (EINTR);
106
107		/*
108		 * Do similar checks as "write" as we are writing
109		 * sfv_len bytes into "vp".
110		 */
111		sfv_len = (ssize32_t)sfv->sfv_len;
112
113		if (sfv_len == 0) {
114			sfv++;
115			continue;
116		}
117
118		if (sfv_len < 0)
119			return (EINVAL);
120
121		if (vp->v_type == VREG) {
122			if (*fileoff >= curproc->p_fsz_ctl) {
123				mutex_enter(&curproc->p_lock);
124				(void) rctl_action(
125				    rctlproc_legacy[RLIMIT_FSIZE],
126				    curproc->p_rctls, curproc, RCA_SAFE);
127				mutex_exit(&curproc->p_lock);
128				return (EFBIG);
129			}
130
131			if (*fileoff >= OFFSET_MAX(fp))
132				return (EFBIG);
133
134			if (*fileoff + sfv_len > OFFSET_MAX(fp))
135				return (EINVAL);
136		}
137
138		tmpcount = *count + sfv_len;
139		if (tmpcount < 0)
140			return (EINVAL);
141
142		sfv_off = sfv->sfv_off;
143
144		auio.uio_extflg = UIO_COPY_DEFAULT;
145		if (sfv->sfv_fd == SFV_FD_SELF) {
146			aiov.iov_len = sfv_len;
147			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
148			auio.uio_loffset = *fileoff;
149			auio.uio_iovcnt = 1;
150			auio.uio_resid = sfv_len;
151			auio.uio_iov = &aiov;
152			auio.uio_segflg = UIO_USERSPACE;
153			auio.uio_llimit = curproc->p_fsz_ctl;
154			auio.uio_fmode = fflag;
155			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
156			while (sfv_len > 0) {
157				error = VOP_WRITE(vp, &auio, ioflag,
158				    fp->f_cred, NULL);
159				cnt = sfv_len - auio.uio_resid;
160				sfv_len -= cnt;
161				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
162				if (vp->v_type == VREG)
163					*fileoff += cnt;
164				*count += cnt;
165				if (error != 0)
166					return (error);
167			}
168		} else {
169			file_t	*ffp;
170			vnode_t	*readvp;
171			size_t	size;
172			caddr_t	ptr;
173
174			if ((ffp = getf(sfv->sfv_fd)) == NULL)
175				return (EBADF);
176
177			if ((ffp->f_flag & FREAD) == 0) {
178				releasef(sfv->sfv_fd);
179				return (EBADF);
180			}
181
182			readvp = ffp->f_vnode;
183			if (readvp->v_type != VREG) {
184				releasef(sfv->sfv_fd);
185				return (EINVAL);
186			}
187
188			/*
189			 * No point reading and writing to same vp,
190			 * as long as both are regular files. readvp is not
191			 * locked; but since we got it from an open file the
192			 * contents will be valid during the time of access.
193			 */
194			if (vn_compare(vp, readvp)) {
195				releasef(sfv->sfv_fd);
196				return (EINVAL);
197			}
198
199			/*
200			 * Optimize the regular file over
201			 * the socket case.
202			 */
203			if (vp->v_type == VSOCK) {
204				error = sosendfile64(fp, ffp, sfv,
205				    (ssize32_t *)&cnt);
206				*count += cnt;
207				if (error)
208					return (error);
209				sfv++;
210				continue;
211			}
212
213			/*
214			 * Note: we assume readvp != vp. "vp" is already
215			 * locked, and "readvp" must not be.
216			 */
217			if (readvp < vp) {
218				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
219				(void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
220				    NULL);
221				(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
222			} else {
223				(void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
224				    NULL);
225			}
226
227			/*
228			 * Same checks as in pread64.
229			 */
230			if (sfv_off > MAXOFFSET_T) {
231				VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
232				releasef(sfv->sfv_fd);
233				return (EINVAL);
234			}
235
236			if (sfv_off + sfv_len > MAXOFFSET_T)
237				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
238
239			/* Find the native blocksize to transfer data */
240			size = MIN(vp->v_vfsp->vfs_bsize,
241			    readvp->v_vfsp->vfs_bsize);
242			size = sfv_len < size ? sfv_len : size;
243			ptr = kmem_alloc(size, KM_NOSLEEP);
244			if (ptr == NULL) {
245				VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
246				releasef(sfv->sfv_fd);
247				return (ENOMEM);
248			}
249
250			while (sfv_len > 0) {
251				size_t	iov_len;
252
253				iov_len = MIN(size, sfv_len);
254				aiov.iov_base = ptr;
255				aiov.iov_len = iov_len;
256				auio.uio_loffset = sfv_off;
257				auio.uio_iov = &aiov;
258				auio.uio_iovcnt = 1;
259				auio.uio_resid = iov_len;
260				auio.uio_segflg = UIO_SYSSPACE;
261				auio.uio_llimit = MAXOFFSET_T;
262				auio.uio_fmode = ffp->f_flag;
263				ioflag = auio.uio_fmode &
264				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
265
266				/*
267				 * If read sync is not asked for,
268				 * filter sync flags
269				 */
270				if ((ioflag & FRSYNC) == 0)
271					ioflag &= ~(FSYNC|FDSYNC);
272				error = VOP_READ(readvp, &auio, ioflag,
273				    fp->f_cred, NULL);
274				if (error) {
275					kmem_free(ptr, size);
276					VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
277					    NULL);
278					releasef(sfv->sfv_fd);
279					return (error);
280				}
281
282				/*
283				 * Check how must data was really read.
284				 * Decrement the 'len' and increment the
285				 * 'off' appropriately.
286				 */
287				cnt = iov_len - auio.uio_resid;
288				if (cnt == 0) {
289					/*
290					 * If we were reading a pipe (currently
291					 * not implemented), we may now lose
292					 * data.
293					 */
294					kmem_free(ptr, size);
295					VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
296					    NULL);
297					releasef(sfv->sfv_fd);
298					return (EINVAL);
299				}
300				sfv_len -= cnt;
301				sfv_off += cnt;
302
303				aiov.iov_base = ptr;
304				aiov.iov_len = cnt;
305				auio.uio_loffset = *fileoff;
306				auio.uio_iov = &aiov;
307				auio.uio_iovcnt = 1;
308				auio.uio_resid = cnt;
309				auio.uio_segflg = UIO_SYSSPACE;
310				auio.uio_llimit = curproc->p_fsz_ctl;
311				auio.uio_fmode = fflag;
312				ioflag = auio.uio_fmode &
313				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
314				error = VOP_WRITE(vp, &auio, ioflag,
315				    fp->f_cred, NULL);
316
317				/*
318				 * Check how much data was written. Increment
319				 * the 'len' and decrement the 'off' if all
320				 * the data was not written.
321				 */
322				cnt -= auio.uio_resid;
323				sfv_len += auio.uio_resid;
324				sfv_off -= auio.uio_resid;
325				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
326				if (vp->v_type == VREG)
327					*fileoff += cnt;
328				*count += cnt;
329				if (error != 0) {
330					kmem_free(ptr, size);
331					VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
332					    NULL);
333					releasef(sfv->sfv_fd);
334					return (error);
335				}
336			}
337			VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
338			releasef(sfv->sfv_fd);
339			kmem_free(ptr, size);
340		}
341		sfv++;
342	}
343	return (0);
344}
345
346ssize32_t
347sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
348    size32_t *xferred, int fildes)
349{
350	u_offset_t		fileoff;
351	int			copy_cnt;
352	const struct ksendfilevec64 *copy_vec;
353	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
354	struct vnode *vp;
355	int error;
356	ssize32_t count = 0;
357
358	vp = fp->f_vnode;
359	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
360
361	copy_vec = vec;
362	fileoff = fp->f_offset;
363
364	do {
365		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
366		if (copyin(copy_vec, sfv, copy_cnt *
367		    sizeof (struct ksendfilevec64))) {
368			error = EFAULT;
369			break;
370		}
371
372		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
373		if (error != 0)
374			break;
375
376		copy_vec += copy_cnt;
377		sfvcnt -= copy_cnt;
378	} while (sfvcnt > 0);
379
380	if (vp->v_type == VREG)
381		fp->f_offset += count;
382
383	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
384	if (copyout(&count, xferred, sizeof (count)))
385		error = EFAULT;
386	releasef(fildes);
387	if (error != 0)
388		return (set_errno(error));
389	return (count);
390}
391#endif
392
393int
394sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
395    int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
396{
397	struct vnode *vp;
398	struct uio auio;
399	struct iovec aiov;
400	ushort_t fflag;
401	int ioflag;
402	int i, error;
403	size_t cnt;
404	ssize_t sfv_len;
405	u_offset_t sfv_off;
406#ifdef _SYSCALL32_IMPL
407	model_t model = get_udatamodel();
408	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
409	    MAXOFF32_T : MAXOFFSET_T;
410#else
411	const u_offset_t maxoff = MAXOFF32_T;
412#endif
413	mblk_t *dmp = NULL;
414	int wroff;
415	int buf_left = 0;
416	size_t	iov_len;
417	mblk_t  *head, *tmp;
418	size_t  size = total_size;
419	size_t  extra;
420	int tail_len;
421	struct nmsghdr msg;
422
423	fflag = fp->f_flag;
424	vp = fp->f_vnode;
425
426	ASSERT(vp->v_type == VSOCK);
427	ASSERT(maxblk > 0);
428
429	/* If nothing to send, return */
430	if (total_size == 0)
431		return (0);
432
433	if (vp->v_stream != NULL) {
434		wroff = (int)vp->v_stream->sd_wroff;
435		tail_len = (int)vp->v_stream->sd_tail;
436	} else {
437		struct sonode *so;
438
439		so = VTOSO(vp);
440		wroff = so->so_proto_props.sopp_wroff;
441		tail_len = so->so_proto_props.sopp_tail;
442	}
443
444	extra = wroff + tail_len;
445
446	buf_left = MIN(total_size, maxblk);
447	head = dmp = allocb(buf_left + extra, BPRI_HI);
448	if (head == NULL)
449		return (ENOMEM);
450	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
451	bzero(&msg, sizeof (msg));
452
453	auio.uio_extflg = UIO_COPY_DEFAULT;
454	for (i = 0; i < copy_cnt; i++) {
455		if (ISSIG(curthread, JUSTLOOKING)) {
456			freemsg(head);
457			return (EINTR);
458		}
459
460		/*
461		 * Do similar checks as "write" as we are writing
462		 * sfv_len bytes into "vp".
463		 */
464		sfv_len = (ssize_t)sfv->sfv_len;
465
466		if (sfv_len == 0) {
467			sfv++;
468			continue;
469		}
470
471		/* Check for overflow */
472#ifdef _SYSCALL32_IMPL
473		if (model == DATAMODEL_ILP32) {
474			if (((ssize32_t)(*count + sfv_len)) < 0) {
475				freemsg(head);
476				return (EINVAL);
477			}
478		} else
479#endif
480		if ((*count + sfv_len) < 0) {
481			freemsg(head);
482			return (EINVAL);
483		}
484
485		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
486
487		if (sfv->sfv_fd == SFV_FD_SELF) {
488			while (sfv_len > 0) {
489				if (buf_left == 0) {
490					tmp = dmp;
491					buf_left = MIN(total_size, maxblk);
492					iov_len = MIN(buf_left, sfv_len);
493					dmp = allocb(buf_left + extra, BPRI_HI);
494					if (dmp == NULL) {
495						freemsg(head);
496						return (ENOMEM);
497					}
498					dmp->b_wptr = dmp->b_rptr =
499					    dmp->b_rptr + wroff;
500					tmp->b_cont = dmp;
501				} else {
502					iov_len = MIN(buf_left, sfv_len);
503				}
504
505				aiov.iov_len = iov_len;
506				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
507				auio.uio_loffset = *fileoff;
508				auio.uio_iovcnt = 1;
509				auio.uio_resid = iov_len;
510				auio.uio_iov = &aiov;
511				auio.uio_segflg = UIO_USERSPACE;
512				auio.uio_llimit = curproc->p_fsz_ctl;
513				auio.uio_fmode = fflag;
514
515				buf_left -= iov_len;
516				total_size -= iov_len;
517				sfv_len -= iov_len;
518				sfv_off += iov_len;
519
520				error = uiomove((caddr_t)dmp->b_wptr,
521				    iov_len, UIO_WRITE, &auio);
522				if (error != 0) {
523					freemsg(head);
524					return (error);
525				}
526				dmp->b_wptr += iov_len;
527			}
528		} else {
529			file_t	*ffp;
530			vnode_t	*readvp;
531
532			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
533				freemsg(head);
534				return (EBADF);
535			}
536
537			if ((ffp->f_flag & FREAD) == 0) {
538				releasef(sfv->sfv_fd);
539				freemsg(head);
540				return (EACCES);
541			}
542
543			readvp = ffp->f_vnode;
544			if (readvp->v_type != VREG) {
545				releasef(sfv->sfv_fd);
546				freemsg(head);
547				return (EINVAL);
548			}
549
550			/*
551			 * No point reading and writing to same vp,
552			 * as long as both are regular files. readvp is not
553			 * locked; but since we got it from an open file the
554			 * contents will be valid during the time of access.
555			 */
556
557			if (vn_compare(vp, readvp)) {
558				releasef(sfv->sfv_fd);
559				freemsg(head);
560				return (EINVAL);
561			}
562
563			/*
564			 * Note: we assume readvp != vp. "vp" is already
565			 * locked, and "readvp" must not be.
566			 */
567
568			if (readvp < vp) {
569				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
570				(void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
571				    NULL);
572				(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
573			} else {
574				(void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
575				    NULL);
576			}
577
578			/* Same checks as in pread */
579			if (sfv_off > maxoff) {
580				VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
581				releasef(sfv->sfv_fd);
582				freemsg(head);
583				return (EINVAL);
584			}
585			if (sfv_off + sfv_len > maxoff) {
586				total_size -= (sfv_off + sfv_len - maxoff);
587				sfv_len = (ssize_t)((offset_t)maxoff -
588				    sfv_off);
589			}
590
591			while (sfv_len > 0) {
592				if (buf_left == 0) {
593					tmp = dmp;
594					buf_left = MIN(total_size, maxblk);
595					iov_len = MIN(buf_left, sfv_len);
596					dmp = allocb(buf_left + extra, BPRI_HI);
597					if (dmp == NULL) {
598						VOP_RWUNLOCK(readvp,
599						    V_WRITELOCK_FALSE, NULL);
600						releasef(sfv->sfv_fd);
601						freemsg(head);
602						return (ENOMEM);
603					}
604					dmp->b_wptr = dmp->b_rptr =
605					    dmp->b_rptr + wroff;
606					tmp->b_cont = dmp;
607				} else {
608					iov_len = MIN(buf_left, sfv_len);
609				}
610				aiov.iov_base = (caddr_t)dmp->b_wptr;
611				aiov.iov_len = iov_len;
612				auio.uio_loffset = sfv_off;
613				auio.uio_iov = &aiov;
614				auio.uio_iovcnt = 1;
615				auio.uio_resid = iov_len;
616				auio.uio_segflg = UIO_SYSSPACE;
617				auio.uio_llimit = MAXOFFSET_T;
618				auio.uio_fmode = ffp->f_flag;
619				ioflag = auio.uio_fmode &
620				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
621
622				/*
623				 * If read sync is not asked for,
624				 * filter sync flags
625				 */
626				if ((ioflag & FRSYNC) == 0)
627					ioflag &= ~(FSYNC|FDSYNC);
628				error = VOP_READ(readvp, &auio, ioflag,
629				    fp->f_cred, NULL);
630				if (error != 0) {
631					/*
632					 * If we were reading a pipe (currently
633					 * not implemented), we may now loose
634					 * data.
635					 */
636					VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
637					    NULL);
638					releasef(sfv->sfv_fd);
639					freemsg(head);
640					return (error);
641				}
642
643				/*
644				 * Check how much data was really read.
645				 * Decrement the 'len' and increment the
646				 * 'off' appropriately.
647				 */
648				cnt = iov_len - auio.uio_resid;
649				if (cnt == 0) {
650					VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
651					    NULL);
652					releasef(sfv->sfv_fd);
653					freemsg(head);
654					return (EINVAL);
655				}
656				sfv_len -= cnt;
657				sfv_off += cnt;
658				total_size -= cnt;
659				buf_left -= cnt;
660
661				dmp->b_wptr += cnt;
662			}
663			VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
664			releasef(sfv->sfv_fd);
665		}
666		sfv++;
667	}
668
669	ASSERT(total_size == 0);
670	error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
671	if (error != 0) {
672		if (head != NULL)
673			freemsg(head);
674		return (error);
675	}
676	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
677	*count += size;
678
679	return (0);
680}
681
682
683int
684sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
685    int copy_cnt, ssize_t *count)
686{
687	struct vnode *vp;
688	struct uio auio;
689	struct iovec aiov;
690	ushort_t fflag;
691	int ioflag;
692	int i, error;
693	size_t cnt;
694	ssize_t sfv_len;
695	u_offset_t sfv_off;
696#ifdef _SYSCALL32_IMPL
697	model_t model = get_udatamodel();
698	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
699	    MAXOFF32_T : MAXOFFSET_T;
700#else
701	const u_offset_t maxoff = MAXOFF32_T;
702#endif
703	mblk_t	*dmp = NULL;
704	char	*buf = NULL;
705	size_t  extra = 0;
706	int maxblk, wroff, tail_len;
707	struct sonode *so;
708	stdata_t *stp;
709	struct nmsghdr msg;
710
711	maxblk = 0;
712	wroff = 0;
713	fflag = fp->f_flag;
714	vp = fp->f_vnode;
715	so = NULL;
716	stp = NULL;
717
718	if (vp->v_type == VSOCK) {
719		so = VTOSO(vp);
720		if (vp->v_stream != NULL) {
721			stp = vp->v_stream;
722			wroff = (int)stp->sd_wroff;
723			tail_len = (int)stp->sd_tail;
724			maxblk = (int)stp->sd_maxblk;
725		} else {
726			stp = NULL;
727			wroff = so->so_proto_props.sopp_wroff;
728			tail_len = so->so_proto_props.sopp_tail;
729			maxblk = so->so_proto_props.sopp_maxblk;
730		}
731		extra = wroff + tail_len;
732	}
733
734	bzero(&msg, sizeof (msg));
735	auio.uio_extflg = UIO_COPY_DEFAULT;
736	for (i = 0; i < copy_cnt; i++) {
737		if (ISSIG(curthread, JUSTLOOKING))
738			return (EINTR);
739
740		/*
741		 * Do similar checks as "write" as we are writing
742		 * sfv_len bytes into "vp".
743		 */
744		sfv_len = (ssize_t)sfv->sfv_len;
745
746		if (sfv_len == 0) {
747			sfv++;
748			continue;
749		}
750
751		if (vp->v_type == VREG) {
752			if (*fileoff >= curproc->p_fsz_ctl) {
753				mutex_enter(&curproc->p_lock);
754				(void) rctl_action(
755				    rctlproc_legacy[RLIMIT_FSIZE],
756				    curproc->p_rctls, curproc, RCA_SAFE);
757				mutex_exit(&curproc->p_lock);
758
759				return (EFBIG);
760			}
761
762			if (*fileoff >= maxoff)
763				return (EFBIG);
764
765			if (*fileoff + sfv_len > maxoff)
766				return (EINVAL);
767		}
768
769		/* Check for overflow */
770#ifdef _SYSCALL32_IMPL
771		if (model == DATAMODEL_ILP32) {
772			if (((ssize32_t)(*count + sfv_len)) < 0)
773				return (EINVAL);
774		} else
775#endif
776		if ((*count + sfv_len) < 0)
777			return (EINVAL);
778
779		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
780
781		if (sfv->sfv_fd == SFV_FD_SELF) {
782			if (vp->v_type == VSOCK) {
783				while (sfv_len > 0) {
784					size_t iov_len;
785
786					iov_len = sfv_len;
787					/*
788					 * Socket filters can limit the mblk
789					 * size, so limit reads to maxblk if
790					 * there are filters present.
791					 */
792					if (so->so_filter_active > 0 &&
793					    maxblk != INFPSZ)
794						iov_len = MIN(iov_len, maxblk);
795
796					aiov.iov_len = iov_len;
797					aiov.iov_base =
798					    (caddr_t)(uintptr_t)sfv_off;
799
800					auio.uio_iov = &aiov;
801					auio.uio_iovcnt = 1;
802					auio.uio_loffset = *fileoff;
803					auio.uio_segflg = UIO_USERSPACE;
804					auio.uio_fmode = fflag;
805					auio.uio_llimit = curproc->p_fsz_ctl;
806					auio.uio_resid = iov_len;
807
808					dmp = allocb(iov_len + extra, BPRI_HI);
809					if (dmp == NULL)
810						return (ENOMEM);
811					dmp->b_wptr = dmp->b_rptr =
812					    dmp->b_rptr + wroff;
813					error = uiomove((caddr_t)dmp->b_wptr,
814					    iov_len, UIO_WRITE, &auio);
815					if (error != 0) {
816						freeb(dmp);
817						return (error);
818					}
819					dmp->b_wptr += iov_len;
820					error = socket_sendmblk(VTOSO(vp),
821					    &msg, fflag, CRED(), &dmp);
822
823					if (error != 0) {
824						if (dmp != NULL)
825							freeb(dmp);
826						return (error);
827					}
828					ttolwp(curthread)->lwp_ru.ioch +=
829					    (ulong_t)iov_len;
830					*count += iov_len;
831					sfv_len -= iov_len;
832					sfv_off += iov_len;
833				}
834			} else {
835				aiov.iov_len = sfv_len;
836				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
837
838				auio.uio_iov = &aiov;
839				auio.uio_iovcnt = 1;
840				auio.uio_loffset = *fileoff;
841				auio.uio_segflg = UIO_USERSPACE;
842				auio.uio_fmode = fflag;
843				auio.uio_llimit = curproc->p_fsz_ctl;
844				auio.uio_resid = sfv_len;
845
846				ioflag = auio.uio_fmode &
847				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
848				while (sfv_len > 0) {
849					error = VOP_WRITE(vp, &auio, ioflag,
850					    fp->f_cred, NULL);
851					cnt = sfv_len - auio.uio_resid;
852					sfv_len -= cnt;
853					ttolwp(curthread)->lwp_ru.ioch +=
854					    (ulong_t)cnt;
855					*fileoff += cnt;
856					*count += cnt;
857					if (error != 0)
858						return (error);
859				}
860			}
861		} else {
862			int segmapit = 0;
863			file_t	*ffp;
864			vnode_t	*readvp;
865			struct vnode *realvp;
866			size_t	size;
867			caddr_t	ptr;
868
869			if ((ffp = getf(sfv->sfv_fd)) == NULL)
870				return (EBADF);
871
872			if ((ffp->f_flag & FREAD) == 0) {
873				releasef(sfv->sfv_fd);
874				return (EBADF);
875			}
876
877			readvp = ffp->f_vnode;
878			if (VOP_REALVP(readvp, &realvp, NULL) == 0)
879				readvp = realvp;
880			if (readvp->v_type != VREG) {
881				releasef(sfv->sfv_fd);
882				return (EINVAL);
883			}
884
885			/*
886			 * No point reading and writing to same vp,
887			 * as long as both are regular files. readvp is not
888			 * locked; but since we got it from an open file the
889			 * contents will be valid during the time of access.
890			 */
891			if (vn_compare(vp, readvp)) {
892				releasef(sfv->sfv_fd);
893				return (EINVAL);
894			}
895
896			/*
897			 * Note: we assume readvp != vp. "vp" is already
898			 * locked, and "readvp" must not be.
899			 */
900			if (readvp < vp) {
901				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
902				(void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
903				    NULL);
904				(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
905			} else {
906				(void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
907				    NULL);
908			}
909
910			/* Same checks as in pread */
911			if (sfv_off > maxoff) {
912				VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
913				releasef(sfv->sfv_fd);
914				return (EINVAL);
915			}
916			if (sfv_off + sfv_len > maxoff) {
917				sfv_len = (ssize_t)((offset_t)maxoff -
918				    sfv_off);
919			}
920			/* Find the native blocksize to transfer data */
921			size = MIN(vp->v_vfsp->vfs_bsize,
922			    readvp->v_vfsp->vfs_bsize);
923			size = sfv_len < size ? sfv_len : size;
924
925			if (vp->v_type != VSOCK) {
926				segmapit = 0;
927				buf = kmem_alloc(size, KM_NOSLEEP);
928				if (buf == NULL) {
929					VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
930					    NULL);
931					releasef(sfv->sfv_fd);
932					return (ENOMEM);
933				}
934			} else {
935				uint_t	copyflag;
936
937				copyflag = stp != NULL ? stp->sd_copyflag :
938				    so->so_proto_props.sopp_zcopyflag;
939
940				/*
941				 * Socket filters can limit the mblk size,
942				 * so limit reads to maxblk if there are
943				 * filters present.
944				 */
945				if (so->so_filter_active > 0 &&
946				    maxblk != INFPSZ)
947					size = MIN(size, maxblk);
948
949				if (vn_has_flocks(readvp) ||
950				    readvp->v_flag & VNOMAP ||
951				    copyflag & STZCVMUNSAFE) {
952					segmapit = 0;
953				} else if (copyflag & STZCVMSAFE) {
954					segmapit = 1;
955				} else {
956					int on = 1;
957					if (socket_setsockopt(VTOSO(vp),
958					    SOL_SOCKET, SO_SND_COPYAVOID,
959					    &on, sizeof (on), CRED()) == 0)
960						segmapit = 1;
961				}
962			}
963
964			if (segmapit) {
965				struct vattr va;
966				boolean_t nowait;
967
968				va.va_mask = AT_SIZE;
969				error = VOP_GETATTR(readvp, &va, 0, kcred,
970				    NULL);
971				if (error != 0 || sfv_off >= va.va_size) {
972					VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
973					    NULL);
974					releasef(sfv->sfv_fd);
975					return (error);
976				}
977				/* Read as much as possible. */
978				if (sfv_off + sfv_len > va.va_size)
979					sfv_len = va.va_size - sfv_off;
980
981				nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
982				error = snf_segmap(fp, readvp, sfv_off,
983				    (u_offset_t)sfv_len, (ssize_t *)&cnt,
984				    nowait);
985				releasef(sfv->sfv_fd);
986				*count += cnt;
987				if (error)
988					return (error);
989				sfv++;
990				continue;
991			}
992
993			while (sfv_len > 0) {
994				size_t	iov_len;
995
996				iov_len = MIN(size, sfv_len);
997
998				if (vp->v_type == VSOCK) {
999					dmp = allocb(iov_len + extra, BPRI_HI);
1000					if (dmp == NULL) {
1001						VOP_RWUNLOCK(readvp,
1002						    V_WRITELOCK_FALSE, NULL);
1003						releasef(sfv->sfv_fd);
1004						return (ENOMEM);
1005					}
1006					dmp->b_wptr = dmp->b_rptr =
1007					    dmp->b_rptr + wroff;
1008					ptr = (caddr_t)dmp->b_rptr;
1009				} else {
1010					ptr = buf;
1011				}
1012
1013				aiov.iov_base = ptr;
1014				aiov.iov_len = iov_len;
1015				auio.uio_loffset = sfv_off;
1016				auio.uio_iov = &aiov;
1017				auio.uio_iovcnt = 1;
1018				auio.uio_resid = iov_len;
1019				auio.uio_segflg = UIO_SYSSPACE;
1020				auio.uio_llimit = MAXOFFSET_T;
1021				auio.uio_fmode = ffp->f_flag;
1022				ioflag = auio.uio_fmode &
1023				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1024
1025				/*
1026				 * If read sync is not asked for,
1027				 * filter sync flags
1028				 */
1029				if ((ioflag & FRSYNC) == 0)
1030					ioflag &= ~(FSYNC|FDSYNC);
1031				error = VOP_READ(readvp, &auio, ioflag,
1032				    fp->f_cred, NULL);
1033				if (error != 0) {
1034					/*
1035					 * If we were reading a pipe (currently
1036					 * not implemented), we may now lose
1037					 * data.
1038					 */
1039					if (vp->v_type == VSOCK)
1040						freeb(dmp);
1041					else
1042						kmem_free(buf, size);
1043					VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
1044					    NULL);
1045					releasef(sfv->sfv_fd);
1046					return (error);
1047				}
1048
1049				/*
1050				 * Check how much data was really read.
1051				 * Decrement the 'len' and increment the
1052				 * 'off' appropriately.
1053				 */
1054				cnt = iov_len - auio.uio_resid;
1055				if (cnt == 0) {
1056					if (vp->v_type == VSOCK)
1057						freeb(dmp);
1058					else
1059						kmem_free(buf, size);
1060					VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
1061					    NULL);
1062					releasef(sfv->sfv_fd);
1063					return (EINVAL);
1064				}
1065				sfv_len -= cnt;
1066				sfv_off += cnt;
1067
1068				if (vp->v_type == VSOCK) {
1069					dmp->b_wptr = dmp->b_rptr + cnt;
1070
1071					error = socket_sendmblk(VTOSO(vp),
1072					    &msg, fflag, CRED(), &dmp);
1073
1074					if (error != 0) {
1075						if (dmp != NULL)
1076							freeb(dmp);
1077						VOP_RWUNLOCK(readvp,
1078						    V_WRITELOCK_FALSE, NULL);
1079						releasef(sfv->sfv_fd);
1080						return (error);
1081					}
1082
1083					ttolwp(curthread)->lwp_ru.ioch +=
1084					    (ulong_t)cnt;
1085					*count += cnt;
1086				} else {
1087
1088					aiov.iov_base = ptr;
1089					aiov.iov_len = cnt;
1090					auio.uio_loffset = *fileoff;
1091					auio.uio_resid = cnt;
1092					auio.uio_iov = &aiov;
1093					auio.uio_iovcnt = 1;
1094					auio.uio_segflg = UIO_SYSSPACE;
1095					auio.uio_llimit = curproc->p_fsz_ctl;
1096					auio.uio_fmode = fflag;
1097					ioflag = auio.uio_fmode &
1098					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1099					error = VOP_WRITE(vp, &auio, ioflag,
1100					    fp->f_cred, NULL);
1101
1102					/*
1103					 * Check how much data was written.
1104					 * Increment the 'len' and decrement the
1105					 * 'off' if all the data was not
1106					 * written.
1107					 */
1108					cnt -= auio.uio_resid;
1109					sfv_len += auio.uio_resid;
1110					sfv_off -= auio.uio_resid;
1111					ttolwp(curthread)->lwp_ru.ioch +=
1112					    (ulong_t)cnt;
1113					*fileoff += cnt;
1114					*count += cnt;
1115					if (error != 0) {
1116						kmem_free(buf, size);
1117						VOP_RWUNLOCK(readvp,
1118						    V_WRITELOCK_FALSE, NULL);
1119						releasef(sfv->sfv_fd);
1120						return (error);
1121					}
1122				}
1123			}
1124			if (buf) {
1125				kmem_free(buf, size);
1126				buf = NULL;
1127			}
1128			VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
1129			releasef(sfv->sfv_fd);
1130		}
1131		sfv++;
1132	}
1133	return (0);
1134}
1135
1136ssize_t
1137sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1138    size_t *xferred)
1139{
1140	int error = 0;
1141	int first_vector_error = 0;
1142	file_t *fp;
1143	struct vnode *vp;
1144	struct sonode *so = NULL;
1145	u_offset_t fileoff;
1146	int copy_cnt;
1147	const struct sendfilevec *copy_vec;
1148	struct sendfilevec sfv[SEND_MAX_CHUNK];
1149	ssize_t count = 0;
1150#ifdef _SYSCALL32_IMPL
1151	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1152#endif
1153	ssize_t total_size;
1154	int i;
1155	boolean_t is_sock = B_FALSE;
1156	int maxblk = 0;
1157
1158	if (sfvcnt <= 0)
1159		return (set_errno(EINVAL));
1160
1161	if ((fp = getf(fildes)) == NULL)
1162		return (set_errno(EBADF));
1163
1164	if (((fp->f_flag) & FWRITE) == 0) {
1165		error = EBADF;
1166		goto err;
1167	}
1168
1169	fileoff = fp->f_offset;
1170	vp = fp->f_vnode;
1171
1172	switch (vp->v_type) {
1173	case VSOCK:
1174		so = VTOSO(vp);
1175		is_sock = B_TRUE;
1176		if (SOCK_IS_NONSTR(so)) {
1177			maxblk = so->so_proto_props.sopp_maxblk;
1178		} else {
1179			maxblk = (int)vp->v_stream->sd_maxblk;
1180		}
1181		break;
1182	case VREG:
1183		break;
1184	default:
1185		error = EINVAL;
1186		goto err;
1187	}
1188
1189	switch (opcode) {
1190	case SENDFILEV :
1191		break;
1192#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1193	case SENDFILEV64 :
1194		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1195		    (size32_t *)xferred, fildes));
1196#endif
1197	default :
1198		error = ENOSYS;
1199		break;
1200	}
1201
1202	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1203	copy_vec = vec;
1204
1205	do {
1206		total_size = 0;
1207		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1208#ifdef _SYSCALL32_IMPL
1209		/* 32-bit callers need to have their iovec expanded. */
1210		if (get_udatamodel() == DATAMODEL_ILP32) {
1211			if (copyin(copy_vec, sfv32,
1212			    copy_cnt * sizeof (ksendfilevec32_t))) {
1213				error = EFAULT;
1214				break;
1215			}
1216
1217			for (i = 0; i < copy_cnt; i++) {
1218				sfv[i].sfv_fd = sfv32[i].sfv_fd;
1219				sfv[i].sfv_off =
1220				    (off_t)(uint32_t)sfv32[i].sfv_off;
1221				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1222				total_size += sfv[i].sfv_len;
1223				sfv[i].sfv_flag = sfv32[i].sfv_flag;
1224				/*
1225				 * Individual elements of the vector must not
1226				 * wrap or overflow, as later math is signed.
1227				 * Equally total_size needs to be checked after
1228				 * each vector is added in, to be sure that
1229				 * rogue values haven't overflowed the counter.
1230				 */
1231				if (((ssize32_t)sfv[i].sfv_len < 0) ||
1232				    ((ssize32_t)total_size < 0)) {
1233					/*
1234					 * Truncate the vector to send data
1235					 * described by elements before the
1236					 * error.
1237					 */
1238					copy_cnt = i;
1239					first_vector_error = EINVAL;
1240					/* total_size can't be trusted */
1241					if ((ssize32_t)total_size < 0)
1242						error = EINVAL;
1243					break;
1244				}
1245			}
1246			/* Nothing to do, process errors */
1247			if (copy_cnt == 0)
1248				break;
1249
1250		} else {
1251#endif
1252			if (copyin(copy_vec, sfv,
1253			    copy_cnt * sizeof (sendfilevec_t))) {
1254				error = EFAULT;
1255				break;
1256			}
1257
1258			for (i = 0; i < copy_cnt; i++) {
1259				total_size += sfv[i].sfv_len;
1260				/*
1261				 * Individual elements of the vector must not
1262				 * wrap or overflow, as later math is signed.
1263				 * Equally total_size needs to be checked after
1264				 * each vector is added in, to be sure that
1265				 * rogue values haven't overflowed the counter.
1266				 */
1267				if (((ssize_t)sfv[i].sfv_len < 0) ||
1268				    (total_size < 0)) {
1269					/*
1270					 * Truncate the vector to send data
1271					 * described by elements before the
1272					 * error.
1273					 */
1274					copy_cnt = i;
1275					first_vector_error = EINVAL;
1276					/* total_size can't be trusted */
1277					if (total_size < 0)
1278						error = EINVAL;
1279					break;
1280				}
1281			}
1282			/* Nothing to do, process errors */
1283			if (copy_cnt == 0)
1284				break;
1285#ifdef _SYSCALL32_IMPL
1286		}
1287#endif
1288
1289		/*
1290		 * The task between deciding to use sendvec_small_chunk
1291		 * and sendvec_chunk is dependant on multiple things:
1292		 *
1293		 * i) latency is important for smaller files. So if the
1294		 * data is smaller than 'tcp_slow_start_initial' times
1295		 * maxblk, then use sendvec_small_chunk which creates
1296		 * maxblk size mblks and chains them together and sends
1297		 * them to TCP in one shot. It also leaves 'wroff' size
1298		 * space for the headers in each mblk.
1299		 *
1300		 * ii) for total size bigger than 'tcp_slow_start_initial'
1301		 * time maxblk, its probably real file data which is
1302		 * dominating. So its better to use sendvec_chunk because
1303		 * performance goes to dog if we don't do pagesize reads.
1304		 * sendvec_chunk will do pagesize reads and write them
1305		 * in pagesize mblks to TCP.
1306		 *
1307		 * Side Notes: A write to file has not been optimized.
1308		 * Future zero copy code will plugin into sendvec_chunk
1309		 * only because doing zero copy for files smaller then
1310		 * pagesize is useless.
1311		 *
1312		 * Note, if socket has NL7C enabled then call NL7C's
1313		 * senfilev() function to consume the sfv[].
1314		 */
1315		if (is_sock) {
1316			if (!SOCK_IS_NONSTR(so) &&
1317			    _SOTOTPI(so)->sti_nl7c_flags != 0) {
1318				error = nl7c_sendfilev(so, &fileoff,
1319				    sfv, copy_cnt, &count);
1320			} else if ((total_size <= (4 * maxblk)) &&
1321			    error == 0) {
1322				error = sendvec_small_chunk(fp,
1323				    &fileoff, sfv, copy_cnt,
1324				    total_size, maxblk, &count);
1325			} else {
1326				error = sendvec_chunk(fp, &fileoff,
1327				    sfv, copy_cnt, &count);
1328			}
1329		} else {
1330			ASSERT(vp->v_type == VREG);
1331			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1332			    &count);
1333		}
1334
1335
1336#ifdef _SYSCALL32_IMPL
1337		if (get_udatamodel() == DATAMODEL_ILP32) {
1338			copy_vec = (const struct sendfilevec *)
1339			    ((char *)copy_vec +
1340			    (copy_cnt * sizeof (ksendfilevec32_t)));
1341		} else
1342#endif
1343			copy_vec += copy_cnt;
1344		sfvcnt -= copy_cnt;
1345
1346	/* Process all vector members up to first error */
1347	} while ((sfvcnt > 0) && first_vector_error == 0 && error == 0);
1348
1349	if (vp->v_type == VREG)
1350		fp->f_offset += count;
1351
1352	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1353
1354#ifdef _SYSCALL32_IMPL
1355	if (get_udatamodel() == DATAMODEL_ILP32) {
1356		ssize32_t count32 = (ssize32_t)count;
1357		if (copyout(&count32, xferred, sizeof (count32)))
1358			error = EFAULT;
1359		releasef(fildes);
1360		if (error != 0)
1361			return (set_errno(error));
1362		if (first_vector_error != 0)
1363			return (set_errno(first_vector_error));
1364		return (count32);
1365	}
1366#endif
1367	if (copyout(&count, xferred, sizeof (count)))
1368		error = EFAULT;
1369	releasef(fildes);
1370	if (error != 0)
1371		return (set_errno(error));
1372	if (first_vector_error != 0)
1373		return (set_errno(first_vector_error));
1374	return (count);
1375err:
1376	ASSERT(error != 0);
1377	releasef(fildes);
1378	return (set_errno(error));
1379}
1380