xref: /illumos-gate/usr/src/uts/common/syscall/sendfile.c (revision c28749e97052f09388969427adf7df641cdcdc22)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/cmn_err.h>
53 #include <sys/vmsystm.h>
54 
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <netinet/in.h>
58 #include <sys/sendfile.h>
59 #include <sys/un.h>
60 #include <inet/nca/ncadoorhdr.h>
61 #include <inet/nca/ncaio.h>
62 #include <sys/tihdr.h>
63 #include <sys/atomic.h>
64 
65 #include <inet/common.h>
66 #include <inet/ip.h>
67 #include <inet/ip6.h>
68 #include <inet/tcp.h>
69 
70 extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *);
71 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
72 		ssize32_t *);
73 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *,
74 		int);
75 
76 /*
77  * kstrwritemp() has very similar semantics as that of strwrite().
78  * The main difference is it obtains mblks from the caller and also
79  * does not do any copy as done in strwrite() from user buffers to
80  * kernel buffers.
81  *
82  * Currently, this routine is used by sendfile to send data allocated
83  * within the kernel without any copying. This interface does not use the
84  * synchronous stream interface as synch. stream interface implies
85  * copying.
86  */
87 int
88 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
89 {
90 	struct stdata *stp;
91 	struct queue *wqp;
92 	mblk_t *newmp;
93 	char waitflag;
94 	int tempmode;
95 	int error = 0;
96 	int done = 0;
97 	struct sonode *so;
98 	boolean_t direct;
99 
100 	ASSERT(vp->v_stream);
101 	stp = vp->v_stream;
102 
103 	so = VTOSO(vp);
104 	direct = (so->so_state & SS_DIRECT);
105 
106 	/*
107 	 * This is the sockfs direct fast path. canputnext() need
108 	 * not be accurate so we don't grab the sd_lock here. If
109 	 * we get flow-controlled, we grab sd_lock just before the
110 	 * do..while loop below to emulate what strwrite() does.
111 	 */
112 	wqp = stp->sd_wrq;
113 	if (canputnext(wqp) && direct &&
114 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
115 		return (sostream_direct(so, NULL, mp, CRED()));
116 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
117 		/* Fast check of flags before acquiring the lock */
118 		mutex_enter(&stp->sd_lock);
119 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
120 		mutex_exit(&stp->sd_lock);
121 		if (error != 0) {
122 			if (!(stp->sd_flag & STPLEX) &&
123 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
124 				tsignal(curthread, SIGPIPE);
125 				error = EPIPE;
126 			}
127 			return (error);
128 		}
129 	}
130 
131 	waitflag = WRITEWAIT;
132 	if (stp->sd_flag & OLDNDELAY)
133 		tempmode = fmode & ~FNDELAY;
134 	else
135 		tempmode = fmode;
136 
137 	mutex_enter(&stp->sd_lock);
138 	do {
139 		if (canputnext(wqp)) {
140 			mutex_exit(&stp->sd_lock);
141 			if (stp->sd_wputdatafunc != NULL) {
142 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
143 				    NULL, NULL, NULL);
144 				if (newmp == NULL) {
145 					/* The caller will free mp */
146 					return (ECOMM);
147 				}
148 				mp = newmp;
149 			}
150 			putnext(wqp, mp);
151 			return (0);
152 		}
153 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
154 		    &done);
155 	} while (error == 0 && !done);
156 
157 	mutex_exit(&stp->sd_lock);
158 	/*
159 	 * EAGAIN tells the application to try again. ENOMEM
160 	 * is returned only if the memory allocation size
161 	 * exceeds the physical limits of the system. ENOMEM
162 	 * can't be true here.
163 	 */
164 	if (error == ENOMEM)
165 		error = EAGAIN;
166 	return (error);
167 }
168 
169 #define	SEND_MAX_CHUNK	16
170 
171 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
172 /*
173  * 64 bit offsets for 32 bit applications only running either on
174  * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
175  * more than 2GB of data.
176  */
177 int
178 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
179     int copy_cnt, ssize32_t *count)
180 {
181 	struct vnode *vp;
182 	ushort_t fflag;
183 	int ioflag;
184 	size32_t cnt;
185 	ssize32_t sfv_len;
186 	ssize32_t tmpcount;
187 	u_offset_t sfv_off;
188 	struct uio auio;
189 	struct iovec aiov;
190 	int i, error;
191 
192 	fflag = fp->f_flag;
193 	vp = fp->f_vnode;
194 	for (i = 0; i < copy_cnt; i++) {
195 
196 		if (ISSIG(curthread, JUSTLOOKING))
197 			return (EINTR);
198 
199 		/*
200 		 * Do similar checks as "write" as we are writing
201 		 * sfv_len bytes into "vp".
202 		 */
203 		sfv_len = (ssize32_t)sfv->sfv_len;
204 
205 		if (sfv_len == 0)
206 			continue;
207 
208 		if (sfv_len < 0)
209 			return (EINVAL);
210 
211 		if (vp->v_type == VREG) {
212 			if (*fileoff >= curproc->p_fsz_ctl) {
213 				mutex_enter(&curproc->p_lock);
214 				(void) rctl_action(
215 				    rctlproc_legacy[RLIMIT_FSIZE],
216 				    curproc->p_rctls, curproc, RCA_SAFE);
217 				mutex_exit(&curproc->p_lock);
218 				return (EFBIG);
219 			}
220 
221 			if (*fileoff >= OFFSET_MAX(fp))
222 				return (EFBIG);
223 
224 			if (*fileoff + sfv_len > OFFSET_MAX(fp))
225 				return (EINVAL);
226 		}
227 
228 		tmpcount = *count + sfv_len;
229 		if (tmpcount < 0)
230 			return (EINVAL);
231 
232 		sfv_off = sfv->sfv_off;
233 
234 		auio.uio_extflg = UIO_COPY_DEFAULT;
235 		if (sfv->sfv_fd == SFV_FD_SELF) {
236 			aiov.iov_len = sfv_len;
237 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
238 			auio.uio_loffset = *fileoff;
239 			auio.uio_iovcnt = 1;
240 			auio.uio_resid = sfv_len;
241 			auio.uio_iov = &aiov;
242 			auio.uio_segflg = UIO_USERSPACE;
243 			auio.uio_llimit = curproc->p_fsz_ctl;
244 			auio.uio_fmode = fflag;
245 			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
246 			while (sfv_len > 0) {
247 				error = VOP_WRITE(vp, &auio, ioflag,
248 				    fp->f_cred, NULL);
249 				cnt = sfv_len - auio.uio_resid;
250 				sfv_len -= cnt;
251 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
252 				if (vp->v_type == VREG)
253 					*fileoff += cnt;
254 				*count += cnt;
255 				if (error != 0)
256 					return (error);
257 			}
258 		} else {
259 			file_t	*ffp;
260 			vnode_t	*readvp;
261 			int	readflg = 0;
262 			size_t	size;
263 			caddr_t	ptr;
264 
265 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
266 				return (EBADF);
267 
268 			if ((ffp->f_flag & FREAD) == 0) {
269 				releasef(sfv->sfv_fd);
270 				return (EBADF);
271 			}
272 
273 			readvp = ffp->f_vnode;
274 			if (readvp->v_type != VREG) {
275 				releasef(sfv->sfv_fd);
276 				return (EINVAL);
277 			}
278 
279 			/*
280 			 * No point reading and writing to same vp,
281 			 * as long as both are regular files. readvp is not
282 			 * locked; but since we got it from an open file the
283 			 * contents will be valid during the time of access.
284 			 */
285 			if (VN_CMP(vp, readvp)) {
286 				releasef(sfv->sfv_fd);
287 				return (EINVAL);
288 			}
289 
290 			/*
291 			 * Note: we assume readvp != vp. "vp" is already
292 			 * locked, and "readvp" must not be.
293 			 */
294 			(void) VOP_RWLOCK(readvp, readflg, NULL);
295 
296 			/*
297 			 * Same checks as in pread64.
298 			 */
299 			if (sfv_off > MAXOFFSET_T) {
300 				VOP_RWUNLOCK(readvp, readflg, NULL);
301 				releasef(sfv->sfv_fd);
302 				return (EINVAL);
303 			}
304 
305 			if (sfv_off + sfv_len > MAXOFFSET_T)
306 				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
307 
308 			/* Find the native blocksize to transfer data */
309 			size = MIN(vp->v_vfsp->vfs_bsize,
310 			    readvp->v_vfsp->vfs_bsize);
311 			size = sfv_len < size ? sfv_len : size;
312 			ptr = kmem_alloc(size, KM_SLEEP);
313 
314 			while (sfv_len > 0) {
315 				size_t	iov_len;
316 
317 				iov_len = MIN(size, sfv_len);
318 				aiov.iov_base = ptr;
319 				aiov.iov_len = iov_len;
320 				auio.uio_loffset = sfv_off;
321 				auio.uio_iov = &aiov;
322 				auio.uio_iovcnt = 1;
323 				auio.uio_resid = iov_len;
324 				auio.uio_segflg = UIO_SYSSPACE;
325 				auio.uio_llimit = MAXOFFSET_T;
326 				auio.uio_fmode = ffp->f_flag;
327 				ioflag = auio.uio_fmode &
328 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
329 
330 				/*
331 				 * If read sync is not asked for,
332 				 * filter sync flags
333 				 */
334 				if ((ioflag & FRSYNC) == 0)
335 					ioflag &= ~(FSYNC|FDSYNC);
336 				error = VOP_READ(readvp, &auio, ioflag,
337 				    fp->f_cred, NULL);
338 				if (error) {
339 					kmem_free(ptr, size);
340 					VOP_RWUNLOCK(readvp, readflg, NULL);
341 					releasef(sfv->sfv_fd);
342 					return (error);
343 				}
344 
345 				/*
346 				 * Check how must data was really read.
347 				 * Decrement the 'len' and increment the
348 				 * 'off' appropriately.
349 				 */
350 				cnt = iov_len - auio.uio_resid;
351 				if (cnt == 0) {
352 					/*
353 					 * If we were reading a pipe (currently
354 					 * not implemented), we may now lose
355 					 * data.
356 					 */
357 					kmem_free(ptr, size);
358 					VOP_RWUNLOCK(readvp, readflg, NULL);
359 					releasef(sfv->sfv_fd);
360 					return (EINVAL);
361 				}
362 				sfv_len -= cnt;
363 				sfv_off += cnt;
364 
365 				aiov.iov_base = ptr;
366 				aiov.iov_len = cnt;
367 				auio.uio_loffset = *fileoff;
368 				auio.uio_resid = cnt;
369 				auio.uio_segflg = UIO_SYSSPACE;
370 				auio.uio_llimit = curproc->p_fsz_ctl;
371 				auio.uio_fmode = fflag;
372 				ioflag = auio.uio_fmode &
373 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
374 				error = VOP_WRITE(vp, &auio, ioflag,
375 				    fp->f_cred, NULL);
376 
377 				/*
378 				 * Check how much data was written. Increment
379 				 * the 'len' and decrement the 'off' if all
380 				 * the data was not written.
381 				 */
382 				cnt -= auio.uio_resid;
383 				sfv_len += auio.uio_resid;
384 				sfv_off -= auio.uio_resid;
385 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
386 				if (vp->v_type == VREG)
387 					*fileoff += cnt;
388 				*count += cnt;
389 				if (error != 0) {
390 					kmem_free(ptr, size);
391 					VOP_RWUNLOCK(readvp, readflg, NULL);
392 					releasef(sfv->sfv_fd);
393 					return (error);
394 				}
395 			}
396 			VOP_RWUNLOCK(readvp, readflg, NULL);
397 			releasef(sfv->sfv_fd);
398 			kmem_free(ptr, size);
399 		}
400 		sfv++;
401 	}
402 	return (0);
403 }
404 
405 ssize32_t
406 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
407 	size32_t *xferred, int fildes)
408 {
409 	int			rwflag;
410 	u_offset_t		fileoff;
411 	int			copy_cnt;
412 	const struct ksendfilevec64 *copy_vec;
413 	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
414 	struct vnode *vp;
415 	int error;
416 	ssize32_t count = 0;
417 	int osfvcnt;
418 
419 	rwflag = 1;
420 	vp = fp->f_vnode;
421 	(void) VOP_RWLOCK(vp, rwflag, NULL);
422 
423 	copy_vec = vec;
424 	fileoff = fp->f_offset;
425 	osfvcnt = sfvcnt;
426 
427 	do {
428 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
429 		if (copyin(copy_vec, sfv, copy_cnt *
430 		    sizeof (struct ksendfilevec64))) {
431 			error = EFAULT;
432 			break;
433 		}
434 
435 		/*
436 		 * Optimize the single regular file over
437 		 * the socket case.
438 		 */
439 		if (vp->v_type == VSOCK && osfvcnt == 1 &&
440 		    sfv->sfv_fd != SFV_FD_SELF) {
441 			file_t *rfp;
442 			vnode_t *rvp;
443 
444 			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
445 				error = EBADF;
446 				break;
447 			}
448 			if ((rfp->f_flag & FREAD) == 0) {
449 				releasef(sfv->sfv_fd);
450 				error = EBADF;
451 				break;
452 			}
453 			rvp = rfp->f_vnode;
454 			if (rvp->v_type == VREG) {
455 				error = sosendfile64(fp, rfp, sfv, &count);
456 				break;
457 			}
458 			releasef(sfv->sfv_fd);
459 		}
460 		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
461 		if (error != 0)
462 			break;
463 
464 		copy_vec += copy_cnt;
465 		sfvcnt -= copy_cnt;
466 	} while (sfvcnt > 0);
467 
468 	if (vp->v_type == VREG)
469 		fp->f_offset += count;
470 
471 	VOP_RWUNLOCK(vp, rwflag, NULL);
472 	if (copyout(&count, xferred, sizeof (count)))
473 		error = EFAULT;
474 	releasef(fildes);
475 	if (error != 0)
476 		return (set_errno(error));
477 	return (count);
478 }
479 #endif
480 
481 int
482 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
483     int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
484 {
485 	struct vnode *vp;
486 	struct uio auio;
487 	struct iovec aiov;
488 	ushort_t fflag;
489 	int ioflag;
490 	int i, error;
491 	size_t cnt;
492 	ssize_t sfv_len;
493 	u_offset_t sfv_off;
494 #ifdef _SYSCALL32_IMPL
495 	model_t model = get_udatamodel();
496 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
497 		MAXOFF32_T : MAXOFFSET_T;
498 #else
499 	const u_offset_t maxoff = MAXOFF32_T;
500 #endif
501 	mblk_t *dmp = NULL;
502 	int wroff;
503 	int buf_left = 0;
504 	size_t	iov_len;
505 	mblk_t  *head, *tmp;
506 	size_t  size = total_size;
507 	size_t  extra;
508 	int tail_len;
509 
510 	fflag = fp->f_flag;
511 	vp = fp->f_vnode;
512 
513 	ASSERT(vp->v_type == VSOCK);
514 	ASSERT(maxblk > 0);
515 
516 	wroff = (int)vp->v_stream->sd_wroff;
517 	tail_len = (int)vp->v_stream->sd_tail;
518 	extra = wroff + tail_len;
519 
520 	buf_left = MIN(total_size, maxblk);
521 	head = dmp = allocb(buf_left + extra, BPRI_HI);
522 	if (head == NULL)
523 		return (ENOMEM);
524 	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
525 
526 	auio.uio_extflg = UIO_COPY_DEFAULT;
527 	for (i = 0; i < copy_cnt; i++) {
528 		if (ISSIG(curthread, JUSTLOOKING))
529 			return (EINTR);
530 
531 		/*
532 		 * Do similar checks as "write" as we are writing
533 		 * sfv_len bytes into "vp".
534 		 */
535 		sfv_len = (ssize_t)sfv->sfv_len;
536 
537 		if (sfv_len == 0) {
538 			sfv++;
539 			continue;
540 		}
541 
542 		/* Make sure sfv_len is not negative */
543 #ifdef _SYSCALL32_IMPL
544 		if (model == DATAMODEL_ILP32) {
545 			if ((ssize32_t)sfv_len < 0)
546 				return (EINVAL);
547 		} else
548 #endif
549 		if (sfv_len < 0)
550 			return (EINVAL);
551 
552 		/* Check for overflow */
553 #ifdef _SYSCALL32_IMPL
554 		if (model == DATAMODEL_ILP32) {
555 			if (((ssize32_t)(*count + sfv_len)) < 0)
556 				return (EINVAL);
557 		} else
558 #endif
559 		if ((*count + sfv_len) < 0)
560 			return (EINVAL);
561 
562 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
563 
564 		if (sfv->sfv_fd == SFV_FD_SELF) {
565 			while (sfv_len > 0) {
566 				if (buf_left == 0) {
567 					tmp = dmp;
568 					buf_left = MIN(total_size, maxblk);
569 					iov_len = MIN(buf_left, sfv_len);
570 					dmp = allocb(buf_left + extra, BPRI_HI);
571 					if (dmp == NULL) {
572 						freemsg(head);
573 						return (ENOMEM);
574 					}
575 					dmp->b_wptr = dmp->b_rptr =
576 					    dmp->b_rptr + wroff;
577 					tmp->b_cont = dmp;
578 				} else {
579 					iov_len = MIN(buf_left, sfv_len);
580 				}
581 
582 				aiov.iov_len = iov_len;
583 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
584 				auio.uio_loffset = *fileoff;
585 				auio.uio_iovcnt = 1;
586 				auio.uio_resid = iov_len;
587 				auio.uio_iov = &aiov;
588 				auio.uio_segflg = UIO_USERSPACE;
589 				auio.uio_llimit = curproc->p_fsz_ctl;
590 				auio.uio_fmode = fflag;
591 
592 				buf_left -= iov_len;
593 				total_size -= iov_len;
594 				sfv_len -= iov_len;
595 				sfv_off += iov_len;
596 
597 				error = uiomove((caddr_t)dmp->b_wptr,
598 				    iov_len, UIO_WRITE, &auio);
599 				if (error != 0) {
600 					freemsg(head);
601 					return (error);
602 				}
603 				dmp->b_wptr += iov_len;
604 			}
605 		} else {
606 			file_t	*ffp;
607 			vnode_t	*readvp;
608 			int	readflg = 0;
609 
610 			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
611 				freemsg(head);
612 				return (EBADF);
613 			}
614 
615 			if ((ffp->f_flag & FREAD) == 0) {
616 				releasef(sfv->sfv_fd);
617 				freemsg(head);
618 				return (EACCES);
619 			}
620 
621 			readvp = ffp->f_vnode;
622 			if (readvp->v_type != VREG) {
623 				releasef(sfv->sfv_fd);
624 				freemsg(head);
625 				return (EINVAL);
626 			}
627 
628 			/*
629 			 * No point reading and writing to same vp,
630 			 * as long as both are regular files. readvp is not
631 			 * locked; but since we got it from an open file the
632 			 * contents will be valid during the time of access.
633 			 */
634 
635 			if (VN_CMP(vp, readvp)) {
636 				releasef(sfv->sfv_fd);
637 				freemsg(head);
638 				return (EINVAL);
639 			}
640 
641 			/*
642 			 * Note: we assume readvp != vp. "vp" is already
643 			 * locked, and "readvp" must not be.
644 			 */
645 
646 			(void) VOP_RWLOCK(readvp, readflg, NULL);
647 
648 			/* Same checks as in pread */
649 			if (sfv_off > maxoff) {
650 				VOP_RWUNLOCK(readvp, readflg, NULL);
651 				releasef(sfv->sfv_fd);
652 				freemsg(head);
653 				return (EINVAL);
654 			}
655 			if (sfv_off + sfv_len > maxoff) {
656 				sfv_len = (ssize_t)((offset_t)maxoff -
657 				    sfv_off);
658 			}
659 
660 			while (sfv_len > 0) {
661 				if (buf_left == 0) {
662 					tmp = dmp;
663 					buf_left = MIN(total_size, maxblk);
664 					iov_len = MIN(buf_left, sfv_len);
665 					dmp = allocb(buf_left + extra, BPRI_HI);
666 					if (dmp == NULL) {
667 						VOP_RWUNLOCK(readvp, readflg,
668 									NULL);
669 						releasef(sfv->sfv_fd);
670 						freemsg(head);
671 						return (ENOMEM);
672 					}
673 					dmp->b_wptr = dmp->b_rptr =
674 					    dmp->b_rptr + wroff;
675 					tmp->b_cont = dmp;
676 				} else {
677 					iov_len = MIN(buf_left, sfv_len);
678 				}
679 				aiov.iov_base = (caddr_t)dmp->b_wptr;
680 				aiov.iov_len = iov_len;
681 				auio.uio_loffset = sfv_off;
682 				auio.uio_iov = &aiov;
683 				auio.uio_iovcnt = 1;
684 				auio.uio_resid = iov_len;
685 				auio.uio_segflg = UIO_SYSSPACE;
686 				auio.uio_llimit = MAXOFFSET_T;
687 				auio.uio_fmode = ffp->f_flag;
688 				ioflag = auio.uio_fmode &
689 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
690 
691 				/*
692 				 * If read sync is not asked for,
693 				 * filter sync flags
694 				 */
695 				if ((ioflag & FRSYNC) == 0)
696 					ioflag &= ~(FSYNC|FDSYNC);
697 				error = VOP_READ(readvp, &auio, ioflag,
698 				    fp->f_cred, NULL);
699 				if (error != 0) {
700 					/*
701 					 * If we were reading a pipe (currently
702 					 * not implemented), we may now loose
703 					 * data.
704 					 */
705 					VOP_RWUNLOCK(readvp, readflg, NULL);
706 					releasef(sfv->sfv_fd);
707 					freemsg(head);
708 					return (error);
709 				}
710 
711 				/*
712 				 * Check how much data was really read.
713 				 * Decrement the 'len' and increment the
714 				 * 'off' appropriately.
715 				 */
716 				cnt = iov_len - auio.uio_resid;
717 				if (cnt == 0) {
718 					VOP_RWUNLOCK(readvp, readflg, NULL);
719 					releasef(sfv->sfv_fd);
720 					freemsg(head);
721 					return (EINVAL);
722 				}
723 				sfv_len -= cnt;
724 				sfv_off += cnt;
725 				total_size -= cnt;
726 				buf_left -= cnt;
727 
728 				dmp->b_wptr += cnt;
729 			}
730 			VOP_RWUNLOCK(readvp, readflg, NULL);
731 			releasef(sfv->sfv_fd);
732 		}
733 		sfv++;
734 	}
735 
736 	ASSERT(total_size == 0);
737 	error = kstrwritemp(vp, head, fflag);
738 	if (error != 0) {
739 		freemsg(head);
740 		return (error);
741 	}
742 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
743 	*count += size;
744 
745 	return (0);
746 }
747 
748 
749 int
750 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
751     int copy_cnt, ssize_t *count)
752 {
753 	struct vnode *vp;
754 	struct uio auio;
755 	struct iovec aiov;
756 	ushort_t fflag;
757 	int ioflag;
758 	int i, error;
759 	size_t cnt;
760 	ssize_t sfv_len;
761 	u_offset_t sfv_off;
762 #ifdef _SYSCALL32_IMPL
763 	model_t model = get_udatamodel();
764 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
765 		MAXOFF32_T : MAXOFFSET_T;
766 #else
767 	const u_offset_t maxoff = MAXOFF32_T;
768 #endif
769 	mblk_t	*dmp = NULL;
770 	char	*buf = NULL;
771 	size_t  extra;
772 	int maxblk, wroff, tail_len;
773 	struct sonode *so;
774 	stdata_t *stp;
775 
776 	fflag = fp->f_flag;
777 	vp = fp->f_vnode;
778 
779 	if (vp->v_type == VSOCK) {
780 		so = VTOSO(vp);
781 		stp = vp->v_stream;
782 		wroff = (int)stp->sd_wroff;
783 		tail_len = (int)stp->sd_tail;
784 		maxblk = (int)stp->sd_maxblk;
785 		extra = wroff + tail_len;
786 	}
787 
788 	auio.uio_extflg = UIO_COPY_DEFAULT;
789 	for (i = 0; i < copy_cnt; i++) {
790 		if (ISSIG(curthread, JUSTLOOKING))
791 			return (EINTR);
792 
793 		/*
794 		 * Do similar checks as "write" as we are writing
795 		 * sfv_len bytes into "vp".
796 		 */
797 		sfv_len = (ssize_t)sfv->sfv_len;
798 
799 		if (sfv_len == 0) {
800 			sfv++;
801 			continue;
802 		}
803 
804 		/* Make sure sfv_len is not negative */
805 #ifdef _SYSCALL32_IMPL
806 		if (model == DATAMODEL_ILP32) {
807 			if ((ssize32_t)sfv_len < 0)
808 				return (EINVAL);
809 		} else
810 #endif
811 		if (sfv_len < 0)
812 			return (EINVAL);
813 
814 		if (vp->v_type == VREG) {
815 			if (*fileoff >= curproc->p_fsz_ctl) {
816 				mutex_enter(&curproc->p_lock);
817 				(void) rctl_action(
818 				    rctlproc_legacy[RLIMIT_FSIZE],
819 				    curproc->p_rctls, curproc, RCA_SAFE);
820 				mutex_exit(&curproc->p_lock);
821 
822 				return (EFBIG);
823 			}
824 
825 			if (*fileoff >= maxoff)
826 				return (EFBIG);
827 
828 			if (*fileoff + sfv_len > maxoff)
829 				return (EINVAL);
830 		}
831 
832 		/* Check for overflow */
833 #ifdef _SYSCALL32_IMPL
834 		if (model == DATAMODEL_ILP32) {
835 			if (((ssize32_t)(*count + sfv_len)) < 0)
836 				return (EINVAL);
837 		} else
838 #endif
839 		if ((*count + sfv_len) < 0)
840 			return (EINVAL);
841 
842 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
843 
844 		if (sfv->sfv_fd == SFV_FD_SELF) {
845 			aiov.iov_len = sfv_len;
846 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
847 			auio.uio_loffset = *fileoff;
848 			auio.uio_iovcnt = 1;
849 			auio.uio_resid = sfv_len;
850 			auio.uio_iov = &aiov;
851 			auio.uio_segflg = UIO_USERSPACE;
852 			auio.uio_llimit = curproc->p_fsz_ctl;
853 			auio.uio_fmode = fflag;
854 
855 			if (vp->v_type == VSOCK) {
856 
857 				/*
858 				 * Optimize for the socket case
859 				 */
860 
861 				dmp = allocb(sfv_len + extra, BPRI_HI);
862 				if (dmp == NULL)
863 					return (ENOMEM);
864 				dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff;
865 				error = uiomove((caddr_t)dmp->b_wptr,
866 				    sfv_len, UIO_WRITE, &auio);
867 				if (error != 0) {
868 					freeb(dmp);
869 					return (error);
870 				}
871 				dmp->b_wptr += sfv_len;
872 				error = kstrwritemp(vp, dmp, fflag);
873 				if (error != 0) {
874 					freeb(dmp);
875 					return (error);
876 				}
877 				ttolwp(curthread)->lwp_ru.ioch +=
878 				    (ulong_t)sfv_len;
879 				*count += sfv_len;
880 			} else {
881 				ioflag = auio.uio_fmode &
882 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
883 				while (sfv_len > 0) {
884 					error = VOP_WRITE(vp, &auio, ioflag,
885 					    fp->f_cred, NULL);
886 					cnt = sfv_len - auio.uio_resid;
887 					sfv_len -= cnt;
888 					ttolwp(curthread)->lwp_ru.ioch +=
889 					    (ulong_t)cnt;
890 					*fileoff += cnt;
891 					*count += cnt;
892 					if (error != 0)
893 						return (error);
894 				}
895 			}
896 		} else {
897 			file_t	*ffp;
898 			vnode_t	*readvp;
899 			int	readflg = 0;
900 			size_t	size;
901 			caddr_t	ptr;
902 
903 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
904 				return (EBADF);
905 
906 			if ((ffp->f_flag & FREAD) == 0) {
907 				releasef(sfv->sfv_fd);
908 				return (EBADF);
909 			}
910 
911 			readvp = ffp->f_vnode;
912 			if (readvp->v_type != VREG) {
913 				releasef(sfv->sfv_fd);
914 				return (EINVAL);
915 			}
916 
917 			/*
918 			 * No point reading and writing to same vp,
919 			 * as long as both are regular files. readvp is not
920 			 * locked; but since we got it from an open file the
921 			 * contents will be valid during the time of access.
922 			 */
923 			if (VN_CMP(vp, readvp)) {
924 				releasef(sfv->sfv_fd);
925 				return (EINVAL);
926 			}
927 
928 			/*
929 			 * Note: we assume readvp != vp. "vp" is already
930 			 * locked, and "readvp" must not be.
931 			 */
932 			(void) VOP_RWLOCK(readvp, readflg, NULL);
933 
934 			/* Same checks as in pread */
935 			if (sfv_off > maxoff) {
936 				VOP_RWUNLOCK(readvp, readflg, NULL);
937 				releasef(sfv->sfv_fd);
938 				return (EINVAL);
939 			}
940 			if (sfv_off + sfv_len > maxoff) {
941 				sfv_len = (ssize_t)((offset_t)maxoff -
942 				    sfv_off);
943 			}
944 			/* Find the native blocksize to transfer data */
945 			size = MIN(vp->v_vfsp->vfs_bsize,
946 			    readvp->v_vfsp->vfs_bsize);
947 			size = sfv_len < size ? sfv_len : size;
948 
949 			if (vp->v_type != VSOCK) {
950 				buf = kmem_alloc(size, KM_NOSLEEP);
951 				if (buf == NULL) {
952 					VOP_RWUNLOCK(readvp, readflg, NULL);
953 					releasef(sfv->sfv_fd);
954 					return (ENOMEM);
955 				}
956 			} else {
957 				/*
958 				 * For sockets acting as an SSL proxy, we
959 				 * need to adjust the size to the maximum
960 				 * SSL record size set in the stream head.
961 				 */
962 				if (so->so_kssl_ctx != NULL)
963 					size = MIN(size, maxblk);
964 			}
965 
966 			while (sfv_len > 0) {
967 				size_t	iov_len;
968 
969 				iov_len = MIN(size, sfv_len);
970 
971 				if (vp->v_type == VSOCK) {
972 					dmp = allocb(iov_len + extra, BPRI_HI);
973 					if (dmp == NULL) {
974 						VOP_RWUNLOCK(readvp, readflg,
975 						    NULL);
976 						releasef(sfv->sfv_fd);
977 						return (ENOMEM);
978 					}
979 					dmp->b_wptr = dmp->b_rptr =
980 					    dmp->b_rptr + wroff;
981 					ptr = (caddr_t)dmp->b_rptr;
982 				} else {
983 					ptr = buf;
984 				}
985 
986 				aiov.iov_base = ptr;
987 				aiov.iov_len = iov_len;
988 				auio.uio_loffset = sfv_off;
989 				auio.uio_iov = &aiov;
990 				auio.uio_iovcnt = 1;
991 				auio.uio_resid = iov_len;
992 				auio.uio_segflg = UIO_SYSSPACE;
993 				auio.uio_llimit = MAXOFFSET_T;
994 				auio.uio_fmode = ffp->f_flag;
995 				ioflag = auio.uio_fmode &
996 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
997 
998 				/*
999 				 * If read sync is not asked for,
1000 				 * filter sync flags
1001 				 */
1002 				if ((ioflag & FRSYNC) == 0)
1003 					ioflag &= ~(FSYNC|FDSYNC);
1004 				error = VOP_READ(readvp, &auio, ioflag,
1005 				    fp->f_cred, NULL);
1006 				if (error != 0) {
1007 					/*
1008 					 * If we were reading a pipe (currently
1009 					 * not implemented), we may now lose
1010 					 * data.
1011 					 */
1012 					if (vp->v_type == VSOCK)
1013 						freeb(dmp);
1014 					else
1015 						kmem_free(buf, size);
1016 					VOP_RWUNLOCK(readvp, readflg, NULL);
1017 					releasef(sfv->sfv_fd);
1018 					return (error);
1019 				}
1020 
1021 				/*
1022 				 * Check how much data was really read.
1023 				 * Decrement the 'len' and increment the
1024 				 * 'off' appropriately.
1025 				 */
1026 				cnt = iov_len - auio.uio_resid;
1027 				if (cnt == 0) {
1028 					if (vp->v_type == VSOCK)
1029 						freeb(dmp);
1030 					else
1031 						kmem_free(buf, size);
1032 					VOP_RWUNLOCK(readvp, readflg, NULL);
1033 					releasef(sfv->sfv_fd);
1034 					return (EINVAL);
1035 				}
1036 				sfv_len -= cnt;
1037 				sfv_off += cnt;
1038 
1039 				if (vp->v_type == VSOCK) {
1040 					dmp->b_wptr = dmp->b_rptr + cnt;
1041 
1042 					error = kstrwritemp(vp, dmp, fflag);
1043 					if (error != 0) {
1044 						freeb(dmp);
1045 						VOP_RWUNLOCK(readvp, readflg,
1046 									NULL);
1047 						releasef(sfv->sfv_fd);
1048 						return (error);
1049 					}
1050 
1051 					ttolwp(curthread)->lwp_ru.ioch +=
1052 					    (ulong_t)cnt;
1053 					*count += cnt;
1054 				} else {
1055 
1056 					aiov.iov_base = ptr;
1057 					aiov.iov_len = cnt;
1058 					auio.uio_loffset = *fileoff;
1059 					auio.uio_resid = cnt;
1060 					auio.uio_segflg = UIO_SYSSPACE;
1061 					auio.uio_llimit = curproc->p_fsz_ctl;
1062 					auio.uio_fmode = fflag;
1063 					ioflag = auio.uio_fmode &
1064 					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1065 					error = VOP_WRITE(vp, &auio, ioflag,
1066 					    fp->f_cred, NULL);
1067 
1068 					/*
1069 					 * Check how much data was written.
1070 					 * Increment the 'len' and decrement the
1071 					 * 'off' if all the data was not
1072 					 * written.
1073 					 */
1074 					cnt -= auio.uio_resid;
1075 					sfv_len += auio.uio_resid;
1076 					sfv_off -= auio.uio_resid;
1077 					ttolwp(curthread)->lwp_ru.ioch +=
1078 					    (ulong_t)cnt;
1079 					*fileoff += cnt;
1080 					*count += cnt;
1081 					if (error != 0) {
1082 						VOP_RWUNLOCK(readvp, readflg,
1083 									NULL);
1084 						releasef(sfv->sfv_fd);
1085 						return (error);
1086 					}
1087 				}
1088 			}
1089 			if (buf) {
1090 				kmem_free(buf, size);
1091 				buf = NULL;
1092 			}
1093 			VOP_RWUNLOCK(readvp, readflg, NULL);
1094 			releasef(sfv->sfv_fd);
1095 		}
1096 		sfv++;
1097 	}
1098 	return (0);
1099 }
1100 
1101 ssize_t
1102 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1103     size_t *xferred)
1104 {
1105 	int error;
1106 	file_t *fp;
1107 	struct vnode *vp;
1108 	struct sonode *so;
1109 	u_offset_t fileoff;
1110 	int copy_cnt;
1111 	const struct sendfilevec *copy_vec;
1112 	struct sendfilevec sfv[SEND_MAX_CHUNK];
1113 	ssize_t count = 0;
1114 #ifdef _SYSCALL32_IMPL
1115 	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1116 #endif
1117 	ssize_t total_size = 0;
1118 	int i;
1119 	boolean_t is_sock = B_FALSE;
1120 	int maxblk = 0;
1121 
1122 	if (sfvcnt <= 0)
1123 		return (set_errno(EINVAL));
1124 
1125 	if ((fp = getf(fildes)) == NULL)
1126 		return (set_errno(EBADF));
1127 
1128 	if (((fp->f_flag) & FWRITE) == 0) {
1129 		error = EBADF;
1130 		goto err;
1131 	}
1132 
1133 	fileoff = fp->f_offset;
1134 	vp = fp->f_vnode;
1135 
1136 	switch (vp->v_type) {
1137 	case VSOCK:
1138 		so = VTOSO(vp);
1139 		/* sendfile not supported for SCTP */
1140 		if (so->so_protocol == IPPROTO_SCTP) {
1141 			error = EPROTONOSUPPORT;
1142 			goto err;
1143 		}
1144 		is_sock = B_TRUE;
1145 		switch (so->so_family) {
1146 		case AF_NCA:
1147 		case AF_INET:
1148 		case AF_INET6:
1149 			/*
1150 			 * Make similar checks done in SOP_WRITE().
1151 			 */
1152 			if (so->so_state & SS_CANTSENDMORE) {
1153 				tsignal(curthread, SIGPIPE);
1154 				error = EPIPE;
1155 				goto err;
1156 			}
1157 			if (so->so_type != SOCK_STREAM) {
1158 				error = EOPNOTSUPP;
1159 				goto err;
1160 			}
1161 
1162 			if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
1163 			    (SS_ISCONNECTED|SS_ISBOUND)) {
1164 				error = ENOTCONN;
1165 				goto err;
1166 			}
1167 
1168 			if ((so->so_state & SS_DIRECT) &&
1169 			    (so->so_priv != NULL) &&
1170 			    (so->so_kssl_ctx == NULL)) {
1171 				maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
1172 			} else {
1173 				maxblk = (int)vp->v_stream->sd_maxblk;
1174 			}
1175 			break;
1176 		default:
1177 			error = EAFNOSUPPORT;
1178 			goto err;
1179 		}
1180 		break;
1181 	case VREG:
1182 		break;
1183 	default:
1184 		error = EINVAL;
1185 		goto err;
1186 	}
1187 
1188 	switch (opcode) {
1189 	case SENDFILEV :
1190 		break;
1191 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1192 	case SENDFILEV64 :
1193 		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1194 		    (size32_t *)xferred, fildes));
1195 #endif
1196 	default :
1197 		error = ENOSYS;
1198 		break;
1199 	}
1200 
1201 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1202 	copy_vec = vec;
1203 
1204 	do {
1205 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1206 #ifdef _SYSCALL32_IMPL
1207 		/* 32-bit callers need to have their iovec expanded. */
1208 		if (get_udatamodel() == DATAMODEL_ILP32) {
1209 			if (copyin(copy_vec, sfv32,
1210 			    copy_cnt * sizeof (ksendfilevec32_t))) {
1211 				error = EFAULT;
1212 				break;
1213 			}
1214 
1215 			for (i = 0; i < copy_cnt; i++) {
1216 				sfv[i].sfv_fd = sfv32[i].sfv_fd;
1217 				sfv[i].sfv_off =
1218 					(off_t)(uint32_t)sfv32[i].sfv_off;
1219 				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1220 				total_size += sfv[i].sfv_len;
1221 				sfv[i].sfv_flag = sfv32[i].sfv_flag;
1222 			}
1223 		} else {
1224 #endif
1225 			if (copyin(copy_vec, sfv,
1226 			    copy_cnt * sizeof (sendfilevec_t))) {
1227 				error = EFAULT;
1228 				break;
1229 			}
1230 
1231 			for (i = 0; i < copy_cnt; i++) {
1232 				total_size += sfv[i].sfv_len;
1233 			}
1234 #ifdef _SYSCALL32_IMPL
1235 		}
1236 #endif
1237 
1238 		/*
1239 		 * The task between deciding to use sendvec_small_chunk
1240 		 * and sendvec_chunk is dependant on multiple things:
1241 		 *
1242 		 * i) latency is important for smaller files. So if the
1243 		 * data is smaller than 'tcp_slow_start_initial' times
1244 		 * maxblk, then use sendvec_small_chunk which creates
1245 		 * maxblk size mblks and chains then together and sends
1246 		 * them to TCP in one shot. It also leaves 'wroff' size
1247 		 * space for the headers in each mblk.
1248 		 *
1249 		 * ii) for total size bigger than 'tcp_slow_start_initial'
1250 		 * time maxblk, its probably real file data which is
1251 		 * dominating. So its better to use sendvec_chunk because
1252 		 * performance goes to dog if we don't do pagesize reads.
1253 		 * sendvec_chunk will do pagesize reads and write them
1254 		 * in pagesize mblks to TCP.
1255 		 *
1256 		 * Side Notes: A write to file has not been optimized.
1257 		 * Future zero copy code will plugin into sendvec_chunk
1258 		 * only because doing zero copy for files smaller then
1259 		 * pagesize is useless.
1260 		 *
1261 		 * Note, if socket has NL7C enabled then call NL7C's
1262 		 * senfilev() function to give NL7C a chance to copy
1263 		 * the vec for caching, then continue processing as
1264 		 * normal.
1265 		 */
1266 		if (is_sock) {
1267 			switch (so->so_family) {
1268 			case AF_INET:
1269 			case AF_INET6:
1270 				if (so->so_nl7c_flags != 0) {
1271 					nl7c_sendfilev(so, fileoff,
1272 					    sfv, copy_cnt);
1273 				}
1274 				if (total_size <= (4 * maxblk))
1275 					error = sendvec_small_chunk(fp,
1276 					    &fileoff, sfv, copy_cnt,
1277 					    total_size, maxblk, &count);
1278 				else
1279 					error = sendvec_chunk(fp, &fileoff,
1280 					    sfv, copy_cnt, &count);
1281 				break;
1282 			case AF_NCA:
1283 				error = nca_sendfilev(fp, sfv, copy_cnt,
1284 				    &count);
1285 				break;
1286 			}
1287 		} else {
1288 			ASSERT(vp->v_type == VREG);
1289 			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1290 			    &count);
1291 		}
1292 
1293 
1294 #ifdef _SYSCALL32_IMPL
1295 	if (get_udatamodel() == DATAMODEL_ILP32)
1296 		copy_vec = (const struct sendfilevec *)((char *)copy_vec +
1297 		    (copy_cnt * sizeof (ksendfilevec32_t)));
1298 	else
1299 #endif
1300 		copy_vec += copy_cnt;
1301 		sfvcnt -= copy_cnt;
1302 	} while (sfvcnt > 0);
1303 
1304 	if (vp->v_type == VREG)
1305 		fp->f_offset += count;
1306 
1307 
1308 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1309 
1310 #ifdef _SYSCALL32_IMPL
1311 	if (get_udatamodel() == DATAMODEL_ILP32) {
1312 		ssize32_t count32 = (ssize32_t)count;
1313 		if (copyout(&count32, xferred, sizeof (count32)))
1314 			error = EFAULT;
1315 		releasef(fildes);
1316 		if (error != 0)
1317 			return (set_errno(error));
1318 		return (count32);
1319 	}
1320 #endif
1321 	if (copyout(&count, xferred, sizeof (count)))
1322 		error = EFAULT;
1323 	releasef(fildes);
1324 	if (error != 0)
1325 		return (set_errno(error));
1326 	return (count);
1327 err:
1328 	ASSERT(error != 0);
1329 	releasef(fildes);
1330 	return (set_errno(error));
1331 }
1332