xref: /illumos-gate/usr/src/uts/common/syscall/sendfile.c (revision 1574ae68edf276820812b92d6c9f22e8f4049860)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/cmn_err.h>
53 #include <sys/vmsystm.h>
54 
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <netinet/in.h>
58 #include <sys/sendfile.h>
59 #include <sys/un.h>
60 #include <inet/nca/ncadoorhdr.h>
61 #include <inet/nca/ncaio.h>
62 #include <sys/tihdr.h>
63 #include <sys/atomic.h>
64 
65 #include <inet/common.h>
66 #include <inet/ip.h>
67 #include <inet/ip6.h>
68 #include <inet/tcp.h>
69 
70 extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *);
71 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
72 		ssize32_t *);
73 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *,
74 		int);
75 
76 #define	SEND_MAX_CHUNK	16
77 
78 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
79 /*
80  * 64 bit offsets for 32 bit applications only running either on
81  * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
82  * more than 2GB of data.
83  */
84 int
85 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
86     int copy_cnt, ssize32_t *count)
87 {
88 	struct vnode *vp;
89 	ushort_t fflag;
90 	int ioflag;
91 	size32_t cnt;
92 	ssize32_t sfv_len;
93 	ssize32_t tmpcount;
94 	u_offset_t sfv_off;
95 	struct uio auio;
96 	struct iovec aiov;
97 	int i, error;
98 
99 	fflag = fp->f_flag;
100 	vp = fp->f_vnode;
101 	for (i = 0; i < copy_cnt; i++) {
102 
103 		if (ISSIG(curthread, JUSTLOOKING))
104 			return (EINTR);
105 
106 		/*
107 		 * Do similar checks as "write" as we are writing
108 		 * sfv_len bytes into "vp".
109 		 */
110 		sfv_len = (ssize32_t)sfv->sfv_len;
111 
112 		if (sfv_len == 0)
113 			continue;
114 
115 		if (sfv_len < 0)
116 			return (EINVAL);
117 
118 		if (vp->v_type == VREG) {
119 			if (*fileoff >= curproc->p_fsz_ctl) {
120 				mutex_enter(&curproc->p_lock);
121 				(void) rctl_action(
122 				    rctlproc_legacy[RLIMIT_FSIZE],
123 				    curproc->p_rctls, curproc, RCA_SAFE);
124 				mutex_exit(&curproc->p_lock);
125 				return (EFBIG);
126 			}
127 
128 			if (*fileoff >= OFFSET_MAX(fp))
129 				return (EFBIG);
130 
131 			if (*fileoff + sfv_len > OFFSET_MAX(fp))
132 				return (EINVAL);
133 		}
134 
135 		tmpcount = *count + sfv_len;
136 		if (tmpcount < 0)
137 			return (EINVAL);
138 
139 		sfv_off = sfv->sfv_off;
140 
141 		auio.uio_extflg = UIO_COPY_DEFAULT;
142 		if (sfv->sfv_fd == SFV_FD_SELF) {
143 			aiov.iov_len = sfv_len;
144 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
145 			auio.uio_loffset = *fileoff;
146 			auio.uio_iovcnt = 1;
147 			auio.uio_resid = sfv_len;
148 			auio.uio_iov = &aiov;
149 			auio.uio_segflg = UIO_USERSPACE;
150 			auio.uio_llimit = curproc->p_fsz_ctl;
151 			auio.uio_fmode = fflag;
152 			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
153 			while (sfv_len > 0) {
154 				error = VOP_WRITE(vp, &auio, ioflag,
155 				    fp->f_cred, NULL);
156 				cnt = sfv_len - auio.uio_resid;
157 				sfv_len -= cnt;
158 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
159 				if (vp->v_type == VREG)
160 					*fileoff += cnt;
161 				*count += cnt;
162 				if (error != 0)
163 					return (error);
164 			}
165 		} else {
166 			file_t	*ffp;
167 			vnode_t	*readvp;
168 			int	readflg = 0;
169 			size_t	size;
170 			caddr_t	ptr;
171 
172 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
173 				return (EBADF);
174 
175 			if ((ffp->f_flag & FREAD) == 0) {
176 				releasef(sfv->sfv_fd);
177 				return (EBADF);
178 			}
179 
180 			readvp = ffp->f_vnode;
181 			if (readvp->v_type != VREG) {
182 				releasef(sfv->sfv_fd);
183 				return (EINVAL);
184 			}
185 
186 			/*
187 			 * No point reading and writing to same vp,
188 			 * as long as both are regular files. readvp is not
189 			 * locked; but since we got it from an open file the
190 			 * contents will be valid during the time of access.
191 			 */
192 			if (VN_CMP(vp, readvp)) {
193 				releasef(sfv->sfv_fd);
194 				return (EINVAL);
195 			}
196 
197 			/*
198 			 * Note: we assume readvp != vp. "vp" is already
199 			 * locked, and "readvp" must not be.
200 			 */
201 			(void) VOP_RWLOCK(readvp, readflg, NULL);
202 
203 			/*
204 			 * Same checks as in pread64.
205 			 */
206 			if (sfv_off > MAXOFFSET_T) {
207 				VOP_RWUNLOCK(readvp, readflg, NULL);
208 				releasef(sfv->sfv_fd);
209 				return (EINVAL);
210 			}
211 
212 			if (sfv_off + sfv_len > MAXOFFSET_T)
213 				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
214 
215 			/* Find the native blocksize to transfer data */
216 			size = MIN(vp->v_vfsp->vfs_bsize,
217 			    readvp->v_vfsp->vfs_bsize);
218 			size = sfv_len < size ? sfv_len : size;
219 			ptr = kmem_alloc(size, KM_SLEEP);
220 
221 			while (sfv_len > 0) {
222 				size_t	iov_len;
223 
224 				iov_len = MIN(size, sfv_len);
225 				aiov.iov_base = ptr;
226 				aiov.iov_len = iov_len;
227 				auio.uio_loffset = sfv_off;
228 				auio.uio_iov = &aiov;
229 				auio.uio_iovcnt = 1;
230 				auio.uio_resid = iov_len;
231 				auio.uio_segflg = UIO_SYSSPACE;
232 				auio.uio_llimit = MAXOFFSET_T;
233 				auio.uio_fmode = ffp->f_flag;
234 				ioflag = auio.uio_fmode &
235 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
236 
237 				/*
238 				 * If read sync is not asked for,
239 				 * filter sync flags
240 				 */
241 				if ((ioflag & FRSYNC) == 0)
242 					ioflag &= ~(FSYNC|FDSYNC);
243 				error = VOP_READ(readvp, &auio, ioflag,
244 				    fp->f_cred, NULL);
245 				if (error) {
246 					kmem_free(ptr, size);
247 					VOP_RWUNLOCK(readvp, readflg, NULL);
248 					releasef(sfv->sfv_fd);
249 					return (error);
250 				}
251 
252 				/*
253 				 * Check how must data was really read.
254 				 * Decrement the 'len' and increment the
255 				 * 'off' appropriately.
256 				 */
257 				cnt = iov_len - auio.uio_resid;
258 				if (cnt == 0) {
259 					/*
260 					 * If we were reading a pipe (currently
261 					 * not implemented), we may now lose
262 					 * data.
263 					 */
264 					kmem_free(ptr, size);
265 					VOP_RWUNLOCK(readvp, readflg, NULL);
266 					releasef(sfv->sfv_fd);
267 					return (EINVAL);
268 				}
269 				sfv_len -= cnt;
270 				sfv_off += cnt;
271 
272 				aiov.iov_base = ptr;
273 				aiov.iov_len = cnt;
274 				auio.uio_loffset = *fileoff;
275 				auio.uio_resid = cnt;
276 				auio.uio_segflg = UIO_SYSSPACE;
277 				auio.uio_llimit = curproc->p_fsz_ctl;
278 				auio.uio_fmode = fflag;
279 				ioflag = auio.uio_fmode &
280 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
281 				error = VOP_WRITE(vp, &auio, ioflag,
282 				    fp->f_cred, NULL);
283 
284 				/*
285 				 * Check how much data was written. Increment
286 				 * the 'len' and decrement the 'off' if all
287 				 * the data was not written.
288 				 */
289 				cnt -= auio.uio_resid;
290 				sfv_len += auio.uio_resid;
291 				sfv_off -= auio.uio_resid;
292 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
293 				if (vp->v_type == VREG)
294 					*fileoff += cnt;
295 				*count += cnt;
296 				if (error != 0) {
297 					kmem_free(ptr, size);
298 					VOP_RWUNLOCK(readvp, readflg, NULL);
299 					releasef(sfv->sfv_fd);
300 					return (error);
301 				}
302 			}
303 			VOP_RWUNLOCK(readvp, readflg, NULL);
304 			releasef(sfv->sfv_fd);
305 			kmem_free(ptr, size);
306 		}
307 		sfv++;
308 	}
309 	return (0);
310 }
311 
312 ssize32_t
313 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
314 	size32_t *xferred, int fildes)
315 {
316 	int			rwflag;
317 	u_offset_t		fileoff;
318 	int			copy_cnt;
319 	const struct ksendfilevec64 *copy_vec;
320 	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
321 	struct vnode *vp;
322 	int error;
323 	ssize32_t count = 0;
324 	int osfvcnt;
325 
326 	rwflag = 1;
327 	vp = fp->f_vnode;
328 	(void) VOP_RWLOCK(vp, rwflag, NULL);
329 
330 	copy_vec = vec;
331 	fileoff = fp->f_offset;
332 	osfvcnt = sfvcnt;
333 
334 	do {
335 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
336 		if (copyin(copy_vec, sfv, copy_cnt *
337 		    sizeof (struct ksendfilevec64))) {
338 			error = EFAULT;
339 			break;
340 		}
341 
342 		/*
343 		 * Optimize the single regular file over
344 		 * the socket case.
345 		 */
346 		if (vp->v_type == VSOCK && osfvcnt == 1 &&
347 		    sfv->sfv_fd != SFV_FD_SELF) {
348 			file_t *rfp;
349 			vnode_t *rvp;
350 
351 			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
352 				error = EBADF;
353 				break;
354 			}
355 			if ((rfp->f_flag & FREAD) == 0) {
356 				releasef(sfv->sfv_fd);
357 				error = EBADF;
358 				break;
359 			}
360 			rvp = rfp->f_vnode;
361 			if (rvp->v_type == VREG) {
362 				error = sosendfile64(fp, rfp, sfv, &count);
363 				break;
364 			}
365 			releasef(sfv->sfv_fd);
366 		}
367 		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
368 		if (error != 0)
369 			break;
370 
371 		copy_vec += copy_cnt;
372 		sfvcnt -= copy_cnt;
373 	} while (sfvcnt > 0);
374 
375 	if (vp->v_type == VREG)
376 		fp->f_offset += count;
377 
378 	VOP_RWUNLOCK(vp, rwflag, NULL);
379 	if (copyout(&count, xferred, sizeof (count)))
380 		error = EFAULT;
381 	releasef(fildes);
382 	if (error != 0)
383 		return (set_errno(error));
384 	return (count);
385 }
386 #endif
387 
388 int
389 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
390     int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
391 {
392 	struct vnode *vp;
393 	struct uio auio;
394 	struct iovec aiov;
395 	ushort_t fflag;
396 	int ioflag;
397 	int i, error;
398 	size_t cnt;
399 	ssize_t sfv_len;
400 	u_offset_t sfv_off;
401 #ifdef _SYSCALL32_IMPL
402 	model_t model = get_udatamodel();
403 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
404 		MAXOFF32_T : MAXOFFSET_T;
405 #else
406 	const u_offset_t maxoff = MAXOFF32_T;
407 #endif
408 	mblk_t *dmp = NULL;
409 	int wroff;
410 	int buf_left = 0;
411 	size_t	iov_len;
412 	mblk_t  *head, *tmp;
413 	size_t  size = total_size;
414 
415 	fflag = fp->f_flag;
416 	vp = fp->f_vnode;
417 
418 	ASSERT(vp->v_type == VSOCK);
419 	ASSERT(maxblk > 0);
420 
421 	wroff = (int)vp->v_stream->sd_wroff;
422 	buf_left = MIN(total_size, maxblk);
423 	head = dmp = allocb(buf_left + wroff, BPRI_HI);
424 	if (head == NULL)
425 		return (ENOMEM);
426 	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
427 
428 	auio.uio_extflg = UIO_COPY_DEFAULT;
429 	for (i = 0; i < copy_cnt; i++) {
430 		if (ISSIG(curthread, JUSTLOOKING))
431 			return (EINTR);
432 
433 		/*
434 		 * Do similar checks as "write" as we are writing
435 		 * sfv_len bytes into "vp".
436 		 */
437 		sfv_len = (ssize_t)sfv->sfv_len;
438 
439 		if (sfv_len == 0) {
440 			sfv++;
441 			continue;
442 		}
443 
444 		/* Make sure sfv_len is not negative */
445 #ifdef _SYSCALL32_IMPL
446 		if (model == DATAMODEL_ILP32) {
447 			if ((ssize32_t)sfv_len < 0)
448 				return (EINVAL);
449 		} else
450 #endif
451 		if (sfv_len < 0)
452 			return (EINVAL);
453 
454 		/* Check for overflow */
455 #ifdef _SYSCALL32_IMPL
456 		if (model == DATAMODEL_ILP32) {
457 			if (((ssize32_t)(*count + sfv_len)) < 0)
458 				return (EINVAL);
459 		} else
460 #endif
461 		if ((*count + sfv_len) < 0)
462 			return (EINVAL);
463 
464 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
465 
466 		if (sfv->sfv_fd == SFV_FD_SELF) {
467 			while (sfv_len > 0) {
468 				if (buf_left == 0) {
469 					tmp = dmp;
470 					buf_left = MIN(total_size, maxblk);
471 					iov_len = MIN(buf_left, sfv_len);
472 					dmp = allocb(buf_left + wroff, BPRI_HI);
473 					if (dmp == NULL) {
474 						freemsg(head);
475 						return (ENOMEM);
476 					}
477 					dmp->b_wptr = dmp->b_rptr =
478 					    dmp->b_rptr + wroff;
479 					tmp->b_cont = dmp;
480 				} else {
481 					iov_len = MIN(buf_left, sfv_len);
482 				}
483 
484 				aiov.iov_len = iov_len;
485 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
486 				auio.uio_loffset = *fileoff;
487 				auio.uio_iovcnt = 1;
488 				auio.uio_resid = iov_len;
489 				auio.uio_iov = &aiov;
490 				auio.uio_segflg = UIO_USERSPACE;
491 				auio.uio_llimit = curproc->p_fsz_ctl;
492 				auio.uio_fmode = fflag;
493 
494 				buf_left -= iov_len;
495 				total_size -= iov_len;
496 				sfv_len -= iov_len;
497 				sfv_off += iov_len;
498 
499 				error = uiomove((caddr_t)dmp->b_wptr,
500 				    iov_len, UIO_WRITE, &auio);
501 				if (error != 0) {
502 					freemsg(head);
503 					return (error);
504 				}
505 				dmp->b_wptr += iov_len;
506 			}
507 		} else {
508 			file_t	*ffp;
509 			vnode_t	*readvp;
510 			int	readflg = 0;
511 
512 			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
513 				freemsg(head);
514 				return (EBADF);
515 			}
516 
517 			if ((ffp->f_flag & FREAD) == 0) {
518 				releasef(sfv->sfv_fd);
519 				freemsg(head);
520 				return (EACCES);
521 			}
522 
523 			readvp = ffp->f_vnode;
524 			if (readvp->v_type != VREG) {
525 				releasef(sfv->sfv_fd);
526 				freemsg(head);
527 				return (EINVAL);
528 			}
529 
530 			/*
531 			 * No point reading and writing to same vp,
532 			 * as long as both are regular files. readvp is not
533 			 * locked; but since we got it from an open file the
534 			 * contents will be valid during the time of access.
535 			 */
536 
537 			if (VN_CMP(vp, readvp)) {
538 				releasef(sfv->sfv_fd);
539 				freemsg(head);
540 				return (EINVAL);
541 			}
542 
543 			/*
544 			 * Note: we assume readvp != vp. "vp" is already
545 			 * locked, and "readvp" must not be.
546 			 */
547 
548 			(void) VOP_RWLOCK(readvp, readflg, NULL);
549 
550 			/* Same checks as in pread */
551 			if (sfv_off > maxoff) {
552 				VOP_RWUNLOCK(readvp, readflg, NULL);
553 				releasef(sfv->sfv_fd);
554 				freemsg(head);
555 				return (EINVAL);
556 			}
557 			if (sfv_off + sfv_len > maxoff) {
558 				sfv_len = (ssize_t)((offset_t)maxoff -
559 				    sfv_off);
560 			}
561 
562 			while (sfv_len > 0) {
563 				if (buf_left == 0) {
564 					tmp = dmp;
565 					buf_left = MIN(total_size, maxblk);
566 					iov_len = MIN(buf_left, sfv_len);
567 					dmp = allocb(buf_left + wroff, BPRI_HI);
568 					if (dmp == NULL) {
569 						VOP_RWUNLOCK(readvp, readflg,
570 									NULL);
571 						releasef(sfv->sfv_fd);
572 						freemsg(head);
573 						return (ENOMEM);
574 					}
575 					dmp->b_wptr = dmp->b_rptr =
576 					    dmp->b_rptr + wroff;
577 					tmp->b_cont = dmp;
578 				} else {
579 					iov_len = MIN(buf_left, sfv_len);
580 				}
581 				aiov.iov_base = (caddr_t)dmp->b_wptr;
582 				aiov.iov_len = iov_len;
583 				auio.uio_loffset = sfv_off;
584 				auio.uio_iov = &aiov;
585 				auio.uio_iovcnt = 1;
586 				auio.uio_resid = iov_len;
587 				auio.uio_segflg = UIO_SYSSPACE;
588 				auio.uio_llimit = MAXOFFSET_T;
589 				auio.uio_fmode = ffp->f_flag;
590 				ioflag = auio.uio_fmode &
591 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
592 
593 				/*
594 				 * If read sync is not asked for,
595 				 * filter sync flags
596 				 */
597 				if ((ioflag & FRSYNC) == 0)
598 					ioflag &= ~(FSYNC|FDSYNC);
599 				error = VOP_READ(readvp, &auio, ioflag,
600 				    fp->f_cred, NULL);
601 				if (error != 0) {
602 					/*
603 					 * If we were reading a pipe (currently
604 					 * not implemented), we may now loose
605 					 * data.
606 					 */
607 					VOP_RWUNLOCK(readvp, readflg, NULL);
608 					releasef(sfv->sfv_fd);
609 					freemsg(head);
610 					return (error);
611 				}
612 
613 				/*
614 				 * Check how much data was really read.
615 				 * Decrement the 'len' and increment the
616 				 * 'off' appropriately.
617 				 */
618 				cnt = iov_len - auio.uio_resid;
619 				if (cnt == 0) {
620 					VOP_RWUNLOCK(readvp, readflg, NULL);
621 					releasef(sfv->sfv_fd);
622 					freemsg(head);
623 					return (EINVAL);
624 				}
625 				sfv_len -= cnt;
626 				sfv_off += cnt;
627 				total_size -= cnt;
628 				buf_left -= cnt;
629 
630 				dmp->b_wptr += cnt;
631 			}
632 			VOP_RWUNLOCK(readvp, readflg, NULL);
633 			releasef(sfv->sfv_fd);
634 		}
635 		sfv++;
636 	}
637 
638 	ASSERT(total_size == 0);
639 	error = kstrwritemp(vp, head, fflag);
640 	if (error != 0) {
641 		freemsg(head);
642 		return (error);
643 	}
644 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
645 	*count += size;
646 
647 	return (0);
648 }
649 
650 
651 int
652 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
653     int copy_cnt, ssize_t *count)
654 {
655 	struct vnode *vp;
656 	struct uio auio;
657 	struct iovec aiov;
658 	ushort_t fflag;
659 	int ioflag;
660 	int i, error;
661 	size_t cnt;
662 	ssize_t sfv_len;
663 	u_offset_t sfv_off;
664 #ifdef _SYSCALL32_IMPL
665 	model_t model = get_udatamodel();
666 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
667 		MAXOFF32_T : MAXOFFSET_T;
668 #else
669 	const u_offset_t maxoff = MAXOFF32_T;
670 #endif
671 	mblk_t	*dmp = NULL;
672 	char	*buf = NULL;
673 
674 	fflag = fp->f_flag;
675 	vp = fp->f_vnode;
676 
677 	auio.uio_extflg = UIO_COPY_DEFAULT;
678 	for (i = 0; i < copy_cnt; i++) {
679 		if (ISSIG(curthread, JUSTLOOKING))
680 			return (EINTR);
681 
682 		/*
683 		 * Do similar checks as "write" as we are writing
684 		 * sfv_len bytes into "vp".
685 		 */
686 		sfv_len = (ssize_t)sfv->sfv_len;
687 
688 		if (sfv_len == 0) {
689 			sfv++;
690 			continue;
691 		}
692 
693 		/* Make sure sfv_len is not negative */
694 #ifdef _SYSCALL32_IMPL
695 		if (model == DATAMODEL_ILP32) {
696 			if ((ssize32_t)sfv_len < 0)
697 				return (EINVAL);
698 		} else
699 #endif
700 		if (sfv_len < 0)
701 			return (EINVAL);
702 
703 		if (vp->v_type == VREG) {
704 			if (*fileoff >= curproc->p_fsz_ctl) {
705 				mutex_enter(&curproc->p_lock);
706 				(void) rctl_action(
707 				    rctlproc_legacy[RLIMIT_FSIZE],
708 				    curproc->p_rctls, curproc, RCA_SAFE);
709 				mutex_exit(&curproc->p_lock);
710 
711 				return (EFBIG);
712 			}
713 
714 			if (*fileoff >= maxoff)
715 				return (EFBIG);
716 
717 			if (*fileoff + sfv_len > maxoff)
718 				return (EINVAL);
719 		}
720 
721 		/* Check for overflow */
722 #ifdef _SYSCALL32_IMPL
723 		if (model == DATAMODEL_ILP32) {
724 			if (((ssize32_t)(*count + sfv_len)) < 0)
725 				return (EINVAL);
726 		} else
727 #endif
728 		if ((*count + sfv_len) < 0)
729 			return (EINVAL);
730 
731 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
732 
733 		if (sfv->sfv_fd == SFV_FD_SELF) {
734 			aiov.iov_len = sfv_len;
735 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
736 			auio.uio_loffset = *fileoff;
737 			auio.uio_iovcnt = 1;
738 			auio.uio_resid = sfv_len;
739 			auio.uio_iov = &aiov;
740 			auio.uio_segflg = UIO_USERSPACE;
741 			auio.uio_llimit = curproc->p_fsz_ctl;
742 			auio.uio_fmode = fflag;
743 
744 			if (vp->v_type == VSOCK) {
745 
746 				/*
747 				 * Optimize for the socket case
748 				 */
749 				int wroff = (int)vp->v_stream->sd_wroff;
750 
751 				dmp = allocb(sfv_len + wroff, BPRI_HI);
752 				if (dmp == NULL)
753 					return (ENOMEM);
754 				dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff;
755 				error = uiomove((caddr_t)dmp->b_wptr,
756 				    sfv_len, UIO_WRITE, &auio);
757 				if (error != 0) {
758 					freeb(dmp);
759 					return (error);
760 				}
761 				dmp->b_wptr += sfv_len;
762 				error = kstrwritemp(vp, dmp, fflag);
763 				if (error != 0) {
764 					freeb(dmp);
765 					return (error);
766 				}
767 				ttolwp(curthread)->lwp_ru.ioch +=
768 				    (ulong_t)sfv_len;
769 				*count += sfv_len;
770 			} else {
771 				ioflag = auio.uio_fmode &
772 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
773 				while (sfv_len > 0) {
774 					error = VOP_WRITE(vp, &auio, ioflag,
775 					    fp->f_cred, NULL);
776 					cnt = sfv_len - auio.uio_resid;
777 					sfv_len -= cnt;
778 					ttolwp(curthread)->lwp_ru.ioch +=
779 					    (ulong_t)cnt;
780 					*fileoff += cnt;
781 					*count += cnt;
782 					if (error != 0)
783 						return (error);
784 				}
785 			}
786 		} else {
787 			file_t	*ffp;
788 			vnode_t	*readvp;
789 			int	readflg = 0;
790 			size_t	size;
791 			caddr_t	ptr;
792 
793 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
794 				return (EBADF);
795 
796 			if ((ffp->f_flag & FREAD) == 0) {
797 				releasef(sfv->sfv_fd);
798 				return (EBADF);
799 			}
800 
801 			readvp = ffp->f_vnode;
802 			if (readvp->v_type != VREG) {
803 				releasef(sfv->sfv_fd);
804 				return (EINVAL);
805 			}
806 
807 			/*
808 			 * No point reading and writing to same vp,
809 			 * as long as both are regular files. readvp is not
810 			 * locked; but since we got it from an open file the
811 			 * contents will be valid during the time of access.
812 			 */
813 			if (VN_CMP(vp, readvp)) {
814 				releasef(sfv->sfv_fd);
815 				return (EINVAL);
816 			}
817 
818 			/*
819 			 * Note: we assume readvp != vp. "vp" is already
820 			 * locked, and "readvp" must not be.
821 			 */
822 			(void) VOP_RWLOCK(readvp, readflg, NULL);
823 
824 			/* Same checks as in pread */
825 			if (sfv_off > maxoff) {
826 				VOP_RWUNLOCK(readvp, readflg, NULL);
827 				releasef(sfv->sfv_fd);
828 				return (EINVAL);
829 			}
830 			if (sfv_off + sfv_len > maxoff) {
831 				sfv_len = (ssize_t)((offset_t)maxoff -
832 				    sfv_off);
833 			}
834 			/* Find the native blocksize to transfer data */
835 			size = MIN(vp->v_vfsp->vfs_bsize,
836 			    readvp->v_vfsp->vfs_bsize);
837 			size = sfv_len < size ? sfv_len : size;
838 
839 			if (vp->v_type != VSOCK) {
840 				buf = kmem_alloc(size, KM_NOSLEEP);
841 				if (buf == NULL) {
842 					VOP_RWUNLOCK(readvp, readflg, NULL);
843 					releasef(sfv->sfv_fd);
844 					return (ENOMEM);
845 				}
846 			}
847 
848 			while (sfv_len > 0) {
849 				size_t	iov_len;
850 
851 				iov_len = MIN(size, sfv_len);
852 
853 				if (vp->v_type == VSOCK) {
854 					dmp = allocb(iov_len, BPRI_HI);
855 					if (dmp == NULL) {
856 						VOP_RWUNLOCK(readvp, readflg,
857 						    NULL);
858 						releasef(sfv->sfv_fd);
859 						return (ENOMEM);
860 					}
861 					ptr = (caddr_t)dmp->b_rptr;
862 				} else {
863 					ptr = buf;
864 				}
865 
866 				aiov.iov_base = ptr;
867 				aiov.iov_len = iov_len;
868 				auio.uio_loffset = sfv_off;
869 				auio.uio_iov = &aiov;
870 				auio.uio_iovcnt = 1;
871 				auio.uio_resid = iov_len;
872 				auio.uio_segflg = UIO_SYSSPACE;
873 				auio.uio_llimit = MAXOFFSET_T;
874 				auio.uio_fmode = ffp->f_flag;
875 				ioflag = auio.uio_fmode &
876 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
877 
878 				/*
879 				 * If read sync is not asked for,
880 				 * filter sync flags
881 				 */
882 				if ((ioflag & FRSYNC) == 0)
883 					ioflag &= ~(FSYNC|FDSYNC);
884 				error = VOP_READ(readvp, &auio, ioflag,
885 				    fp->f_cred, NULL);
886 				if (error != 0) {
887 					/*
888 					 * If we were reading a pipe (currently
889 					 * not implemented), we may now lose
890 					 * data.
891 					 */
892 					if (vp->v_type == VSOCK)
893 						freeb(dmp);
894 					else
895 						kmem_free(buf, size);
896 					VOP_RWUNLOCK(readvp, readflg, NULL);
897 					releasef(sfv->sfv_fd);
898 					return (error);
899 				}
900 
901 				/*
902 				 * Check how much data was really read.
903 				 * Decrement the 'len' and increment the
904 				 * 'off' appropriately.
905 				 */
906 				cnt = iov_len - auio.uio_resid;
907 				if (cnt == 0) {
908 					if (vp->v_type == VSOCK)
909 						freeb(dmp);
910 					else
911 						kmem_free(buf, size);
912 					VOP_RWUNLOCK(readvp, readflg, NULL);
913 					releasef(sfv->sfv_fd);
914 					return (EINVAL);
915 				}
916 				sfv_len -= cnt;
917 				sfv_off += cnt;
918 
919 				if (vp->v_type == VSOCK) {
920 					dmp->b_wptr = dmp->b_rptr + cnt;
921 
922 					error = kstrwritemp(vp, dmp, fflag);
923 					if (error != 0) {
924 						freeb(dmp);
925 						VOP_RWUNLOCK(readvp, readflg,
926 									NULL);
927 						releasef(sfv->sfv_fd);
928 						return (error);
929 					}
930 
931 					ttolwp(curthread)->lwp_ru.ioch +=
932 					    (ulong_t)cnt;
933 					*count += cnt;
934 				} else {
935 
936 					aiov.iov_base = ptr;
937 					aiov.iov_len = cnt;
938 					auio.uio_loffset = *fileoff;
939 					auio.uio_resid = cnt;
940 					auio.uio_segflg = UIO_SYSSPACE;
941 					auio.uio_llimit = curproc->p_fsz_ctl;
942 					auio.uio_fmode = fflag;
943 					ioflag = auio.uio_fmode &
944 					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
945 					error = VOP_WRITE(vp, &auio, ioflag,
946 					    fp->f_cred, NULL);
947 
948 					/*
949 					 * Check how much data was written.
950 					 * Increment the 'len' and decrement the
951 					 * 'off' if all the data was not
952 					 * written.
953 					 */
954 					cnt -= auio.uio_resid;
955 					sfv_len += auio.uio_resid;
956 					sfv_off -= auio.uio_resid;
957 					ttolwp(curthread)->lwp_ru.ioch +=
958 					    (ulong_t)cnt;
959 					*fileoff += cnt;
960 					*count += cnt;
961 					if (error != 0) {
962 						VOP_RWUNLOCK(readvp, readflg,
963 									NULL);
964 						releasef(sfv->sfv_fd);
965 						return (error);
966 					}
967 				}
968 			}
969 			if (buf) {
970 				kmem_free(buf, size);
971 				buf = NULL;
972 			}
973 			VOP_RWUNLOCK(readvp, readflg, NULL);
974 			releasef(sfv->sfv_fd);
975 		}
976 		sfv++;
977 	}
978 	return (0);
979 }
980 
981 ssize_t
982 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
983     size_t *xferred)
984 {
985 	int error;
986 	file_t *fp;
987 	struct vnode *vp;
988 	struct sonode *so;
989 	u_offset_t fileoff;
990 	int copy_cnt;
991 	const struct sendfilevec *copy_vec;
992 	struct sendfilevec sfv[SEND_MAX_CHUNK];
993 	ssize_t count = 0;
994 #ifdef _SYSCALL32_IMPL
995 	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
996 #endif
997 	ssize_t total_size = 0;
998 	int i;
999 	boolean_t is_sock = B_FALSE;
1000 	int maxblk = 0;
1001 
1002 	if (sfvcnt <= 0)
1003 		return (set_errno(EINVAL));
1004 
1005 	if ((fp = getf(fildes)) == NULL)
1006 		return (set_errno(EBADF));
1007 
1008 	if (((fp->f_flag) & FWRITE) == 0) {
1009 		error = EBADF;
1010 		goto err;
1011 	}
1012 
1013 	fileoff = fp->f_offset;
1014 	vp = fp->f_vnode;
1015 
1016 	switch (vp->v_type) {
1017 	case VSOCK:
1018 		so = VTOSO(vp);
1019 		/* sendfile not supported for SCTP */
1020 		if (so->so_protocol == IPPROTO_SCTP) {
1021 			error = EPROTONOSUPPORT;
1022 			goto err;
1023 		}
1024 		is_sock = B_TRUE;
1025 		switch (so->so_family) {
1026 		case AF_NCA:
1027 		case AF_INET:
1028 		case AF_INET6:
1029 			/*
1030 			 * Make similar checks done in SOP_WRITE().
1031 			 */
1032 			if (so->so_state & SS_CANTSENDMORE) {
1033 				tsignal(curthread, SIGPIPE);
1034 				error = EPIPE;
1035 				goto err;
1036 			}
1037 			if (so->so_type != SOCK_STREAM) {
1038 				error = EOPNOTSUPP;
1039 				goto err;
1040 			}
1041 
1042 			if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
1043 			    (SS_ISCONNECTED|SS_ISBOUND)) {
1044 				error = ENOTCONN;
1045 				goto err;
1046 			}
1047 
1048 			if ((so->so_state & SS_TCP_FAST_ACCEPT) &&
1049 			    (so->so_priv != NULL)) {
1050 				maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
1051 			} else {
1052 				maxblk = (int)vp->v_stream->sd_maxblk;
1053 			}
1054 			break;
1055 		default:
1056 			error = EAFNOSUPPORT;
1057 			goto err;
1058 		}
1059 		break;
1060 	case VREG:
1061 		break;
1062 	default:
1063 		error = EINVAL;
1064 		goto err;
1065 	}
1066 
1067 	switch (opcode) {
1068 	case SENDFILEV :
1069 		break;
1070 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1071 	case SENDFILEV64 :
1072 		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1073 		    (size32_t *)xferred, fildes));
1074 #endif
1075 	default :
1076 		error = ENOSYS;
1077 		break;
1078 	}
1079 
1080 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1081 	copy_vec = vec;
1082 
1083 	do {
1084 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1085 #ifdef _SYSCALL32_IMPL
1086 		/* 32-bit callers need to have their iovec expanded. */
1087 		if (get_udatamodel() == DATAMODEL_ILP32) {
1088 			if (copyin(copy_vec, sfv32,
1089 			    copy_cnt * sizeof (ksendfilevec32_t))) {
1090 				error = EFAULT;
1091 				break;
1092 			}
1093 
1094 			for (i = 0; i < copy_cnt; i++) {
1095 				sfv[i].sfv_fd = sfv32[i].sfv_fd;
1096 				sfv[i].sfv_off =
1097 					(off_t)(uint32_t)sfv32[i].sfv_off;
1098 				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1099 				total_size += sfv[i].sfv_len;
1100 				sfv[i].sfv_flag = sfv32[i].sfv_flag;
1101 			}
1102 		} else {
1103 #endif
1104 			if (copyin(copy_vec, sfv,
1105 			    copy_cnt * sizeof (sendfilevec_t))) {
1106 				error = EFAULT;
1107 				break;
1108 			}
1109 
1110 			for (i = 0; i < copy_cnt; i++) {
1111 				total_size += sfv[i].sfv_len;
1112 			}
1113 #ifdef _SYSCALL32_IMPL
1114 		}
1115 #endif
1116 
1117 		/*
1118 		 * The task between deciding to use sendvec_small_chunk
1119 		 * and sendvec_chunk is dependant on multiple things:
1120 		 *
1121 		 * i) latency is important for smaller files. So if the
1122 		 * data is smaller than 'tcp_slow_start_initial' times
1123 		 * maxblk, then use sendvec_small_chunk which creates
1124 		 * maxblk size mblks and chains then together and sends
1125 		 * them to TCP in one shot. It also leaves 'wroff' size
1126 		 * space for the headers in each mblk.
1127 		 *
1128 		 * ii) for total size bigger than 'tcp_slow_start_initial'
1129 		 * time maxblk, its probably real file data which is
1130 		 * dominating. So its better to use sendvec_chunk because
1131 		 * performance goes to dog if we don't do pagesize reads.
1132 		 * sendvec_chunk will do pagesize reads and write them
1133 		 * in pagesize mblks to TCP.
1134 		 *
1135 		 * Side Notes: A write to file has not been optimized.
1136 		 * Future zero copy code will plugin into sendvec_chunk
1137 		 * only because doing zero copy for files smaller then
1138 		 * pagesize is useless.
1139 		 *
1140 		 * Note, if socket has NL7C enabled then call NL7C's
1141 		 * senfilev() function to give NL7C a chance to copy
1142 		 * the vec for caching, then continue processing as
1143 		 * normal.
1144 		 */
1145 		if (is_sock) {
1146 			switch (so->so_family) {
1147 			case AF_INET:
1148 			case AF_INET6:
1149 				if (so->so_nl7c_flags != 0) {
1150 					nl7c_sendfilev(so, fileoff,
1151 					    sfv, copy_cnt);
1152 				}
1153 				if (total_size <= (4 * maxblk))
1154 					error = sendvec_small_chunk(fp,
1155 					    &fileoff, sfv, copy_cnt,
1156 					    total_size, maxblk, &count);
1157 				else
1158 					error = sendvec_chunk(fp, &fileoff,
1159 					    sfv, copy_cnt, &count);
1160 				break;
1161 			case AF_NCA:
1162 				error = nca_sendfilev(fp, sfv, copy_cnt,
1163 				    &count);
1164 				break;
1165 			}
1166 		} else {
1167 			ASSERT(vp->v_type == VREG);
1168 			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1169 			    &count);
1170 		}
1171 
1172 
1173 #ifdef _SYSCALL32_IMPL
1174 	if (get_udatamodel() == DATAMODEL_ILP32)
1175 		copy_vec = (const struct sendfilevec *)((char *)copy_vec +
1176 		    (copy_cnt * sizeof (ksendfilevec32_t)));
1177 	else
1178 #endif
1179 		copy_vec += copy_cnt;
1180 		sfvcnt -= copy_cnt;
1181 	} while (sfvcnt > 0);
1182 
1183 	if (vp->v_type == VREG)
1184 		fp->f_offset += count;
1185 
1186 
1187 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1188 
1189 #ifdef _SYSCALL32_IMPL
1190 	if (get_udatamodel() == DATAMODEL_ILP32) {
1191 		ssize32_t count32 = (ssize32_t)count;
1192 		if (copyout(&count32, xferred, sizeof (count32)))
1193 			error = EFAULT;
1194 		releasef(fildes);
1195 		if (error != 0)
1196 			return (set_errno(error));
1197 		return (count32);
1198 	}
1199 #endif
1200 	if (copyout(&count, xferred, sizeof (count)))
1201 		error = EFAULT;
1202 	releasef(fildes);
1203 	if (error != 0)
1204 		return (set_errno(error));
1205 	return (count);
1206 err:
1207 	ASSERT(error != 0);
1208 	releasef(fildes);
1209 	return (set_errno(error));
1210 }
1211