xref: /illumos-gate/usr/src/uts/common/syscall/sendfile.c (revision 0f1702c5201310f0529cd5abb77652e5e9b241b6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/debug.h>
39 #include <sys/errno.h>
40 #include <sys/time.h>
41 #include <sys/file.h>
42 #include <sys/open.h>
43 #include <sys/user.h>
44 #include <sys/termios.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
47 #include <sys/sunddi.h>
48 #include <sys/esunddi.h>
49 #include <sys/flock.h>
50 #include <sys/modctl.h>
51 #include <sys/cmn_err.h>
52 #include <sys/vmsystm.h>
53 
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <fs/sockfs/sockcommon.h>
57 #include <fs/sockfs/socktpi.h>
58 
59 #include <netinet/in.h>
60 #include <sys/sendfile.h>
61 #include <sys/un.h>
62 #include <sys/tihdr.h>
63 #include <sys/atomic.h>
64 
65 #include <inet/common.h>
66 #include <inet/ip.h>
67 #include <inet/ip6.h>
68 #include <inet/tcp.h>
69 
70 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
71 		ssize32_t *);
72 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *,
73 		int, ssize_t *);
74 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *,
75 		boolean_t);
76 extern sotpi_info_t *sotpi_sototpi(struct sonode *);
77 
78 #define	readflg	(V_WRITELOCK_FALSE)
79 #define	rwflag	(V_WRITELOCK_TRUE)
80 
81 #define	SEND_MAX_CHUNK	16
82 
83 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
84 /*
85  * 64 bit offsets for 32 bit applications only running either on
86  * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
87  * more than 2GB of data.
88  */
89 int
90 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
91     int copy_cnt, ssize32_t *count)
92 {
93 	struct vnode *vp;
94 	ushort_t fflag;
95 	int ioflag;
96 	size32_t cnt;
97 	ssize32_t sfv_len;
98 	ssize32_t tmpcount;
99 	u_offset_t sfv_off;
100 	struct uio auio;
101 	struct iovec aiov;
102 	int i, error;
103 
104 	fflag = fp->f_flag;
105 	vp = fp->f_vnode;
106 	for (i = 0; i < copy_cnt; i++) {
107 
108 		if (ISSIG(curthread, JUSTLOOKING))
109 			return (EINTR);
110 
111 		/*
112 		 * Do similar checks as "write" as we are writing
113 		 * sfv_len bytes into "vp".
114 		 */
115 		sfv_len = (ssize32_t)sfv->sfv_len;
116 
117 		if (sfv_len == 0) {
118 			sfv++;
119 			continue;
120 		}
121 
122 		if (sfv_len < 0)
123 			return (EINVAL);
124 
125 		if (vp->v_type == VREG) {
126 			if (*fileoff >= curproc->p_fsz_ctl) {
127 				mutex_enter(&curproc->p_lock);
128 				(void) rctl_action(
129 				    rctlproc_legacy[RLIMIT_FSIZE],
130 				    curproc->p_rctls, curproc, RCA_SAFE);
131 				mutex_exit(&curproc->p_lock);
132 				return (EFBIG);
133 			}
134 
135 			if (*fileoff >= OFFSET_MAX(fp))
136 				return (EFBIG);
137 
138 			if (*fileoff + sfv_len > OFFSET_MAX(fp))
139 				return (EINVAL);
140 		}
141 
142 		tmpcount = *count + sfv_len;
143 		if (tmpcount < 0)
144 			return (EINVAL);
145 
146 		sfv_off = sfv->sfv_off;
147 
148 		auio.uio_extflg = UIO_COPY_DEFAULT;
149 		if (sfv->sfv_fd == SFV_FD_SELF) {
150 			aiov.iov_len = sfv_len;
151 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
152 			auio.uio_loffset = *fileoff;
153 			auio.uio_iovcnt = 1;
154 			auio.uio_resid = sfv_len;
155 			auio.uio_iov = &aiov;
156 			auio.uio_segflg = UIO_USERSPACE;
157 			auio.uio_llimit = curproc->p_fsz_ctl;
158 			auio.uio_fmode = fflag;
159 			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
160 			while (sfv_len > 0) {
161 				error = VOP_WRITE(vp, &auio, ioflag,
162 				    fp->f_cred, NULL);
163 				cnt = sfv_len - auio.uio_resid;
164 				sfv_len -= cnt;
165 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
166 				if (vp->v_type == VREG)
167 					*fileoff += cnt;
168 				*count += cnt;
169 				if (error != 0)
170 					return (error);
171 			}
172 		} else {
173 			file_t	*ffp;
174 			vnode_t	*readvp;
175 			size_t	size;
176 			caddr_t	ptr;
177 
178 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
179 				return (EBADF);
180 
181 			if ((ffp->f_flag & FREAD) == 0) {
182 				releasef(sfv->sfv_fd);
183 				return (EBADF);
184 			}
185 
186 			readvp = ffp->f_vnode;
187 			if (readvp->v_type != VREG) {
188 				releasef(sfv->sfv_fd);
189 				return (EINVAL);
190 			}
191 
192 			/*
193 			 * No point reading and writing to same vp,
194 			 * as long as both are regular files. readvp is not
195 			 * locked; but since we got it from an open file the
196 			 * contents will be valid during the time of access.
197 			 */
198 			if (vn_compare(vp, readvp)) {
199 				releasef(sfv->sfv_fd);
200 				return (EINVAL);
201 			}
202 
203 			/*
204 			 * Note: we assume readvp != vp. "vp" is already
205 			 * locked, and "readvp" must not be.
206 			 */
207 			(void) VOP_RWLOCK(readvp, readflg, NULL);
208 
209 			/*
210 			 * Same checks as in pread64.
211 			 */
212 			if (sfv_off > MAXOFFSET_T) {
213 				VOP_RWUNLOCK(readvp, readflg, NULL);
214 				releasef(sfv->sfv_fd);
215 				return (EINVAL);
216 			}
217 
218 			if (sfv_off + sfv_len > MAXOFFSET_T)
219 				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
220 
221 			/* Find the native blocksize to transfer data */
222 			size = MIN(vp->v_vfsp->vfs_bsize,
223 			    readvp->v_vfsp->vfs_bsize);
224 			size = sfv_len < size ? sfv_len : size;
225 			ptr = kmem_alloc(size, KM_SLEEP);
226 
227 			while (sfv_len > 0) {
228 				size_t	iov_len;
229 
230 				iov_len = MIN(size, sfv_len);
231 				aiov.iov_base = ptr;
232 				aiov.iov_len = iov_len;
233 				auio.uio_loffset = sfv_off;
234 				auio.uio_iov = &aiov;
235 				auio.uio_iovcnt = 1;
236 				auio.uio_resid = iov_len;
237 				auio.uio_segflg = UIO_SYSSPACE;
238 				auio.uio_llimit = MAXOFFSET_T;
239 				auio.uio_fmode = ffp->f_flag;
240 				ioflag = auio.uio_fmode &
241 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
242 
243 				/*
244 				 * If read sync is not asked for,
245 				 * filter sync flags
246 				 */
247 				if ((ioflag & FRSYNC) == 0)
248 					ioflag &= ~(FSYNC|FDSYNC);
249 				error = VOP_READ(readvp, &auio, ioflag,
250 				    fp->f_cred, NULL);
251 				if (error) {
252 					kmem_free(ptr, size);
253 					VOP_RWUNLOCK(readvp, readflg, NULL);
254 					releasef(sfv->sfv_fd);
255 					return (error);
256 				}
257 
258 				/*
259 				 * Check how must data was really read.
260 				 * Decrement the 'len' and increment the
261 				 * 'off' appropriately.
262 				 */
263 				cnt = iov_len - auio.uio_resid;
264 				if (cnt == 0) {
265 					/*
266 					 * If we were reading a pipe (currently
267 					 * not implemented), we may now lose
268 					 * data.
269 					 */
270 					kmem_free(ptr, size);
271 					VOP_RWUNLOCK(readvp, readflg, NULL);
272 					releasef(sfv->sfv_fd);
273 					return (EINVAL);
274 				}
275 				sfv_len -= cnt;
276 				sfv_off += cnt;
277 
278 				aiov.iov_base = ptr;
279 				aiov.iov_len = cnt;
280 				auio.uio_loffset = *fileoff;
281 				auio.uio_iov = &aiov;
282 				auio.uio_iovcnt = 1;
283 				auio.uio_resid = cnt;
284 				auio.uio_segflg = UIO_SYSSPACE;
285 				auio.uio_llimit = curproc->p_fsz_ctl;
286 				auio.uio_fmode = fflag;
287 				ioflag = auio.uio_fmode &
288 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
289 				error = VOP_WRITE(vp, &auio, ioflag,
290 				    fp->f_cred, NULL);
291 
292 				/*
293 				 * Check how much data was written. Increment
294 				 * the 'len' and decrement the 'off' if all
295 				 * the data was not written.
296 				 */
297 				cnt -= auio.uio_resid;
298 				sfv_len += auio.uio_resid;
299 				sfv_off -= auio.uio_resid;
300 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
301 				if (vp->v_type == VREG)
302 					*fileoff += cnt;
303 				*count += cnt;
304 				if (error != 0) {
305 					kmem_free(ptr, size);
306 					VOP_RWUNLOCK(readvp, readflg, NULL);
307 					releasef(sfv->sfv_fd);
308 					return (error);
309 				}
310 			}
311 			VOP_RWUNLOCK(readvp, readflg, NULL);
312 			releasef(sfv->sfv_fd);
313 			kmem_free(ptr, size);
314 		}
315 		sfv++;
316 	}
317 	return (0);
318 }
319 
320 ssize32_t
321 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
322 	size32_t *xferred, int fildes)
323 {
324 	u_offset_t		fileoff;
325 	int			copy_cnt;
326 	const struct ksendfilevec64 *copy_vec;
327 	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
328 	struct vnode *vp;
329 	int error;
330 	ssize32_t count = 0;
331 
332 	vp = fp->f_vnode;
333 	(void) VOP_RWLOCK(vp, rwflag, NULL);
334 
335 	copy_vec = vec;
336 	fileoff = fp->f_offset;
337 
338 	do {
339 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
340 		if (copyin(copy_vec, sfv, copy_cnt *
341 		    sizeof (struct ksendfilevec64))) {
342 			error = EFAULT;
343 			break;
344 		}
345 
346 		/*
347 		 * Optimize the regular file over
348 		 * the socket case.
349 		 */
350 		if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) {
351 			file_t *rfp;
352 			vnode_t *rvp;
353 
354 			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
355 				error = EBADF;
356 				break;
357 			}
358 			if ((rfp->f_flag & FREAD) == 0) {
359 				releasef(sfv->sfv_fd);
360 				error = EBADF;
361 				break;
362 			}
363 			rvp = rfp->f_vnode;
364 			if (rvp->v_type == VREG) {
365 				error = sosendfile64(fp, rfp, sfv, &count);
366 				if (error)
367 					break;
368 				copy_vec++;
369 				sfvcnt--;
370 				continue;
371 			}
372 			releasef(sfv->sfv_fd);
373 		}
374 		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
375 		if (error != 0)
376 			break;
377 
378 		copy_vec += copy_cnt;
379 		sfvcnt -= copy_cnt;
380 	} while (sfvcnt > 0);
381 
382 	if (vp->v_type == VREG)
383 		fp->f_offset += count;
384 
385 	VOP_RWUNLOCK(vp, rwflag, NULL);
386 	if (copyout(&count, xferred, sizeof (count)))
387 		error = EFAULT;
388 	releasef(fildes);
389 	if (error != 0)
390 		return (set_errno(error));
391 	return (count);
392 }
393 #endif
394 
395 int
396 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
397     int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
398 {
399 	struct vnode *vp;
400 	struct uio auio;
401 	struct iovec aiov;
402 	ushort_t fflag;
403 	int ioflag;
404 	int i, error;
405 	size_t cnt;
406 	ssize_t sfv_len;
407 	u_offset_t sfv_off;
408 #ifdef _SYSCALL32_IMPL
409 	model_t model = get_udatamodel();
410 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
411 	    MAXOFF32_T : MAXOFFSET_T;
412 #else
413 	const u_offset_t maxoff = MAXOFF32_T;
414 #endif
415 	mblk_t *dmp = NULL;
416 	int wroff;
417 	int buf_left = 0;
418 	size_t	iov_len;
419 	mblk_t  *head, *tmp;
420 	size_t  size = total_size;
421 	size_t  extra;
422 	int tail_len;
423 	struct nmsghdr msg;
424 
425 	fflag = fp->f_flag;
426 	vp = fp->f_vnode;
427 
428 	ASSERT(vp->v_type == VSOCK);
429 	ASSERT(maxblk > 0);
430 
431 	/* If nothing to send, return */
432 	if (total_size == 0)
433 		return (0);
434 
435 	if (vp->v_stream != NULL) {
436 		wroff = (int)vp->v_stream->sd_wroff;
437 		tail_len = (int)vp->v_stream->sd_tail;
438 	} else {
439 		struct sonode *so;
440 
441 		so = VTOSO(vp);
442 		wroff = so->so_proto_props.sopp_wroff;
443 		tail_len = so->so_proto_props.sopp_tail;
444 	}
445 
446 	extra = wroff + tail_len;
447 
448 	buf_left = MIN(total_size, maxblk);
449 	head = dmp = allocb(buf_left + extra, BPRI_HI);
450 	if (head == NULL)
451 		return (ENOMEM);
452 	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
453 	bzero(&msg, sizeof (msg));
454 
455 	auio.uio_extflg = UIO_COPY_DEFAULT;
456 	for (i = 0; i < copy_cnt; i++) {
457 		if (ISSIG(curthread, JUSTLOOKING)) {
458 			freemsg(head);
459 			return (EINTR);
460 		}
461 
462 		/*
463 		 * Do similar checks as "write" as we are writing
464 		 * sfv_len bytes into "vp".
465 		 */
466 		sfv_len = (ssize_t)sfv->sfv_len;
467 
468 		if (sfv_len == 0) {
469 			sfv++;
470 			continue;
471 		}
472 
473 		/* Check for overflow */
474 #ifdef _SYSCALL32_IMPL
475 		if (model == DATAMODEL_ILP32) {
476 			if (((ssize32_t)(*count + sfv_len)) < 0) {
477 				freemsg(head);
478 				return (EINVAL);
479 			}
480 		} else
481 #endif
482 		if ((*count + sfv_len) < 0) {
483 			freemsg(head);
484 			return (EINVAL);
485 		}
486 
487 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
488 
489 		if (sfv->sfv_fd == SFV_FD_SELF) {
490 			while (sfv_len > 0) {
491 				if (buf_left == 0) {
492 					tmp = dmp;
493 					buf_left = MIN(total_size, maxblk);
494 					iov_len = MIN(buf_left, sfv_len);
495 					dmp = allocb(buf_left + extra, BPRI_HI);
496 					if (dmp == NULL) {
497 						freemsg(head);
498 						return (ENOMEM);
499 					}
500 					dmp->b_wptr = dmp->b_rptr =
501 					    dmp->b_rptr + wroff;
502 					tmp->b_cont = dmp;
503 				} else {
504 					iov_len = MIN(buf_left, sfv_len);
505 				}
506 
507 				aiov.iov_len = iov_len;
508 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
509 				auio.uio_loffset = *fileoff;
510 				auio.uio_iovcnt = 1;
511 				auio.uio_resid = iov_len;
512 				auio.uio_iov = &aiov;
513 				auio.uio_segflg = UIO_USERSPACE;
514 				auio.uio_llimit = curproc->p_fsz_ctl;
515 				auio.uio_fmode = fflag;
516 
517 				buf_left -= iov_len;
518 				total_size -= iov_len;
519 				sfv_len -= iov_len;
520 				sfv_off += iov_len;
521 
522 				error = uiomove((caddr_t)dmp->b_wptr,
523 				    iov_len, UIO_WRITE, &auio);
524 				if (error != 0) {
525 					freemsg(head);
526 					return (error);
527 				}
528 				dmp->b_wptr += iov_len;
529 			}
530 		} else {
531 			file_t	*ffp;
532 			vnode_t	*readvp;
533 
534 			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
535 				freemsg(head);
536 				return (EBADF);
537 			}
538 
539 			if ((ffp->f_flag & FREAD) == 0) {
540 				releasef(sfv->sfv_fd);
541 				freemsg(head);
542 				return (EACCES);
543 			}
544 
545 			readvp = ffp->f_vnode;
546 			if (readvp->v_type != VREG) {
547 				releasef(sfv->sfv_fd);
548 				freemsg(head);
549 				return (EINVAL);
550 			}
551 
552 			/*
553 			 * No point reading and writing to same vp,
554 			 * as long as both are regular files. readvp is not
555 			 * locked; but since we got it from an open file the
556 			 * contents will be valid during the time of access.
557 			 */
558 
559 			if (vn_compare(vp, readvp)) {
560 				releasef(sfv->sfv_fd);
561 				freemsg(head);
562 				return (EINVAL);
563 			}
564 
565 			/*
566 			 * Note: we assume readvp != vp. "vp" is already
567 			 * locked, and "readvp" must not be.
568 			 */
569 
570 			(void) VOP_RWLOCK(readvp, readflg, NULL);
571 
572 			/* Same checks as in pread */
573 			if (sfv_off > maxoff) {
574 				VOP_RWUNLOCK(readvp, readflg, NULL);
575 				releasef(sfv->sfv_fd);
576 				freemsg(head);
577 				return (EINVAL);
578 			}
579 			if (sfv_off + sfv_len > maxoff) {
580 				total_size -= (sfv_off + sfv_len - maxoff);
581 				sfv_len = (ssize_t)((offset_t)maxoff -
582 				    sfv_off);
583 			}
584 
585 			while (sfv_len > 0) {
586 				if (buf_left == 0) {
587 					tmp = dmp;
588 					buf_left = MIN(total_size, maxblk);
589 					iov_len = MIN(buf_left, sfv_len);
590 					dmp = allocb(buf_left + extra, BPRI_HI);
591 					if (dmp == NULL) {
592 						VOP_RWUNLOCK(readvp, readflg,
593 						    NULL);
594 						releasef(sfv->sfv_fd);
595 						freemsg(head);
596 						return (ENOMEM);
597 					}
598 					dmp->b_wptr = dmp->b_rptr =
599 					    dmp->b_rptr + wroff;
600 					tmp->b_cont = dmp;
601 				} else {
602 					iov_len = MIN(buf_left, sfv_len);
603 				}
604 				aiov.iov_base = (caddr_t)dmp->b_wptr;
605 				aiov.iov_len = iov_len;
606 				auio.uio_loffset = sfv_off;
607 				auio.uio_iov = &aiov;
608 				auio.uio_iovcnt = 1;
609 				auio.uio_resid = iov_len;
610 				auio.uio_segflg = UIO_SYSSPACE;
611 				auio.uio_llimit = MAXOFFSET_T;
612 				auio.uio_fmode = ffp->f_flag;
613 				ioflag = auio.uio_fmode &
614 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
615 
616 				/*
617 				 * If read sync is not asked for,
618 				 * filter sync flags
619 				 */
620 				if ((ioflag & FRSYNC) == 0)
621 					ioflag &= ~(FSYNC|FDSYNC);
622 				error = VOP_READ(readvp, &auio, ioflag,
623 				    fp->f_cred, NULL);
624 				if (error != 0) {
625 					/*
626 					 * If we were reading a pipe (currently
627 					 * not implemented), we may now loose
628 					 * data.
629 					 */
630 					VOP_RWUNLOCK(readvp, readflg, NULL);
631 					releasef(sfv->sfv_fd);
632 					freemsg(head);
633 					return (error);
634 				}
635 
636 				/*
637 				 * Check how much data was really read.
638 				 * Decrement the 'len' and increment the
639 				 * 'off' appropriately.
640 				 */
641 				cnt = iov_len - auio.uio_resid;
642 				if (cnt == 0) {
643 					VOP_RWUNLOCK(readvp, readflg, NULL);
644 					releasef(sfv->sfv_fd);
645 					freemsg(head);
646 					return (EINVAL);
647 				}
648 				sfv_len -= cnt;
649 				sfv_off += cnt;
650 				total_size -= cnt;
651 				buf_left -= cnt;
652 
653 				dmp->b_wptr += cnt;
654 			}
655 			VOP_RWUNLOCK(readvp, readflg, NULL);
656 			releasef(sfv->sfv_fd);
657 		}
658 		sfv++;
659 	}
660 
661 	ASSERT(total_size == 0);
662 	error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
663 	if (error != 0) {
664 		if (head != NULL)
665 			freemsg(head);
666 		return (error);
667 	}
668 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
669 	*count += size;
670 
671 	return (0);
672 }
673 
674 
675 int
676 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
677     int copy_cnt, ssize_t *count)
678 {
679 	struct vnode *vp;
680 	struct uio auio;
681 	struct iovec aiov;
682 	ushort_t fflag;
683 	int ioflag;
684 	int i, error;
685 	size_t cnt;
686 	ssize_t sfv_len;
687 	u_offset_t sfv_off;
688 #ifdef _SYSCALL32_IMPL
689 	model_t model = get_udatamodel();
690 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
691 	    MAXOFF32_T : MAXOFFSET_T;
692 #else
693 	const u_offset_t maxoff = MAXOFF32_T;
694 #endif
695 	mblk_t	*dmp = NULL;
696 	char	*buf = NULL;
697 	size_t  extra;
698 	int maxblk, wroff, tail_len;
699 	struct sonode *so;
700 	stdata_t *stp;
701 	struct nmsghdr msg;
702 
703 	fflag = fp->f_flag;
704 	vp = fp->f_vnode;
705 
706 	if (vp->v_type == VSOCK) {
707 		so = VTOSO(vp);
708 		if (vp->v_stream != NULL) {
709 			stp = vp->v_stream;
710 			wroff = (int)stp->sd_wroff;
711 			tail_len = (int)stp->sd_tail;
712 			maxblk = (int)stp->sd_maxblk;
713 		} else {
714 			stp = NULL;
715 			wroff = so->so_proto_props.sopp_wroff;
716 			tail_len = so->so_proto_props.sopp_tail;
717 			maxblk = so->so_proto_props.sopp_maxblk;
718 		}
719 		extra = wroff + tail_len;
720 	}
721 
722 	bzero(&msg, sizeof (msg));
723 	auio.uio_extflg = UIO_COPY_DEFAULT;
724 	for (i = 0; i < copy_cnt; i++) {
725 		if (ISSIG(curthread, JUSTLOOKING))
726 			return (EINTR);
727 
728 		/*
729 		 * Do similar checks as "write" as we are writing
730 		 * sfv_len bytes into "vp".
731 		 */
732 		sfv_len = (ssize_t)sfv->sfv_len;
733 
734 		if (sfv_len == 0) {
735 			sfv++;
736 			continue;
737 		}
738 
739 		if (vp->v_type == VREG) {
740 			if (*fileoff >= curproc->p_fsz_ctl) {
741 				mutex_enter(&curproc->p_lock);
742 				(void) rctl_action(
743 				    rctlproc_legacy[RLIMIT_FSIZE],
744 				    curproc->p_rctls, curproc, RCA_SAFE);
745 				mutex_exit(&curproc->p_lock);
746 
747 				return (EFBIG);
748 			}
749 
750 			if (*fileoff >= maxoff)
751 				return (EFBIG);
752 
753 			if (*fileoff + sfv_len > maxoff)
754 				return (EINVAL);
755 		}
756 
757 		/* Check for overflow */
758 #ifdef _SYSCALL32_IMPL
759 		if (model == DATAMODEL_ILP32) {
760 			if (((ssize32_t)(*count + sfv_len)) < 0)
761 				return (EINVAL);
762 		} else
763 #endif
764 		if ((*count + sfv_len) < 0)
765 			return (EINVAL);
766 
767 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
768 
769 		if (sfv->sfv_fd == SFV_FD_SELF) {
770 			if (vp->v_type == VSOCK) {
771 				while (sfv_len > 0) {
772 					size_t iov_len;
773 
774 					iov_len = sfv_len;
775 					if (!SOCK_IS_NONSTR(so) &&
776 					    SOTOTPI(so)->sti_kssl_ctx != NULL)
777 						iov_len = MIN(iov_len, maxblk);
778 
779 					aiov.iov_len = iov_len;
780 					aiov.iov_base =
781 					    (caddr_t)(uintptr_t)sfv_off;
782 
783 					auio.uio_iov = &aiov;
784 					auio.uio_iovcnt = 1;
785 					auio.uio_loffset = *fileoff;
786 					auio.uio_segflg = UIO_USERSPACE;
787 					auio.uio_fmode = fflag;
788 					auio.uio_llimit = curproc->p_fsz_ctl;
789 					auio.uio_resid = iov_len;
790 
791 					dmp = allocb(iov_len + extra, BPRI_HI);
792 					if (dmp == NULL)
793 						return (ENOMEM);
794 					dmp->b_wptr = dmp->b_rptr =
795 					    dmp->b_rptr + wroff;
796 					error = uiomove((caddr_t)dmp->b_wptr,
797 					    iov_len, UIO_WRITE, &auio);
798 					if (error != 0) {
799 						freeb(dmp);
800 						return (error);
801 					}
802 					dmp->b_wptr += iov_len;
803 					error = socket_sendmblk(VTOSO(vp),
804 					    &msg, fflag, CRED(), &dmp);
805 
806 					if (error != 0) {
807 						if (dmp != NULL)
808 							freeb(dmp);
809 						return (error);
810 					}
811 					ttolwp(curthread)->lwp_ru.ioch +=
812 					    (ulong_t)iov_len;
813 					*count += iov_len;
814 					sfv_len -= iov_len;
815 					sfv_off += iov_len;
816 				}
817 			} else {
818 				ttolwp(curthread)->lwp_ru.ioch +=
819 				    (ulong_t)sfv_len;
820 				*count += sfv_len;
821 				aiov.iov_len = sfv_len;
822 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
823 
824 				auio.uio_iov = &aiov;
825 				auio.uio_iovcnt = 1;
826 				auio.uio_loffset = *fileoff;
827 				auio.uio_segflg = UIO_USERSPACE;
828 				auio.uio_fmode = fflag;
829 				auio.uio_llimit = curproc->p_fsz_ctl;
830 				auio.uio_resid = sfv_len;
831 
832 				ioflag = auio.uio_fmode &
833 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
834 				while (sfv_len > 0) {
835 					error = VOP_WRITE(vp, &auio, ioflag,
836 					    fp->f_cred, NULL);
837 					cnt = sfv_len - auio.uio_resid;
838 					sfv_len -= cnt;
839 					ttolwp(curthread)->lwp_ru.ioch +=
840 					    (ulong_t)cnt;
841 					*fileoff += cnt;
842 					*count += cnt;
843 					if (error != 0)
844 						return (error);
845 				}
846 			}
847 		} else {
848 			int segmapit = 0;
849 			file_t	*ffp;
850 			vnode_t	*readvp;
851 			struct vnode *realvp;
852 			size_t	size;
853 			caddr_t	ptr;
854 
855 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
856 				return (EBADF);
857 
858 			if ((ffp->f_flag & FREAD) == 0) {
859 				releasef(sfv->sfv_fd);
860 				return (EBADF);
861 			}
862 
863 			readvp = ffp->f_vnode;
864 			if (VOP_REALVP(readvp, &realvp, NULL) == 0)
865 				readvp = realvp;
866 			if (readvp->v_type != VREG) {
867 				releasef(sfv->sfv_fd);
868 				return (EINVAL);
869 			}
870 
871 			/*
872 			 * No point reading and writing to same vp,
873 			 * as long as both are regular files. readvp is not
874 			 * locked; but since we got it from an open file the
875 			 * contents will be valid during the time of access.
876 			 */
877 			if (vn_compare(vp, readvp)) {
878 				releasef(sfv->sfv_fd);
879 				return (EINVAL);
880 			}
881 
882 			/*
883 			 * Note: we assume readvp != vp. "vp" is already
884 			 * locked, and "readvp" must not be.
885 			 */
886 			(void) VOP_RWLOCK(readvp, readflg, NULL);
887 
888 			/* Same checks as in pread */
889 			if (sfv_off > maxoff) {
890 				VOP_RWUNLOCK(readvp, readflg, NULL);
891 				releasef(sfv->sfv_fd);
892 				return (EINVAL);
893 			}
894 			if (sfv_off + sfv_len > maxoff) {
895 				sfv_len = (ssize_t)((offset_t)maxoff -
896 				    sfv_off);
897 			}
898 			/* Find the native blocksize to transfer data */
899 			size = MIN(vp->v_vfsp->vfs_bsize,
900 			    readvp->v_vfsp->vfs_bsize);
901 			size = sfv_len < size ? sfv_len : size;
902 
903 			if (vp->v_type != VSOCK) {
904 				segmapit = 0;
905 				buf = kmem_alloc(size, KM_NOSLEEP);
906 				if (buf == NULL) {
907 					VOP_RWUNLOCK(readvp, readflg, NULL);
908 					releasef(sfv->sfv_fd);
909 					return (ENOMEM);
910 				}
911 			} else {
912 				uint_t	copyflag;
913 
914 				copyflag = stp != NULL ? stp->sd_copyflag :
915 				    so->so_proto_props.sopp_zcopyflag;
916 				/*
917 				 * For sockets acting as an SSL proxy, we
918 				 * need to adjust the size to the maximum
919 				 * SSL record size set in the stream head.
920 				 */
921 				if (!SOCK_IS_NONSTR(so) &&
922 				    _SOTOTPI(so)->sti_kssl_ctx != NULL)
923 					size = MIN(size, maxblk);
924 
925 				if (vn_has_flocks(readvp) ||
926 				    readvp->v_flag & VNOMAP ||
927 				    copyflag & STZCVMUNSAFE) {
928 					segmapit = 0;
929 				} else if (copyflag & STZCVMSAFE) {
930 					segmapit = 1;
931 				} else {
932 					int on = 1;
933 					if (socket_setsockopt(VTOSO(vp),
934 					    SOL_SOCKET, SO_SND_COPYAVOID,
935 					    &on, sizeof (on), CRED()) == 0)
936 					segmapit = 1;
937 				}
938 			}
939 
940 			if (segmapit) {
941 				boolean_t nowait;
942 
943 				nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
944 				error = snf_segmap(fp, readvp, sfv_off,
945 				    (u_offset_t)sfv_len, (ssize_t *)&cnt,
946 				    nowait);
947 				releasef(sfv->sfv_fd);
948 				*count += cnt;
949 				if (error)
950 					return (error);
951 				sfv++;
952 				continue;
953 			}
954 
955 			while (sfv_len > 0) {
956 				size_t	iov_len;
957 
958 				iov_len = MIN(size, sfv_len);
959 
960 				if (vp->v_type == VSOCK) {
961 					dmp = allocb(iov_len + extra, BPRI_HI);
962 					if (dmp == NULL) {
963 						VOP_RWUNLOCK(readvp, readflg,
964 						    NULL);
965 						releasef(sfv->sfv_fd);
966 						return (ENOMEM);
967 					}
968 					dmp->b_wptr = dmp->b_rptr =
969 					    dmp->b_rptr + wroff;
970 					ptr = (caddr_t)dmp->b_rptr;
971 				} else {
972 					ptr = buf;
973 				}
974 
975 				aiov.iov_base = ptr;
976 				aiov.iov_len = iov_len;
977 				auio.uio_loffset = sfv_off;
978 				auio.uio_iov = &aiov;
979 				auio.uio_iovcnt = 1;
980 				auio.uio_resid = iov_len;
981 				auio.uio_segflg = UIO_SYSSPACE;
982 				auio.uio_llimit = MAXOFFSET_T;
983 				auio.uio_fmode = ffp->f_flag;
984 				ioflag = auio.uio_fmode &
985 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
986 
987 				/*
988 				 * If read sync is not asked for,
989 				 * filter sync flags
990 				 */
991 				if ((ioflag & FRSYNC) == 0)
992 					ioflag &= ~(FSYNC|FDSYNC);
993 				error = VOP_READ(readvp, &auio, ioflag,
994 				    fp->f_cred, NULL);
995 				if (error != 0) {
996 					/*
997 					 * If we were reading a pipe (currently
998 					 * not implemented), we may now lose
999 					 * data.
1000 					 */
1001 					if (vp->v_type == VSOCK)
1002 						freeb(dmp);
1003 					else
1004 						kmem_free(buf, size);
1005 					VOP_RWUNLOCK(readvp, readflg, NULL);
1006 					releasef(sfv->sfv_fd);
1007 					return (error);
1008 				}
1009 
1010 				/*
1011 				 * Check how much data was really read.
1012 				 * Decrement the 'len' and increment the
1013 				 * 'off' appropriately.
1014 				 */
1015 				cnt = iov_len - auio.uio_resid;
1016 				if (cnt == 0) {
1017 					if (vp->v_type == VSOCK)
1018 						freeb(dmp);
1019 					else
1020 						kmem_free(buf, size);
1021 					VOP_RWUNLOCK(readvp, readflg, NULL);
1022 					releasef(sfv->sfv_fd);
1023 					return (EINVAL);
1024 				}
1025 				sfv_len -= cnt;
1026 				sfv_off += cnt;
1027 
1028 				if (vp->v_type == VSOCK) {
1029 					dmp->b_wptr = dmp->b_rptr + cnt;
1030 
1031 					error = socket_sendmblk(VTOSO(vp),
1032 					    &msg, fflag, CRED(), &dmp);
1033 
1034 					if (error != 0) {
1035 						if (dmp != NULL)
1036 							freeb(dmp);
1037 						VOP_RWUNLOCK(readvp, readflg,
1038 						    NULL);
1039 						releasef(sfv->sfv_fd);
1040 						return (error);
1041 					}
1042 
1043 					ttolwp(curthread)->lwp_ru.ioch +=
1044 					    (ulong_t)cnt;
1045 					*count += cnt;
1046 				} else {
1047 
1048 					aiov.iov_base = ptr;
1049 					aiov.iov_len = cnt;
1050 					auio.uio_loffset = *fileoff;
1051 					auio.uio_resid = cnt;
1052 					auio.uio_iov = &aiov;
1053 					auio.uio_iovcnt = 1;
1054 					auio.uio_segflg = UIO_SYSSPACE;
1055 					auio.uio_llimit = curproc->p_fsz_ctl;
1056 					auio.uio_fmode = fflag;
1057 					ioflag = auio.uio_fmode &
1058 					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1059 					error = VOP_WRITE(vp, &auio, ioflag,
1060 					    fp->f_cred, NULL);
1061 
1062 					/*
1063 					 * Check how much data was written.
1064 					 * Increment the 'len' and decrement the
1065 					 * 'off' if all the data was not
1066 					 * written.
1067 					 */
1068 					cnt -= auio.uio_resid;
1069 					sfv_len += auio.uio_resid;
1070 					sfv_off -= auio.uio_resid;
1071 					ttolwp(curthread)->lwp_ru.ioch +=
1072 					    (ulong_t)cnt;
1073 					*fileoff += cnt;
1074 					*count += cnt;
1075 					if (error != 0) {
1076 						kmem_free(buf, size);
1077 						VOP_RWUNLOCK(readvp, readflg,
1078 						    NULL);
1079 						releasef(sfv->sfv_fd);
1080 						return (error);
1081 					}
1082 				}
1083 			}
1084 			if (buf) {
1085 				kmem_free(buf, size);
1086 				buf = NULL;
1087 			}
1088 			VOP_RWUNLOCK(readvp, readflg, NULL);
1089 			releasef(sfv->sfv_fd);
1090 		}
1091 		sfv++;
1092 	}
1093 	return (0);
1094 }
1095 
1096 ssize_t
1097 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1098     size_t *xferred)
1099 {
1100 	int error = 0;
1101 	int first_vector_error = 0;
1102 	file_t *fp;
1103 	struct vnode *vp;
1104 	struct sonode *so;
1105 	u_offset_t fileoff;
1106 	int copy_cnt;
1107 	const struct sendfilevec *copy_vec;
1108 	struct sendfilevec sfv[SEND_MAX_CHUNK];
1109 	ssize_t count = 0;
1110 #ifdef _SYSCALL32_IMPL
1111 	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1112 #endif
1113 	ssize_t total_size;
1114 	int i;
1115 	boolean_t is_sock = B_FALSE;
1116 	int maxblk = 0;
1117 
1118 	if (sfvcnt <= 0)
1119 		return (set_errno(EINVAL));
1120 
1121 	if ((fp = getf(fildes)) == NULL)
1122 		return (set_errno(EBADF));
1123 
1124 	if (((fp->f_flag) & FWRITE) == 0) {
1125 		error = EBADF;
1126 		goto err;
1127 	}
1128 
1129 	fileoff = fp->f_offset;
1130 	vp = fp->f_vnode;
1131 
1132 	switch (vp->v_type) {
1133 	case VSOCK:
1134 		so = VTOSO(vp);
1135 		is_sock = B_TRUE;
1136 		if (SOCK_IS_NONSTR(so)) {
1137 			maxblk = so->so_proto_props.sopp_maxblk;
1138 		} else {
1139 			maxblk = (int)vp->v_stream->sd_maxblk;
1140 		}
1141 		break;
1142 	case VREG:
1143 		break;
1144 	default:
1145 		error = EINVAL;
1146 		goto err;
1147 	}
1148 
1149 	switch (opcode) {
1150 	case SENDFILEV :
1151 		break;
1152 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1153 	case SENDFILEV64 :
1154 		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1155 		    (size32_t *)xferred, fildes));
1156 #endif
1157 	default :
1158 		error = ENOSYS;
1159 		break;
1160 	}
1161 
1162 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1163 	copy_vec = vec;
1164 
1165 	do {
1166 		total_size = 0;
1167 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1168 #ifdef _SYSCALL32_IMPL
1169 		/* 32-bit callers need to have their iovec expanded. */
1170 		if (get_udatamodel() == DATAMODEL_ILP32) {
1171 			if (copyin(copy_vec, sfv32,
1172 			    copy_cnt * sizeof (ksendfilevec32_t))) {
1173 				error = EFAULT;
1174 				break;
1175 			}
1176 
1177 			for (i = 0; i < copy_cnt; i++) {
1178 				sfv[i].sfv_fd = sfv32[i].sfv_fd;
1179 				sfv[i].sfv_off =
1180 				    (off_t)(uint32_t)sfv32[i].sfv_off;
1181 				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1182 				total_size += sfv[i].sfv_len;
1183 				sfv[i].sfv_flag = sfv32[i].sfv_flag;
1184 				/*
1185 				 * Individual elements of the vector must not
1186 				 * wrap or overflow, as later math is signed.
1187 				 * Equally total_size needs to be checked after
1188 				 * each vector is added in, to be sure that
1189 				 * rogue values haven't overflowed the counter.
1190 				 */
1191 				if (((ssize32_t)sfv[i].sfv_len < 0) ||
1192 				    ((ssize32_t)total_size < 0)) {
1193 					/*
1194 					 * Truncate the vector to send data
1195 					 * described by elements before the
1196 					 * error.
1197 					 */
1198 					copy_cnt = i;
1199 					first_vector_error = EINVAL;
1200 					/* total_size can't be trusted */
1201 					if ((ssize32_t)total_size < 0)
1202 						error = EINVAL;
1203 					break;
1204 				}
1205 			}
1206 			/* Nothing to do, process errors */
1207 			if (copy_cnt == 0)
1208 				break;
1209 
1210 		} else {
1211 #endif
1212 			if (copyin(copy_vec, sfv,
1213 			    copy_cnt * sizeof (sendfilevec_t))) {
1214 				error = EFAULT;
1215 				break;
1216 			}
1217 
1218 			for (i = 0; i < copy_cnt; i++) {
1219 				total_size += sfv[i].sfv_len;
1220 				/*
1221 				 * Individual elements of the vector must not
1222 				 * wrap or overflow, as later math is signed.
1223 				 * Equally total_size needs to be checked after
1224 				 * each vector is added in, to be sure that
1225 				 * rogue values haven't overflowed the counter.
1226 				 */
1227 				if (((ssize_t)sfv[i].sfv_len < 0) ||
1228 				    (total_size < 0)) {
1229 					/*
1230 					 * Truncate the vector to send data
1231 					 * described by elements before the
1232 					 * error.
1233 					 */
1234 					copy_cnt = i;
1235 					first_vector_error = EINVAL;
1236 					/* total_size can't be trusted */
1237 					if (total_size < 0)
1238 						error = EINVAL;
1239 					break;
1240 				}
1241 			}
1242 			/* Nothing to do, process errors */
1243 			if (copy_cnt == 0)
1244 				break;
1245 #ifdef _SYSCALL32_IMPL
1246 		}
1247 #endif
1248 
1249 		/*
1250 		 * The task between deciding to use sendvec_small_chunk
1251 		 * and sendvec_chunk is dependant on multiple things:
1252 		 *
1253 		 * i) latency is important for smaller files. So if the
1254 		 * data is smaller than 'tcp_slow_start_initial' times
1255 		 * maxblk, then use sendvec_small_chunk which creates
1256 		 * maxblk size mblks and chains them together and sends
1257 		 * them to TCP in one shot. It also leaves 'wroff' size
1258 		 * space for the headers in each mblk.
1259 		 *
1260 		 * ii) for total size bigger than 'tcp_slow_start_initial'
1261 		 * time maxblk, its probably real file data which is
1262 		 * dominating. So its better to use sendvec_chunk because
1263 		 * performance goes to dog if we don't do pagesize reads.
1264 		 * sendvec_chunk will do pagesize reads and write them
1265 		 * in pagesize mblks to TCP.
1266 		 *
1267 		 * Side Notes: A write to file has not been optimized.
1268 		 * Future zero copy code will plugin into sendvec_chunk
1269 		 * only because doing zero copy for files smaller then
1270 		 * pagesize is useless.
1271 		 *
1272 		 * Note, if socket has NL7C enabled then call NL7C's
1273 		 * senfilev() function to consume the sfv[].
1274 		 */
1275 		if (is_sock) {
1276 			if (!SOCK_IS_NONSTR(so) &&
1277 			    _SOTOTPI(so)->sti_nl7c_flags != 0) {
1278 				error = nl7c_sendfilev(so, &fileoff,
1279 				    sfv, copy_cnt, &count);
1280 			} else if ((total_size <= (4 * maxblk)) &&
1281 			    error == 0) {
1282 				error = sendvec_small_chunk(fp,
1283 				    &fileoff, sfv, copy_cnt,
1284 				    total_size, maxblk, &count);
1285 			} else {
1286 				error = sendvec_chunk(fp, &fileoff,
1287 				    sfv, copy_cnt, &count);
1288 			}
1289 		} else {
1290 			ASSERT(vp->v_type == VREG);
1291 			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1292 			    &count);
1293 		}
1294 
1295 
1296 #ifdef _SYSCALL32_IMPL
1297 	if (get_udatamodel() == DATAMODEL_ILP32)
1298 		copy_vec = (const struct sendfilevec *)((char *)copy_vec +
1299 		    (copy_cnt * sizeof (ksendfilevec32_t)));
1300 	else
1301 #endif
1302 		copy_vec += copy_cnt;
1303 		sfvcnt -= copy_cnt;
1304 
1305 	/* Process all vector members up to first error */
1306 	} while ((sfvcnt > 0) && first_vector_error == 0 && error == 0);
1307 
1308 	if (vp->v_type == VREG)
1309 		fp->f_offset += count;
1310 
1311 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1312 
1313 #ifdef _SYSCALL32_IMPL
1314 	if (get_udatamodel() == DATAMODEL_ILP32) {
1315 		ssize32_t count32 = (ssize32_t)count;
1316 		if (copyout(&count32, xferred, sizeof (count32)))
1317 			error = EFAULT;
1318 		releasef(fildes);
1319 		if (error != 0)
1320 			return (set_errno(error));
1321 		if (first_vector_error != 0)
1322 			return (set_errno(first_vector_error));
1323 		return (count32);
1324 	}
1325 #endif
1326 	if (copyout(&count, xferred, sizeof (count)))
1327 		error = EFAULT;
1328 	releasef(fildes);
1329 	if (error != 0)
1330 		return (set_errno(error));
1331 	if (first_vector_error != 0)
1332 		return (set_errno(first_vector_error));
1333 	return (count);
1334 err:
1335 	ASSERT(error != 0);
1336 	releasef(fildes);
1337 	return (set_errno(error));
1338 }
1339