xref: /illumos-gate/usr/src/uts/common/syscall/rw.c (revision 8e9352593f28b1a84101b11e1c6db6aabf187a1c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2015, Joyent, Inc.  All rights reserved.
26  */
27 
28 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 /*
32  * Portions of this source code were derived from Berkeley 4.3 BSD
33  * under license from the Regents of the University of California.
34  */
35 
36 #include <sys/param.h>
37 #include <sys/isa_defs.h>
38 #include <sys/types.h>
39 #include <sys/inttypes.h>
40 #include <sys/sysmacros.h>
41 #include <sys/cred.h>
42 #include <sys/user.h>
43 #include <sys/systm.h>
44 #include <sys/errno.h>
45 #include <sys/vnode.h>
46 #include <sys/file.h>
47 #include <sys/proc.h>
48 #include <sys/cpuvar.h>
49 #include <sys/uio.h>
50 #include <sys/debug.h>
51 #include <sys/rctl.h>
52 #include <sys/nbmlock.h>
53 #include <sys/limits.h>
54 
55 #define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
56 
57 size_t copyout_max_cached = COPYOUT_MAX_CACHE;	/* global so it's patchable */
58 
59 /*
60  * read, write, pread, pwrite, readv, and writev syscalls.
61  *
62  * 64-bit open:	all open's are large file opens.
63  * Large Files: the behaviour of read depends on whether the fd
64  *		corresponds to large open or not.
65  * 32-bit open:	FOFFMAX flag not set.
66  *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
67  *		EOVERFLOW if count is non-zero and if size of file
68  *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
69  *		at >= MAXOFF32_T returns EOF.
70  */
71 
72 /*
73  * Native system call
74  */
75 ssize_t
76 read(int fdes, void *cbuf, size_t count)
77 {
78 	struct uio auio;
79 	struct iovec aiov;
80 	file_t *fp;
81 	register vnode_t *vp;
82 	struct cpu *cp;
83 	int fflag, ioflag, rwflag;
84 	ssize_t cnt, bcount;
85 	int error = 0;
86 	u_offset_t fileoff;
87 	int in_crit = 0;
88 
89 	if ((cnt = (ssize_t)count) < 0)
90 		return (set_errno(EINVAL));
91 	if ((fp = getf(fdes)) == NULL)
92 		return (set_errno(EBADF));
93 	if (((fflag = fp->f_flag) & FREAD) == 0) {
94 		error = EBADF;
95 		goto out;
96 	}
97 	vp = fp->f_vnode;
98 
99 	if (vp->v_type == VREG && cnt == 0) {
100 		goto out;
101 	}
102 
103 	rwflag = 0;
104 	aiov.iov_base = cbuf;
105 	aiov.iov_len = cnt;
106 
107 	/*
108 	 * We have to enter the critical region before calling VOP_RWLOCK
109 	 * to avoid a deadlock with write() calls.
110 	 */
111 	if (nbl_need_check(vp)) {
112 		int svmand;
113 
114 		nbl_start_crit(vp, RW_READER);
115 		in_crit = 1;
116 		error = nbl_svmand(vp, fp->f_cred, &svmand);
117 		if (error != 0)
118 			goto out;
119 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
120 		    NULL)) {
121 			error = EACCES;
122 			goto out;
123 		}
124 	}
125 
126 	(void) VOP_RWLOCK(vp, rwflag, NULL);
127 
128 	/*
129 	 * We do the following checks inside VOP_RWLOCK so as to
130 	 * prevent file size from changing while these checks are
131 	 * being done. Also, we load fp's offset to the local
132 	 * variable fileoff because we can have a parallel lseek
133 	 * going on (f_offset is not protected by any lock) which
134 	 * could change f_offset. We need to see the value only
135 	 * once here and take a decision. Seeing it more than once
136 	 * can lead to incorrect functionality.
137 	 */
138 
139 	fileoff = (u_offset_t)fp->f_offset;
140 	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
141 		struct vattr va;
142 		va.va_mask = AT_SIZE;
143 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
144 			VOP_RWUNLOCK(vp, rwflag, NULL);
145 			goto out;
146 		}
147 		if (fileoff >= va.va_size) {
148 			cnt = 0;
149 			VOP_RWUNLOCK(vp, rwflag, NULL);
150 			goto out;
151 		} else {
152 			error = EOVERFLOW;
153 			VOP_RWUNLOCK(vp, rwflag, NULL);
154 			goto out;
155 		}
156 	}
157 	if ((vp->v_type == VREG) &&
158 	    (fileoff + cnt > OFFSET_MAX(fp))) {
159 		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
160 	}
161 	auio.uio_loffset = fileoff;
162 	auio.uio_iov = &aiov;
163 	auio.uio_iovcnt = 1;
164 	auio.uio_resid = bcount = cnt;
165 	auio.uio_segflg = UIO_USERSPACE;
166 	auio.uio_llimit = MAXOFFSET_T;
167 	auio.uio_fmode = fflag;
168 	/*
169 	 * Only use bypass caches when the count is large enough
170 	 */
171 	if (bcount <= copyout_max_cached)
172 		auio.uio_extflg = UIO_COPY_CACHED;
173 	else
174 		auio.uio_extflg = UIO_COPY_DEFAULT;
175 
176 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
177 
178 	/* If read sync is not asked for, filter sync flags */
179 	if ((ioflag & FRSYNC) == 0)
180 		ioflag &= ~(FSYNC|FDSYNC);
181 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
182 	cnt -= auio.uio_resid;
183 	CPU_STATS_ENTER_K();
184 	cp = CPU;
185 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
186 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
187 	CPU_STATS_EXIT_K();
188 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
189 
190 	if (vp->v_type == VFIFO)	/* Backward compatibility */
191 		fp->f_offset = cnt;
192 	else if (((fp->f_flag & FAPPEND) == 0) ||
193 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
194 		fp->f_offset = auio.uio_loffset;
195 	VOP_RWUNLOCK(vp, rwflag, NULL);
196 
197 	if (error == EINTR && cnt != 0)
198 		error = 0;
199 out:
200 	if (in_crit)
201 		nbl_end_crit(vp);
202 	releasef(fdes);
203 	if (error)
204 		return (set_errno(error));
205 	return (cnt);
206 }
207 
208 /*
209  * Native system call
210  */
211 ssize_t
212 write(int fdes, void *cbuf, size_t count)
213 {
214 	struct uio auio;
215 	struct iovec aiov;
216 	file_t *fp;
217 	register vnode_t *vp;
218 	struct cpu *cp;
219 	int fflag, ioflag, rwflag;
220 	ssize_t cnt, bcount;
221 	int error = 0;
222 	u_offset_t fileoff;
223 	int in_crit = 0;
224 
225 	if ((cnt = (ssize_t)count) < 0)
226 		return (set_errno(EINVAL));
227 	if ((fp = getf(fdes)) == NULL)
228 		return (set_errno(EBADF));
229 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
230 		error = EBADF;
231 		goto out;
232 	}
233 	vp = fp->f_vnode;
234 
235 	if (vp->v_type == VREG && cnt == 0) {
236 		goto out;
237 	}
238 
239 	rwflag = 1;
240 	aiov.iov_base = cbuf;
241 	aiov.iov_len = cnt;
242 
243 	/*
244 	 * We have to enter the critical region before calling VOP_RWLOCK
245 	 * to avoid a deadlock with ufs.
246 	 */
247 	if (nbl_need_check(vp)) {
248 		int svmand;
249 
250 		nbl_start_crit(vp, RW_READER);
251 		in_crit = 1;
252 		error = nbl_svmand(vp, fp->f_cred, &svmand);
253 		if (error != 0)
254 			goto out;
255 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
256 		    NULL)) {
257 			error = EACCES;
258 			goto out;
259 		}
260 	}
261 
262 	(void) VOP_RWLOCK(vp, rwflag, NULL);
263 
264 	fileoff = fp->f_offset;
265 	if (vp->v_type == VREG) {
266 
267 		/*
268 		 * We raise psignal if write for >0 bytes causes
269 		 * it to exceed the ulimit.
270 		 */
271 		if (fileoff >= curproc->p_fsz_ctl) {
272 			VOP_RWUNLOCK(vp, rwflag, NULL);
273 
274 			mutex_enter(&curproc->p_lock);
275 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
276 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
277 			mutex_exit(&curproc->p_lock);
278 
279 			error = EFBIG;
280 			goto out;
281 		}
282 		/*
283 		 * We return EFBIG if write is done at an offset
284 		 * greater than the offset maximum for this file structure.
285 		 */
286 
287 		if (fileoff >= OFFSET_MAX(fp)) {
288 			VOP_RWUNLOCK(vp, rwflag, NULL);
289 			error = EFBIG;
290 			goto out;
291 		}
292 		/*
293 		 * Limit the bytes to be written  upto offset maximum for
294 		 * this open file structure.
295 		 */
296 		if (fileoff + cnt > OFFSET_MAX(fp))
297 			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
298 	}
299 	auio.uio_loffset = fileoff;
300 	auio.uio_iov = &aiov;
301 	auio.uio_iovcnt = 1;
302 	auio.uio_resid = bcount = cnt;
303 	auio.uio_segflg = UIO_USERSPACE;
304 	auio.uio_llimit = curproc->p_fsz_ctl;
305 	auio.uio_fmode = fflag;
306 	auio.uio_extflg = UIO_COPY_DEFAULT;
307 
308 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
309 
310 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
311 	cnt -= auio.uio_resid;
312 	CPU_STATS_ENTER_K();
313 	cp = CPU;
314 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
315 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
316 	CPU_STATS_EXIT_K();
317 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
318 
319 	if (vp->v_type == VFIFO)	/* Backward compatibility */
320 		fp->f_offset = cnt;
321 	else if (((fp->f_flag & FAPPEND) == 0) ||
322 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
323 		fp->f_offset = auio.uio_loffset;
324 	VOP_RWUNLOCK(vp, rwflag, NULL);
325 
326 	if (error == EINTR && cnt != 0)
327 		error = 0;
328 out:
329 	if (in_crit)
330 		nbl_end_crit(vp);
331 	releasef(fdes);
332 	if (error)
333 		return (set_errno(error));
334 	return (cnt);
335 }
336 
337 ssize_t
338 pread(int fdes, void *cbuf, size_t count, off_t offset)
339 {
340 	struct uio auio;
341 	struct iovec aiov;
342 	file_t *fp;
343 	register vnode_t *vp;
344 	struct cpu *cp;
345 	int fflag, ioflag, rwflag;
346 	ssize_t bcount;
347 	int error = 0;
348 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
349 #ifdef _SYSCALL32_IMPL
350 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
351 	    MAXOFF32_T : MAXOFFSET_T;
352 #else
353 	const u_offset_t maxoff = MAXOFF32_T;
354 #endif
355 	int in_crit = 0;
356 
357 	if ((bcount = (ssize_t)count) < 0)
358 		return (set_errno(EINVAL));
359 
360 	if ((fp = getf(fdes)) == NULL)
361 		return (set_errno(EBADF));
362 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
363 		error = EBADF;
364 		goto out;
365 	}
366 
367 	rwflag = 0;
368 	vp = fp->f_vnode;
369 
370 	if (vp->v_type == VREG) {
371 
372 		if (bcount == 0)
373 			goto out;
374 
375 		/*
376 		 * Return EINVAL if an invalid offset comes to pread.
377 		 * Negative offset from user will cause this error.
378 		 */
379 
380 		if (fileoff > maxoff) {
381 			error = EINVAL;
382 			goto out;
383 		}
384 		/*
385 		 * Limit offset such that we don't read or write
386 		 * a file beyond the maximum offset representable in
387 		 * an off_t structure.
388 		 */
389 		if (fileoff + bcount > maxoff)
390 			bcount = (ssize_t)((offset_t)maxoff - fileoff);
391 	} else if (vp->v_type == VFIFO) {
392 		error = ESPIPE;
393 		goto out;
394 	}
395 
396 	/*
397 	 * We have to enter the critical region before calling VOP_RWLOCK
398 	 * to avoid a deadlock with ufs.
399 	 */
400 	if (nbl_need_check(vp)) {
401 		int svmand;
402 
403 		nbl_start_crit(vp, RW_READER);
404 		in_crit = 1;
405 		error = nbl_svmand(vp, fp->f_cred, &svmand);
406 		if (error != 0)
407 			goto out;
408 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
409 		    NULL)) {
410 			error = EACCES;
411 			goto out;
412 		}
413 	}
414 
415 	aiov.iov_base = cbuf;
416 	aiov.iov_len = bcount;
417 	(void) VOP_RWLOCK(vp, rwflag, NULL);
418 	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
419 		struct vattr va;
420 		va.va_mask = AT_SIZE;
421 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
422 			VOP_RWUNLOCK(vp, rwflag, NULL);
423 			goto out;
424 		}
425 		VOP_RWUNLOCK(vp, rwflag, NULL);
426 
427 		/*
428 		 * We have to return EOF if fileoff is >= file size.
429 		 */
430 		if (fileoff >= va.va_size) {
431 			bcount = 0;
432 			goto out;
433 		}
434 
435 		/*
436 		 * File is greater than or equal to maxoff and therefore
437 		 * we return EOVERFLOW.
438 		 */
439 		error = EOVERFLOW;
440 		goto out;
441 	}
442 	auio.uio_loffset = fileoff;
443 	auio.uio_iov = &aiov;
444 	auio.uio_iovcnt = 1;
445 	auio.uio_resid = bcount;
446 	auio.uio_segflg = UIO_USERSPACE;
447 	auio.uio_llimit = MAXOFFSET_T;
448 	auio.uio_fmode = fflag;
449 	auio.uio_extflg = UIO_COPY_CACHED;
450 
451 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
452 
453 	/* If read sync is not asked for, filter sync flags */
454 	if ((ioflag & FRSYNC) == 0)
455 		ioflag &= ~(FSYNC|FDSYNC);
456 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
457 	bcount -= auio.uio_resid;
458 	CPU_STATS_ENTER_K();
459 	cp = CPU;
460 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
461 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
462 	CPU_STATS_EXIT_K();
463 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
464 	VOP_RWUNLOCK(vp, rwflag, NULL);
465 
466 	if (error == EINTR && bcount != 0)
467 		error = 0;
468 out:
469 	if (in_crit)
470 		nbl_end_crit(vp);
471 	releasef(fdes);
472 	if (error)
473 		return (set_errno(error));
474 	return (bcount);
475 }
476 
477 ssize_t
478 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
479 {
480 	struct uio auio;
481 	struct iovec aiov;
482 	file_t *fp;
483 	register vnode_t *vp;
484 	struct cpu *cp;
485 	int fflag, ioflag, rwflag;
486 	ssize_t bcount;
487 	int error = 0;
488 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
489 #ifdef _SYSCALL32_IMPL
490 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
491 	    MAXOFF32_T : MAXOFFSET_T;
492 #else
493 	const u_offset_t maxoff = MAXOFF32_T;
494 #endif
495 	int in_crit = 0;
496 
497 	if ((bcount = (ssize_t)count) < 0)
498 		return (set_errno(EINVAL));
499 	if ((fp = getf(fdes)) == NULL)
500 		return (set_errno(EBADF));
501 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
502 		error = EBADF;
503 		goto out;
504 	}
505 
506 	rwflag = 1;
507 	vp = fp->f_vnode;
508 
509 	if (vp->v_type == VREG) {
510 
511 		if (bcount == 0)
512 			goto out;
513 
514 		/*
515 		 * return EINVAL for offsets that cannot be
516 		 * represented in an off_t.
517 		 */
518 		if (fileoff > maxoff) {
519 			error = EINVAL;
520 			goto out;
521 		}
522 		/*
523 		 * Take appropriate action if we are trying to write above the
524 		 * resource limit.
525 		 */
526 		if (fileoff >= curproc->p_fsz_ctl) {
527 			mutex_enter(&curproc->p_lock);
528 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
529 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
530 			mutex_exit(&curproc->p_lock);
531 
532 			error = EFBIG;
533 			goto out;
534 		}
535 		/*
536 		 * Don't allow pwrite to cause file sizes to exceed
537 		 * maxoff.
538 		 */
539 		if (fileoff == maxoff) {
540 			error = EFBIG;
541 			goto out;
542 		}
543 		if (fileoff + count > maxoff)
544 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
545 	} else if (vp->v_type == VFIFO) {
546 		error = ESPIPE;
547 		goto out;
548 	}
549 
550 	/*
551 	 * We have to enter the critical region before calling VOP_RWLOCK
552 	 * to avoid a deadlock with ufs.
553 	 */
554 	if (nbl_need_check(vp)) {
555 		int svmand;
556 
557 		nbl_start_crit(vp, RW_READER);
558 		in_crit = 1;
559 		error = nbl_svmand(vp, fp->f_cred, &svmand);
560 		if (error != 0)
561 			goto out;
562 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
563 		    NULL)) {
564 			error = EACCES;
565 			goto out;
566 		}
567 	}
568 
569 	aiov.iov_base = cbuf;
570 	aiov.iov_len = bcount;
571 	(void) VOP_RWLOCK(vp, rwflag, NULL);
572 	auio.uio_loffset = fileoff;
573 	auio.uio_iov = &aiov;
574 	auio.uio_iovcnt = 1;
575 	auio.uio_resid = bcount;
576 	auio.uio_segflg = UIO_USERSPACE;
577 	auio.uio_llimit = curproc->p_fsz_ctl;
578 	auio.uio_fmode = fflag;
579 	auio.uio_extflg = UIO_COPY_CACHED;
580 
581 	/*
582 	 * The SUSv4 POSIX specification states:
583 	 *	The pwrite() function shall be equivalent to write(), except
584 	 *	that it writes into a given position and does not change
585 	 *	the file offset (regardless of whether O_APPEND is set).
586 	 * To make this be true, we omit the FAPPEND flag from ioflag.
587 	 */
588 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
589 
590 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
591 	bcount -= auio.uio_resid;
592 	CPU_STATS_ENTER_K();
593 	cp = CPU;
594 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
595 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
596 	CPU_STATS_EXIT_K();
597 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
598 	VOP_RWUNLOCK(vp, rwflag, NULL);
599 
600 	if (error == EINTR && bcount != 0)
601 		error = 0;
602 out:
603 	if (in_crit)
604 		nbl_end_crit(vp);
605 	releasef(fdes);
606 	if (error)
607 		return (set_errno(error));
608 	return (bcount);
609 }
610 
611 ssize_t
612 readv(int fdes, struct iovec *iovp, int iovcnt)
613 {
614 	struct uio auio;
615 	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
616 	int aiovlen = 0;
617 	file_t *fp;
618 	register vnode_t *vp;
619 	struct cpu *cp;
620 	int fflag, ioflag, rwflag;
621 	ssize_t count, bcount;
622 	int error = 0;
623 	int i;
624 	u_offset_t fileoff;
625 	int in_crit = 0;
626 
627 	if (iovcnt <= 0 || iovcnt > IOV_MAX)
628 		return (set_errno(EINVAL));
629 
630 	if (iovcnt > IOV_MAX_STACK) {
631 		aiovlen = iovcnt * sizeof (iovec_t);
632 		aiov = kmem_alloc(aiovlen, KM_SLEEP);
633 	}
634 
635 #ifdef _SYSCALL32_IMPL
636 	/*
637 	 * 32-bit callers need to have their iovec expanded,
638 	 * while ensuring that they can't move more than 2Gbytes
639 	 * of data in a single call.
640 	 */
641 	if (get_udatamodel() == DATAMODEL_ILP32) {
642 		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
643 		int aiov32len;
644 		ssize32_t count32;
645 
646 		aiov32len = iovcnt * sizeof (iovec32_t);
647 		if (aiovlen != 0)
648 			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
649 
650 		if (copyin(iovp, aiov32, aiov32len)) {
651 			if (aiovlen != 0) {
652 				kmem_free(aiov32, aiov32len);
653 				kmem_free(aiov, aiovlen);
654 			}
655 			return (set_errno(EFAULT));
656 		}
657 
658 		count32 = 0;
659 		for (i = 0; i < iovcnt; i++) {
660 			ssize32_t iovlen32 = aiov32[i].iov_len;
661 			count32 += iovlen32;
662 			if (iovlen32 < 0 || count32 < 0) {
663 				if (aiovlen != 0) {
664 					kmem_free(aiov32, aiov32len);
665 					kmem_free(aiov, aiovlen);
666 				}
667 				return (set_errno(EINVAL));
668 			}
669 			aiov[i].iov_len = iovlen32;
670 			aiov[i].iov_base =
671 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
672 		}
673 
674 		if (aiovlen != 0)
675 			kmem_free(aiov32, aiov32len);
676 	} else
677 #endif
678 	if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
679 		if (aiovlen != 0)
680 			kmem_free(aiov, aiovlen);
681 		return (set_errno(EFAULT));
682 	}
683 
684 	count = 0;
685 	for (i = 0; i < iovcnt; i++) {
686 		ssize_t iovlen = aiov[i].iov_len;
687 		count += iovlen;
688 		if (iovlen < 0 || count < 0) {
689 			if (aiovlen != 0)
690 				kmem_free(aiov, aiovlen);
691 			return (set_errno(EINVAL));
692 		}
693 	}
694 	if ((fp = getf(fdes)) == NULL) {
695 		if (aiovlen != 0)
696 			kmem_free(aiov, aiovlen);
697 		return (set_errno(EBADF));
698 	}
699 	if (((fflag = fp->f_flag) & FREAD) == 0) {
700 		error = EBADF;
701 		goto out;
702 	}
703 	vp = fp->f_vnode;
704 	if (vp->v_type == VREG && count == 0) {
705 		goto out;
706 	}
707 
708 	rwflag = 0;
709 
710 	/*
711 	 * We have to enter the critical region before calling VOP_RWLOCK
712 	 * to avoid a deadlock with ufs.
713 	 */
714 	if (nbl_need_check(vp)) {
715 		int svmand;
716 
717 		nbl_start_crit(vp, RW_READER);
718 		in_crit = 1;
719 		error = nbl_svmand(vp, fp->f_cred, &svmand);
720 		if (error != 0)
721 			goto out;
722 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
723 		    NULL)) {
724 			error = EACCES;
725 			goto out;
726 		}
727 	}
728 
729 	(void) VOP_RWLOCK(vp, rwflag, NULL);
730 	fileoff = fp->f_offset;
731 
732 	/*
733 	 * Behaviour is same as read. Please see comments in read.
734 	 */
735 
736 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
737 		struct vattr va;
738 		va.va_mask = AT_SIZE;
739 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
740 			VOP_RWUNLOCK(vp, rwflag, NULL);
741 			goto out;
742 		}
743 		if (fileoff >= va.va_size) {
744 			VOP_RWUNLOCK(vp, rwflag, NULL);
745 			count = 0;
746 			goto out;
747 		} else {
748 			VOP_RWUNLOCK(vp, rwflag, NULL);
749 			error = EOVERFLOW;
750 			goto out;
751 		}
752 	}
753 	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
754 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
755 	}
756 	auio.uio_loffset = fileoff;
757 	auio.uio_iov = aiov;
758 	auio.uio_iovcnt = iovcnt;
759 	auio.uio_resid = bcount = count;
760 	auio.uio_segflg = UIO_USERSPACE;
761 	auio.uio_llimit = MAXOFFSET_T;
762 	auio.uio_fmode = fflag;
763 	if (bcount <= copyout_max_cached)
764 		auio.uio_extflg = UIO_COPY_CACHED;
765 	else
766 		auio.uio_extflg = UIO_COPY_DEFAULT;
767 
768 
769 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
770 
771 	/* If read sync is not asked for, filter sync flags */
772 	if ((ioflag & FRSYNC) == 0)
773 		ioflag &= ~(FSYNC|FDSYNC);
774 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
775 	count -= auio.uio_resid;
776 	CPU_STATS_ENTER_K();
777 	cp = CPU;
778 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
779 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
780 	CPU_STATS_EXIT_K();
781 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
782 
783 	if (vp->v_type == VFIFO)	/* Backward compatibility */
784 		fp->f_offset = count;
785 	else if (((fp->f_flag & FAPPEND) == 0) ||
786 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
787 		fp->f_offset = auio.uio_loffset;
788 
789 	VOP_RWUNLOCK(vp, rwflag, NULL);
790 
791 	if (error == EINTR && count != 0)
792 		error = 0;
793 out:
794 	if (in_crit)
795 		nbl_end_crit(vp);
796 	releasef(fdes);
797 	if (aiovlen != 0)
798 		kmem_free(aiov, aiovlen);
799 	if (error)
800 		return (set_errno(error));
801 	return (count);
802 }
803 
804 ssize_t
805 writev(int fdes, struct iovec *iovp, int iovcnt)
806 {
807 	struct uio auio;
808 	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
809 	int aiovlen = 0;
810 	file_t *fp;
811 	register vnode_t *vp;
812 	struct cpu *cp;
813 	int fflag, ioflag, rwflag;
814 	ssize_t count, bcount;
815 	int error = 0;
816 	int i;
817 	u_offset_t fileoff;
818 	int in_crit = 0;
819 
820 	if (iovcnt <= 0 || iovcnt > IOV_MAX)
821 		return (set_errno(EINVAL));
822 
823 	if (iovcnt > IOV_MAX_STACK) {
824 		aiovlen = iovcnt * sizeof (iovec_t);
825 		aiov = kmem_alloc(aiovlen, KM_SLEEP);
826 	}
827 
828 #ifdef _SYSCALL32_IMPL
829 	/*
830 	 * 32-bit callers need to have their iovec expanded,
831 	 * while ensuring that they can't move more than 2Gbytes
832 	 * of data in a single call.
833 	 */
834 	if (get_udatamodel() == DATAMODEL_ILP32) {
835 		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
836 		int aiov32len;
837 		ssize32_t count32;
838 
839 		aiov32len = iovcnt * sizeof (iovec32_t);
840 		if (aiovlen != 0)
841 			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
842 
843 		if (copyin(iovp, aiov32, aiov32len)) {
844 			if (aiovlen != 0) {
845 				kmem_free(aiov32, aiov32len);
846 				kmem_free(aiov, aiovlen);
847 			}
848 			return (set_errno(EFAULT));
849 		}
850 
851 		count32 = 0;
852 		for (i = 0; i < iovcnt; i++) {
853 			ssize32_t iovlen = aiov32[i].iov_len;
854 			count32 += iovlen;
855 			if (iovlen < 0 || count32 < 0) {
856 				if (aiovlen != 0) {
857 					kmem_free(aiov32, aiov32len);
858 					kmem_free(aiov, aiovlen);
859 				}
860 				return (set_errno(EINVAL));
861 			}
862 			aiov[i].iov_len = iovlen;
863 			aiov[i].iov_base =
864 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
865 		}
866 		if (aiovlen != 0)
867 			kmem_free(aiov32, aiov32len);
868 	} else
869 #endif
870 	if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
871 		if (aiovlen != 0)
872 			kmem_free(aiov, aiovlen);
873 		return (set_errno(EFAULT));
874 	}
875 
876 	count = 0;
877 	for (i = 0; i < iovcnt; i++) {
878 		ssize_t iovlen = aiov[i].iov_len;
879 		count += iovlen;
880 		if (iovlen < 0 || count < 0) {
881 			if (aiovlen != 0)
882 				kmem_free(aiov, aiovlen);
883 			return (set_errno(EINVAL));
884 		}
885 	}
886 	if ((fp = getf(fdes)) == NULL) {
887 		if (aiovlen != 0)
888 			kmem_free(aiov, aiovlen);
889 		return (set_errno(EBADF));
890 	}
891 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
892 		error = EBADF;
893 		goto out;
894 	}
895 	vp = fp->f_vnode;
896 	if (vp->v_type == VREG && count == 0) {
897 		goto out;
898 	}
899 
900 	rwflag = 1;
901 
902 	/*
903 	 * We have to enter the critical region before calling VOP_RWLOCK
904 	 * to avoid a deadlock with ufs.
905 	 */
906 	if (nbl_need_check(vp)) {
907 		int svmand;
908 
909 		nbl_start_crit(vp, RW_READER);
910 		in_crit = 1;
911 		error = nbl_svmand(vp, fp->f_cred, &svmand);
912 		if (error != 0)
913 			goto out;
914 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
915 		    NULL)) {
916 			error = EACCES;
917 			goto out;
918 		}
919 	}
920 
921 	(void) VOP_RWLOCK(vp, rwflag, NULL);
922 
923 	fileoff = fp->f_offset;
924 
925 	/*
926 	 * Behaviour is same as write. Please see comments for write.
927 	 */
928 
929 	if (vp->v_type == VREG) {
930 		if (fileoff >= curproc->p_fsz_ctl) {
931 			VOP_RWUNLOCK(vp, rwflag, NULL);
932 			mutex_enter(&curproc->p_lock);
933 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
934 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
935 			mutex_exit(&curproc->p_lock);
936 			error = EFBIG;
937 			goto out;
938 		}
939 		if (fileoff >= OFFSET_MAX(fp)) {
940 			VOP_RWUNLOCK(vp, rwflag, NULL);
941 			error = EFBIG;
942 			goto out;
943 		}
944 		if (fileoff + count > OFFSET_MAX(fp))
945 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
946 	}
947 	auio.uio_loffset = fileoff;
948 	auio.uio_iov = aiov;
949 	auio.uio_iovcnt = iovcnt;
950 	auio.uio_resid = bcount = count;
951 	auio.uio_segflg = UIO_USERSPACE;
952 	auio.uio_llimit = curproc->p_fsz_ctl;
953 	auio.uio_fmode = fflag;
954 	auio.uio_extflg = UIO_COPY_DEFAULT;
955 
956 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
957 
958 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
959 	count -= auio.uio_resid;
960 	CPU_STATS_ENTER_K();
961 	cp = CPU;
962 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
963 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
964 	CPU_STATS_EXIT_K();
965 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
966 
967 	if (vp->v_type == VFIFO)	/* Backward compatibility */
968 		fp->f_offset = count;
969 	else if (((fp->f_flag & FAPPEND) == 0) ||
970 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
971 		fp->f_offset = auio.uio_loffset;
972 	VOP_RWUNLOCK(vp, rwflag, NULL);
973 
974 	if (error == EINTR && count != 0)
975 		error = 0;
976 out:
977 	if (in_crit)
978 		nbl_end_crit(vp);
979 	releasef(fdes);
980 	if (aiovlen != 0)
981 		kmem_free(aiov, aiovlen);
982 	if (error)
983 		return (set_errno(error));
984 	return (count);
985 }
986 
987 ssize_t
988 preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
989     off_t extended_offset)
990 {
991 	struct uio auio;
992 	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
993 	int aiovlen = 0;
994 	file_t *fp;
995 	register vnode_t *vp;
996 	struct cpu *cp;
997 	int fflag, ioflag, rwflag;
998 	ssize_t count, bcount;
999 	int error = 0;
1000 	int i;
1001 
1002 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1003 	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1004 	    (u_offset_t)offset;
1005 #else /* _SYSCALL32_IMPL || _ILP32 */
1006 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1007 #endif /* _SYSCALL32_IMPR || _ILP32 */
1008 #ifdef _SYSCALL32_IMPL
1009 	const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
1010 	    extended_offset == 0?
1011 	    MAXOFF32_T : MAXOFFSET_T;
1012 #else /* _SYSCALL32_IMPL */
1013 	const u_offset_t maxoff = MAXOFF32_T;
1014 #endif /* _SYSCALL32_IMPL */
1015 
1016 	int in_crit = 0;
1017 
1018 	if (iovcnt <= 0 || iovcnt > IOV_MAX)
1019 		return (set_errno(EINVAL));
1020 
1021 	if (iovcnt > IOV_MAX_STACK) {
1022 		aiovlen = iovcnt * sizeof (iovec_t);
1023 		aiov = kmem_alloc(aiovlen, KM_SLEEP);
1024 	}
1025 
1026 #ifdef _SYSCALL32_IMPL
1027 	/*
1028 	 * 32-bit callers need to have their iovec expanded,
1029 	 * while ensuring that they can't move more than 2Gbytes
1030 	 * of data in a single call.
1031 	 */
1032 	if (get_udatamodel() == DATAMODEL_ILP32) {
1033 		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1034 		int aiov32len;
1035 		ssize32_t count32;
1036 
1037 		aiov32len = iovcnt * sizeof (iovec32_t);
1038 		if (aiovlen != 0)
1039 			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1040 
1041 		if (copyin(iovp, aiov32, aiov32len)) {
1042 			if (aiovlen != 0) {
1043 				kmem_free(aiov32, aiov32len);
1044 				kmem_free(aiov, aiovlen);
1045 			}
1046 			return (set_errno(EFAULT));
1047 		}
1048 
1049 		count32 = 0;
1050 		for (i = 0; i < iovcnt; i++) {
1051 			ssize32_t iovlen32 = aiov32[i].iov_len;
1052 			count32 += iovlen32;
1053 			if (iovlen32 < 0 || count32 < 0) {
1054 				if (aiovlen != 0) {
1055 					kmem_free(aiov32, aiov32len);
1056 					kmem_free(aiov, aiovlen);
1057 				}
1058 				return (set_errno(EINVAL));
1059 			}
1060 			aiov[i].iov_len = iovlen32;
1061 			aiov[i].iov_base =
1062 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1063 		}
1064 		if (aiovlen != 0)
1065 			kmem_free(aiov32, aiov32len);
1066 	} else
1067 #endif /* _SYSCALL32_IMPL */
1068 		if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1069 			if (aiovlen != 0)
1070 				kmem_free(aiov, aiovlen);
1071 			return (set_errno(EFAULT));
1072 		}
1073 
1074 	count = 0;
1075 	for (i = 0; i < iovcnt; i++) {
1076 		ssize_t iovlen = aiov[i].iov_len;
1077 		count += iovlen;
1078 		if (iovlen < 0 || count < 0) {
1079 			if (aiovlen != 0)
1080 				kmem_free(aiov, aiovlen);
1081 			return (set_errno(EINVAL));
1082 		}
1083 	}
1084 
1085 	if ((bcount = (ssize_t)count) < 0) {
1086 		if (aiovlen != 0)
1087 			kmem_free(aiov, aiovlen);
1088 		return (set_errno(EINVAL));
1089 	}
1090 	if ((fp = getf(fdes)) == NULL) {
1091 		if (aiovlen != 0)
1092 			kmem_free(aiov, aiovlen);
1093 		return (set_errno(EBADF));
1094 	}
1095 	if (((fflag = fp->f_flag) & FREAD) == 0) {
1096 		error = EBADF;
1097 		goto out;
1098 	}
1099 	vp = fp->f_vnode;
1100 	rwflag = 0;
1101 	if (vp->v_type == VREG) {
1102 
1103 		if (bcount == 0)
1104 			goto out;
1105 
1106 		/*
1107 		 * return EINVAL for offsets that cannot be
1108 		 * represented in an off_t.
1109 		 */
1110 		if (fileoff > maxoff) {
1111 			error = EINVAL;
1112 			goto out;
1113 		}
1114 
1115 		if (fileoff + bcount > maxoff)
1116 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1117 	} else if (vp->v_type == VFIFO) {
1118 		error = ESPIPE;
1119 		goto out;
1120 	}
1121 	/*
1122 	 * We have to enter the critical region before calling VOP_RWLOCK
1123 	 * to avoid a deadlock with ufs.
1124 	 */
1125 	if (nbl_need_check(vp)) {
1126 		int svmand;
1127 
1128 		nbl_start_crit(vp, RW_READER);
1129 		in_crit = 1;
1130 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1131 		if (error != 0)
1132 			goto out;
1133 		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1134 		    NULL)) {
1135 			error = EACCES;
1136 			goto out;
1137 		}
1138 	}
1139 
1140 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1141 
1142 	/*
1143 	 * Behaviour is same as read(2). Please see comments in
1144 	 * read(2).
1145 	 */
1146 
1147 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
1148 		struct vattr va;
1149 		va.va_mask = AT_SIZE;
1150 		if ((error =
1151 		    VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
1152 			VOP_RWUNLOCK(vp, rwflag, NULL);
1153 			goto out;
1154 		}
1155 		if (fileoff >= va.va_size) {
1156 			VOP_RWUNLOCK(vp, rwflag, NULL);
1157 			count = 0;
1158 			goto out;
1159 		} else {
1160 			VOP_RWUNLOCK(vp, rwflag, NULL);
1161 			error = EOVERFLOW;
1162 			goto out;
1163 		}
1164 	}
1165 	if ((vp->v_type == VREG) &&
1166 	    (fileoff + count > OFFSET_MAX(fp))) {
1167 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1168 	}
1169 	auio.uio_loffset = fileoff;
1170 	auio.uio_iov = aiov;
1171 	auio.uio_iovcnt = iovcnt;
1172 	auio.uio_resid = bcount = count;
1173 	auio.uio_segflg = UIO_USERSPACE;
1174 	auio.uio_llimit = MAXOFFSET_T;
1175 	auio.uio_fmode = fflag;
1176 	if (bcount <= copyout_max_cached)
1177 		auio.uio_extflg = UIO_COPY_CACHED;
1178 	else
1179 		auio.uio_extflg = UIO_COPY_DEFAULT;
1180 
1181 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1182 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1183 	count -= auio.uio_resid;
1184 	CPU_STATS_ENTER_K();
1185 	cp = CPU;
1186 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1187 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
1188 	CPU_STATS_EXIT_K();
1189 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1190 
1191 	VOP_RWUNLOCK(vp, rwflag, NULL);
1192 
1193 	if (error == EINTR && count != 0)
1194 		error = 0;
1195 out:
1196 	if (in_crit)
1197 		nbl_end_crit(vp);
1198 	releasef(fdes);
1199 	if (aiovlen != 0)
1200 		kmem_free(aiov, aiovlen);
1201 	if (error)
1202 		return (set_errno(error));
1203 	return (count);
1204 }
1205 
1206 ssize_t
1207 pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
1208     off_t extended_offset)
1209 {
1210 	struct uio auio;
1211 	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1212 	int aiovlen = 0;
1213 	file_t *fp;
1214 	register vnode_t *vp;
1215 	struct cpu *cp;
1216 	int fflag, ioflag, rwflag;
1217 	ssize_t count, bcount;
1218 	int error = 0;
1219 	int i;
1220 
1221 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1222 	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1223 	    (u_offset_t)offset;
1224 #else /* _SYSCALL32_IMPL || _ILP32 */
1225 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1226 #endif /* _SYSCALL32_IMPR || _ILP32 */
1227 #ifdef _SYSCALL32_IMPL
1228 	const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
1229 	    extended_offset == 0?
1230 	    MAXOFF32_T : MAXOFFSET_T;
1231 #else /* _SYSCALL32_IMPL */
1232 	const u_offset_t maxoff = MAXOFF32_T;
1233 #endif /* _SYSCALL32_IMPL */
1234 
1235 	int in_crit = 0;
1236 
1237 	if (iovcnt <= 0 || iovcnt > IOV_MAX)
1238 		return (set_errno(EINVAL));
1239 
1240 	if (iovcnt > IOV_MAX_STACK) {
1241 		aiovlen = iovcnt * sizeof (iovec_t);
1242 		aiov = kmem_alloc(aiovlen, KM_SLEEP);
1243 	}
1244 
1245 #ifdef _SYSCALL32_IMPL
1246 	/*
1247 	 * 32-bit callers need to have their iovec expanded,
1248 	 * while ensuring that they can't move more than 2Gbytes
1249 	 * of data in a single call.
1250 	 */
1251 	if (get_udatamodel() == DATAMODEL_ILP32) {
1252 		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1253 		int aiov32len;
1254 		ssize32_t count32;
1255 
1256 		aiov32len = iovcnt * sizeof (iovec32_t);
1257 		if (aiovlen != 0)
1258 			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1259 
1260 		if (copyin(iovp, aiov32, aiov32len)) {
1261 			if (aiovlen != 0) {
1262 				kmem_free(aiov32, aiov32len);
1263 				kmem_free(aiov, aiovlen);
1264 			}
1265 			return (set_errno(EFAULT));
1266 		}
1267 
1268 		count32 = 0;
1269 		for (i = 0; i < iovcnt; i++) {
1270 			ssize32_t iovlen32 = aiov32[i].iov_len;
1271 			count32 += iovlen32;
1272 			if (iovlen32 < 0 || count32 < 0) {
1273 				if (aiovlen != 0) {
1274 					kmem_free(aiov32, aiov32len);
1275 					kmem_free(aiov, aiovlen);
1276 				}
1277 				return (set_errno(EINVAL));
1278 			}
1279 			aiov[i].iov_len = iovlen32;
1280 			aiov[i].iov_base =
1281 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1282 		}
1283 		if (aiovlen != 0)
1284 			kmem_free(aiov32, aiov32len);
1285 	} else
1286 #endif /* _SYSCALL32_IMPL */
1287 		if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1288 			if (aiovlen != 0)
1289 				kmem_free(aiov, aiovlen);
1290 			return (set_errno(EFAULT));
1291 		}
1292 
1293 	count = 0;
1294 	for (i = 0; i < iovcnt; i++) {
1295 		ssize_t iovlen = aiov[i].iov_len;
1296 		count += iovlen;
1297 		if (iovlen < 0 || count < 0) {
1298 			if (aiovlen != 0)
1299 				kmem_free(aiov, aiovlen);
1300 			return (set_errno(EINVAL));
1301 		}
1302 	}
1303 
1304 	if ((bcount = (ssize_t)count) < 0) {
1305 		if (aiovlen != 0)
1306 			kmem_free(aiov, aiovlen);
1307 		return (set_errno(EINVAL));
1308 	}
1309 	if ((fp = getf(fdes)) == NULL) {
1310 		if (aiovlen != 0)
1311 			kmem_free(aiov, aiovlen);
1312 		return (set_errno(EBADF));
1313 	}
1314 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
1315 		error = EBADF;
1316 		goto out;
1317 	}
1318 	vp = fp->f_vnode;
1319 	rwflag = 1;
1320 	if (vp->v_type == VREG) {
1321 
1322 		if (bcount == 0)
1323 			goto out;
1324 
1325 		/*
1326 		 * return EINVAL for offsets that cannot be
1327 		 * represented in an off_t.
1328 		 */
1329 		if (fileoff > maxoff) {
1330 			error = EINVAL;
1331 			goto out;
1332 		}
1333 		/*
1334 		 * Take appropriate action if we are trying
1335 		 * to write above the resource limit.
1336 		 */
1337 		if (fileoff >= curproc->p_fsz_ctl) {
1338 			mutex_enter(&curproc->p_lock);
1339 			/*
1340 			 * Return value ignored because it lists
1341 			 * actions taken, but we are in an error case.
1342 			 * We don't have any actions that depend on
1343 			 * what could happen in this call, so we ignore
1344 			 * the return value.
1345 			 */
1346 			(void) rctl_action(
1347 			    rctlproc_legacy[RLIMIT_FSIZE],
1348 			    curproc->p_rctls, curproc,
1349 			    RCA_UNSAFE_SIGINFO);
1350 			mutex_exit(&curproc->p_lock);
1351 
1352 			error = EFBIG;
1353 			goto out;
1354 		}
1355 		/*
1356 		 * Don't allow pwritev to cause file sizes to exceed
1357 		 * maxoff.
1358 		 */
1359 		if (fileoff == maxoff) {
1360 			error = EFBIG;
1361 			goto out;
1362 		}
1363 
1364 		if (fileoff + bcount > maxoff)
1365 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1366 	} else if (vp->v_type == VFIFO) {
1367 		error = ESPIPE;
1368 		goto out;
1369 	}
1370 	/*
1371 	 * We have to enter the critical region before calling VOP_RWLOCK
1372 	 * to avoid a deadlock with ufs.
1373 	 */
1374 	if (nbl_need_check(vp)) {
1375 		int svmand;
1376 
1377 		nbl_start_crit(vp, RW_READER);
1378 		in_crit = 1;
1379 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1380 		if (error != 0)
1381 			goto out;
1382 		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1383 		    NULL)) {
1384 			error = EACCES;
1385 			goto out;
1386 		}
1387 	}
1388 
1389 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1390 
1391 
1392 	/*
1393 	 * Behaviour is same as write(2). Please see comments for
1394 	 * write(2).
1395 	 */
1396 
1397 	if (vp->v_type == VREG) {
1398 		if (fileoff >= curproc->p_fsz_ctl) {
1399 			VOP_RWUNLOCK(vp, rwflag, NULL);
1400 			mutex_enter(&curproc->p_lock);
1401 			/* see above rctl_action comment */
1402 			(void) rctl_action(
1403 			    rctlproc_legacy[RLIMIT_FSIZE],
1404 			    curproc->p_rctls,
1405 			    curproc, RCA_UNSAFE_SIGINFO);
1406 			mutex_exit(&curproc->p_lock);
1407 			error = EFBIG;
1408 			goto out;
1409 		}
1410 		if (fileoff >= OFFSET_MAX(fp)) {
1411 			VOP_RWUNLOCK(vp, rwflag, NULL);
1412 			error = EFBIG;
1413 			goto out;
1414 		}
1415 		if (fileoff + count > OFFSET_MAX(fp))
1416 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1417 	}
1418 
1419 	auio.uio_loffset = fileoff;
1420 	auio.uio_iov = aiov;
1421 	auio.uio_iovcnt = iovcnt;
1422 	auio.uio_resid = bcount = count;
1423 	auio.uio_segflg = UIO_USERSPACE;
1424 	auio.uio_llimit = curproc->p_fsz_ctl;
1425 	auio.uio_fmode = fflag;
1426 	auio.uio_extflg = UIO_COPY_CACHED;
1427 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1428 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1429 	count -= auio.uio_resid;
1430 	CPU_STATS_ENTER_K();
1431 	cp = CPU;
1432 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1433 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
1434 	CPU_STATS_EXIT_K();
1435 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1436 
1437 	VOP_RWUNLOCK(vp, rwflag, NULL);
1438 
1439 	if (error == EINTR && count != 0)
1440 		error = 0;
1441 out:
1442 	if (in_crit)
1443 		nbl_end_crit(vp);
1444 	releasef(fdes);
1445 	if (aiovlen != 0)
1446 		kmem_free(aiov, aiovlen);
1447 	if (error)
1448 		return (set_errno(error));
1449 	return (count);
1450 }
1451 
1452 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1453 
1454 /*
1455  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1456  */
1457 ssize32_t
1458 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1459     uint32_t offset_2)
1460 {
1461 	struct uio auio;
1462 	struct iovec aiov;
1463 	file_t *fp;
1464 	register vnode_t *vp;
1465 	struct cpu *cp;
1466 	int fflag, ioflag, rwflag;
1467 	ssize_t bcount;
1468 	int error = 0;
1469 	u_offset_t fileoff;
1470 	int in_crit = 0;
1471 
1472 #if defined(_LITTLE_ENDIAN)
1473 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1474 #else
1475 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1476 #endif
1477 
1478 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1479 		return (set_errno(EINVAL));
1480 
1481 	if ((fp = getf(fdes)) == NULL)
1482 		return (set_errno(EBADF));
1483 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
1484 		error = EBADF;
1485 		goto out;
1486 	}
1487 
1488 	rwflag = 0;
1489 	vp = fp->f_vnode;
1490 
1491 	if (vp->v_type == VREG) {
1492 
1493 		if (bcount == 0)
1494 			goto out;
1495 
1496 		/*
1497 		 * Same as pread. See comments in pread.
1498 		 */
1499 
1500 		if (fileoff > MAXOFFSET_T) {
1501 			error = EINVAL;
1502 			goto out;
1503 		}
1504 		if (fileoff + bcount > MAXOFFSET_T)
1505 			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
1506 	} else if (vp->v_type == VFIFO) {
1507 		error = ESPIPE;
1508 		goto out;
1509 	}
1510 
1511 	/*
1512 	 * We have to enter the critical region before calling VOP_RWLOCK
1513 	 * to avoid a deadlock with ufs.
1514 	 */
1515 	if (nbl_need_check(vp)) {
1516 		int svmand;
1517 
1518 		nbl_start_crit(vp, RW_READER);
1519 		in_crit = 1;
1520 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1521 		if (error != 0)
1522 			goto out;
1523 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
1524 		    NULL)) {
1525 			error = EACCES;
1526 			goto out;
1527 		}
1528 	}
1529 
1530 	aiov.iov_base = cbuf;
1531 	aiov.iov_len = bcount;
1532 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1533 	auio.uio_loffset = fileoff;
1534 
1535 	/*
1536 	 * Note: File size can never be greater than MAXOFFSET_T.
1537 	 * If ever we start supporting 128 bit files the code
1538 	 * similar to the one in pread at this place should be here.
1539 	 * Here we avoid the unnecessary VOP_GETATTR() when we
1540 	 * know that fileoff == MAXOFFSET_T implies that it is always
1541 	 * greater than or equal to file size.
1542 	 */
1543 	auio.uio_iov = &aiov;
1544 	auio.uio_iovcnt = 1;
1545 	auio.uio_resid = bcount;
1546 	auio.uio_segflg = UIO_USERSPACE;
1547 	auio.uio_llimit = MAXOFFSET_T;
1548 	auio.uio_fmode = fflag;
1549 	auio.uio_extflg = UIO_COPY_CACHED;
1550 
1551 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1552 
1553 	/* If read sync is not asked for, filter sync flags */
1554 	if ((ioflag & FRSYNC) == 0)
1555 		ioflag &= ~(FSYNC|FDSYNC);
1556 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1557 	bcount -= auio.uio_resid;
1558 	CPU_STATS_ENTER_K();
1559 	cp = CPU;
1560 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1561 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1562 	CPU_STATS_EXIT_K();
1563 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1564 	VOP_RWUNLOCK(vp, rwflag, NULL);
1565 
1566 	if (error == EINTR && bcount != 0)
1567 		error = 0;
1568 out:
1569 	if (in_crit)
1570 		nbl_end_crit(vp);
1571 	releasef(fdes);
1572 	if (error)
1573 		return (set_errno(error));
1574 	return (bcount);
1575 }
1576 
1577 /*
1578  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1579  */
1580 ssize32_t
1581 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1582     uint32_t offset_2)
1583 {
1584 	struct uio auio;
1585 	struct iovec aiov;
1586 	file_t *fp;
1587 	register vnode_t *vp;
1588 	struct cpu *cp;
1589 	int fflag, ioflag, rwflag;
1590 	ssize_t bcount;
1591 	int error = 0;
1592 	u_offset_t fileoff;
1593 	int in_crit = 0;
1594 
1595 #if defined(_LITTLE_ENDIAN)
1596 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1597 #else
1598 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1599 #endif
1600 
1601 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1602 		return (set_errno(EINVAL));
1603 	if ((fp = getf(fdes)) == NULL)
1604 		return (set_errno(EBADF));
1605 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1606 		error = EBADF;
1607 		goto out;
1608 	}
1609 
1610 	rwflag = 1;
1611 	vp = fp->f_vnode;
1612 
1613 	if (vp->v_type == VREG) {
1614 
1615 		if (bcount == 0)
1616 			goto out;
1617 
1618 		/*
1619 		 * See comments in pwrite.
1620 		 */
1621 		if (fileoff > MAXOFFSET_T) {
1622 			error = EINVAL;
1623 			goto out;
1624 		}
1625 		if (fileoff >= curproc->p_fsz_ctl) {
1626 			mutex_enter(&curproc->p_lock);
1627 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1628 			    curproc->p_rctls, curproc, RCA_SAFE);
1629 			mutex_exit(&curproc->p_lock);
1630 			error = EFBIG;
1631 			goto out;
1632 		}
1633 		if (fileoff == MAXOFFSET_T) {
1634 			error = EFBIG;
1635 			goto out;
1636 		}
1637 		if (fileoff + bcount > MAXOFFSET_T)
1638 			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1639 	} else if (vp->v_type == VFIFO) {
1640 		error = ESPIPE;
1641 		goto out;
1642 	}
1643 
1644 	/*
1645 	 * We have to enter the critical region before calling VOP_RWLOCK
1646 	 * to avoid a deadlock with ufs.
1647 	 */
1648 	if (nbl_need_check(vp)) {
1649 		int svmand;
1650 
1651 		nbl_start_crit(vp, RW_READER);
1652 		in_crit = 1;
1653 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1654 		if (error != 0)
1655 			goto out;
1656 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1657 		    NULL)) {
1658 			error = EACCES;
1659 			goto out;
1660 		}
1661 	}
1662 
1663 	aiov.iov_base = cbuf;
1664 	aiov.iov_len = bcount;
1665 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1666 	auio.uio_loffset = fileoff;
1667 	auio.uio_iov = &aiov;
1668 	auio.uio_iovcnt = 1;
1669 	auio.uio_resid = bcount;
1670 	auio.uio_segflg = UIO_USERSPACE;
1671 	auio.uio_llimit = curproc->p_fsz_ctl;
1672 	auio.uio_fmode = fflag;
1673 	auio.uio_extflg = UIO_COPY_CACHED;
1674 
1675 	/*
1676 	 * The SUSv4 POSIX specification states:
1677 	 *	The pwrite() function shall be equivalent to write(), except
1678 	 *	that it writes into a given position and does not change
1679 	 *	the file offset (regardless of whether O_APPEND is set).
1680 	 * To make this be true, we omit the FAPPEND flag from ioflag.
1681 	 */
1682 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1683 
1684 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1685 	bcount -= auio.uio_resid;
1686 	CPU_STATS_ENTER_K();
1687 	cp = CPU;
1688 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1689 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1690 	CPU_STATS_EXIT_K();
1691 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1692 	VOP_RWUNLOCK(vp, rwflag, NULL);
1693 
1694 	if (error == EINTR && bcount != 0)
1695 		error = 0;
1696 out:
1697 	if (in_crit)
1698 		nbl_end_crit(vp);
1699 	releasef(fdes);
1700 	if (error)
1701 		return (set_errno(error));
1702 	return (bcount);
1703 }
1704 
1705 #endif	/* _SYSCALL32_IMPL || _ILP32 */
1706 
1707 #ifdef _SYSCALL32_IMPL
1708 /*
1709  * Tail-call elimination of xxx32() down to xxx()
1710  *
1711  * A number of xxx32 system calls take a len (or count) argument and
1712  * return a number in the range [0,len] or -1 on error.
1713  * Given an ssize32_t input len, the downcall xxx() will return
1714  * a 64-bit value that is -1 or in the range [0,len] which actually
1715  * is a proper return value for the xxx32 call. So even if the xxx32
1716  * calls can be considered as returning a ssize32_t, they are currently
1717  * declared as returning a ssize_t as this enables tail-call elimination.
1718  *
1719  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1720  * down negative input values as such and let the downcall handle error
1721  * reporting. Functions covered by this comments are:
1722  *
1723  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1724  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1725  * readlink.c:     readlink32.
1726  */
1727 
1728 ssize_t
1729 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1730 {
1731 	return (read(fdes,
1732 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1733 }
1734 
1735 ssize_t
1736 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1737 {
1738 	return (write(fdes,
1739 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1740 }
1741 
1742 ssize_t
1743 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1744 {
1745 	return (pread(fdes,
1746 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1747 	    (off_t)(uint32_t)offset));
1748 }
1749 
1750 ssize_t
1751 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1752 {
1753 	return (pwrite(fdes,
1754 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1755 	    (off_t)(uint32_t)offset));
1756 }
1757 
1758 ssize_t
1759 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1760 {
1761 	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1762 }
1763 
1764 ssize_t
1765 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1766 {
1767 	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1768 }
1769 #endif	/* _SYSCALL32_IMPL */
1770