1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2020, Joyent, Inc.
26 */
27
28/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
29/*	  All Rights Reserved	*/
30
31/*
32 * Portions of this source code were derived from Berkeley 4.3 BSD
33 * under license from the Regents of the University of California.
34 */
35
36#include <sys/param.h>
37#include <sys/isa_defs.h>
38#include <sys/types.h>
39#include <sys/inttypes.h>
40#include <sys/sysmacros.h>
41#include <sys/cred.h>
42#include <sys/user.h>
43#include <sys/systm.h>
44#include <sys/errno.h>
45#include <sys/vnode.h>
46#include <sys/file.h>
47#include <sys/proc.h>
48#include <sys/cpuvar.h>
49#include <sys/uio.h>
50#include <sys/debug.h>
51#include <sys/rctl.h>
52#include <sys/nbmlock.h>
53#include <sys/limits.h>
54
55#define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
56
57size_t copyout_max_cached = COPYOUT_MAX_CACHE;	/* global so it's patchable */
58
59/*
60 * read, write, pread, pwrite, readv, and writev syscalls.
61 *
62 * 64-bit open:	all open's are large file opens.
63 * Large Files: the behaviour of read depends on whether the fd
64 *		corresponds to large open or not.
65 * 32-bit open:	FOFFMAX flag not set.
66 *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
67 *		EOVERFLOW if count is non-zero and if size of file
68 *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
69 *		at >= MAXOFF32_T returns EOF.
70 */
71
72/*
73 * Native system call
74 */
75ssize_t
76read(int fdes, void *cbuf, size_t count)
77{
78	struct uio auio;
79	struct iovec aiov;
80	file_t *fp;
81	register vnode_t *vp;
82	struct cpu *cp;
83	int fflag, ioflag, rwflag;
84	ssize_t cnt, bcount;
85	int error = 0;
86	u_offset_t fileoff;
87	int in_crit = 0;
88
89	if ((cnt = (ssize_t)count) < 0)
90		return (set_errno(EINVAL));
91	if ((fp = getf(fdes)) == NULL)
92		return (set_errno(EBADF));
93	if (((fflag = fp->f_flag) & FREAD) == 0) {
94		error = EBADF;
95		goto out;
96	}
97	vp = fp->f_vnode;
98
99	if (vp->v_type == VREG && cnt == 0) {
100		goto out;
101	}
102
103	rwflag = 0;
104	aiov.iov_base = cbuf;
105	aiov.iov_len = cnt;
106
107	/*
108	 * We have to enter the critical region before calling VOP_RWLOCK
109	 * to avoid a deadlock with write() calls.
110	 */
111	if (nbl_need_check(vp)) {
112		int svmand;
113
114		nbl_start_crit(vp, RW_READER);
115		in_crit = 1;
116		error = nbl_svmand(vp, fp->f_cred, &svmand);
117		if (error != 0)
118			goto out;
119		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
120		    NULL)) {
121			error = EACCES;
122			goto out;
123		}
124	}
125
126	(void) VOP_RWLOCK(vp, rwflag, NULL);
127
128	/*
129	 * We do the following checks inside VOP_RWLOCK so as to
130	 * prevent file size from changing while these checks are
131	 * being done. Also, we load fp's offset to the local
132	 * variable fileoff because we can have a parallel lseek
133	 * going on (f_offset is not protected by any lock) which
134	 * could change f_offset. We need to see the value only
135	 * once here and take a decision. Seeing it more than once
136	 * can lead to incorrect functionality.
137	 */
138
139	fileoff = (u_offset_t)fp->f_offset;
140	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
141		struct vattr va;
142		va.va_mask = AT_SIZE;
143		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
144			VOP_RWUNLOCK(vp, rwflag, NULL);
145			goto out;
146		}
147		if (fileoff >= va.va_size) {
148			cnt = 0;
149			VOP_RWUNLOCK(vp, rwflag, NULL);
150			goto out;
151		} else {
152			error = EOVERFLOW;
153			VOP_RWUNLOCK(vp, rwflag, NULL);
154			goto out;
155		}
156	}
157	if ((vp->v_type == VREG) &&
158	    (fileoff + cnt > OFFSET_MAX(fp))) {
159		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
160	}
161	auio.uio_loffset = fileoff;
162	auio.uio_iov = &aiov;
163	auio.uio_iovcnt = 1;
164	auio.uio_resid = bcount = cnt;
165	auio.uio_segflg = UIO_USERSPACE;
166	auio.uio_llimit = MAXOFFSET_T;
167	auio.uio_fmode = fflag;
168	/*
169	 * Only use bypass caches when the count is large enough
170	 */
171	if (bcount <= copyout_max_cached)
172		auio.uio_extflg = UIO_COPY_CACHED;
173	else
174		auio.uio_extflg = UIO_COPY_DEFAULT;
175
176	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
177
178	/* If read sync is not asked for, filter sync flags */
179	if ((ioflag & FRSYNC) == 0)
180		ioflag &= ~(FSYNC|FDSYNC);
181	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
182	cnt -= auio.uio_resid;
183	CPU_STATS_ENTER_K();
184	cp = CPU;
185	CPU_STATS_ADDQ(cp, sys, sysread, 1);
186	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
187	CPU_STATS_EXIT_K();
188	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
189
190	if (vp->v_type == VFIFO)	/* Backward compatibility */
191		fp->f_offset = cnt;
192	else if (((fp->f_flag & FAPPEND) == 0) ||
193	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
194		fp->f_offset = auio.uio_loffset;
195	VOP_RWUNLOCK(vp, rwflag, NULL);
196
197	if (error == EINTR && cnt != 0)
198		error = 0;
199out:
200	if (in_crit)
201		nbl_end_crit(vp);
202	releasef(fdes);
203	if (error)
204		return (set_errno(error));
205	return (cnt);
206}
207
208/*
209 * Native system call
210 */
211ssize_t
212write(int fdes, void *cbuf, size_t count)
213{
214	struct uio auio;
215	struct iovec aiov;
216	file_t *fp;
217	register vnode_t *vp;
218	struct cpu *cp;
219	int fflag, ioflag, rwflag;
220	ssize_t cnt, bcount;
221	int error = 0;
222	u_offset_t fileoff;
223	int in_crit = 0;
224
225	if ((cnt = (ssize_t)count) < 0)
226		return (set_errno(EINVAL));
227	if ((fp = getf(fdes)) == NULL)
228		return (set_errno(EBADF));
229	if (((fflag = fp->f_flag) & FWRITE) == 0) {
230		error = EBADF;
231		goto out;
232	}
233	vp = fp->f_vnode;
234
235	if (vp->v_type == VREG && cnt == 0) {
236		goto out;
237	}
238
239	rwflag = 1;
240	aiov.iov_base = cbuf;
241	aiov.iov_len = cnt;
242
243	/*
244	 * We have to enter the critical region before calling VOP_RWLOCK
245	 * to avoid a deadlock with ufs.
246	 */
247	if (nbl_need_check(vp)) {
248		int svmand;
249
250		nbl_start_crit(vp, RW_READER);
251		in_crit = 1;
252		error = nbl_svmand(vp, fp->f_cred, &svmand);
253		if (error != 0)
254			goto out;
255		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
256		    NULL)) {
257			error = EACCES;
258			goto out;
259		}
260	}
261
262	(void) VOP_RWLOCK(vp, rwflag, NULL);
263
264	fileoff = fp->f_offset;
265	if (vp->v_type == VREG) {
266
267		/*
268		 * We raise psignal if write for >0 bytes causes
269		 * it to exceed the ulimit.
270		 */
271		if (fileoff >= curproc->p_fsz_ctl) {
272			VOP_RWUNLOCK(vp, rwflag, NULL);
273
274			mutex_enter(&curproc->p_lock);
275			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
276			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
277			mutex_exit(&curproc->p_lock);
278
279			error = EFBIG;
280			goto out;
281		}
282		/*
283		 * We return EFBIG if write is done at an offset
284		 * greater than the offset maximum for this file structure.
285		 */
286
287		if (fileoff >= OFFSET_MAX(fp)) {
288			VOP_RWUNLOCK(vp, rwflag, NULL);
289			error = EFBIG;
290			goto out;
291		}
292		/*
293		 * Limit the bytes to be written  upto offset maximum for
294		 * this open file structure.
295		 */
296		if (fileoff + cnt > OFFSET_MAX(fp))
297			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
298	}
299	auio.uio_loffset = fileoff;
300	auio.uio_iov = &aiov;
301	auio.uio_iovcnt = 1;
302	auio.uio_resid = bcount = cnt;
303	auio.uio_segflg = UIO_USERSPACE;
304	auio.uio_llimit = curproc->p_fsz_ctl;
305	auio.uio_fmode = fflag;
306	auio.uio_extflg = UIO_COPY_DEFAULT;
307
308	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
309
310	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
311	cnt -= auio.uio_resid;
312	CPU_STATS_ENTER_K();
313	cp = CPU;
314	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
315	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
316	CPU_STATS_EXIT_K();
317	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
318
319	if (vp->v_type == VFIFO)	/* Backward compatibility */
320		fp->f_offset = cnt;
321	else if (((fp->f_flag & FAPPEND) == 0) ||
322	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
323		fp->f_offset = auio.uio_loffset;
324	VOP_RWUNLOCK(vp, rwflag, NULL);
325
326	if (error == EINTR && cnt != 0)
327		error = 0;
328out:
329	if (in_crit)
330		nbl_end_crit(vp);
331	releasef(fdes);
332	if (error)
333		return (set_errno(error));
334	return (cnt);
335}
336
337ssize_t
338pread(int fdes, void *cbuf, size_t count, off_t offset)
339{
340	struct uio auio;
341	struct iovec aiov;
342	file_t *fp;
343	register vnode_t *vp;
344	struct cpu *cp;
345	int fflag, ioflag, rwflag;
346	ssize_t bcount;
347	int error = 0;
348	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
349#ifdef _SYSCALL32_IMPL
350	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
351	    MAXOFF32_T : MAXOFFSET_T;
352#else
353	const u_offset_t maxoff = MAXOFF32_T;
354#endif
355	int in_crit = 0;
356
357	if ((bcount = (ssize_t)count) < 0)
358		return (set_errno(EINVAL));
359
360	if ((fp = getf(fdes)) == NULL)
361		return (set_errno(EBADF));
362	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
363		error = EBADF;
364		goto out;
365	}
366
367	rwflag = 0;
368	vp = fp->f_vnode;
369
370	if (vp->v_type == VREG) {
371
372		if (bcount == 0)
373			goto out;
374
375		/*
376		 * Return EINVAL if an invalid offset comes to pread.
377		 * Negative offset from user will cause this error.
378		 */
379
380		if (fileoff > maxoff) {
381			error = EINVAL;
382			goto out;
383		}
384		/*
385		 * Limit offset such that we don't read or write
386		 * a file beyond the maximum offset representable in
387		 * an off_t structure.
388		 */
389		if (fileoff + bcount > maxoff)
390			bcount = (ssize_t)((offset_t)maxoff - fileoff);
391	} else if (vp->v_type == VFIFO) {
392		error = ESPIPE;
393		goto out;
394	}
395
396	/*
397	 * We have to enter the critical region before calling VOP_RWLOCK
398	 * to avoid a deadlock with ufs.
399	 */
400	if (nbl_need_check(vp)) {
401		int svmand;
402
403		nbl_start_crit(vp, RW_READER);
404		in_crit = 1;
405		error = nbl_svmand(vp, fp->f_cred, &svmand);
406		if (error != 0)
407			goto out;
408		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
409		    NULL)) {
410			error = EACCES;
411			goto out;
412		}
413	}
414
415	aiov.iov_base = cbuf;
416	aiov.iov_len = bcount;
417	(void) VOP_RWLOCK(vp, rwflag, NULL);
418	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
419		struct vattr va;
420		va.va_mask = AT_SIZE;
421		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
422			VOP_RWUNLOCK(vp, rwflag, NULL);
423			goto out;
424		}
425		VOP_RWUNLOCK(vp, rwflag, NULL);
426
427		/*
428		 * We have to return EOF if fileoff is >= file size.
429		 */
430		if (fileoff >= va.va_size) {
431			bcount = 0;
432			goto out;
433		}
434
435		/*
436		 * File is greater than or equal to maxoff and therefore
437		 * we return EOVERFLOW.
438		 */
439		error = EOVERFLOW;
440		goto out;
441	}
442	auio.uio_loffset = fileoff;
443	auio.uio_iov = &aiov;
444	auio.uio_iovcnt = 1;
445	auio.uio_resid = bcount;
446	auio.uio_segflg = UIO_USERSPACE;
447	auio.uio_llimit = MAXOFFSET_T;
448	auio.uio_fmode = fflag;
449	auio.uio_extflg = UIO_COPY_CACHED;
450
451	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
452
453	/* If read sync is not asked for, filter sync flags */
454	if ((ioflag & FRSYNC) == 0)
455		ioflag &= ~(FSYNC|FDSYNC);
456	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
457	bcount -= auio.uio_resid;
458	CPU_STATS_ENTER_K();
459	cp = CPU;
460	CPU_STATS_ADDQ(cp, sys, sysread, 1);
461	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
462	CPU_STATS_EXIT_K();
463	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
464	VOP_RWUNLOCK(vp, rwflag, NULL);
465
466	if (error == EINTR && bcount != 0)
467		error = 0;
468out:
469	if (in_crit)
470		nbl_end_crit(vp);
471	releasef(fdes);
472	if (error)
473		return (set_errno(error));
474	return (bcount);
475}
476
477ssize_t
478pwrite(int fdes, void *cbuf, size_t count, off_t offset)
479{
480	struct uio auio;
481	struct iovec aiov;
482	file_t *fp;
483	register vnode_t *vp;
484	struct cpu *cp;
485	int fflag, ioflag, rwflag;
486	ssize_t bcount;
487	int error = 0;
488	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
489#ifdef _SYSCALL32_IMPL
490	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
491	    MAXOFF32_T : MAXOFFSET_T;
492#else
493	const u_offset_t maxoff = MAXOFF32_T;
494#endif
495	int in_crit = 0;
496
497	if ((bcount = (ssize_t)count) < 0)
498		return (set_errno(EINVAL));
499	if ((fp = getf(fdes)) == NULL)
500		return (set_errno(EBADF));
501	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
502		error = EBADF;
503		goto out;
504	}
505
506	rwflag = 1;
507	vp = fp->f_vnode;
508
509	if (vp->v_type == VREG) {
510
511		if (bcount == 0)
512			goto out;
513
514		/*
515		 * return EINVAL for offsets that cannot be
516		 * represented in an off_t.
517		 */
518		if (fileoff > maxoff) {
519			error = EINVAL;
520			goto out;
521		}
522		/*
523		 * Take appropriate action if we are trying to write above the
524		 * resource limit.
525		 */
526		if (fileoff >= curproc->p_fsz_ctl) {
527			mutex_enter(&curproc->p_lock);
528			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
529			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
530			mutex_exit(&curproc->p_lock);
531
532			error = EFBIG;
533			goto out;
534		}
535		/*
536		 * Don't allow pwrite to cause file sizes to exceed
537		 * maxoff.
538		 */
539		if (fileoff == maxoff) {
540			error = EFBIG;
541			goto out;
542		}
543		if (fileoff + count > maxoff)
544			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
545	} else if (vp->v_type == VFIFO) {
546		error = ESPIPE;
547		goto out;
548	}
549
550	/*
551	 * We have to enter the critical region before calling VOP_RWLOCK
552	 * to avoid a deadlock with ufs.
553	 */
554	if (nbl_need_check(vp)) {
555		int svmand;
556
557		nbl_start_crit(vp, RW_READER);
558		in_crit = 1;
559		error = nbl_svmand(vp, fp->f_cred, &svmand);
560		if (error != 0)
561			goto out;
562		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
563		    NULL)) {
564			error = EACCES;
565			goto out;
566		}
567	}
568
569	aiov.iov_base = cbuf;
570	aiov.iov_len = bcount;
571	(void) VOP_RWLOCK(vp, rwflag, NULL);
572	auio.uio_loffset = fileoff;
573	auio.uio_iov = &aiov;
574	auio.uio_iovcnt = 1;
575	auio.uio_resid = bcount;
576	auio.uio_segflg = UIO_USERSPACE;
577	auio.uio_llimit = curproc->p_fsz_ctl;
578	auio.uio_fmode = fflag;
579	auio.uio_extflg = UIO_COPY_CACHED;
580
581	/*
582	 * The SUSv4 POSIX specification states:
583	 *	The pwrite() function shall be equivalent to write(), except
584	 *	that it writes into a given position and does not change
585	 *	the file offset (regardless of whether O_APPEND is set).
586	 * To make this be true, we omit the FAPPEND flag from ioflag.
587	 */
588	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
589
590	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
591	bcount -= auio.uio_resid;
592	CPU_STATS_ENTER_K();
593	cp = CPU;
594	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
595	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
596	CPU_STATS_EXIT_K();
597	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
598	VOP_RWUNLOCK(vp, rwflag, NULL);
599
600	if (error == EINTR && bcount != 0)
601		error = 0;
602out:
603	if (in_crit)
604		nbl_end_crit(vp);
605	releasef(fdes);
606	if (error)
607		return (set_errno(error));
608	return (bcount);
609}
610
611ssize_t
612readv(int fdes, struct iovec *iovp, int iovcnt)
613{
614	struct uio auio;
615	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
616	int aiovlen = 0;
617	file_t *fp;
618	register vnode_t *vp;
619	struct cpu *cp;
620	int fflag, ioflag, rwflag;
621	ssize_t count, bcount;
622	int error = 0;
623	int i;
624	u_offset_t fileoff;
625	int in_crit = 0;
626
627	if (iovcnt <= 0 || iovcnt > IOV_MAX)
628		return (set_errno(EINVAL));
629
630	if (iovcnt > IOV_MAX_STACK) {
631		aiovlen = iovcnt * sizeof (iovec_t);
632		aiov = kmem_alloc(aiovlen, KM_SLEEP);
633	}
634
635#ifdef _SYSCALL32_IMPL
636	/*
637	 * 32-bit callers need to have their iovec expanded,
638	 * while ensuring that they can't move more than 2Gbytes
639	 * of data in a single call.
640	 */
641	if (get_udatamodel() == DATAMODEL_ILP32) {
642		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
643		int aiov32len;
644		ssize32_t count32;
645
646		aiov32len = iovcnt * sizeof (iovec32_t);
647		if (aiovlen != 0)
648			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
649
650		if (copyin(iovp, aiov32, aiov32len)) {
651			if (aiovlen != 0) {
652				kmem_free(aiov32, aiov32len);
653				kmem_free(aiov, aiovlen);
654			}
655			return (set_errno(EFAULT));
656		}
657
658		count32 = 0;
659		for (i = 0; i < iovcnt; i++) {
660			ssize32_t iovlen32 = aiov32[i].iov_len;
661			count32 += iovlen32;
662			if (iovlen32 < 0 || count32 < 0) {
663				if (aiovlen != 0) {
664					kmem_free(aiov32, aiov32len);
665					kmem_free(aiov, aiovlen);
666				}
667				return (set_errno(EINVAL));
668			}
669			aiov[i].iov_len = iovlen32;
670			aiov[i].iov_base =
671			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
672		}
673
674		if (aiovlen != 0)
675			kmem_free(aiov32, aiov32len);
676	} else
677#endif
678	if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
679		if (aiovlen != 0)
680			kmem_free(aiov, aiovlen);
681		return (set_errno(EFAULT));
682	}
683
684	count = 0;
685	for (i = 0; i < iovcnt; i++) {
686		ssize_t iovlen = aiov[i].iov_len;
687		count += iovlen;
688		if (iovlen < 0 || count < 0) {
689			if (aiovlen != 0)
690				kmem_free(aiov, aiovlen);
691			return (set_errno(EINVAL));
692		}
693	}
694	if ((fp = getf(fdes)) == NULL) {
695		if (aiovlen != 0)
696			kmem_free(aiov, aiovlen);
697		return (set_errno(EBADF));
698	}
699	if (((fflag = fp->f_flag) & FREAD) == 0) {
700		error = EBADF;
701		goto out;
702	}
703	vp = fp->f_vnode;
704	if (vp->v_type == VREG && count == 0) {
705		goto out;
706	}
707
708	rwflag = 0;
709
710	/*
711	 * We have to enter the critical region before calling VOP_RWLOCK
712	 * to avoid a deadlock with ufs.
713	 */
714	if (nbl_need_check(vp)) {
715		int svmand;
716
717		nbl_start_crit(vp, RW_READER);
718		in_crit = 1;
719		error = nbl_svmand(vp, fp->f_cred, &svmand);
720		if (error != 0)
721			goto out;
722		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
723		    NULL)) {
724			error = EACCES;
725			goto out;
726		}
727	}
728
729	(void) VOP_RWLOCK(vp, rwflag, NULL);
730	fileoff = fp->f_offset;
731
732	/*
733	 * Behaviour is same as read. Please see comments in read.
734	 */
735
736	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
737		struct vattr va;
738		va.va_mask = AT_SIZE;
739		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
740			VOP_RWUNLOCK(vp, rwflag, NULL);
741			goto out;
742		}
743		if (fileoff >= va.va_size) {
744			VOP_RWUNLOCK(vp, rwflag, NULL);
745			count = 0;
746			goto out;
747		} else {
748			VOP_RWUNLOCK(vp, rwflag, NULL);
749			error = EOVERFLOW;
750			goto out;
751		}
752	}
753	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
754		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
755	}
756	auio.uio_loffset = fileoff;
757	auio.uio_iov = aiov;
758	auio.uio_iovcnt = iovcnt;
759	auio.uio_resid = bcount = count;
760	auio.uio_segflg = UIO_USERSPACE;
761	auio.uio_llimit = MAXOFFSET_T;
762	auio.uio_fmode = fflag;
763	if (bcount <= copyout_max_cached)
764		auio.uio_extflg = UIO_COPY_CACHED;
765	else
766		auio.uio_extflg = UIO_COPY_DEFAULT;
767
768
769	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
770
771	/* If read sync is not asked for, filter sync flags */
772	if ((ioflag & FRSYNC) == 0)
773		ioflag &= ~(FSYNC|FDSYNC);
774	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
775	count -= auio.uio_resid;
776	CPU_STATS_ENTER_K();
777	cp = CPU;
778	CPU_STATS_ADDQ(cp, sys, sysread, 1);
779	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
780	CPU_STATS_EXIT_K();
781	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
782
783	if (vp->v_type == VFIFO)	/* Backward compatibility */
784		fp->f_offset = count;
785	else if (((fp->f_flag & FAPPEND) == 0) ||
786	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
787		fp->f_offset = auio.uio_loffset;
788
789	VOP_RWUNLOCK(vp, rwflag, NULL);
790
791	if (error == EINTR && count != 0)
792		error = 0;
793out:
794	if (in_crit)
795		nbl_end_crit(vp);
796	releasef(fdes);
797	if (aiovlen != 0)
798		kmem_free(aiov, aiovlen);
799	if (error)
800		return (set_errno(error));
801	return (count);
802}
803
804ssize_t
805writev(int fdes, struct iovec *iovp, int iovcnt)
806{
807	struct uio auio;
808	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
809	int aiovlen = 0;
810	file_t *fp;
811	register vnode_t *vp;
812	struct cpu *cp;
813	int fflag, ioflag, rwflag;
814	ssize_t count, bcount;
815	int error = 0;
816	int i;
817	u_offset_t fileoff;
818	int in_crit = 0;
819
820	if (iovcnt <= 0 || iovcnt > IOV_MAX)
821		return (set_errno(EINVAL));
822
823	if (iovcnt > IOV_MAX_STACK) {
824		aiovlen = iovcnt * sizeof (iovec_t);
825		aiov = kmem_alloc(aiovlen, KM_SLEEP);
826	}
827
828#ifdef _SYSCALL32_IMPL
829	/*
830	 * 32-bit callers need to have their iovec expanded,
831	 * while ensuring that they can't move more than 2Gbytes
832	 * of data in a single call.
833	 */
834	if (get_udatamodel() == DATAMODEL_ILP32) {
835		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
836		int aiov32len;
837		ssize32_t count32;
838
839		aiov32len = iovcnt * sizeof (iovec32_t);
840		if (aiovlen != 0)
841			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
842
843		if (copyin(iovp, aiov32, aiov32len)) {
844			if (aiovlen != 0) {
845				kmem_free(aiov32, aiov32len);
846				kmem_free(aiov, aiovlen);
847			}
848			return (set_errno(EFAULT));
849		}
850
851		count32 = 0;
852		for (i = 0; i < iovcnt; i++) {
853			ssize32_t iovlen = aiov32[i].iov_len;
854			count32 += iovlen;
855			if (iovlen < 0 || count32 < 0) {
856				if (aiovlen != 0) {
857					kmem_free(aiov32, aiov32len);
858					kmem_free(aiov, aiovlen);
859				}
860				return (set_errno(EINVAL));
861			}
862			aiov[i].iov_len = iovlen;
863			aiov[i].iov_base =
864			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
865		}
866		if (aiovlen != 0)
867			kmem_free(aiov32, aiov32len);
868	} else
869#endif
870	if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
871		if (aiovlen != 0)
872			kmem_free(aiov, aiovlen);
873		return (set_errno(EFAULT));
874	}
875
876	count = 0;
877	for (i = 0; i < iovcnt; i++) {
878		ssize_t iovlen = aiov[i].iov_len;
879		count += iovlen;
880		if (iovlen < 0 || count < 0) {
881			if (aiovlen != 0)
882				kmem_free(aiov, aiovlen);
883			return (set_errno(EINVAL));
884		}
885	}
886	if ((fp = getf(fdes)) == NULL) {
887		if (aiovlen != 0)
888			kmem_free(aiov, aiovlen);
889		return (set_errno(EBADF));
890	}
891	if (((fflag = fp->f_flag) & FWRITE) == 0) {
892		error = EBADF;
893		goto out;
894	}
895	vp = fp->f_vnode;
896	if (vp->v_type == VREG && count == 0) {
897		goto out;
898	}
899
900	rwflag = 1;
901
902	/*
903	 * We have to enter the critical region before calling VOP_RWLOCK
904	 * to avoid a deadlock with ufs.
905	 */
906	if (nbl_need_check(vp)) {
907		int svmand;
908
909		nbl_start_crit(vp, RW_READER);
910		in_crit = 1;
911		error = nbl_svmand(vp, fp->f_cred, &svmand);
912		if (error != 0)
913			goto out;
914		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
915		    NULL)) {
916			error = EACCES;
917			goto out;
918		}
919	}
920
921	(void) VOP_RWLOCK(vp, rwflag, NULL);
922
923	fileoff = fp->f_offset;
924
925	/*
926	 * Behaviour is same as write. Please see comments for write.
927	 */
928
929	if (vp->v_type == VREG) {
930		if (fileoff >= curproc->p_fsz_ctl) {
931			VOP_RWUNLOCK(vp, rwflag, NULL);
932			mutex_enter(&curproc->p_lock);
933			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
934			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
935			mutex_exit(&curproc->p_lock);
936			error = EFBIG;
937			goto out;
938		}
939		if (fileoff >= OFFSET_MAX(fp)) {
940			VOP_RWUNLOCK(vp, rwflag, NULL);
941			error = EFBIG;
942			goto out;
943		}
944		if (fileoff + count > OFFSET_MAX(fp))
945			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
946	}
947	auio.uio_loffset = fileoff;
948	auio.uio_iov = aiov;
949	auio.uio_iovcnt = iovcnt;
950	auio.uio_resid = bcount = count;
951	auio.uio_segflg = UIO_USERSPACE;
952	auio.uio_llimit = curproc->p_fsz_ctl;
953	auio.uio_fmode = fflag;
954	auio.uio_extflg = UIO_COPY_DEFAULT;
955
956	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
957
958	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
959	count -= auio.uio_resid;
960	CPU_STATS_ENTER_K();
961	cp = CPU;
962	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
963	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
964	CPU_STATS_EXIT_K();
965	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
966
967	if (vp->v_type == VFIFO)	/* Backward compatibility */
968		fp->f_offset = count;
969	else if (((fp->f_flag & FAPPEND) == 0) ||
970	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
971		fp->f_offset = auio.uio_loffset;
972	VOP_RWUNLOCK(vp, rwflag, NULL);
973
974	if (error == EINTR && count != 0)
975		error = 0;
976out:
977	if (in_crit)
978		nbl_end_crit(vp);
979	releasef(fdes);
980	if (aiovlen != 0)
981		kmem_free(aiov, aiovlen);
982	if (error)
983		return (set_errno(error));
984	return (count);
985}
986
987ssize_t
988preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
989    off_t extended_offset)
990{
991	struct uio auio;
992	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
993	int aiovlen = 0;
994	file_t *fp;
995	register vnode_t *vp;
996	struct cpu *cp;
997	int fflag, ioflag, rwflag;
998	ssize_t count, bcount;
999	int error = 0;
1000	int i;
1001
1002	/*
1003	 * In a 64-bit kernel, this interface supports native 64-bit
1004	 * applications as well as 32-bit applications using both standard and
1005	 * large-file access. For 32-bit large-file aware applications, the
1006	 * offset is passed as two parameters which are joined into the actual
1007	 * offset used. The 64-bit libc always passes 0 for the extended_offset.
1008	 * Note that off_t is a signed value, but the preadv/pwritev API treats
1009	 * the offset as a position in the file for the operation, so passing
1010	 * a negative value will likely fail the maximum offset checks below
1011	 * because we convert it to an unsigned value which will be larger than
1012	 * the maximum valid offset.
1013	 */
1014#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1015	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1016	    (u_offset_t)offset;
1017#else /* _SYSCALL32_IMPL || _ILP32 */
1018	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1019#endif /* _SYSCALL32_IMPR || _ILP32 */
1020
1021	int in_crit = 0;
1022
1023	if (iovcnt <= 0 || iovcnt > IOV_MAX)
1024		return (set_errno(EINVAL));
1025
1026	if (iovcnt > IOV_MAX_STACK) {
1027		aiovlen = iovcnt * sizeof (iovec_t);
1028		aiov = kmem_alloc(aiovlen, KM_SLEEP);
1029	}
1030
1031#ifdef _SYSCALL32_IMPL
1032	/*
1033	 * 32-bit callers need to have their iovec expanded,
1034	 * while ensuring that they can't move more than 2Gbytes
1035	 * of data in a single call.
1036	 */
1037	if (get_udatamodel() == DATAMODEL_ILP32) {
1038		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1039		int aiov32len;
1040		ssize32_t count32;
1041
1042		aiov32len = iovcnt * sizeof (iovec32_t);
1043		if (aiovlen != 0)
1044			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1045
1046		if (copyin(iovp, aiov32, aiov32len)) {
1047			if (aiovlen != 0) {
1048				kmem_free(aiov32, aiov32len);
1049				kmem_free(aiov, aiovlen);
1050			}
1051			return (set_errno(EFAULT));
1052		}
1053
1054		count32 = 0;
1055		for (i = 0; i < iovcnt; i++) {
1056			ssize32_t iovlen32 = aiov32[i].iov_len;
1057			count32 += iovlen32;
1058			if (iovlen32 < 0 || count32 < 0) {
1059				if (aiovlen != 0) {
1060					kmem_free(aiov32, aiov32len);
1061					kmem_free(aiov, aiovlen);
1062				}
1063				return (set_errno(EINVAL));
1064			}
1065			aiov[i].iov_len = iovlen32;
1066			aiov[i].iov_base =
1067			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1068		}
1069		if (aiovlen != 0)
1070			kmem_free(aiov32, aiov32len);
1071	} else
1072#endif /* _SYSCALL32_IMPL */
1073		if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1074			if (aiovlen != 0)
1075				kmem_free(aiov, aiovlen);
1076			return (set_errno(EFAULT));
1077		}
1078
1079	count = 0;
1080	for (i = 0; i < iovcnt; i++) {
1081		ssize_t iovlen = aiov[i].iov_len;
1082		count += iovlen;
1083		if (iovlen < 0 || count < 0) {
1084			if (aiovlen != 0)
1085				kmem_free(aiov, aiovlen);
1086			return (set_errno(EINVAL));
1087		}
1088	}
1089
1090	if ((bcount = count) < 0) {
1091		if (aiovlen != 0)
1092			kmem_free(aiov, aiovlen);
1093		return (set_errno(EINVAL));
1094	}
1095	if ((fp = getf(fdes)) == NULL) {
1096		if (aiovlen != 0)
1097			kmem_free(aiov, aiovlen);
1098		return (set_errno(EBADF));
1099	}
1100	if (((fflag = fp->f_flag) & FREAD) == 0) {
1101		error = EBADF;
1102		goto out;
1103	}
1104	vp = fp->f_vnode;
1105	rwflag = 0;
1106
1107	/*
1108	 * Behaviour is same as read(2). Please see comments in read above.
1109	 */
1110	if (vp->v_type == VREG) {
1111		if (bcount == 0)
1112			goto out;
1113
1114		/* Handle offset past maximum offset allowed for file. */
1115		if (fileoff >= OFFSET_MAX(fp)) {
1116			struct vattr va;
1117			va.va_mask = AT_SIZE;
1118
1119			error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL);
1120			if (error == 0)  {
1121				if (fileoff >= va.va_size) {
1122					count = 0;
1123				} else {
1124					error = EOVERFLOW;
1125				}
1126			}
1127			goto out;
1128		}
1129
1130		ASSERT(bcount == count);
1131
1132		/* Note: modified count used in nbl_conflict() call below. */
1133		if ((fileoff + count) > OFFSET_MAX(fp))
1134			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1135
1136	} else if (vp->v_type == VFIFO) {
1137		error = ESPIPE;
1138		goto out;
1139	}
1140	/*
1141	 * We have to enter the critical region before calling VOP_RWLOCK
1142	 * to avoid a deadlock with ufs.
1143	 */
1144	if (nbl_need_check(vp)) {
1145		int svmand;
1146
1147		nbl_start_crit(vp, RW_READER);
1148		in_crit = 1;
1149		error = nbl_svmand(vp, fp->f_cred, &svmand);
1150		if (error != 0)
1151			goto out;
1152		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
1153			error = EACCES;
1154			goto out;
1155		}
1156	}
1157
1158	(void) VOP_RWLOCK(vp, rwflag, NULL);
1159
1160	auio.uio_loffset = fileoff;
1161	auio.uio_iov = aiov;
1162	auio.uio_iovcnt = iovcnt;
1163	auio.uio_resid = bcount = count;
1164	auio.uio_segflg = UIO_USERSPACE;
1165	auio.uio_llimit = MAXOFFSET_T;
1166	auio.uio_fmode = fflag;
1167	if (bcount <= copyout_max_cached)
1168		auio.uio_extflg = UIO_COPY_CACHED;
1169	else
1170		auio.uio_extflg = UIO_COPY_DEFAULT;
1171
1172	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1173	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1174	count -= auio.uio_resid;
1175	CPU_STATS_ENTER_K();
1176	cp = CPU;
1177	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1178	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
1179	CPU_STATS_EXIT_K();
1180	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1181
1182	VOP_RWUNLOCK(vp, rwflag, NULL);
1183
1184	if (error == EINTR && count != 0)
1185		error = 0;
1186out:
1187	if (in_crit)
1188		nbl_end_crit(vp);
1189	releasef(fdes);
1190	if (aiovlen != 0)
1191		kmem_free(aiov, aiovlen);
1192	if (error)
1193		return (set_errno(error));
1194	return (count);
1195}
1196
1197ssize_t
1198pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
1199    off_t extended_offset)
1200{
1201	struct uio auio;
1202	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1203	int aiovlen = 0;
1204	file_t *fp;
1205	register vnode_t *vp;
1206	struct cpu *cp;
1207	int fflag, ioflag, rwflag;
1208	ssize_t count, bcount;
1209	int error = 0;
1210	int i;
1211
1212	/*
1213	 * See the comment in preadv for how the offset is handled.
1214	 */
1215#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1216	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1217	    (u_offset_t)offset;
1218#else /* _SYSCALL32_IMPL || _ILP32 */
1219	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1220#endif /* _SYSCALL32_IMPR || _ILP32 */
1221
1222	int in_crit = 0;
1223
1224	if (iovcnt <= 0 || iovcnt > IOV_MAX)
1225		return (set_errno(EINVAL));
1226
1227	if (iovcnt > IOV_MAX_STACK) {
1228		aiovlen = iovcnt * sizeof (iovec_t);
1229		aiov = kmem_alloc(aiovlen, KM_SLEEP);
1230	}
1231
1232#ifdef _SYSCALL32_IMPL
1233	/*
1234	 * 32-bit callers need to have their iovec expanded,
1235	 * while ensuring that they can't move more than 2Gbytes
1236	 * of data in a single call.
1237	 */
1238	if (get_udatamodel() == DATAMODEL_ILP32) {
1239		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1240		int aiov32len;
1241		ssize32_t count32;
1242
1243		aiov32len = iovcnt * sizeof (iovec32_t);
1244		if (aiovlen != 0)
1245			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1246
1247		if (copyin(iovp, aiov32, aiov32len)) {
1248			if (aiovlen != 0) {
1249				kmem_free(aiov32, aiov32len);
1250				kmem_free(aiov, aiovlen);
1251			}
1252			return (set_errno(EFAULT));
1253		}
1254
1255		count32 = 0;
1256		for (i = 0; i < iovcnt; i++) {
1257			ssize32_t iovlen32 = aiov32[i].iov_len;
1258			count32 += iovlen32;
1259			if (iovlen32 < 0 || count32 < 0) {
1260				if (aiovlen != 0) {
1261					kmem_free(aiov32, aiov32len);
1262					kmem_free(aiov, aiovlen);
1263				}
1264				return (set_errno(EINVAL));
1265			}
1266			aiov[i].iov_len = iovlen32;
1267			aiov[i].iov_base =
1268			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1269		}
1270		if (aiovlen != 0)
1271			kmem_free(aiov32, aiov32len);
1272	} else
1273#endif /* _SYSCALL32_IMPL */
1274		if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1275			if (aiovlen != 0)
1276				kmem_free(aiov, aiovlen);
1277			return (set_errno(EFAULT));
1278		}
1279
1280	count = 0;
1281	for (i = 0; i < iovcnt; i++) {
1282		ssize_t iovlen = aiov[i].iov_len;
1283		count += iovlen;
1284		if (iovlen < 0 || count < 0) {
1285			if (aiovlen != 0)
1286				kmem_free(aiov, aiovlen);
1287			return (set_errno(EINVAL));
1288		}
1289	}
1290
1291	if ((bcount = count) < 0) {
1292		if (aiovlen != 0)
1293			kmem_free(aiov, aiovlen);
1294		return (set_errno(EINVAL));
1295	}
1296	if ((fp = getf(fdes)) == NULL) {
1297		if (aiovlen != 0)
1298			kmem_free(aiov, aiovlen);
1299		return (set_errno(EBADF));
1300	}
1301	if (((fflag = fp->f_flag) & FWRITE) == 0) {
1302		error = EBADF;
1303		goto out;
1304	}
1305	vp = fp->f_vnode;
1306	rwflag = 1;
1307
1308	/*
1309	 * The kernel's write(2) code checks OFFSET_MAX and the rctl, and
1310	 * returns EFBIG when fileoff exceeds either limit. We do the same.
1311	 */
1312	if (vp->v_type == VREG) {
1313		if (bcount == 0)
1314			goto out;
1315
1316		/*
1317		 * Don't allow pwritev to cause file size to exceed the proper
1318		 * offset limit.
1319		 */
1320		if (fileoff >= OFFSET_MAX(fp)) {
1321			error = EFBIG;
1322			goto out;
1323		}
1324
1325		/*
1326		 * Take appropriate action if we are trying
1327		 * to write above the resource limit.
1328		 */
1329		if (fileoff >= curproc->p_fsz_ctl) {
1330			mutex_enter(&curproc->p_lock);
1331			/*
1332			 * Return value ignored because it lists
1333			 * actions taken, but we are in an error case.
1334			 * We don't have any actions that depend on
1335			 * what could happen in this call, so we ignore
1336			 * the return value.
1337			 */
1338			(void) rctl_action(
1339			    rctlproc_legacy[RLIMIT_FSIZE],
1340			    curproc->p_rctls, curproc,
1341			    RCA_UNSAFE_SIGINFO);
1342			mutex_exit(&curproc->p_lock);
1343
1344			error = EFBIG;
1345			goto out;
1346		}
1347
1348		ASSERT(bcount == count);
1349
1350		/* Note: modified count used in nbl_conflict() call below. */
1351		if ((fileoff + count) > OFFSET_MAX(fp))
1352			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1353
1354	} else if (vp->v_type == VFIFO) {
1355		error = ESPIPE;
1356		goto out;
1357	}
1358	/*
1359	 * We have to enter the critical region before calling VOP_RWLOCK
1360	 * to avoid a deadlock with ufs.
1361	 */
1362	if (nbl_need_check(vp)) {
1363		int svmand;
1364
1365		nbl_start_crit(vp, RW_READER);
1366		in_crit = 1;
1367		error = nbl_svmand(vp, fp->f_cred, &svmand);
1368		if (error != 0)
1369			goto out;
1370		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
1371			error = EACCES;
1372			goto out;
1373		}
1374	}
1375
1376	(void) VOP_RWLOCK(vp, rwflag, NULL);
1377
1378	auio.uio_loffset = fileoff;
1379	auio.uio_iov = aiov;
1380	auio.uio_iovcnt = iovcnt;
1381	auio.uio_resid = bcount = count;
1382	auio.uio_segflg = UIO_USERSPACE;
1383	auio.uio_llimit = curproc->p_fsz_ctl;
1384	auio.uio_fmode = fflag;
1385	auio.uio_extflg = UIO_COPY_CACHED;
1386	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1387	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1388	count -= auio.uio_resid;
1389	CPU_STATS_ENTER_K();
1390	cp = CPU;
1391	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1392	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
1393	CPU_STATS_EXIT_K();
1394	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1395
1396	VOP_RWUNLOCK(vp, rwflag, NULL);
1397
1398	if (error == EINTR && count != 0)
1399		error = 0;
1400out:
1401	if (in_crit)
1402		nbl_end_crit(vp);
1403	releasef(fdes);
1404	if (aiovlen != 0)
1405		kmem_free(aiov, aiovlen);
1406	if (error)
1407		return (set_errno(error));
1408	return (count);
1409}
1410
1411#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1412
1413/*
1414 * This syscall supplies 64-bit file offsets to 32-bit applications only.
1415 */
1416ssize32_t
1417pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1418    uint32_t offset_2)
1419{
1420	struct uio auio;
1421	struct iovec aiov;
1422	file_t *fp;
1423	register vnode_t *vp;
1424	struct cpu *cp;
1425	int fflag, ioflag, rwflag;
1426	ssize_t bcount;
1427	int error = 0;
1428	u_offset_t fileoff;
1429	int in_crit = 0;
1430
1431#if defined(_LITTLE_ENDIAN)
1432	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1433#else
1434	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1435#endif
1436
1437	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1438		return (set_errno(EINVAL));
1439
1440	if ((fp = getf(fdes)) == NULL)
1441		return (set_errno(EBADF));
1442	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
1443		error = EBADF;
1444		goto out;
1445	}
1446
1447	rwflag = 0;
1448	vp = fp->f_vnode;
1449
1450	if (vp->v_type == VREG) {
1451
1452		if (bcount == 0)
1453			goto out;
1454
1455		/*
1456		 * Same as pread. See comments in pread.
1457		 */
1458
1459		if (fileoff > MAXOFFSET_T) {
1460			error = EINVAL;
1461			goto out;
1462		}
1463		if (fileoff + bcount > MAXOFFSET_T)
1464			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
1465	} else if (vp->v_type == VFIFO) {
1466		error = ESPIPE;
1467		goto out;
1468	}
1469
1470	/*
1471	 * We have to enter the critical region before calling VOP_RWLOCK
1472	 * to avoid a deadlock with ufs.
1473	 */
1474	if (nbl_need_check(vp)) {
1475		int svmand;
1476
1477		nbl_start_crit(vp, RW_READER);
1478		in_crit = 1;
1479		error = nbl_svmand(vp, fp->f_cred, &svmand);
1480		if (error != 0)
1481			goto out;
1482		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
1483		    NULL)) {
1484			error = EACCES;
1485			goto out;
1486		}
1487	}
1488
1489	aiov.iov_base = cbuf;
1490	aiov.iov_len = bcount;
1491	(void) VOP_RWLOCK(vp, rwflag, NULL);
1492	auio.uio_loffset = fileoff;
1493
1494	/*
1495	 * Note: File size can never be greater than MAXOFFSET_T.
1496	 * If ever we start supporting 128 bit files the code
1497	 * similar to the one in pread at this place should be here.
1498	 * Here we avoid the unnecessary VOP_GETATTR() when we
1499	 * know that fileoff == MAXOFFSET_T implies that it is always
1500	 * greater than or equal to file size.
1501	 */
1502	auio.uio_iov = &aiov;
1503	auio.uio_iovcnt = 1;
1504	auio.uio_resid = bcount;
1505	auio.uio_segflg = UIO_USERSPACE;
1506	auio.uio_llimit = MAXOFFSET_T;
1507	auio.uio_fmode = fflag;
1508	auio.uio_extflg = UIO_COPY_CACHED;
1509
1510	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1511
1512	/* If read sync is not asked for, filter sync flags */
1513	if ((ioflag & FRSYNC) == 0)
1514		ioflag &= ~(FSYNC|FDSYNC);
1515	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1516	bcount -= auio.uio_resid;
1517	CPU_STATS_ENTER_K();
1518	cp = CPU;
1519	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1520	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1521	CPU_STATS_EXIT_K();
1522	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1523	VOP_RWUNLOCK(vp, rwflag, NULL);
1524
1525	if (error == EINTR && bcount != 0)
1526		error = 0;
1527out:
1528	if (in_crit)
1529		nbl_end_crit(vp);
1530	releasef(fdes);
1531	if (error)
1532		return (set_errno(error));
1533	return (bcount);
1534}
1535
1536/*
1537 * This syscall supplies 64-bit file offsets to 32-bit applications only.
1538 */
1539ssize32_t
1540pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1541    uint32_t offset_2)
1542{
1543	struct uio auio;
1544	struct iovec aiov;
1545	file_t *fp;
1546	register vnode_t *vp;
1547	struct cpu *cp;
1548	int fflag, ioflag, rwflag;
1549	ssize_t bcount;
1550	int error = 0;
1551	u_offset_t fileoff;
1552	int in_crit = 0;
1553
1554#if defined(_LITTLE_ENDIAN)
1555	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1556#else
1557	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1558#endif
1559
1560	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1561		return (set_errno(EINVAL));
1562	if ((fp = getf(fdes)) == NULL)
1563		return (set_errno(EBADF));
1564	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1565		error = EBADF;
1566		goto out;
1567	}
1568
1569	rwflag = 1;
1570	vp = fp->f_vnode;
1571
1572	if (vp->v_type == VREG) {
1573
1574		if (bcount == 0)
1575			goto out;
1576
1577		/*
1578		 * See comments in pwrite.
1579		 */
1580		if (fileoff > MAXOFFSET_T) {
1581			error = EINVAL;
1582			goto out;
1583		}
1584		if (fileoff >= curproc->p_fsz_ctl) {
1585			mutex_enter(&curproc->p_lock);
1586			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1587			    curproc->p_rctls, curproc, RCA_SAFE);
1588			mutex_exit(&curproc->p_lock);
1589			error = EFBIG;
1590			goto out;
1591		}
1592		if (fileoff == MAXOFFSET_T) {
1593			error = EFBIG;
1594			goto out;
1595		}
1596		if (fileoff + bcount > MAXOFFSET_T)
1597			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1598	} else if (vp->v_type == VFIFO) {
1599		error = ESPIPE;
1600		goto out;
1601	}
1602
1603	/*
1604	 * We have to enter the critical region before calling VOP_RWLOCK
1605	 * to avoid a deadlock with ufs.
1606	 */
1607	if (nbl_need_check(vp)) {
1608		int svmand;
1609
1610		nbl_start_crit(vp, RW_READER);
1611		in_crit = 1;
1612		error = nbl_svmand(vp, fp->f_cred, &svmand);
1613		if (error != 0)
1614			goto out;
1615		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1616		    NULL)) {
1617			error = EACCES;
1618			goto out;
1619		}
1620	}
1621
1622	aiov.iov_base = cbuf;
1623	aiov.iov_len = bcount;
1624	(void) VOP_RWLOCK(vp, rwflag, NULL);
1625	auio.uio_loffset = fileoff;
1626	auio.uio_iov = &aiov;
1627	auio.uio_iovcnt = 1;
1628	auio.uio_resid = bcount;
1629	auio.uio_segflg = UIO_USERSPACE;
1630	auio.uio_llimit = curproc->p_fsz_ctl;
1631	auio.uio_fmode = fflag;
1632	auio.uio_extflg = UIO_COPY_CACHED;
1633
1634	/*
1635	 * The SUSv4 POSIX specification states:
1636	 *	The pwrite() function shall be equivalent to write(), except
1637	 *	that it writes into a given position and does not change
1638	 *	the file offset (regardless of whether O_APPEND is set).
1639	 * To make this be true, we omit the FAPPEND flag from ioflag.
1640	 */
1641	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1642
1643	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1644	bcount -= auio.uio_resid;
1645	CPU_STATS_ENTER_K();
1646	cp = CPU;
1647	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1648	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1649	CPU_STATS_EXIT_K();
1650	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1651	VOP_RWUNLOCK(vp, rwflag, NULL);
1652
1653	if (error == EINTR && bcount != 0)
1654		error = 0;
1655out:
1656	if (in_crit)
1657		nbl_end_crit(vp);
1658	releasef(fdes);
1659	if (error)
1660		return (set_errno(error));
1661	return (bcount);
1662}
1663
1664#endif	/* _SYSCALL32_IMPL || _ILP32 */
1665
1666#ifdef _SYSCALL32_IMPL
1667/*
1668 * Tail-call elimination of xxx32() down to xxx()
1669 *
1670 * A number of xxx32 system calls take a len (or count) argument and
1671 * return a number in the range [0,len] or -1 on error.
1672 * Given an ssize32_t input len, the downcall xxx() will return
1673 * a 64-bit value that is -1 or in the range [0,len] which actually
1674 * is a proper return value for the xxx32 call. So even if the xxx32
1675 * calls can be considered as returning a ssize32_t, they are currently
1676 * declared as returning a ssize_t as this enables tail-call elimination.
1677 *
1678 * The cast of len (or count) to ssize32_t is needed to ensure we pass
1679 * down negative input values as such and let the downcall handle error
1680 * reporting. Functions covered by this comments are:
1681 *
1682 * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1683 * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1684 * readlink.c:     readlink32.
1685 */
1686
1687ssize_t
1688read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1689{
1690	return (read(fdes,
1691	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1692}
1693
1694ssize_t
1695write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1696{
1697	return (write(fdes,
1698	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1699}
1700
1701ssize_t
1702pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1703{
1704	return (pread(fdes,
1705	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1706	    (off_t)(uint32_t)offset));
1707}
1708
1709ssize_t
1710pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1711{
1712	return (pwrite(fdes,
1713	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1714	    (off_t)(uint32_t)offset));
1715}
1716
1717ssize_t
1718readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1719{
1720	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1721}
1722
1723ssize_t
1724writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1725{
1726	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1727}
1728#endif	/* _SYSCALL32_IMPL */
1729