1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
26 */
27
28/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
29/*	  All Rights Reserved  	*/
30
31/*
32 * Portions of this source code were derived from Berkeley 4.3 BSD
33 * under license from the Regents of the University of California.
34 */
35
36#include <sys/param.h>
37#include <sys/isa_defs.h>
38#include <sys/types.h>
39#include <sys/inttypes.h>
40#include <sys/sysmacros.h>
41#include <sys/cred.h>
42#include <sys/user.h>
43#include <sys/systm.h>
44#include <sys/errno.h>
45#include <sys/vnode.h>
46#include <sys/file.h>
47#include <sys/proc.h>
48#include <sys/cpuvar.h>
49#include <sys/uio.h>
50#include <sys/debug.h>
51#include <sys/rctl.h>
52#include <sys/nbmlock.h>
53
54#define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
55
56size_t copyout_max_cached = COPYOUT_MAX_CACHE;	/* global so it's patchable */
57
58/*
59 * read, write, pread, pwrite, readv, and writev syscalls.
60 *
61 * 64-bit open:	all open's are large file opens.
62 * Large Files: the behaviour of read depends on whether the fd
63 *		corresponds to large open or not.
64 * 32-bit open:	FOFFMAX flag not set.
65 *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
66 *		EOVERFLOW if count is non-zero and if size of file
67 *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
68 *		at >= MAXOFF32_T returns EOF.
69 */
70
71/*
72 * Native system call
73 */
74ssize_t
75read(int fdes, void *cbuf, size_t count)
76{
77	struct uio auio;
78	struct iovec aiov;
79	file_t *fp;
80	register vnode_t *vp;
81	struct cpu *cp;
82	int fflag, ioflag, rwflag;
83	ssize_t cnt, bcount;
84	int error = 0;
85	u_offset_t fileoff;
86	int in_crit = 0;
87
88	if ((cnt = (ssize_t)count) < 0)
89		return (set_errno(EINVAL));
90	if ((fp = getf(fdes)) == NULL)
91		return (set_errno(EBADF));
92	if (((fflag = fp->f_flag) & FREAD) == 0) {
93		error = EBADF;
94		goto out;
95	}
96	vp = fp->f_vnode;
97
98	if (vp->v_type == VREG && cnt == 0) {
99		goto out;
100	}
101
102	rwflag = 0;
103	aiov.iov_base = cbuf;
104	aiov.iov_len = cnt;
105
106	/*
107	 * We have to enter the critical region before calling VOP_RWLOCK
108	 * to avoid a deadlock with write() calls.
109	 */
110	if (nbl_need_check(vp)) {
111		int svmand;
112
113		nbl_start_crit(vp, RW_READER);
114		in_crit = 1;
115		error = nbl_svmand(vp, fp->f_cred, &svmand);
116		if (error != 0)
117			goto out;
118		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
119		    NULL)) {
120			error = EACCES;
121			goto out;
122		}
123	}
124
125	(void) VOP_RWLOCK(vp, rwflag, NULL);
126
127	/*
128	 * We do the following checks inside VOP_RWLOCK so as to
129	 * prevent file size from changing while these checks are
130	 * being done. Also, we load fp's offset to the local
131	 * variable fileoff because we can have a parallel lseek
132	 * going on (f_offset is not protected by any lock) which
133	 * could change f_offset. We need to see the value only
134	 * once here and take a decision. Seeing it more than once
135	 * can lead to incorrect functionality.
136	 */
137
138	fileoff = (u_offset_t)fp->f_offset;
139	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
140		struct vattr va;
141		va.va_mask = AT_SIZE;
142		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
143			VOP_RWUNLOCK(vp, rwflag, NULL);
144			goto out;
145		}
146		if (fileoff >= va.va_size) {
147			cnt = 0;
148			VOP_RWUNLOCK(vp, rwflag, NULL);
149			goto out;
150		} else {
151			error = EOVERFLOW;
152			VOP_RWUNLOCK(vp, rwflag, NULL);
153			goto out;
154		}
155	}
156	if ((vp->v_type == VREG) &&
157	    (fileoff + cnt > OFFSET_MAX(fp))) {
158		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
159	}
160	auio.uio_loffset = fileoff;
161	auio.uio_iov = &aiov;
162	auio.uio_iovcnt = 1;
163	auio.uio_resid = bcount = cnt;
164	auio.uio_segflg = UIO_USERSPACE;
165	auio.uio_llimit = MAXOFFSET_T;
166	auio.uio_fmode = fflag;
167	/*
168	 * Only use bypass caches when the count is large enough
169	 */
170	if (bcount <= copyout_max_cached)
171		auio.uio_extflg = UIO_COPY_CACHED;
172	else
173		auio.uio_extflg = UIO_COPY_DEFAULT;
174
175	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
176
177	/* If read sync is not asked for, filter sync flags */
178	if ((ioflag & FRSYNC) == 0)
179		ioflag &= ~(FSYNC|FDSYNC);
180	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
181	cnt -= auio.uio_resid;
182	CPU_STATS_ENTER_K();
183	cp = CPU;
184	CPU_STATS_ADDQ(cp, sys, sysread, 1);
185	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
186	CPU_STATS_EXIT_K();
187	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
188
189	if (vp->v_type == VFIFO)	/* Backward compatibility */
190		fp->f_offset = cnt;
191	else if (((fp->f_flag & FAPPEND) == 0) ||
192	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
193		fp->f_offset = auio.uio_loffset;
194	VOP_RWUNLOCK(vp, rwflag, NULL);
195
196	if (error == EINTR && cnt != 0)
197		error = 0;
198out:
199	if (in_crit)
200		nbl_end_crit(vp);
201	releasef(fdes);
202	if (error)
203		return (set_errno(error));
204	return (cnt);
205}
206
207/*
208 * Native system call
209 */
210ssize_t
211write(int fdes, void *cbuf, size_t count)
212{
213	struct uio auio;
214	struct iovec aiov;
215	file_t *fp;
216	register vnode_t *vp;
217	struct cpu *cp;
218	int fflag, ioflag, rwflag;
219	ssize_t cnt, bcount;
220	int error = 0;
221	u_offset_t fileoff;
222	int in_crit = 0;
223
224	if ((cnt = (ssize_t)count) < 0)
225		return (set_errno(EINVAL));
226	if ((fp = getf(fdes)) == NULL)
227		return (set_errno(EBADF));
228	if (((fflag = fp->f_flag) & FWRITE) == 0) {
229		error = EBADF;
230		goto out;
231	}
232	vp = fp->f_vnode;
233
234	if (vp->v_type == VREG && cnt == 0) {
235		goto out;
236	}
237
238	rwflag = 1;
239	aiov.iov_base = cbuf;
240	aiov.iov_len = cnt;
241
242	/*
243	 * We have to enter the critical region before calling VOP_RWLOCK
244	 * to avoid a deadlock with ufs.
245	 */
246	if (nbl_need_check(vp)) {
247		int svmand;
248
249		nbl_start_crit(vp, RW_READER);
250		in_crit = 1;
251		error = nbl_svmand(vp, fp->f_cred, &svmand);
252		if (error != 0)
253			goto out;
254		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
255		    NULL)) {
256			error = EACCES;
257			goto out;
258		}
259	}
260
261	(void) VOP_RWLOCK(vp, rwflag, NULL);
262
263	fileoff = fp->f_offset;
264	if (vp->v_type == VREG) {
265
266		/*
267		 * We raise psignal if write for >0 bytes causes
268		 * it to exceed the ulimit.
269		 */
270		if (fileoff >= curproc->p_fsz_ctl) {
271			VOP_RWUNLOCK(vp, rwflag, NULL);
272
273			mutex_enter(&curproc->p_lock);
274			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
275			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
276			mutex_exit(&curproc->p_lock);
277
278			error = EFBIG;
279			goto out;
280		}
281		/*
282		 * We return EFBIG if write is done at an offset
283		 * greater than the offset maximum for this file structure.
284		 */
285
286		if (fileoff >= OFFSET_MAX(fp)) {
287			VOP_RWUNLOCK(vp, rwflag, NULL);
288			error = EFBIG;
289			goto out;
290		}
291		/*
292		 * Limit the bytes to be written  upto offset maximum for
293		 * this open file structure.
294		 */
295		if (fileoff + cnt > OFFSET_MAX(fp))
296			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
297	}
298	auio.uio_loffset = fileoff;
299	auio.uio_iov = &aiov;
300	auio.uio_iovcnt = 1;
301	auio.uio_resid = bcount = cnt;
302	auio.uio_segflg = UIO_USERSPACE;
303	auio.uio_llimit = curproc->p_fsz_ctl;
304	auio.uio_fmode = fflag;
305	auio.uio_extflg = UIO_COPY_DEFAULT;
306
307	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
308
309	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
310	cnt -= auio.uio_resid;
311	CPU_STATS_ENTER_K();
312	cp = CPU;
313	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
314	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
315	CPU_STATS_EXIT_K();
316	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
317
318	if (vp->v_type == VFIFO)	/* Backward compatibility */
319		fp->f_offset = cnt;
320	else if (((fp->f_flag & FAPPEND) == 0) ||
321	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
322		fp->f_offset = auio.uio_loffset;
323	VOP_RWUNLOCK(vp, rwflag, NULL);
324
325	if (error == EINTR && cnt != 0)
326		error = 0;
327out:
328	if (in_crit)
329		nbl_end_crit(vp);
330	releasef(fdes);
331	if (error)
332		return (set_errno(error));
333	return (cnt);
334}
335
336ssize_t
337pread(int fdes, void *cbuf, size_t count, off_t offset)
338{
339	struct uio auio;
340	struct iovec aiov;
341	file_t *fp;
342	register vnode_t *vp;
343	struct cpu *cp;
344	int fflag, ioflag, rwflag;
345	ssize_t bcount;
346	int error = 0;
347	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
348#ifdef _SYSCALL32_IMPL
349	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
350	    MAXOFF32_T : MAXOFFSET_T;
351#else
352	const u_offset_t maxoff = MAXOFF32_T;
353#endif
354	int in_crit = 0;
355
356	if ((bcount = (ssize_t)count) < 0)
357		return (set_errno(EINVAL));
358
359	if ((fp = getf(fdes)) == NULL)
360		return (set_errno(EBADF));
361	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
362		error = EBADF;
363		goto out;
364	}
365
366	rwflag = 0;
367	vp = fp->f_vnode;
368
369	if (vp->v_type == VREG) {
370
371		if (bcount == 0)
372			goto out;
373
374		/*
375		 * Return EINVAL if an invalid offset comes to pread.
376		 * Negative offset from user will cause this error.
377		 */
378
379		if (fileoff > maxoff) {
380			error = EINVAL;
381			goto out;
382		}
383		/*
384		 * Limit offset such that we don't read or write
385		 * a file beyond the maximum offset representable in
386		 * an off_t structure.
387		 */
388		if (fileoff + bcount > maxoff)
389			bcount = (ssize_t)((offset_t)maxoff - fileoff);
390	} else if (vp->v_type == VFIFO) {
391		error = ESPIPE;
392		goto out;
393	}
394
395	/*
396	 * We have to enter the critical region before calling VOP_RWLOCK
397	 * to avoid a deadlock with ufs.
398	 */
399	if (nbl_need_check(vp)) {
400		int svmand;
401
402		nbl_start_crit(vp, RW_READER);
403		in_crit = 1;
404		error = nbl_svmand(vp, fp->f_cred, &svmand);
405		if (error != 0)
406			goto out;
407		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
408		    NULL)) {
409			error = EACCES;
410			goto out;
411		}
412	}
413
414	aiov.iov_base = cbuf;
415	aiov.iov_len = bcount;
416	(void) VOP_RWLOCK(vp, rwflag, NULL);
417	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
418		struct vattr va;
419		va.va_mask = AT_SIZE;
420		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
421			VOP_RWUNLOCK(vp, rwflag, NULL);
422			goto out;
423		}
424		VOP_RWUNLOCK(vp, rwflag, NULL);
425
426		/*
427		 * We have to return EOF if fileoff is >= file size.
428		 */
429		if (fileoff >= va.va_size) {
430			bcount = 0;
431			goto out;
432		}
433
434		/*
435		 * File is greater than or equal to maxoff and therefore
436		 * we return EOVERFLOW.
437		 */
438		error = EOVERFLOW;
439		goto out;
440	}
441	auio.uio_loffset = fileoff;
442	auio.uio_iov = &aiov;
443	auio.uio_iovcnt = 1;
444	auio.uio_resid = bcount;
445	auio.uio_segflg = UIO_USERSPACE;
446	auio.uio_llimit = MAXOFFSET_T;
447	auio.uio_fmode = fflag;
448	auio.uio_extflg = UIO_COPY_CACHED;
449
450	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
451
452	/* If read sync is not asked for, filter sync flags */
453	if ((ioflag & FRSYNC) == 0)
454		ioflag &= ~(FSYNC|FDSYNC);
455	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
456	bcount -= auio.uio_resid;
457	CPU_STATS_ENTER_K();
458	cp = CPU;
459	CPU_STATS_ADDQ(cp, sys, sysread, 1);
460	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
461	CPU_STATS_EXIT_K();
462	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
463	VOP_RWUNLOCK(vp, rwflag, NULL);
464
465	if (error == EINTR && bcount != 0)
466		error = 0;
467out:
468	if (in_crit)
469		nbl_end_crit(vp);
470	releasef(fdes);
471	if (error)
472		return (set_errno(error));
473	return (bcount);
474}
475
476ssize_t
477pwrite(int fdes, void *cbuf, size_t count, off_t offset)
478{
479	struct uio auio;
480	struct iovec aiov;
481	file_t *fp;
482	register vnode_t *vp;
483	struct cpu *cp;
484	int fflag, ioflag, rwflag;
485	ssize_t bcount;
486	int error = 0;
487	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
488#ifdef _SYSCALL32_IMPL
489	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
490	    MAXOFF32_T : MAXOFFSET_T;
491#else
492	const u_offset_t maxoff = MAXOFF32_T;
493#endif
494	int in_crit = 0;
495
496	if ((bcount = (ssize_t)count) < 0)
497		return (set_errno(EINVAL));
498	if ((fp = getf(fdes)) == NULL)
499		return (set_errno(EBADF));
500	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
501		error = EBADF;
502		goto out;
503	}
504
505	rwflag = 1;
506	vp = fp->f_vnode;
507
508	if (vp->v_type == VREG) {
509
510		if (bcount == 0)
511			goto out;
512
513		/*
514		 * return EINVAL for offsets that cannot be
515		 * represented in an off_t.
516		 */
517		if (fileoff > maxoff) {
518			error = EINVAL;
519			goto out;
520		}
521		/*
522		 * Take appropriate action if we are trying to write above the
523		 * resource limit.
524		 */
525		if (fileoff >= curproc->p_fsz_ctl) {
526			mutex_enter(&curproc->p_lock);
527			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
528			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
529			mutex_exit(&curproc->p_lock);
530
531			error = EFBIG;
532			goto out;
533		}
534		/*
535		 * Don't allow pwrite to cause file sizes to exceed
536		 * maxoff.
537		 */
538		if (fileoff == maxoff) {
539			error = EFBIG;
540			goto out;
541		}
542		if (fileoff + count > maxoff)
543			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
544	} else if (vp->v_type == VFIFO) {
545		error = ESPIPE;
546		goto out;
547	}
548
549	/*
550	 * We have to enter the critical region before calling VOP_RWLOCK
551	 * to avoid a deadlock with ufs.
552	 */
553	if (nbl_need_check(vp)) {
554		int svmand;
555
556		nbl_start_crit(vp, RW_READER);
557		in_crit = 1;
558		error = nbl_svmand(vp, fp->f_cred, &svmand);
559		if (error != 0)
560			goto out;
561		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
562		    NULL)) {
563			error = EACCES;
564			goto out;
565		}
566	}
567
568	aiov.iov_base = cbuf;
569	aiov.iov_len = bcount;
570	(void) VOP_RWLOCK(vp, rwflag, NULL);
571	auio.uio_loffset = fileoff;
572	auio.uio_iov = &aiov;
573	auio.uio_iovcnt = 1;
574	auio.uio_resid = bcount;
575	auio.uio_segflg = UIO_USERSPACE;
576	auio.uio_llimit = curproc->p_fsz_ctl;
577	auio.uio_fmode = fflag;
578	auio.uio_extflg = UIO_COPY_CACHED;
579
580	/*
581	 * The SUSv4 POSIX specification states:
582	 *	The pwrite() function shall be equivalent to write(), except
583	 *	that it writes into a given position and does not change
584	 *	the file offset (regardless of whether O_APPEND is set).
585	 * To make this be true, we omit the FAPPEND flag from ioflag.
586	 */
587	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
588
589	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
590	bcount -= auio.uio_resid;
591	CPU_STATS_ENTER_K();
592	cp = CPU;
593	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
594	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
595	CPU_STATS_EXIT_K();
596	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
597	VOP_RWUNLOCK(vp, rwflag, NULL);
598
599	if (error == EINTR && bcount != 0)
600		error = 0;
601out:
602	if (in_crit)
603		nbl_end_crit(vp);
604	releasef(fdes);
605	if (error)
606		return (set_errno(error));
607	return (bcount);
608}
609
610/*
611 * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
612 * XXX -- However, SVVS expects readv() and writev() to fail if
613 * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
614 * XXX -- so I guess that's the "interface".
615 */
616#define	DEF_IOV_MAX	16
617
618ssize_t
619readv(int fdes, struct iovec *iovp, int iovcnt)
620{
621	struct uio auio;
622	struct iovec aiov[DEF_IOV_MAX];
623	file_t *fp;
624	register vnode_t *vp;
625	struct cpu *cp;
626	int fflag, ioflag, rwflag;
627	ssize_t count, bcount;
628	int error = 0;
629	int i;
630	u_offset_t fileoff;
631	int in_crit = 0;
632
633	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
634		return (set_errno(EINVAL));
635
636#ifdef _SYSCALL32_IMPL
637	/*
638	 * 32-bit callers need to have their iovec expanded,
639	 * while ensuring that they can't move more than 2Gbytes
640	 * of data in a single call.
641	 */
642	if (get_udatamodel() == DATAMODEL_ILP32) {
643		struct iovec32 aiov32[DEF_IOV_MAX];
644		ssize32_t count32;
645
646		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
647			return (set_errno(EFAULT));
648
649		count32 = 0;
650		for (i = 0; i < iovcnt; i++) {
651			ssize32_t iovlen32 = aiov32[i].iov_len;
652			count32 += iovlen32;
653			if (iovlen32 < 0 || count32 < 0)
654				return (set_errno(EINVAL));
655			aiov[i].iov_len = iovlen32;
656			aiov[i].iov_base =
657			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
658		}
659	} else
660#endif
661	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
662		return (set_errno(EFAULT));
663
664	count = 0;
665	for (i = 0; i < iovcnt; i++) {
666		ssize_t iovlen = aiov[i].iov_len;
667		count += iovlen;
668		if (iovlen < 0 || count < 0)
669			return (set_errno(EINVAL));
670	}
671	if ((fp = getf(fdes)) == NULL)
672		return (set_errno(EBADF));
673	if (((fflag = fp->f_flag) & FREAD) == 0) {
674		error = EBADF;
675		goto out;
676	}
677	vp = fp->f_vnode;
678	if (vp->v_type == VREG && count == 0) {
679		goto out;
680	}
681
682	rwflag = 0;
683
684	/*
685	 * We have to enter the critical region before calling VOP_RWLOCK
686	 * to avoid a deadlock with ufs.
687	 */
688	if (nbl_need_check(vp)) {
689		int svmand;
690
691		nbl_start_crit(vp, RW_READER);
692		in_crit = 1;
693		error = nbl_svmand(vp, fp->f_cred, &svmand);
694		if (error != 0)
695			goto out;
696		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
697		    NULL)) {
698			error = EACCES;
699			goto out;
700		}
701	}
702
703	(void) VOP_RWLOCK(vp, rwflag, NULL);
704	fileoff = fp->f_offset;
705
706	/*
707	 * Behaviour is same as read. Please see comments in read.
708	 */
709
710	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
711		struct vattr va;
712		va.va_mask = AT_SIZE;
713		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
714			VOP_RWUNLOCK(vp, rwflag, NULL);
715			goto out;
716		}
717		if (fileoff >= va.va_size) {
718			VOP_RWUNLOCK(vp, rwflag, NULL);
719			count = 0;
720			goto out;
721		} else {
722			VOP_RWUNLOCK(vp, rwflag, NULL);
723			error = EOVERFLOW;
724			goto out;
725		}
726	}
727	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
728		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
729	}
730	auio.uio_loffset = fileoff;
731	auio.uio_iov = aiov;
732	auio.uio_iovcnt = iovcnt;
733	auio.uio_resid = bcount = count;
734	auio.uio_segflg = UIO_USERSPACE;
735	auio.uio_llimit = MAXOFFSET_T;
736	auio.uio_fmode = fflag;
737	if (bcount <= copyout_max_cached)
738		auio.uio_extflg = UIO_COPY_CACHED;
739	else
740		auio.uio_extflg = UIO_COPY_DEFAULT;
741
742
743	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
744
745	/* If read sync is not asked for, filter sync flags */
746	if ((ioflag & FRSYNC) == 0)
747		ioflag &= ~(FSYNC|FDSYNC);
748	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
749	count -= auio.uio_resid;
750	CPU_STATS_ENTER_K();
751	cp = CPU;
752	CPU_STATS_ADDQ(cp, sys, sysread, 1);
753	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
754	CPU_STATS_EXIT_K();
755	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
756
757	if (vp->v_type == VFIFO)	/* Backward compatibility */
758		fp->f_offset = count;
759	else if (((fp->f_flag & FAPPEND) == 0) ||
760	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
761		fp->f_offset = auio.uio_loffset;
762
763	VOP_RWUNLOCK(vp, rwflag, NULL);
764
765	if (error == EINTR && count != 0)
766		error = 0;
767out:
768	if (in_crit)
769		nbl_end_crit(vp);
770	releasef(fdes);
771	if (error)
772		return (set_errno(error));
773	return (count);
774}
775
776ssize_t
777writev(int fdes, struct iovec *iovp, int iovcnt)
778{
779	struct uio auio;
780	struct iovec aiov[DEF_IOV_MAX];
781	file_t *fp;
782	register vnode_t *vp;
783	struct cpu *cp;
784	int fflag, ioflag, rwflag;
785	ssize_t count, bcount;
786	int error = 0;
787	int i;
788	u_offset_t fileoff;
789	int in_crit = 0;
790
791	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
792		return (set_errno(EINVAL));
793
794#ifdef _SYSCALL32_IMPL
795	/*
796	 * 32-bit callers need to have their iovec expanded,
797	 * while ensuring that they can't move more than 2Gbytes
798	 * of data in a single call.
799	 */
800	if (get_udatamodel() == DATAMODEL_ILP32) {
801		struct iovec32 aiov32[DEF_IOV_MAX];
802		ssize32_t count32;
803
804		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
805			return (set_errno(EFAULT));
806
807		count32 = 0;
808		for (i = 0; i < iovcnt; i++) {
809			ssize32_t iovlen = aiov32[i].iov_len;
810			count32 += iovlen;
811			if (iovlen < 0 || count32 < 0)
812				return (set_errno(EINVAL));
813			aiov[i].iov_len = iovlen;
814			aiov[i].iov_base =
815			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
816		}
817	} else
818#endif
819	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
820		return (set_errno(EFAULT));
821
822	count = 0;
823	for (i = 0; i < iovcnt; i++) {
824		ssize_t iovlen = aiov[i].iov_len;
825		count += iovlen;
826		if (iovlen < 0 || count < 0)
827			return (set_errno(EINVAL));
828	}
829	if ((fp = getf(fdes)) == NULL)
830		return (set_errno(EBADF));
831	if (((fflag = fp->f_flag) & FWRITE) == 0) {
832		error = EBADF;
833		goto out;
834	}
835	vp = fp->f_vnode;
836	if (vp->v_type == VREG && count == 0) {
837		goto out;
838	}
839
840	rwflag = 1;
841
842	/*
843	 * We have to enter the critical region before calling VOP_RWLOCK
844	 * to avoid a deadlock with ufs.
845	 */
846	if (nbl_need_check(vp)) {
847		int svmand;
848
849		nbl_start_crit(vp, RW_READER);
850		in_crit = 1;
851		error = nbl_svmand(vp, fp->f_cred, &svmand);
852		if (error != 0)
853			goto out;
854		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
855		    NULL)) {
856			error = EACCES;
857			goto out;
858		}
859	}
860
861	(void) VOP_RWLOCK(vp, rwflag, NULL);
862
863	fileoff = fp->f_offset;
864
865	/*
866	 * Behaviour is same as write. Please see comments for write.
867	 */
868
869	if (vp->v_type == VREG) {
870		if (fileoff >= curproc->p_fsz_ctl) {
871			VOP_RWUNLOCK(vp, rwflag, NULL);
872			mutex_enter(&curproc->p_lock);
873			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
874			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
875			mutex_exit(&curproc->p_lock);
876			error = EFBIG;
877			goto out;
878		}
879		if (fileoff >= OFFSET_MAX(fp)) {
880			VOP_RWUNLOCK(vp, rwflag, NULL);
881			error = EFBIG;
882			goto out;
883		}
884		if (fileoff + count > OFFSET_MAX(fp))
885			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
886	}
887	auio.uio_loffset = fileoff;
888	auio.uio_iov = aiov;
889	auio.uio_iovcnt = iovcnt;
890	auio.uio_resid = bcount = count;
891	auio.uio_segflg = UIO_USERSPACE;
892	auio.uio_llimit = curproc->p_fsz_ctl;
893	auio.uio_fmode = fflag;
894	auio.uio_extflg = UIO_COPY_DEFAULT;
895
896	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
897
898	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
899	count -= auio.uio_resid;
900	CPU_STATS_ENTER_K();
901	cp = CPU;
902	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
903	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
904	CPU_STATS_EXIT_K();
905	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
906
907	if (vp->v_type == VFIFO)	/* Backward compatibility */
908		fp->f_offset = count;
909	else if (((fp->f_flag & FAPPEND) == 0) ||
910	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
911		fp->f_offset = auio.uio_loffset;
912	VOP_RWUNLOCK(vp, rwflag, NULL);
913
914	if (error == EINTR && count != 0)
915		error = 0;
916out:
917	if (in_crit)
918		nbl_end_crit(vp);
919	releasef(fdes);
920	if (error)
921		return (set_errno(error));
922	return (count);
923}
924
925ssize_t
926preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
927    off_t extended_offset)
928{
929	struct uio auio;
930	struct iovec aiov[DEF_IOV_MAX];
931	file_t *fp;
932	register vnode_t *vp;
933	struct cpu *cp;
934	int fflag, ioflag, rwflag;
935	ssize_t count, bcount;
936	int error = 0;
937	int i;
938
939#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
940	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
941	    (u_offset_t)offset;
942#else /* _SYSCALL32_IMPL || _ILP32 */
943	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
944#endif /* _SYSCALL32_IMPR || _ILP32 */
945#ifdef _SYSCALL32_IMPL
946	const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
947	    extended_offset == 0?
948	    MAXOFF32_T : MAXOFFSET_T;
949#else /* _SYSCALL32_IMPL */
950	const u_offset_t maxoff = MAXOFF32_T;
951#endif /* _SYSCALL32_IMPL */
952
953	int in_crit = 0;
954
955	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
956		return (set_errno(EINVAL));
957
958#ifdef _SYSCALL32_IMPL
959	/*
960	 * 32-bit callers need to have their iovec expanded,
961	 * while ensuring that they can't move more than 2Gbytes
962	 * of data in a single call.
963	 */
964	if (get_udatamodel() == DATAMODEL_ILP32) {
965		struct iovec32 aiov32[DEF_IOV_MAX];
966		ssize32_t count32;
967
968		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
969			return (set_errno(EFAULT));
970
971		count32 = 0;
972		for (i = 0; i < iovcnt; i++) {
973			ssize32_t iovlen32 = aiov32[i].iov_len;
974			count32 += iovlen32;
975			if (iovlen32 < 0 || count32 < 0)
976				return (set_errno(EINVAL));
977			aiov[i].iov_len = iovlen32;
978			aiov[i].iov_base =
979			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
980		}
981	} else
982#endif /* _SYSCALL32_IMPL */
983		if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
984			return (set_errno(EFAULT));
985
986	count = 0;
987	for (i = 0; i < iovcnt; i++) {
988		ssize_t iovlen = aiov[i].iov_len;
989		count += iovlen;
990		if (iovlen < 0 || count < 0)
991			return (set_errno(EINVAL));
992	}
993
994	if ((bcount = (ssize_t)count) < 0)
995		return (set_errno(EINVAL));
996	if ((fp = getf(fdes)) == NULL)
997		return (set_errno(EBADF));
998	if (((fflag = fp->f_flag) & FREAD) == 0) {
999		error = EBADF;
1000		goto out;
1001	}
1002	vp = fp->f_vnode;
1003	rwflag = 0;
1004	if (vp->v_type == VREG) {
1005
1006		if (bcount == 0)
1007			goto out;
1008
1009		/*
1010		 * return EINVAL for offsets that cannot be
1011		 * represented in an off_t.
1012		 */
1013		if (fileoff > maxoff) {
1014			error = EINVAL;
1015			goto out;
1016		}
1017
1018		if (fileoff + bcount > maxoff)
1019			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1020	} else if (vp->v_type == VFIFO) {
1021		error = ESPIPE;
1022		goto out;
1023	}
1024	/*
1025	 * We have to enter the critical region before calling VOP_RWLOCK
1026	 * to avoid a deadlock with ufs.
1027	 */
1028	if (nbl_need_check(vp)) {
1029		int svmand;
1030
1031		nbl_start_crit(vp, RW_READER);
1032		in_crit = 1;
1033		error = nbl_svmand(vp, fp->f_cred, &svmand);
1034		if (error != 0)
1035			goto out;
1036		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1037		    NULL)) {
1038			error = EACCES;
1039			goto out;
1040		}
1041	}
1042
1043	(void) VOP_RWLOCK(vp, rwflag, NULL);
1044
1045	/*
1046	 * Behaviour is same as read(2). Please see comments in
1047	 * read(2).
1048	 */
1049
1050	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
1051		struct vattr va;
1052		va.va_mask = AT_SIZE;
1053		if ((error =
1054		    VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
1055			VOP_RWUNLOCK(vp, rwflag, NULL);
1056			goto out;
1057		}
1058		if (fileoff >= va.va_size) {
1059			VOP_RWUNLOCK(vp, rwflag, NULL);
1060			count = 0;
1061			goto out;
1062		} else {
1063			VOP_RWUNLOCK(vp, rwflag, NULL);
1064			error = EOVERFLOW;
1065			goto out;
1066		}
1067	}
1068	if ((vp->v_type == VREG) &&
1069	    (fileoff + count > OFFSET_MAX(fp))) {
1070		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1071	}
1072	auio.uio_loffset = fileoff;
1073	auio.uio_iov = aiov;
1074	auio.uio_iovcnt = iovcnt;
1075	auio.uio_resid = bcount = count;
1076	auio.uio_segflg = UIO_USERSPACE;
1077	auio.uio_llimit = MAXOFFSET_T;
1078	auio.uio_fmode = fflag;
1079	if (bcount <= copyout_max_cached)
1080		auio.uio_extflg = UIO_COPY_CACHED;
1081	else
1082		auio.uio_extflg = UIO_COPY_DEFAULT;
1083
1084	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1085	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1086	count -= auio.uio_resid;
1087	CPU_STATS_ENTER_K();
1088	cp = CPU;
1089	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1090	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
1091	CPU_STATS_EXIT_K();
1092	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1093
1094	VOP_RWUNLOCK(vp, rwflag, NULL);
1095
1096	if (error == EINTR && count != 0)
1097		error = 0;
1098out:
1099	if (in_crit)
1100		nbl_end_crit(vp);
1101	releasef(fdes);
1102	if (error)
1103		return (set_errno(error));
1104	return (count);
1105}
1106
1107ssize_t
1108pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
1109    off_t extended_offset)
1110{
1111	struct uio auio;
1112	struct iovec aiov[DEF_IOV_MAX];
1113	file_t *fp;
1114	register vnode_t *vp;
1115	struct cpu *cp;
1116	int fflag, ioflag, rwflag;
1117	ssize_t count, bcount;
1118	int error = 0;
1119	int i;
1120
1121#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1122	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1123	    (u_offset_t)offset;
1124#else /* _SYSCALL32_IMPL || _ILP32 */
1125	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1126#endif /* _SYSCALL32_IMPR || _ILP32 */
1127#ifdef _SYSCALL32_IMPL
1128	const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
1129	    extended_offset == 0?
1130	    MAXOFF32_T : MAXOFFSET_T;
1131#else /* _SYSCALL32_IMPL */
1132	const u_offset_t maxoff = MAXOFF32_T;
1133#endif /* _SYSCALL32_IMPL */
1134
1135	int in_crit = 0;
1136
1137	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
1138		return (set_errno(EINVAL));
1139
1140#ifdef _SYSCALL32_IMPL
1141	/*
1142	 * 32-bit callers need to have their iovec expanded,
1143	 * while ensuring that they can't move more than 2Gbytes
1144	 * of data in a single call.
1145	 */
1146	if (get_udatamodel() == DATAMODEL_ILP32) {
1147		struct iovec32 aiov32[DEF_IOV_MAX];
1148		ssize32_t count32;
1149
1150		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
1151			return (set_errno(EFAULT));
1152
1153		count32 = 0;
1154		for (i = 0; i < iovcnt; i++) {
1155			ssize32_t iovlen32 = aiov32[i].iov_len;
1156			count32 += iovlen32;
1157			if (iovlen32 < 0 || count32 < 0)
1158				return (set_errno(EINVAL));
1159			aiov[i].iov_len = iovlen32;
1160			aiov[i].iov_base =
1161			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1162		}
1163	} else
1164#endif /* _SYSCALL32_IMPL */
1165		if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
1166			return (set_errno(EFAULT));
1167
1168	count = 0;
1169	for (i = 0; i < iovcnt; i++) {
1170		ssize_t iovlen = aiov[i].iov_len;
1171		count += iovlen;
1172		if (iovlen < 0 || count < 0)
1173			return (set_errno(EINVAL));
1174	}
1175
1176	if ((bcount = (ssize_t)count) < 0)
1177		return (set_errno(EINVAL));
1178	if ((fp = getf(fdes)) == NULL)
1179		return (set_errno(EBADF));
1180	if (((fflag = fp->f_flag) & FWRITE) == 0) {
1181		error = EBADF;
1182		goto out;
1183	}
1184	vp = fp->f_vnode;
1185	rwflag = 1;
1186	if (vp->v_type == VREG) {
1187
1188		if (bcount == 0)
1189			goto out;
1190
1191		/*
1192		 * return EINVAL for offsets that cannot be
1193		 * represented in an off_t.
1194		 */
1195		if (fileoff > maxoff) {
1196			error = EINVAL;
1197			goto out;
1198		}
1199		/*
1200		 * Take appropriate action if we are trying
1201		 * to write above the resource limit.
1202		 */
1203		if (fileoff >= curproc->p_fsz_ctl) {
1204			mutex_enter(&curproc->p_lock);
1205			/*
1206			 * Return value ignored because it lists
1207			 * actions taken, but we are in an error case.
1208			 * We don't have any actions that depend on
1209			 * what could happen in this call, so we ignore
1210			 * the return value.
1211			 */
1212			(void) rctl_action(
1213			    rctlproc_legacy[RLIMIT_FSIZE],
1214			    curproc->p_rctls, curproc,
1215			    RCA_UNSAFE_SIGINFO);
1216			mutex_exit(&curproc->p_lock);
1217
1218			error = EFBIG;
1219			goto out;
1220		}
1221		/*
1222		 * Don't allow pwritev to cause file sizes to exceed
1223		 * maxoff.
1224		 */
1225		if (fileoff == maxoff) {
1226			error = EFBIG;
1227			goto out;
1228		}
1229
1230		if (fileoff + bcount > maxoff)
1231			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1232	} else if (vp->v_type == VFIFO) {
1233		error = ESPIPE;
1234		goto out;
1235	}
1236	/*
1237	 * We have to enter the critical region before calling VOP_RWLOCK
1238	 * to avoid a deadlock with ufs.
1239	 */
1240	if (nbl_need_check(vp)) {
1241		int svmand;
1242
1243		nbl_start_crit(vp, RW_READER);
1244		in_crit = 1;
1245		error = nbl_svmand(vp, fp->f_cred, &svmand);
1246		if (error != 0)
1247			goto out;
1248		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1249		    NULL)) {
1250			error = EACCES;
1251			goto out;
1252		}
1253	}
1254
1255	(void) VOP_RWLOCK(vp, rwflag, NULL);
1256
1257
1258	/*
1259	 * Behaviour is same as write(2). Please see comments for
1260	 * write(2).
1261	 */
1262
1263	if (vp->v_type == VREG) {
1264		if (fileoff >= curproc->p_fsz_ctl) {
1265			VOP_RWUNLOCK(vp, rwflag, NULL);
1266			mutex_enter(&curproc->p_lock);
1267			/* see above rctl_action comment */
1268			(void) rctl_action(
1269			    rctlproc_legacy[RLIMIT_FSIZE],
1270			    curproc->p_rctls,
1271			    curproc, RCA_UNSAFE_SIGINFO);
1272			mutex_exit(&curproc->p_lock);
1273			error = EFBIG;
1274			goto out;
1275		}
1276		if (fileoff >= OFFSET_MAX(fp)) {
1277			VOP_RWUNLOCK(vp, rwflag, NULL);
1278			error = EFBIG;
1279			goto out;
1280		}
1281		if (fileoff + count > OFFSET_MAX(fp))
1282			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1283	}
1284
1285	auio.uio_loffset = fileoff;
1286	auio.uio_iov = aiov;
1287	auio.uio_iovcnt = iovcnt;
1288	auio.uio_resid = bcount = count;
1289	auio.uio_segflg = UIO_USERSPACE;
1290	auio.uio_llimit = curproc->p_fsz_ctl;
1291	auio.uio_fmode = fflag;
1292	auio.uio_extflg = UIO_COPY_CACHED;
1293	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1294	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1295	count -= auio.uio_resid;
1296	CPU_STATS_ENTER_K();
1297	cp = CPU;
1298	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1299	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
1300	CPU_STATS_EXIT_K();
1301	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1302
1303	VOP_RWUNLOCK(vp, rwflag, NULL);
1304
1305	if (error == EINTR && count != 0)
1306		error = 0;
1307out:
1308	if (in_crit)
1309		nbl_end_crit(vp);
1310	releasef(fdes);
1311	if (error)
1312		return (set_errno(error));
1313	return (count);
1314}
1315
1316#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1317
1318/*
1319 * This syscall supplies 64-bit file offsets to 32-bit applications only.
1320 */
1321ssize32_t
1322pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1323    uint32_t offset_2)
1324{
1325	struct uio auio;
1326	struct iovec aiov;
1327	file_t *fp;
1328	register vnode_t *vp;
1329	struct cpu *cp;
1330	int fflag, ioflag, rwflag;
1331	ssize_t bcount;
1332	int error = 0;
1333	u_offset_t fileoff;
1334	int in_crit = 0;
1335
1336#if defined(_LITTLE_ENDIAN)
1337	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1338#else
1339	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1340#endif
1341
1342	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1343		return (set_errno(EINVAL));
1344
1345	if ((fp = getf(fdes)) == NULL)
1346		return (set_errno(EBADF));
1347	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
1348		error = EBADF;
1349		goto out;
1350	}
1351
1352	rwflag = 0;
1353	vp = fp->f_vnode;
1354
1355	if (vp->v_type == VREG) {
1356
1357		if (bcount == 0)
1358			goto out;
1359
1360		/*
1361		 * Same as pread. See comments in pread.
1362		 */
1363
1364		if (fileoff > MAXOFFSET_T) {
1365			error = EINVAL;
1366			goto out;
1367		}
1368		if (fileoff + bcount > MAXOFFSET_T)
1369			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
1370	} else if (vp->v_type == VFIFO) {
1371		error = ESPIPE;
1372		goto out;
1373	}
1374
1375	/*
1376	 * We have to enter the critical region before calling VOP_RWLOCK
1377	 * to avoid a deadlock with ufs.
1378	 */
1379	if (nbl_need_check(vp)) {
1380		int svmand;
1381
1382		nbl_start_crit(vp, RW_READER);
1383		in_crit = 1;
1384		error = nbl_svmand(vp, fp->f_cred, &svmand);
1385		if (error != 0)
1386			goto out;
1387		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
1388		    NULL)) {
1389			error = EACCES;
1390			goto out;
1391		}
1392	}
1393
1394	aiov.iov_base = cbuf;
1395	aiov.iov_len = bcount;
1396	(void) VOP_RWLOCK(vp, rwflag, NULL);
1397	auio.uio_loffset = fileoff;
1398
1399	/*
1400	 * Note: File size can never be greater than MAXOFFSET_T.
1401	 * If ever we start supporting 128 bit files the code
1402	 * similar to the one in pread at this place should be here.
1403	 * Here we avoid the unnecessary VOP_GETATTR() when we
1404	 * know that fileoff == MAXOFFSET_T implies that it is always
1405	 * greater than or equal to file size.
1406	 */
1407	auio.uio_iov = &aiov;
1408	auio.uio_iovcnt = 1;
1409	auio.uio_resid = bcount;
1410	auio.uio_segflg = UIO_USERSPACE;
1411	auio.uio_llimit = MAXOFFSET_T;
1412	auio.uio_fmode = fflag;
1413	auio.uio_extflg = UIO_COPY_CACHED;
1414
1415	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1416
1417	/* If read sync is not asked for, filter sync flags */
1418	if ((ioflag & FRSYNC) == 0)
1419		ioflag &= ~(FSYNC|FDSYNC);
1420	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1421	bcount -= auio.uio_resid;
1422	CPU_STATS_ENTER_K();
1423	cp = CPU;
1424	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1425	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1426	CPU_STATS_EXIT_K();
1427	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1428	VOP_RWUNLOCK(vp, rwflag, NULL);
1429
1430	if (error == EINTR && bcount != 0)
1431		error = 0;
1432out:
1433	if (in_crit)
1434		nbl_end_crit(vp);
1435	releasef(fdes);
1436	if (error)
1437		return (set_errno(error));
1438	return (bcount);
1439}
1440
1441/*
1442 * This syscall supplies 64-bit file offsets to 32-bit applications only.
1443 */
1444ssize32_t
1445pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1446    uint32_t offset_2)
1447{
1448	struct uio auio;
1449	struct iovec aiov;
1450	file_t *fp;
1451	register vnode_t *vp;
1452	struct cpu *cp;
1453	int fflag, ioflag, rwflag;
1454	ssize_t bcount;
1455	int error = 0;
1456	u_offset_t fileoff;
1457	int in_crit = 0;
1458
1459#if defined(_LITTLE_ENDIAN)
1460	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1461#else
1462	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1463#endif
1464
1465	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1466		return (set_errno(EINVAL));
1467	if ((fp = getf(fdes)) == NULL)
1468		return (set_errno(EBADF));
1469	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1470		error = EBADF;
1471		goto out;
1472	}
1473
1474	rwflag = 1;
1475	vp = fp->f_vnode;
1476
1477	if (vp->v_type == VREG) {
1478
1479		if (bcount == 0)
1480			goto out;
1481
1482		/*
1483		 * See comments in pwrite.
1484		 */
1485		if (fileoff > MAXOFFSET_T) {
1486			error = EINVAL;
1487			goto out;
1488		}
1489		if (fileoff >= curproc->p_fsz_ctl) {
1490			mutex_enter(&curproc->p_lock);
1491			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1492			    curproc->p_rctls, curproc, RCA_SAFE);
1493			mutex_exit(&curproc->p_lock);
1494			error = EFBIG;
1495			goto out;
1496		}
1497		if (fileoff == MAXOFFSET_T) {
1498			error = EFBIG;
1499			goto out;
1500		}
1501		if (fileoff + bcount > MAXOFFSET_T)
1502			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1503	} else if (vp->v_type == VFIFO) {
1504		error = ESPIPE;
1505		goto out;
1506	}
1507
1508	/*
1509	 * We have to enter the critical region before calling VOP_RWLOCK
1510	 * to avoid a deadlock with ufs.
1511	 */
1512	if (nbl_need_check(vp)) {
1513		int svmand;
1514
1515		nbl_start_crit(vp, RW_READER);
1516		in_crit = 1;
1517		error = nbl_svmand(vp, fp->f_cred, &svmand);
1518		if (error != 0)
1519			goto out;
1520		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1521		    NULL)) {
1522			error = EACCES;
1523			goto out;
1524		}
1525	}
1526
1527	aiov.iov_base = cbuf;
1528	aiov.iov_len = bcount;
1529	(void) VOP_RWLOCK(vp, rwflag, NULL);
1530	auio.uio_loffset = fileoff;
1531	auio.uio_iov = &aiov;
1532	auio.uio_iovcnt = 1;
1533	auio.uio_resid = bcount;
1534	auio.uio_segflg = UIO_USERSPACE;
1535	auio.uio_llimit = curproc->p_fsz_ctl;
1536	auio.uio_fmode = fflag;
1537	auio.uio_extflg = UIO_COPY_CACHED;
1538
1539	/*
1540	 * The SUSv4 POSIX specification states:
1541	 *	The pwrite() function shall be equivalent to write(), except
1542	 *	that it writes into a given position and does not change
1543	 *	the file offset (regardless of whether O_APPEND is set).
1544	 * To make this be true, we omit the FAPPEND flag from ioflag.
1545	 */
1546	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1547
1548	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1549	bcount -= auio.uio_resid;
1550	CPU_STATS_ENTER_K();
1551	cp = CPU;
1552	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1553	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1554	CPU_STATS_EXIT_K();
1555	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1556	VOP_RWUNLOCK(vp, rwflag, NULL);
1557
1558	if (error == EINTR && bcount != 0)
1559		error = 0;
1560out:
1561	if (in_crit)
1562		nbl_end_crit(vp);
1563	releasef(fdes);
1564	if (error)
1565		return (set_errno(error));
1566	return (bcount);
1567}
1568
1569#endif	/* _SYSCALL32_IMPL || _ILP32 */
1570
1571#ifdef _SYSCALL32_IMPL
1572/*
1573 * Tail-call elimination of xxx32() down to xxx()
1574 *
1575 * A number of xxx32 system calls take a len (or count) argument and
1576 * return a number in the range [0,len] or -1 on error.
1577 * Given an ssize32_t input len, the downcall xxx() will return
1578 * a 64-bit value that is -1 or in the range [0,len] which actually
1579 * is a proper return value for the xxx32 call. So even if the xxx32
1580 * calls can be considered as returning a ssize32_t, they are currently
1581 * declared as returning a ssize_t as this enables tail-call elimination.
1582 *
1583 * The cast of len (or count) to ssize32_t is needed to ensure we pass
1584 * down negative input values as such and let the downcall handle error
1585 * reporting. Functions covered by this comments are:
1586 *
1587 * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1588 * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1589 * readlink.c:     readlink32.
1590 */
1591
1592ssize_t
1593read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1594{
1595	return (read(fdes,
1596	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1597}
1598
1599ssize_t
1600write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1601{
1602	return (write(fdes,
1603	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1604}
1605
1606ssize_t
1607pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1608{
1609	return (pread(fdes,
1610	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1611	    (off_t)(uint32_t)offset));
1612}
1613
1614ssize_t
1615pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1616{
1617	return (pwrite(fdes,
1618	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1619	    (off_t)(uint32_t)offset));
1620}
1621
1622ssize_t
1623readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1624{
1625	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1626}
1627
1628ssize_t
1629writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1630{
1631	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1632}
1633#endif	/* _SYSCALL32_IMPL */
1634