1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28/* All Rights Reserved */
29
30/*
31 * Portions of this source code were derived from Berkeley 4.3 BSD
32 * under license from the Regents of the University of California.
33 */
34
35#include <sys/types.h>
36#include <sys/t_lock.h>
37#include <sys/param.h>
38#include <sys/time.h>
39#include <sys/systm.h>
40#include <sys/sysmacros.h>
41#include <sys/resource.h>
42#include <sys/signal.h>
43#include <sys/cred.h>
44#include <sys/user.h>
45#include <sys/buf.h>
46#include <sys/vfs.h>
47#include <sys/vnode.h>
48#include <sys/proc.h>
49#include <sys/disp.h>
50#include <sys/file.h>
51#include <sys/fcntl.h>
52#include <sys/flock.h>
53#include <sys/kmem.h>
54#include <sys/uio.h>
55#include <sys/dnlc.h>
56#include <sys/conf.h>
57#include <sys/mman.h>
58#include <sys/pathname.h>
59#include <sys/debug.h>
60#include <sys/vmsystm.h>
61#include <sys/cmn_err.h>
62#include <sys/filio.h>
63#include <sys/atomic.h>
64
65#include <sys/fssnap_if.h>
66#include <sys/fs/ufs_fs.h>
67#include <sys/fs/ufs_lockfs.h>
68#include <sys/fs/ufs_filio.h>
69#include <sys/fs/ufs_inode.h>
70#include <sys/fs/ufs_fsdir.h>
71#include <sys/fs/ufs_quota.h>
72#include <sys/fs/ufs_trans.h>
73#include <sys/fs/ufs_panic.h>
74#include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
75#include <sys/errno.h>
76
77#include <sys/filio.h>		/* _FIOIO */
78
79#include <vm/hat.h>
80#include <vm/page.h>
81#include <vm/pvn.h>
82#include <vm/as.h>
83#include <vm/seg.h>
84#include <vm/seg_map.h>
85#include <vm/seg_vn.h>
86#include <vm/seg_kmem.h>
87#include <vm/rm.h>
88#include <sys/swap.h>
89#include <sys/epm.h>
90
91#include <fs/fs_subr.h>
92
93static void	*ufs_directio_zero_buf;
94static int	ufs_directio_zero_len	= 8192;
95
96int	ufs_directio_enabled = 1;	/* feature is enabled */
97
98/*
99 * for kstats reader
100 */
101struct ufs_directio_kstats {
102	kstat_named_t	logical_reads;
103	kstat_named_t	phys_reads;
104	kstat_named_t	hole_reads;
105	kstat_named_t	nread;
106	kstat_named_t	logical_writes;
107	kstat_named_t	phys_writes;
108	kstat_named_t	nwritten;
109	kstat_named_t	nflushes;
110} ufs_directio_kstats = {
111	{ "logical_reads",	KSTAT_DATA_UINT64 },
112	{ "phys_reads",		KSTAT_DATA_UINT64 },
113	{ "hole_reads",		KSTAT_DATA_UINT64 },
114	{ "nread",		KSTAT_DATA_UINT64 },
115	{ "logical_writes",	KSTAT_DATA_UINT64 },
116	{ "phys_writes",	KSTAT_DATA_UINT64 },
117	{ "nwritten",		KSTAT_DATA_UINT64 },
118	{ "nflushes",		KSTAT_DATA_UINT64 },
119};
120
121kstat_t	*ufs_directio_kstatsp;
122
123/*
124 * use kmem_cache_create for direct-physio buffers. This has shown
125 * a better cache distribution compared to buffers on the
126 * stack. It also avoids semaphore construction/deconstruction
127 * per request
128 */
129struct directio_buf {
130	struct directio_buf	*next;
131	char		*addr;
132	size_t		nbytes;
133	struct buf	buf;
134};
135static struct kmem_cache *directio_buf_cache;
136
137
138/* ARGSUSED */
139static int
140directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
141{
142	bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
143	return (0);
144}
145
146/* ARGSUSED */
147static void
148directio_buf_destructor(void *dbp, void *cdrarg)
149{
150	biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
151}
152
153void
154directio_bufs_init(void)
155{
156	directio_buf_cache = kmem_cache_create("directio_buf_cache",
157	    sizeof (struct directio_buf), 0,
158	    directio_buf_constructor, directio_buf_destructor,
159	    NULL, NULL, NULL, 0);
160}
161
162void
163ufs_directio_init(void)
164{
165	/*
166	 * kstats
167	 */
168	ufs_directio_kstatsp = kstat_create("ufs", 0,
169	    "directio", "ufs", KSTAT_TYPE_NAMED,
170	    sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
171	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
172	if (ufs_directio_kstatsp) {
173		ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
174		kstat_install(ufs_directio_kstatsp);
175	}
176	/*
177	 * kzero is broken so we have to use a private buf of zeroes
178	 */
179	ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
180	directio_bufs_init();
181}
182
183/*
184 * Wait for the first direct IO operation to finish
185 */
186static int
187directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
188{
189	buf_t	*bp;
190	int	error;
191
192	/*
193	 * Wait for IO to finish
194	 */
195	bp = &dbp->buf;
196	error = biowait(bp);
197
198	/*
199	 * bytes_io will be used to figure out a resid
200	 * for the caller. The resid is approximated by reporting
201	 * the bytes following the first failed IO as the residual.
202	 *
203	 * I am cautious about using b_resid because I
204	 * am not sure how well the disk drivers maintain it.
205	 */
206	if (error)
207		if (bp->b_resid)
208			*bytes_iop = bp->b_bcount - bp->b_resid;
209		else
210			*bytes_iop = 0;
211	else
212		*bytes_iop += bp->b_bcount;
213	/*
214	 * Release direct IO resources
215	 */
216	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
217	kmem_cache_free(directio_buf_cache, dbp);
218	return (error);
219}
220
221/*
222 * Wait for all of the direct IO operations to finish
223 */
224
225static int
226directio_wait(struct directio_buf *tail, long *bytes_iop)
227{
228	int	error = 0, newerror;
229	struct directio_buf	*dbp;
230
231	/*
232	 * The linked list of directio buf structures is maintained
233	 * in reverse order (tail->last request->penultimate request->...)
234	 */
235	while ((dbp = tail) != NULL) {
236		tail = dbp->next;
237		newerror = directio_wait_one(dbp, bytes_iop);
238		if (error == 0)
239			error = newerror;
240	}
241	return (error);
242}
243/*
244 * Initiate direct IO request
245 */
246static void
247directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
248    offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
249    struct directio_buf **tailp, page_t **pplist)
250{
251	buf_t *bp;
252	struct directio_buf *dbp;
253
254	/*
255	 * Allocate a directio buf header
256	 *   Note - list is maintained in reverse order.
257	 *   directio_wait_one() depends on this fact when
258	 *   adjusting the ``bytes_io'' param. bytes_io
259	 *   is used to compute a residual in the case of error.
260	 */
261	dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
262	dbp->next = *tailp;
263	*tailp = dbp;
264
265	/*
266	 * Initialize buf header
267	 */
268	dbp->addr = addr;
269	dbp->nbytes = nbytes;
270	bp = &dbp->buf;
271	bp->b_edev = ip->i_dev;
272	bp->b_lblkno = btodt(offset);
273	bp->b_bcount = nbytes;
274	bp->b_un.b_addr = addr;
275	bp->b_proc = procp;
276	bp->b_file = ip->i_vnode;
277
278	/*
279	 * Note that S_WRITE implies B_READ and vice versa: a read(2)
280	 * will B_READ data from the filesystem and S_WRITE it into
281	 * the user's buffer; a write(2) will S_READ data from the
282	 * user's buffer and B_WRITE it to the filesystem.
283	 */
284	if (rw == S_WRITE) {
285		bp->b_flags = B_BUSY | B_PHYS | B_READ;
286		ufs_directio_kstats.phys_reads.value.ui64++;
287		ufs_directio_kstats.nread.value.ui64 += nbytes;
288	} else {
289		bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
290		ufs_directio_kstats.phys_writes.value.ui64++;
291		ufs_directio_kstats.nwritten.value.ui64 += nbytes;
292	}
293	bp->b_shadow = pplist;
294	if (pplist != NULL)
295		bp->b_flags |= B_SHADOW;
296
297	/*
298	 * Issue I/O request.
299	 */
300	ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
301	if (ufsvfsp->vfs_snapshot)
302		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
303	else
304		(void) bdev_strategy(bp);
305
306	if (rw == S_WRITE)
307		lwp_stat_update(LWP_STAT_OUBLK, 1);
308	else
309		lwp_stat_update(LWP_STAT_INBLK, 1);
310
311}
312
313uint32_t	ufs_shared_writes;	/* writes done w/ lock shared */
314uint32_t	ufs_cur_writes;		/* # concurrent writes */
315uint32_t	ufs_maxcur_writes;	/* high water concurrent writes */
316uint32_t	ufs_posix_hits;		/* writes done /w lock excl. */
317
318/*
319 * Force POSIX syncronous data integrity on all writes for testing.
320 */
321uint32_t	ufs_force_posix_sdi = 0;
322
323/*
324 * Direct Write
325 */
326
327int
328ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
329    cred_t *cr, int *statusp)
330{
331	long		resid, bytes_written;
332	u_offset_t	size, uoff;
333	uio_t		*uio = arg_uio;
334	rlim64_t	limit = uio->uio_llimit;
335	int		on, n, error, newerror, len, has_holes;
336	daddr_t		bn;
337	size_t		nbytes;
338	struct fs	*fs;
339	vnode_t		*vp;
340	iovec_t		*iov;
341	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
342	struct proc	*procp;
343	struct as	*as;
344	struct directio_buf	*tail;
345	int		exclusive, ncur, bmap_peek;
346	uio_t		copy_uio;
347	iovec_t		copy_iov;
348	char		*copy_base;
349	long		copy_resid;
350
351	/*
352	 * assume that directio isn't possible (normal case)
353	 */
354	*statusp = DIRECTIO_FAILURE;
355
356	/*
357	 * Don't go direct
358	 */
359	if (ufs_directio_enabled == 0)
360		return (0);
361
362	/*
363	 * mapped file; nevermind
364	 */
365	if (ip->i_mapcnt)
366		return (0);
367
368	/*
369	 * CAN WE DO DIRECT IO?
370	 */
371	uoff = uio->uio_loffset;
372	resid = uio->uio_resid;
373
374	/*
375	 * beyond limit
376	 */
377	if (uoff + resid > limit)
378		return (0);
379
380	/*
381	 * must be sector aligned
382	 */
383	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
384		return (0);
385
386	/*
387	 * SHOULD WE DO DIRECT IO?
388	 */
389	size = ip->i_size;
390	has_holes = -1;
391
392	/*
393	 * only on regular files; no metadata
394	 */
395	if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
396		return (0);
397
398	/*
399	 * Synchronous, allocating writes run very slow in Direct-Mode
400	 *	XXX - can be fixed with bmap_write changes for large writes!!!
401	 *	XXX - can be fixed for updates to "almost-full" files
402	 *	XXX - WARNING - system hangs if bmap_write() has to
403	 *			allocate lots of pages since pageout
404	 *			suspends on locked inode
405	 */
406	if (!rewrite && (ip->i_flag & ISYNC)) {
407		if ((uoff + resid) > size)
408			return (0);
409		has_holes = bmap_has_holes(ip);
410		if (has_holes)
411			return (0);
412	}
413
414	/*
415	 * Each iovec must be short aligned and sector aligned.  If
416	 * one is not, then kmem_alloc a new buffer and copy all of
417	 * the smaller buffers into the new buffer.  This new
418	 * buffer will be short aligned and sector aligned.
419	 */
420	iov = uio->uio_iov;
421	nbytes = uio->uio_iovcnt;
422	while (nbytes--) {
423		if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
424		    (intptr_t)(iov->iov_base) & 1) {
425			copy_resid = uio->uio_resid;
426			copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
427			if (copy_base == NULL)
428				return (0);
429			copy_iov.iov_base = copy_base;
430			copy_iov.iov_len = copy_resid;
431			copy_uio.uio_iov = &copy_iov;
432			copy_uio.uio_iovcnt = 1;
433			copy_uio.uio_segflg = UIO_SYSSPACE;
434			copy_uio.uio_extflg = UIO_COPY_DEFAULT;
435			copy_uio.uio_loffset = uio->uio_loffset;
436			copy_uio.uio_resid = uio->uio_resid;
437			copy_uio.uio_llimit = uio->uio_llimit;
438			error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
439			if (error) {
440				kmem_free(copy_base, copy_resid);
441				return (0);
442			}
443			uio = &copy_uio;
444			break;
445		}
446		iov++;
447	}
448
449	/*
450	 * From here on down, all error exits must go to errout and
451	 * not simply return a 0.
452	 */
453
454	/*
455	 * DIRECTIO
456	 */
457
458	fs = ip->i_fs;
459
460	/*
461	 * POSIX check. If attempting a concurrent re-write, make sure
462	 * that this will be a single request to the driver to meet
463	 * POSIX synchronous data integrity requirements.
464	 */
465	bmap_peek = 0;
466	if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
467		int upgrade = 0;
468
469		/* check easy conditions first */
470		if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
471			upgrade = 1;
472		} else {
473			/* now look for contiguous allocation */
474			len = (ssize_t)blkroundup(fs, resid);
475			error = bmap_read(ip, uoff, &bn, &len);
476			if (error || bn == UFS_HOLE || len == 0)
477				goto errout;
478			/* save a call to bmap_read later */
479			bmap_peek = 1;
480			if (len < resid)
481				upgrade = 1;
482		}
483		if (upgrade) {
484			rw_exit(&ip->i_contents);
485			rw_enter(&ip->i_contents, RW_WRITER);
486			ufs_posix_hits++;
487		}
488	}
489
490
491	/*
492	 * allocate space
493	 */
494
495	/*
496	 * If attempting a re-write, there is no allocation to do.
497	 * bmap_write would trip an ASSERT if i_contents is held shared.
498	 */
499	if (rewrite)
500		goto skip_alloc;
501
502	do {
503		on = (int)blkoff(fs, uoff);
504		n = (int)MIN(fs->fs_bsize - on, resid);
505		if ((uoff + n) > ip->i_size) {
506			error = bmap_write(ip, uoff, (int)(on + n),
507			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
508			    NULL, cr);
509			/* Caller is responsible for updating i_seq if needed */
510			if (error)
511				break;
512			ip->i_size = uoff + n;
513			ip->i_flag |= IATTCHG;
514		} else if (n == MAXBSIZE) {
515			error = bmap_write(ip, uoff, (int)(on + n),
516			    BI_ALLOC_ONLY, NULL, cr);
517			/* Caller is responsible for updating i_seq if needed */
518		} else {
519			if (has_holes < 0)
520				has_holes = bmap_has_holes(ip);
521			if (has_holes) {
522				uint_t	blk_size;
523				u_offset_t offset;
524
525				offset = uoff & (offset_t)fs->fs_bmask;
526				blk_size = (int)blksize(fs, ip,
527				    (daddr_t)lblkno(fs, offset));
528				error = bmap_write(ip, uoff, blk_size,
529				    BI_NORMAL, NULL, cr);
530				/*
531				 * Caller is responsible for updating
532				 * i_seq if needed
533				 */
534			} else
535				error = 0;
536		}
537		if (error)
538			break;
539		uoff += n;
540		resid -= n;
541		/*
542		 * if file has grown larger than 2GB, set flag
543		 * in superblock if not already set
544		 */
545		if ((ip->i_size > MAXOFF32_T) &&
546		    !(fs->fs_flags & FSLARGEFILES)) {
547			ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
548			mutex_enter(&ufsvfsp->vfs_lock);
549			fs->fs_flags |= FSLARGEFILES;
550			ufs_sbwrite(ufsvfsp);
551			mutex_exit(&ufsvfsp->vfs_lock);
552		}
553	} while (resid);
554
555	if (error) {
556		/*
557		 * restore original state
558		 */
559		if (resid) {
560			if (size == ip->i_size)
561				goto errout;
562			(void) ufs_itrunc(ip, size, 0, cr);
563		}
564		/*
565		 * try non-directio path
566		 */
567		goto errout;
568	}
569skip_alloc:
570
571	/*
572	 * get rid of cached pages
573	 */
574	vp = ITOV(ip);
575	exclusive = rw_write_held(&ip->i_contents);
576	if (vn_has_cached_data(vp)) {
577		if (!exclusive) {
578			/*
579			 * Still holding i_rwlock, so no allocations
580			 * can happen after dropping contents.
581			 */
582			rw_exit(&ip->i_contents);
583			rw_enter(&ip->i_contents, RW_WRITER);
584		}
585		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
586		    B_INVAL, cr, NULL);
587		if (vn_has_cached_data(vp))
588			goto errout;
589		if (!exclusive)
590			rw_downgrade(&ip->i_contents);
591		ufs_directio_kstats.nflushes.value.ui64++;
592	}
593
594	/*
595	 * Direct Writes
596	 */
597
598	if (!exclusive) {
599		ufs_shared_writes++;
600		ncur = atomic_inc_32_nv(&ufs_cur_writes);
601		if (ncur > ufs_maxcur_writes)
602			ufs_maxcur_writes = ncur;
603	}
604
605	/*
606	 * proc and as are for VM operations in directio_start()
607	 */
608	if (uio->uio_segflg == UIO_USERSPACE) {
609		procp = ttoproc(curthread);
610		as = procp->p_as;
611	} else {
612		procp = NULL;
613		as = &kas;
614	}
615	*statusp = DIRECTIO_SUCCESS;
616	error = 0;
617	newerror = 0;
618	resid = uio->uio_resid;
619	bytes_written = 0;
620	ufs_directio_kstats.logical_writes.value.ui64++;
621	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
622		size_t pglck_len, pglck_size;
623		caddr_t pglck_base;
624		page_t **pplist, **spplist;
625
626		tail = NULL;
627
628		/*
629		 * Adjust number of bytes
630		 */
631		iov = uio->uio_iov;
632		pglck_len = (size_t)MIN(iov->iov_len, resid);
633		pglck_base = iov->iov_base;
634		if (pglck_len == 0) {
635			uio->uio_iov++;
636			uio->uio_iovcnt--;
637			continue;
638		}
639
640		/*
641		 * Try to Lock down the largest chunck of pages possible.
642		 */
643		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
644		error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
645
646		if (error)
647			break;
648
649		pglck_size = pglck_len;
650		while (pglck_len) {
651
652			nbytes = pglck_len;
653			uoff = uio->uio_loffset;
654
655			if (!bmap_peek) {
656
657				/*
658				 * Re-adjust number of bytes to contiguous
659				 * range. May have already called bmap_read
660				 * in the case of a concurrent rewrite.
661				 */
662				len = (ssize_t)blkroundup(fs, nbytes);
663				error = bmap_read(ip, uoff, &bn, &len);
664				if (error)
665					break;
666				if (bn == UFS_HOLE || len == 0)
667					break;
668			}
669			nbytes = (size_t)MIN(nbytes, len);
670			bmap_peek = 0;
671
672			/*
673			 * Get the pagelist pointer for this offset to be
674			 * passed to directio_start.
675			 */
676
677			if (pplist != NULL)
678				spplist = pplist +
679				    btop((uintptr_t)iov->iov_base -
680				    ((uintptr_t)pglck_base & PAGEMASK));
681			else
682				spplist = NULL;
683
684			/*
685			 * Kick off the direct write requests
686			 */
687			directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
688			    iov->iov_base, S_READ, procp, &tail, spplist);
689
690			/*
691			 * Adjust pointers and counters
692			 */
693			iov->iov_len -= nbytes;
694			iov->iov_base += nbytes;
695			uio->uio_loffset += nbytes;
696			resid -= nbytes;
697			pglck_len -= nbytes;
698		}
699
700		/*
701		 * Wait for outstanding requests
702		 */
703		newerror = directio_wait(tail, &bytes_written);
704
705		/*
706		 * Release VM resources
707		 */
708		as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
709
710	}
711
712	if (!exclusive) {
713		atomic_dec_32(&ufs_cur_writes);
714		/*
715		 * If this write was done shared, readers may
716		 * have pulled in unmodified pages. Get rid of
717		 * these potentially stale pages.
718		 */
719		if (vn_has_cached_data(vp)) {
720			rw_exit(&ip->i_contents);
721			rw_enter(&ip->i_contents, RW_WRITER);
722			(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
723			    B_INVAL, cr, NULL);
724			ufs_directio_kstats.nflushes.value.ui64++;
725			rw_downgrade(&ip->i_contents);
726		}
727	}
728
729	/*
730	 * If error, adjust resid to begin at the first
731	 * un-writable byte.
732	 */
733	if (error == 0)
734		error = newerror;
735	if (error)
736		resid = uio->uio_resid - bytes_written;
737	arg_uio->uio_resid = resid;
738
739	if (!rewrite) {
740		ip->i_flag |= IUPD | ICHG;
741		/* Caller will update i_seq */
742		TRANS_INODE(ip->i_ufsvfs, ip);
743	}
744	/*
745	 * If there is a residual; adjust the EOF if necessary
746	 */
747	if (resid) {
748		if (size != ip->i_size) {
749			if (uio->uio_loffset > size)
750				size = uio->uio_loffset;
751			(void) ufs_itrunc(ip, size, 0, cr);
752		}
753	}
754
755	if (uio == &copy_uio)
756		kmem_free(copy_base, copy_resid);
757
758	return (error);
759
760errout:
761	if (uio == &copy_uio)
762		kmem_free(copy_base, copy_resid);
763
764	return (0);
765}
766/*
767 * Direct read of a hole
768 */
769static int
770directio_hole(struct uio *uio, size_t nbytes)
771{
772	int		error = 0, nzero;
773	uio_t		phys_uio;
774	iovec_t		phys_iov;
775
776	ufs_directio_kstats.hole_reads.value.ui64++;
777	ufs_directio_kstats.nread.value.ui64 += nbytes;
778
779	phys_iov.iov_base = uio->uio_iov->iov_base;
780	phys_iov.iov_len = nbytes;
781
782	phys_uio.uio_iov = &phys_iov;
783	phys_uio.uio_iovcnt = 1;
784	phys_uio.uio_resid = phys_iov.iov_len;
785	phys_uio.uio_segflg = uio->uio_segflg;
786	phys_uio.uio_extflg = uio->uio_extflg;
787	while (error == 0 && phys_uio.uio_resid) {
788		nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
789		error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
790		    &phys_uio);
791	}
792	return (error);
793}
794
795/*
796 * Direct Read
797 */
798int
799ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
800{
801	ssize_t		resid, bytes_read;
802	u_offset_t	size, uoff;
803	int		error, newerror, len;
804	size_t		nbytes;
805	struct fs	*fs;
806	vnode_t		*vp;
807	daddr_t		bn;
808	iovec_t		*iov;
809	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
810	struct proc	*procp;
811	struct as	*as;
812	struct directio_buf	*tail;
813
814	/*
815	 * assume that directio isn't possible (normal case)
816	 */
817	*statusp = DIRECTIO_FAILURE;
818
819	/*
820	 * Don't go direct
821	 */
822	if (ufs_directio_enabled == 0)
823		return (0);
824
825	/*
826	 * mapped file; nevermind
827	 */
828	if (ip->i_mapcnt)
829		return (0);
830
831	/*
832	 * CAN WE DO DIRECT IO?
833	 */
834	/*
835	 * must be sector aligned
836	 */
837	uoff = uio->uio_loffset;
838	resid = uio->uio_resid;
839	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
840		return (0);
841	/*
842	 * must be short aligned and sector aligned
843	 */
844	iov = uio->uio_iov;
845	nbytes = uio->uio_iovcnt;
846	while (nbytes--) {
847		if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
848			return (0);
849		if ((intptr_t)(iov++->iov_base) & 1)
850			return (0);
851	}
852
853	/*
854	 * DIRECTIO
855	 */
856	fs = ip->i_fs;
857
858	/*
859	 * don't read past EOF
860	 */
861	size = ip->i_size;
862
863	/*
864	 * The file offset is past EOF so bail out here; we don't want
865	 * to update uio_resid and make it look like we read something.
866	 * We say that direct I/O was a success to avoid having rdip()
867	 * go through the same "read past EOF logic".
868	 */
869	if (uoff >= size) {
870		*statusp = DIRECTIO_SUCCESS;
871		return (0);
872	}
873
874	/*
875	 * The read would extend past EOF so make it smaller.
876	 */
877	if ((uoff + resid) > size) {
878		resid = size - uoff;
879		/*
880		 * recheck sector alignment
881		 */
882		if (resid & (DEV_BSIZE - 1))
883			return (0);
884	}
885
886	/*
887	 * At this point, we know there is some real work to do.
888	 */
889	ASSERT(resid);
890
891	/*
892	 * get rid of cached pages
893	 */
894	vp = ITOV(ip);
895	if (vn_has_cached_data(vp)) {
896		rw_exit(&ip->i_contents);
897		rw_enter(&ip->i_contents, RW_WRITER);
898		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
899		    B_INVAL, cr, NULL);
900		if (vn_has_cached_data(vp))
901			return (0);
902		rw_downgrade(&ip->i_contents);
903		ufs_directio_kstats.nflushes.value.ui64++;
904	}
905	/*
906	 * Direct Reads
907	 */
908
909	/*
910	 * proc and as are for VM operations in directio_start()
911	 */
912	if (uio->uio_segflg == UIO_USERSPACE) {
913		procp = ttoproc(curthread);
914		as = procp->p_as;
915	} else {
916		procp = NULL;
917		as = &kas;
918	}
919
920	*statusp = DIRECTIO_SUCCESS;
921	error = 0;
922	newerror = 0;
923	bytes_read = 0;
924	ufs_directio_kstats.logical_reads.value.ui64++;
925	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
926		size_t pglck_len, pglck_size;
927		caddr_t pglck_base;
928		page_t **pplist, **spplist;
929
930		tail = NULL;
931
932		/*
933		 * Adjust number of bytes
934		 */
935		iov = uio->uio_iov;
936		pglck_len = (size_t)MIN(iov->iov_len, resid);
937		pglck_base = iov->iov_base;
938		if (pglck_len == 0) {
939			uio->uio_iov++;
940			uio->uio_iovcnt--;
941			continue;
942		}
943
944		/*
945		 * Try to Lock down the largest chunck of pages possible.
946		 */
947		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
948		error = as_pagelock(as, &pplist, pglck_base,
949		    pglck_len, S_WRITE);
950
951		if (error)
952			break;
953
954		pglck_size = pglck_len;
955		while (pglck_len) {
956
957			nbytes = pglck_len;
958			uoff = uio->uio_loffset;
959
960			/*
961			 * Re-adjust number of bytes to contiguous range
962			 */
963			len = (ssize_t)blkroundup(fs, nbytes);
964			error = bmap_read(ip, uoff, &bn, &len);
965			if (error)
966				break;
967
968			if (bn == UFS_HOLE) {
969				nbytes = (size_t)MIN(fs->fs_bsize -
970				    (long)blkoff(fs, uoff), nbytes);
971				error = directio_hole(uio, nbytes);
972				/*
973				 * Hole reads are not added to the list
974				 * processed by directio_wait() below so
975				 * account for bytes read here.
976				 */
977				if (!error)
978					bytes_read += nbytes;
979			} else {
980				nbytes = (size_t)MIN(nbytes, len);
981
982				/*
983				 * Get the pagelist pointer for this offset
984				 * to be passed to directio_start.
985				 */
986				if (pplist != NULL)
987					spplist = pplist +
988					    btop((uintptr_t)iov->iov_base -
989					    ((uintptr_t)pglck_base & PAGEMASK));
990				else
991					spplist = NULL;
992
993				/*
994				 * Kick off the direct read requests
995				 */
996				directio_start(ufsvfsp, ip, nbytes,
997				    ldbtob(bn), iov->iov_base,
998				    S_WRITE, procp, &tail, spplist);
999			}
1000
1001			if (error)
1002				break;
1003
1004			/*
1005			 * Adjust pointers and counters
1006			 */
1007			iov->iov_len -= nbytes;
1008			iov->iov_base += nbytes;
1009			uio->uio_loffset += nbytes;
1010			resid -= nbytes;
1011			pglck_len -= nbytes;
1012		}
1013
1014		/*
1015		 * Wait for outstanding requests
1016		 */
1017		newerror = directio_wait(tail, &bytes_read);
1018		/*
1019		 * Release VM resources
1020		 */
1021		as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1022
1023	}
1024
1025	/*
1026	 * If error, adjust resid to begin at the first
1027	 * un-read byte.
1028	 */
1029	if (error == 0)
1030		error = newerror;
1031	uio->uio_resid -= bytes_read;
1032	return (error);
1033}
1034