17c478bdstevel@tonic-gate/*
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
59cbc422peterte * Common Development and Distribution License (the "License").
69cbc422peterte * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
213b862e9Roger A. Faulkner
227c478bdstevel@tonic-gate/*
233b862e9Roger A. Faulkner * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
247c478bdstevel@tonic-gate * Use is subject to license terms.
2506e6833Josef 'Jeff' Sipek * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
26ade42b5Sebastien Roy * Copyright (c) 2017 by Delphix. All rights reserved.
277c478bdstevel@tonic-gate */
287c478bdstevel@tonic-gate
297c478bdstevel@tonic-gate/*
307c478bdstevel@tonic-gate * Vnode operations for the High Sierra filesystem
317c478bdstevel@tonic-gate */
327c478bdstevel@tonic-gate
337c478bdstevel@tonic-gate#include <sys/types.h>
347c478bdstevel@tonic-gate#include <sys/t_lock.h>
357c478bdstevel@tonic-gate#include <sys/param.h>
367c478bdstevel@tonic-gate#include <sys/time.h>
377c478bdstevel@tonic-gate#include <sys/systm.h>
387c478bdstevel@tonic-gate#include <sys/sysmacros.h>
397c478bdstevel@tonic-gate#include <sys/resource.h>
407c478bdstevel@tonic-gate#include <sys/signal.h>
417c478bdstevel@tonic-gate#include <sys/cred.h>
427c478bdstevel@tonic-gate#include <sys/user.h>
437c478bdstevel@tonic-gate#include <sys/buf.h>
447c478bdstevel@tonic-gate#include <sys/vfs.h>
45aa59c4crsb#include <sys/vfs_opreg.h>
467c478bdstevel@tonic-gate#include <sys/stat.h>
477c478bdstevel@tonic-gate#include <sys/vnode.h>
487c478bdstevel@tonic-gate#include <sys/mode.h>
497c478bdstevel@tonic-gate#include <sys/proc.h>
507c478bdstevel@tonic-gate#include <sys/disp.h>
517c478bdstevel@tonic-gate#include <sys/file.h>
527c478bdstevel@tonic-gate#include <sys/fcntl.h>
537c478bdstevel@tonic-gate#include <sys/flock.h>
547c478bdstevel@tonic-gate#include <sys/kmem.h>
557c478bdstevel@tonic-gate#include <sys/uio.h>
567c478bdstevel@tonic-gate#include <sys/conf.h>
577c478bdstevel@tonic-gate#include <sys/errno.h>
587c478bdstevel@tonic-gate#include <sys/mman.h>
597c478bdstevel@tonic-gate#include <sys/pathname.h>
607c478bdstevel@tonic-gate#include <sys/debug.h>
617c478bdstevel@tonic-gate#include <sys/vmsystm.h>
627c478bdstevel@tonic-gate#include <sys/cmn_err.h>
637c478bdstevel@tonic-gate#include <sys/fbuf.h>
647c478bdstevel@tonic-gate#include <sys/dirent.h>
657c478bdstevel@tonic-gate#include <sys/errno.h>
6684b8276mg#include <sys/dkio.h>
6784b8276mg#include <sys/cmn_err.h>
6884b8276mg#include <sys/atomic.h>
697c478bdstevel@tonic-gate
707c478bdstevel@tonic-gate#include <vm/hat.h>
717c478bdstevel@tonic-gate#include <vm/page.h>
727c478bdstevel@tonic-gate#include <vm/pvn.h>
737c478bdstevel@tonic-gate#include <vm/as.h>
747c478bdstevel@tonic-gate#include <vm/seg.h>
757c478bdstevel@tonic-gate#include <vm/seg_map.h>
767c478bdstevel@tonic-gate#include <vm/seg_kmem.h>
777c478bdstevel@tonic-gate#include <vm/seg_vn.h>
787c478bdstevel@tonic-gate#include <vm/rm.h>
797c478bdstevel@tonic-gate#include <vm/page.h>
807c478bdstevel@tonic-gate#include <sys/swap.h>
8184b8276mg#include <sys/avl.h>
8284b8276mg#include <sys/sunldi.h>
8384b8276mg#include <sys/ddi.h>
8484b8276mg#include <sys/sunddi.h>
8584b8276mg#include <sys/sdt.h>
8684b8276mg
8784b8276mg/*
8884b8276mg * For struct modlinkage
8984b8276mg */
9084b8276mg#include <sys/modctl.h>
917c478bdstevel@tonic-gate
927c478bdstevel@tonic-gate#include <sys/fs/hsfs_spec.h>
937c478bdstevel@tonic-gate#include <sys/fs/hsfs_node.h>
947c478bdstevel@tonic-gate#include <sys/fs/hsfs_impl.h>
957c478bdstevel@tonic-gate#include <sys/fs/hsfs_susp.h>
967c478bdstevel@tonic-gate#include <sys/fs/hsfs_rrip.h>
977c478bdstevel@tonic-gate
987c478bdstevel@tonic-gate#include <fs/fs_subr.h>
997c478bdstevel@tonic-gate
10084b8276mg/* # of contiguous requests to detect sequential access pattern */
10184b8276mgstatic int seq_contig_requests = 2;
10284b8276mg
10384b8276mg/*
10484b8276mg * This is the max number os taskq threads that will be created
10584b8276mg * if required. Since we are using a Dynamic TaskQ by default only
10684b8276mg * one thread is created initially.
10784b8276mg *
10884b8276mg * NOTE: In the usual hsfs use case this per fs instance number
10984b8276mg * of taskq threads should not place any undue load on a system.
11084b8276mg * Even on an unusual system with say 100 CDROM drives, 800 threads
11184b8276mg * will not be created unless all the drives are loaded and all
11284b8276mg * of them are saturated with I/O at the same time! If there is at
11384b8276mg * all a complaint of system load due to such an unusual case it
11484b8276mg * should be easy enough to change to one per-machine Dynamic TaskQ
11584b8276mg * for all hsfs mounts with a nthreads of say 32.
11684b8276mg */
11784b8276mgstatic int hsfs_taskq_nthreads = 8;	/* # of taskq threads per fs */
11884b8276mg
11984b8276mg/* Min count of adjacent bufs that will avoid buf coalescing */
12084b8276mgstatic int hsched_coalesce_min = 2;
12184b8276mg
12284b8276mg/*
12384b8276mg * Kmem caches for heavily used small allocations. Using these kmem
12484b8276mg * caches provides a factor of 3 reduction in system time and greatly
12584b8276mg * aids overall throughput esp. on SPARC.
12684b8276mg */
12784b8276mgstruct kmem_cache *hio_cache;
12884b8276mgstruct kmem_cache *hio_info_cache;
12984b8276mg
130d10b670frankho/*
131d10b670frankho * This tunable allows us to ignore inode numbers from rrip-1.12.
132d10b670frankho * In this case, we fall back to our default inode algorithm.
133d10b670frankho */
134d10b670frankhoextern int use_rrip_inodes;
135d10b670frankho
13684b8276mgstatic int hsched_deadline_compare(const void *x1, const void *x2);
13784b8276mgstatic int hsched_offset_compare(const void *x1, const void *x2);
13884b8276mgstatic void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra);
13984b8276mgint hsched_invoke_strategy(struct hsfs *fsp);
140d10b670frankho
1417c478bdstevel@tonic-gate/* ARGSUSED */
1427c478bdstevel@tonic-gatestatic int
143ade42b5Sebastien Royhsfs_fsync(vnode_t *cp, int syncflag, cred_t *cred, caller_context_t *ct)
1447c478bdstevel@tonic-gate{
1457c478bdstevel@tonic-gate	return (0);
1467c478bdstevel@tonic-gate}
1477c478bdstevel@tonic-gate
1487c478bdstevel@tonic-gate
1497c478bdstevel@tonic-gate/*ARGSUSED*/
1507c478bdstevel@tonic-gatestatic int
151ade42b5Sebastien Royhsfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
152ade42b5Sebastien Roy    struct caller_context *ct)
1537c478bdstevel@tonic-gate{
1547c478bdstevel@tonic-gate	caddr_t base;
1558cd7c4fpeterte	offset_t diff;
1567c478bdstevel@tonic-gate	int error;
1578cd7c4fpeterte	struct hsnode *hp;
1588cd7c4fpeterte	uint_t filesize;
1597c478bdstevel@tonic-gate
1607c478bdstevel@tonic-gate	hp = VTOH(vp);
1617c478bdstevel@tonic-gate	/*
1627c478bdstevel@tonic-gate	 * if vp is of type VDIR, make sure dirent
1637c478bdstevel@tonic-gate	 * is filled up with all info (because of ptbl)
1647c478bdstevel@tonic-gate	 */
1657c478bdstevel@tonic-gate	if (vp->v_type == VDIR) {
1667c478bdstevel@tonic-gate		if (hp->hs_dirent.ext_size == 0)
1677c478bdstevel@tonic-gate			hs_filldirent(vp, &hp->hs_dirent);
1687c478bdstevel@tonic-gate	}
1697c478bdstevel@tonic-gate	filesize = hp->hs_dirent.ext_size;
1707c478bdstevel@tonic-gate
1718cd7c4fpeterte	/* Sanity checks. */
1728cd7c4fpeterte	if (uiop->uio_resid == 0 ||		/* No data wanted. */
1739cbc422peterte	    uiop->uio_loffset > HS_MAXFILEOFF ||	/* Offset too big. */
1748cd7c4fpeterte	    uiop->uio_loffset >= filesize)	/* Past EOF. */
1758cd7c4fpeterte		return (0);
1767c478bdstevel@tonic-gate
1777c478bdstevel@tonic-gate	do {
1788cd7c4fpeterte		/*
1798cd7c4fpeterte		 * We want to ask for only the "right" amount of data.
1808cd7c4fpeterte		 * In this case that means:-
1818cd7c4fpeterte		 *
1828cd7c4fpeterte		 * We can't get data from beyond our EOF. If asked,
1838cd7c4fpeterte		 * we will give a short read.
1848cd7c4fpeterte		 *
1858cd7c4fpeterte		 * segmap_getmapflt returns buffers of MAXBSIZE bytes.
1868cd7c4fpeterte		 * These buffers are always MAXBSIZE aligned.
1878cd7c4fpeterte		 * If our starting offset is not MAXBSIZE aligned,
1888cd7c4fpeterte		 * we can only ask for less than MAXBSIZE bytes.
1898cd7c4fpeterte		 *
1908cd7c4fpeterte		 * If our requested offset and length are such that
1918cd7c4fpeterte		 * they belong in different MAXBSIZE aligned slots
1928cd7c4fpeterte		 * then we'll be making more than one call on
1938cd7c4fpeterte		 * segmap_getmapflt.
1948cd7c4fpeterte		 *
1958cd7c4fpeterte		 * This diagram shows the variables we use and their
1968cd7c4fpeterte		 * relationships.
1978cd7c4fpeterte		 *
1988cd7c4fpeterte		 * |<-----MAXBSIZE----->|
1998cd7c4fpeterte		 * +--------------------------...+
2008cd7c4fpeterte		 * |.....mapon->|<--n-->|....*...|EOF
2018cd7c4fpeterte		 * +--------------------------...+
2028cd7c4fpeterte		 * uio_loffset->|
2038cd7c4fpeterte		 * uio_resid....|<---------->|
2048cd7c4fpeterte		 * diff.........|<-------------->|
2058cd7c4fpeterte		 *
2068cd7c4fpeterte		 * So, in this case our offset is not aligned
2078cd7c4fpeterte		 * and our request takes us outside of the
2088cd7c4fpeterte		 * MAXBSIZE window. We will break this up into
2098cd7c4fpeterte		 * two segmap_getmapflt calls.
2108cd7c4fpeterte		 */
2118cd7c4fpeterte		size_t nbytes;
2128cd7c4fpeterte		offset_t mapon;
2138cd7c4fpeterte		size_t n;
2148cd7c4fpeterte		uint_t flags;
2158cd7c4fpeterte
2168cd7c4fpeterte		mapon = uiop->uio_loffset & MAXBOFFSET;
2178cd7c4fpeterte		diff = filesize - uiop->uio_loffset;
2188cd7c4fpeterte		nbytes = (size_t)MIN(MAXBSIZE - mapon, uiop->uio_resid);
2198cd7c4fpeterte		n = MIN(diff, nbytes);
2208cd7c4fpeterte		if (n <= 0) {
2218cd7c4fpeterte			/* EOF or request satisfied. */
2228cd7c4fpeterte			return (0);
2237c478bdstevel@tonic-gate		}
2247c478bdstevel@tonic-gate
2258cd7c4fpeterte		base = segmap_getmapflt(segkmap, vp,
2268cd7c4fpeterte		    (u_offset_t)uiop->uio_loffset, n, 1, S_READ);
2278cd7c4fpeterte
2288cd7c4fpeterte		error = uiomove(base + mapon, n, UIO_READ, uiop);
2297c478bdstevel@tonic-gate
2307c478bdstevel@tonic-gate		if (error == 0) {
2317c478bdstevel@tonic-gate			/*
2327c478bdstevel@tonic-gate			 * if read a whole block, or read to eof,
2337c478bdstevel@tonic-gate			 *  won't need this buffer again soon.
2347c478bdstevel@tonic-gate			 */
2358cd7c4fpeterte			if (n + mapon == MAXBSIZE ||
2368cd7c4fpeterte			    uiop->uio_loffset == filesize)
2377c478bdstevel@tonic-gate				flags = SM_DONTNEED;
2387c478bdstevel@tonic-gate			else
2397c478bdstevel@tonic-gate				flags = 0;
24084b8276mg
2417c478bdstevel@tonic-gate			error = segmap_release(segkmap, base, flags);
2427c478bdstevel@tonic-gate		} else
2437c478bdstevel@tonic-gate			(void) segmap_release(segkmap, base, 0);
2447c478bdstevel@tonic-gate	} while (error == 0 && uiop->uio_resid > 0);
2457c478bdstevel@tonic-gate
2467c478bdstevel@tonic-gate	return (error);
2477c478bdstevel@tonic-gate}
2487c478bdstevel@tonic-gate
2497c478bdstevel@tonic-gate/*ARGSUSED2*/
2507c478bdstevel@tonic-gatestatic int
251ade42b5Sebastien Royhsfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
252ade42b5Sebastien Roy    caller_context_t *ct)
2537c478bdstevel@tonic-gate{
2547c478bdstevel@tonic-gate	struct hsnode *hp;
2557c478bdstevel@tonic-gate	struct vfs *vfsp;
2567c478bdstevel@tonic-gate	struct hsfs *fsp;
2577c478bdstevel@tonic-gate
2587c478bdstevel@tonic-gate	hp = VTOH(vp);
2597c478bdstevel@tonic-gate	fsp = VFS_TO_HSFS(vp->v_vfsp);
2607c478bdstevel@tonic-gate	vfsp = vp->v_vfsp;
2617c478bdstevel@tonic-gate
2627c478bdstevel@tonic-gate	if ((hp->hs_dirent.ext_size == 0) && (vp->v_type == VDIR)) {
2637c478bdstevel@tonic-gate		hs_filldirent(vp, &hp->hs_dirent);
2647c478bdstevel@tonic-gate	}
2657c478bdstevel@tonic-gate	vap->va_type = IFTOVT(hp->hs_dirent.mode);
2667c478bdstevel@tonic-gate	vap->va_mode = hp->hs_dirent.mode;
2677c478bdstevel@tonic-gate	vap->va_uid = hp->hs_dirent.uid;
2687c478bdstevel@tonic-gate	vap->va_gid = hp->hs_dirent.gid;
2697c478bdstevel@tonic-gate
2707c478bdstevel@tonic-gate	vap->va_fsid = vfsp->vfs_dev;
2717c478bdstevel@tonic-gate	vap->va_nodeid = (ino64_t)hp->hs_nodeid;
2727c478bdstevel@tonic-gate	vap->va_nlink = hp->hs_dirent.nlink;
2737c478bdstevel@tonic-gate	vap->va_size =	(offset_t)hp->hs_dirent.ext_size;
2747c478bdstevel@tonic-gate
2757c478bdstevel@tonic-gate	vap->va_atime.tv_sec = hp->hs_dirent.adate.tv_sec;
2767c478bdstevel@tonic-gate	vap->va_atime.tv_nsec = hp->hs_dirent.adate.tv_usec*1000;
2777c478bdstevel@tonic-gate	vap->va_mtime.tv_sec = hp->hs_dirent.mdate.tv_sec;
2787c478bdstevel@tonic-gate	vap->va_mtime.tv_nsec = hp->hs_dirent.mdate.tv_usec*1000;
2797c478bdstevel@tonic-gate	vap->va_ctime.tv_sec = hp->hs_dirent.cdate.tv_sec;
2807c478bdstevel@tonic-gate	vap->va_ctime.tv_nsec = hp->hs_dirent.cdate.tv_usec*1000;
2817c478bdstevel@tonic-gate	if (vp->v_type == VCHR || vp->v_type == VBLK)
2827c478bdstevel@tonic-gate		vap->va_rdev = hp->hs_dirent.r_dev;
2837c478bdstevel@tonic-gate	else
2847c478bdstevel@tonic-gate		vap->va_rdev = 0;
2857c478bdstevel@tonic-gate	vap->va_blksize = vfsp->vfs_bsize;
2867c478bdstevel@tonic-gate	/* no. of blocks = no. of data blocks + no. of xar blocks */
2877c478bdstevel@tonic-gate	vap->va_nblocks = (fsblkcnt64_t)howmany(vap->va_size + (u_longlong_t)
2887c478bdstevel@tonic-gate	    (hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift), DEV_BSIZE);
2897c478bdstevel@tonic-gate	vap->va_seq = hp->hs_seq;
2907c478bdstevel@tonic-gate	return (0);
2917c478bdstevel@tonic-gate}
2927c478bdstevel@tonic-gate
2937c478bdstevel@tonic-gate/*ARGSUSED*/
2947c478bdstevel@tonic-gatestatic int
295ade42b5Sebastien Royhsfs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cred,
296ade42b5Sebastien Roy    caller_context_t *ct)
2977c478bdstevel@tonic-gate{
2987c478bdstevel@tonic-gate	struct hsnode *hp;
2997c478bdstevel@tonic-gate
3007c478bdstevel@tonic-gate	if (vp->v_type != VLNK)
3017c478bdstevel@tonic-gate		return (EINVAL);
3027c478bdstevel@tonic-gate
3037c478bdstevel@tonic-gate	hp = VTOH(vp);
3047c478bdstevel@tonic-gate
3057c478bdstevel@tonic-gate	if (hp->hs_dirent.sym_link == (char *)NULL)
3067c478bdstevel@tonic-gate		return (ENOENT);
3077c478bdstevel@tonic-gate
3087c478bdstevel@tonic-gate	return (uiomove(hp->hs_dirent.sym_link,
3097c478bdstevel@tonic-gate	    (size_t)MIN(hp->hs_dirent.ext_size,
3107c478bdstevel@tonic-gate	    uiop->uio_resid), UIO_READ, uiop));
3117c478bdstevel@tonic-gate}
3127c478bdstevel@tonic-gate
3137c478bdstevel@tonic-gate/*ARGSUSED*/
3147c478bdstevel@tonic-gatestatic void
315ade42b5Sebastien Royhsfs_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
3167c478bdstevel@tonic-gate{
3177c478bdstevel@tonic-gate	struct hsnode *hp;
3187c478bdstevel@tonic-gate	struct hsfs *fsp;
3197c478bdstevel@tonic-gate
3207c478bdstevel@tonic-gate	int nopage;
3217c478bdstevel@tonic-gate
3227c478bdstevel@tonic-gate	hp = VTOH(vp);
3237c478bdstevel@tonic-gate	fsp = VFS_TO_HSFS(vp->v_vfsp);
3247c478bdstevel@tonic-gate	/*
3257c478bdstevel@tonic-gate	 * Note: acquiring and holding v_lock for quite a while
3267c478bdstevel@tonic-gate	 * here serializes on the vnode; this is unfortunate, but
3277c478bdstevel@tonic-gate	 * likely not to overly impact performance, as the underlying
3287c478bdstevel@tonic-gate	 * device (CDROM drive) is quite slow.
3297c478bdstevel@tonic-gate	 */
3307c478bdstevel@tonic-gate	rw_enter(&fsp->hsfs_hash_lock, RW_WRITER);
3317c478bdstevel@tonic-gate	mutex_enter(&hp->hs_contents_lock);
3327c478bdstevel@tonic-gate	mutex_enter(&vp->v_lock);
3337c478bdstevel@tonic-gate
3347c478bdstevel@tonic-gate	if (vp->v_count < 1) {
3357c478bdstevel@tonic-gate		panic("hsfs_inactive: v_count < 1");
3367c478bdstevel@tonic-gate		/*NOTREACHED*/
3377c478bdstevel@tonic-gate	}
3387c478bdstevel@tonic-gate
339ade42b5Sebastien Roy	VN_RELE_LOCKED(vp);
340ade42b5Sebastien Roy	if (vp->v_count > 0 || (hp->hs_flags & HREF) == 0) {
3417c478bdstevel@tonic-gate		mutex_exit(&vp->v_lock);
3427c478bdstevel@tonic-gate		mutex_exit(&hp->hs_contents_lock);
3437c478bdstevel@tonic-gate		rw_exit(&fsp->hsfs_hash_lock);
3447c478bdstevel@tonic-gate		return;
3457c478bdstevel@tonic-gate	}
3467c478bdstevel@tonic-gate	if (vp->v_count == 0) {
3477c478bdstevel@tonic-gate		/*
3487c478bdstevel@tonic-gate		 * Free the hsnode.
3497c478bdstevel@tonic-gate		 * If there are no pages associated with the
3507c478bdstevel@tonic-gate		 * hsnode, give it back to the kmem_cache,
3517c478bdstevel@tonic-gate		 * else put at the end of this file system's
3527c478bdstevel@tonic-gate		 * internal free list.
3537c478bdstevel@tonic-gate		 */
3547c478bdstevel@tonic-gate		nopage = !vn_has_cached_data(vp);
3557c478bdstevel@tonic-gate		hp->hs_flags = 0;
3567c478bdstevel@tonic-gate		/*
3577c478bdstevel@tonic-gate		 * exit these locks now, since hs_freenode may
3587c478bdstevel@tonic-gate		 * kmem_free the hsnode and embedded vnode
3597c478bdstevel@tonic-gate		 */
3607c478bdstevel@tonic-gate		mutex_exit(&vp->v_lock);
3617c478bdstevel@tonic-gate		mutex_exit(&hp->hs_contents_lock);
3627c478bdstevel@tonic-gate		hs_freenode(vp, fsp, nopage);
3637c478bdstevel@tonic-gate	} else {
3647c478bdstevel@tonic-gate		mutex_exit(&vp->v_lock);
3657c478bdstevel@tonic-gate		mutex_exit(&hp->hs_contents_lock);
3667c478bdstevel@tonic-gate	}
3677c478bdstevel@tonic-gate	rw_exit(&fsp->hsfs_hash_lock);
3687c478bdstevel@tonic-gate}
3697c478bdstevel@tonic-gate
3707c478bdstevel@tonic-gate
3717c478bdstevel@tonic-gate/*ARGSUSED*/
3727c478bdstevel@tonic-gatestatic int
373ade42b5Sebastien Royhsfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
374ade42b5Sebastien Roy    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
375ade42b5Sebastien Roy    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
3767c478bdstevel@tonic-gate{
3777c478bdstevel@tonic-gate	int error;
3787c478bdstevel@tonic-gate	int namelen = (int)strlen(nm);
3797c478bdstevel@tonic-gate
3807c478bdstevel@tonic-gate	if (*nm == '\0') {
3817c478bdstevel@tonic-gate		VN_HOLD(dvp);
3827c478bdstevel@tonic-gate		*vpp = dvp;
3837c478bdstevel@tonic-gate		return (0);
3847c478bdstevel@tonic-gate	}
3857c478bdstevel@tonic-gate
3867c478bdstevel@tonic-gate	/*
3877c478bdstevel@tonic-gate	 * If we're looking for ourself, life is simple.
3887c478bdstevel@tonic-gate	 */
3897c478bdstevel@tonic-gate	if (namelen == 1 && *nm == '.') {
3907c478bdstevel@tonic-gate		if (error = hs_access(dvp, (mode_t)VEXEC, cred))
3917c478bdstevel@tonic-gate			return (error);
3927c478bdstevel@tonic-gate		VN_HOLD(dvp);
3937c478bdstevel@tonic-gate		*vpp = dvp;
3947c478bdstevel@tonic-gate		return (0);
3957c478bdstevel@tonic-gate	}
3967c478bdstevel@tonic-gate
3977c478bdstevel@tonic-gate	return (hs_dirlook(dvp, nm, namelen, vpp, cred));
3987c478bdstevel@tonic-gate}
3997c478bdstevel@tonic-gate
4007c478bdstevel@tonic-gate
4017c478bdstevel@tonic-gate/*ARGSUSED*/
4027c478bdstevel@tonic-gatestatic int
403ade42b5Sebastien Royhsfs_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
404ade42b5Sebastien Roy    caller_context_t *ct, int flags)
4057c478bdstevel@tonic-gate{
4067c478bdstevel@tonic-gate	struct hsnode	*dhp;
4077c478bdstevel@tonic-gate	struct hsfs	*fsp;
4087c478bdstevel@tonic-gate	struct hs_direntry hd;
4097c478bdstevel@tonic-gate	struct dirent64	*nd;
4107c478bdstevel@tonic-gate	int		error;
4117c478bdstevel@tonic-gate	uint_t		offset;		/* real offset in directory */
4127c478bdstevel@tonic-gate	uint_t		dirsiz;		/* real size of directory */
4137c478bdstevel@tonic-gate	uchar_t		*blkp;
4147c478bdstevel@tonic-gate	int		hdlen;		/* length of hs directory entry */
4157c478bdstevel@tonic-gate	long		ndlen;		/* length of dirent entry */
4167c478bdstevel@tonic-gate	int		bytes_wanted;
4177c478bdstevel@tonic-gate	size_t		bufsize;	/* size of dirent buffer */
4187c478bdstevel@tonic-gate	char		*outbuf;	/* ptr to dirent buffer */
4197c478bdstevel@tonic-gate	char		*dname;
4207c478bdstevel@tonic-gate	int		dnamelen;
4217c478bdstevel@tonic-gate	size_t		dname_size;
4227c478bdstevel@tonic-gate	struct fbuf	*fbp;
4237c478bdstevel@tonic-gate	uint_t		last_offset;	/* last index into current dir block */
4247c478bdstevel@tonic-gate	ino64_t		dirino;	/* temporary storage before storing in dirent */
4257c478bdstevel@tonic-gate	off_t		diroff;
4267c478bdstevel@tonic-gate
4277c478bdstevel@tonic-gate	dhp = VTOH(vp);
4287c478bdstevel@tonic-gate	fsp = VFS_TO_HSFS(vp->v_vfsp);
4297c478bdstevel@tonic-gate	if (dhp->hs_dirent.ext_size == 0)
4307c478bdstevel@tonic-gate		hs_filldirent(vp, &dhp->hs_dirent);
4317c478bdstevel@tonic-gate	dirsiz = dhp->hs_dirent.ext_size;
4327c478bdstevel@tonic-gate	if (uiop->uio_loffset >= dirsiz) {	/* at or beyond EOF */
4337c478bdstevel@tonic-gate		if (eofp)
4347c478bdstevel@tonic-gate			*eofp = 1;
4357c478bdstevel@tonic-gate		return (0);
4367c478bdstevel@tonic-gate	}
4379cbc422peterte	ASSERT(uiop->uio_loffset <= HS_MAXFILEOFF);
4389cbc422peterte	offset = uiop->uio_loffset;
4397c478bdstevel@tonic-gate
4407c478bdstevel@tonic-gate	dname_size = fsp->hsfs_namemax + 1;	/* 1 for the ending NUL */
4417c478bdstevel@tonic-gate	dname = kmem_alloc(dname_size, KM_SLEEP);
4427c478bdstevel@tonic-gate	bufsize = uiop->uio_resid + sizeof (struct dirent64);
4437c478bdstevel@tonic-gate
4447c478bdstevel@tonic-gate	outbuf = kmem_alloc(bufsize, KM_SLEEP);
4457c478bdstevel@tonic-gate	nd = (struct dirent64 *)outbuf;
4467c478bdstevel@tonic-gate
4477c478bdstevel@tonic-gate	while (offset < dirsiz) {
448cf83459frankho		bytes_wanted = MIN(MAXBSIZE, dirsiz - (offset & MAXBMASK));
4497c478bdstevel@tonic-gate
4507c478bdstevel@tonic-gate		error = fbread(vp, (offset_t)(offset & MAXBMASK),
451d10b670frankho		    (unsigned int)bytes_wanted, S_READ, &fbp);
4527c478bdstevel@tonic-gate		if (error)
4537c478bdstevel@tonic-gate			goto done;
4547c478bdstevel@tonic-gate
4557c478bdstevel@tonic-gate		blkp = (uchar_t *)fbp->fb_addr;
456cf83459frankho		last_offset = (offset & MAXBMASK) + fbp->fb_count;
4577c478bdstevel@tonic-gate
4587c478bdstevel@tonic-gate#define	rel_offset(offset) ((offset) & MAXBOFFSET)	/* index into blkp */
4597c478bdstevel@tonic-gate
4607c478bdstevel@tonic-gate		while (offset < last_offset) {
4617c478bdstevel@tonic-gate			/*
462cf83459frankho			 * Very similar validation code is found in
463cf83459frankho			 * process_dirblock(), hsfs_node.c.
464cf83459frankho			 * For an explanation, see there.
465cf83459frankho			 * It may make sense for the future to
466cf83459frankho			 * "consolidate" the code in hs_parsedir(),
467cf83459frankho			 * process_dirblock() and hsfs_readdir() into
468cf83459frankho			 * a single utility function.
4697c478bdstevel@tonic-gate			 */
4707c478bdstevel@tonic-gate			hdlen = (int)((uchar_t)
471d10b670frankho			    HDE_DIR_LEN(&blkp[rel_offset(offset)]));
472cf83459frankho			if (hdlen < HDE_ROOT_DIR_REC_SIZE ||
473cf83459frankho			    offset + hdlen > last_offset) {
4747c478bdstevel@tonic-gate				/*
475cf83459frankho				 * advance to next sector boundary
4767c478bdstevel@tonic-gate				 */
477cf83459frankho				offset = roundup(offset + 1, HS_SECTOR_SIZE);
478cf83459frankho				if (hdlen)
479cf83459frankho					hs_log_bogus_disk_warning(fsp,
480cf83459frankho					    HSFS_ERR_TRAILING_JUNK, 0);
481cf83459frankho
482cf83459frankho				continue;
4837c478bdstevel@tonic-gate			}
4847c478bdstevel@tonic-gate
4857c478bdstevel@tonic-gate			bzero(&hd, sizeof (hd));
4867c478bdstevel@tonic-gate
4877c478bdstevel@tonic-gate			/*
4887c478bdstevel@tonic-gate			 * Just ignore invalid directory entries.
4897c478bdstevel@tonic-gate			 * XXX - maybe hs_parsedir() will detect EXISTENCE bit
4907c478bdstevel@tonic-gate			 */
4917c478bdstevel@tonic-gate			if (!hs_parsedir(fsp, &blkp[rel_offset(offset)],
492d10b670frankho			    &hd, dname, &dnamelen, last_offset - offset)) {
4937c478bdstevel@tonic-gate				/*
4947c478bdstevel@tonic-gate				 * Determine if there is enough room
4957c478bdstevel@tonic-gate				 */
4967c478bdstevel@tonic-gate				ndlen = (long)DIRENT64_RECLEN((dnamelen));
4977c478bdstevel@tonic-gate
4987c478bdstevel@tonic-gate				if ((ndlen + ((char *)nd - outbuf)) >
4997c478bdstevel@tonic-gate				    uiop->uio_resid) {
5007c478bdstevel@tonic-gate					fbrelse(fbp, S_READ);
5017c478bdstevel@tonic-gate					goto done; /* output buffer full */
5027c478bdstevel@tonic-gate				}
5037c478bdstevel@tonic-gate
5047c478bdstevel@tonic-gate				diroff = offset + hdlen;
5057c478bdstevel@tonic-gate				/*
506d10b670frankho				 * If the media carries rrip-v1.12 or newer,
507d10b670frankho				 * and we trust the inodes from the rrip data
508d10b670frankho				 * (use_rrip_inodes != 0), use that data. If the
509d10b670frankho				 * media has been created by a recent mkisofs
510d10b670frankho				 * version, we may trust all numbers in the
511d10b670frankho				 * starting extent number; otherwise, we cannot
512d10b670frankho				 * do this for zero sized files and symlinks,
513d10b670frankho				 * because if we did we'd end up mapping all of
514d10b670frankho				 * them to the same node. We use HS_DUMMY_INO
515d10b670frankho				 * in this case and make sure that we will not
516d10b670frankho				 * map all files to the same meta data.
5177c478bdstevel@tonic-gate				 */
518d10b670frankho				if (hd.inode != 0 && use_rrip_inodes) {
519d10b670frankho					dirino = hd.inode;
520d10b670frankho				} else if ((hd.ext_size == 0 ||
521d10b670frankho				    hd.sym_link != (char *)NULL) &&
522d10b670frankho				    (fsp->hsfs_flags & HSFSMNT_INODE) == 0) {
523d10b670frankho					dirino = HS_DUMMY_INO;
5247c478bdstevel@tonic-gate				} else {
525d10b670frankho					dirino = hd.ext_lbn;
5267c478bdstevel@tonic-gate				}
5277c478bdstevel@tonic-gate
5287c478bdstevel@tonic-gate				/* strncpy(9f) will zero uninitialized bytes */
5297c478bdstevel@tonic-gate
5307c478bdstevel@tonic-gate				ASSERT(strlen(dname) + 1 <=
5317c478bdstevel@tonic-gate				    DIRENT64_NAMELEN(ndlen));
5327c478bdstevel@tonic-gate				(void) strncpy(nd->d_name, dname,
5337c478bdstevel@tonic-gate				    DIRENT64_NAMELEN(ndlen));
5347c478bdstevel@tonic-gate				nd->d_reclen = (ushort_t)ndlen;
5357c478bdstevel@tonic-gate				nd->d_off = (offset_t)diroff;
5367c478bdstevel@tonic-gate				nd->d_ino = dirino;
5377c478bdstevel@tonic-gate				nd = (struct dirent64 *)((char *)nd + ndlen);
5387c478bdstevel@tonic-gate
5397c478bdstevel@tonic-gate				/*
5407c478bdstevel@tonic-gate				 * free up space allocated for symlink
5417c478bdstevel@tonic-gate				 */
5427c478bdstevel@tonic-gate				if (hd.sym_link != (char *)NULL) {
5437c478bdstevel@tonic-gate					kmem_free(hd.sym_link,
5447c478bdstevel@tonic-gate					    (size_t)(hd.ext_size+1));
5457c478bdstevel@tonic-gate					hd.sym_link = (char *)NULL;
5467c478bdstevel@tonic-gate				}
5477c478bdstevel@tonic-gate			}
5487c478bdstevel@tonic-gate			offset += hdlen;
5497c478bdstevel@tonic-gate		}
5507c478bdstevel@tonic-gate		fbrelse(fbp, S_READ);
5517c478bdstevel@tonic-gate	}
5527c478bdstevel@tonic-gate
5537c478bdstevel@tonic-gate	/*
5547c478bdstevel@tonic-gate	 * Got here for one of the following reasons:
5557c478bdstevel@tonic-gate	 *	1) outbuf is full (error == 0)
5567c478bdstevel@tonic-gate	 *	2) end of directory reached (error == 0)
5577c478bdstevel@tonic-gate	 *	3) error reading directory sector (error != 0)
5587c478bdstevel@tonic-gate	 *	4) directory entry crosses sector boundary (error == 0)
5597c478bdstevel@tonic-gate	 *
5607c478bdstevel@tonic-gate	 * If any directory entries have been copied, don't report
5617c478bdstevel@tonic-gate	 * case 4.  Instead, return the valid directory entries.
5627c478bdstevel@tonic-gate	 *
5637c478bdstevel@tonic-gate	 * If no entries have been copied, report the error.
5647c478bdstevel@tonic-gate	 * If case 4, this will be indistiguishable from EOF.
5657c478bdstevel@tonic-gate	 */
5667c478bdstevel@tonic-gatedone:
5677c478bdstevel@tonic-gate	ndlen = ((char *)nd - outbuf);
5687c478bdstevel@tonic-gate	if (ndlen != 0) {
5697c478bdstevel@tonic-gate		error = uiomove(outbuf, (size_t)ndlen, UIO_READ, uiop);
5709cbc422peterte		uiop->uio_loffset = offset;
5717c478bdstevel@tonic-gate	}
5727c478bdstevel@tonic-gate	kmem_free(dname, dname_size);
5737c478bdstevel@tonic-gate	kmem_free(outbuf, bufsize);
5747c478bdstevel@tonic-gate	if (eofp && error == 0)
5759cbc422peterte		*eofp = (uiop->uio_loffset >= dirsiz);
5767c478bdstevel@tonic-gate	return (error);
5777c478bdstevel@tonic-gate}
5787c478bdstevel@tonic-gate
579da6c28aamw/*ARGSUSED2*/
5807c478bdstevel@tonic-gatestatic int
581da6c28aamwhsfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
5827c478bdstevel@tonic-gate{
5837c478bdstevel@tonic-gate	struct hsnode *hp;
5847c478bdstevel@tonic-gate	struct hsfid *fid;
5857c478bdstevel@tonic-gate
5867c478bdstevel@tonic-gate	if (fidp->fid_len < (sizeof (*fid) - sizeof (fid->hf_len))) {
5877c478bdstevel@tonic-gate		fidp->fid_len = sizeof (*fid) - sizeof (fid->hf_len);
5887c478bdstevel@tonic-gate		return (ENOSPC);
5897c478bdstevel@tonic-gate	}
5907c478bdstevel@tonic-gate
5917c478bdstevel@tonic-gate	fid = (struct hsfid *)fidp;
5927c478bdstevel@tonic-gate	fid->hf_len = sizeof (*fid) - sizeof (fid->hf_len);
5937c478bdstevel@tonic-gate	hp = VTOH(vp);
5947c478bdstevel@tonic-gate	mutex_enter(&hp->hs_contents_lock);
5957c478bdstevel@tonic-gate	fid->hf_dir_lbn = hp->hs_dir_lbn;
5967c478bdstevel@tonic-gate	fid->hf_dir_off = (ushort_t)hp->hs_dir_off;
597d10b670frankho	fid->hf_ino = hp->hs_nodeid;
5987c478bdstevel@tonic-gate	mutex_exit(&hp->hs_contents_lock);
5997c478bdstevel@tonic-gate	return (0);
6007c478bdstevel@tonic-gate}
6017c478bdstevel@tonic-gate
6027c478bdstevel@tonic-gate/*ARGSUSED*/
6037c478bdstevel@tonic-gatestatic int
604ade42b5Sebastien Royhsfs_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct)
6057c478bdstevel@tonic-gate{
6067c478bdstevel@tonic-gate	return (0);
6077c478bdstevel@tonic-gate}
6087c478bdstevel@tonic-gate
6097c478bdstevel@tonic-gate/*ARGSUSED*/
6107c478bdstevel@tonic-gatestatic int
611ade42b5Sebastien Royhsfs_close(struct vnode *vp, int flag, int count, offset_t offset,
612ade42b5Sebastien Roy    struct cred *cred, caller_context_t *ct)
6137c478bdstevel@tonic-gate{
6147c478bdstevel@tonic-gate	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
6157c478bdstevel@tonic-gate	cleanshares(vp, ttoproc(curthread)->p_pid);
6167c478bdstevel@tonic-gate	return (0);
6177c478bdstevel@tonic-gate}
6187c478bdstevel@tonic-gate
6197c478bdstevel@tonic-gate/*ARGSUSED2*/
6207c478bdstevel@tonic-gatestatic int
621ade42b5Sebastien Royhsfs_access(struct vnode *vp, int mode, int flags, cred_t *cred,
622ade42b5Sebastien Roy    caller_context_t *ct)
6237c478bdstevel@tonic-gate{
6247c478bdstevel@tonic-gate	return (hs_access(vp, (mode_t)mode, cred));
6257c478bdstevel@tonic-gate}
6267c478bdstevel@tonic-gate
6277c478bdstevel@tonic-gate/*
6287c478bdstevel@tonic-gate * the seek time of a CD-ROM is very slow, and data transfer
6297c478bdstevel@tonic-gate * rate is even worse (max. 150K per sec).  The design
6307c478bdstevel@tonic-gate * decision is to reduce access to cd-rom as much as possible,
6317c478bdstevel@tonic-gate * and to transfer a sizable block (read-ahead) of data at a time.
6327c478bdstevel@tonic-gate * UFS style of read ahead one block at a time is not appropriate,
6337c478bdstevel@tonic-gate * and is not supported
6347c478bdstevel@tonic-gate */
6357c478bdstevel@tonic-gate
6367c478bdstevel@tonic-gate/*
6377c478bdstevel@tonic-gate * KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS.
6387c478bdstevel@tonic-gate */
6397c478bdstevel@tonic-gate#define	KLUSTSIZE	(56 * 1024)
6407c478bdstevel@tonic-gate/* we don't support read ahead */
6417c478bdstevel@tonic-gateint hsfs_lostpage;	/* no. of times we lost original page */
6427c478bdstevel@tonic-gate
6437c478bdstevel@tonic-gate/*
6447c478bdstevel@tonic-gate * Used to prevent biodone() from releasing buf resources that
6457c478bdstevel@tonic-gate * we didn't allocate in quite the usual way.
6467c478bdstevel@tonic-gate */
6477c478bdstevel@tonic-gate/*ARGSUSED*/
6487c478bdstevel@tonic-gateint
6497c478bdstevel@tonic-gatehsfs_iodone(struct buf *bp)
6507c478bdstevel@tonic-gate{
6517c478bdstevel@tonic-gate	sema_v(&bp->b_io);
6527c478bdstevel@tonic-gate	return (0);
6537c478bdstevel@tonic-gate}
6547c478bdstevel@tonic-gate
6557c478bdstevel@tonic-gate/*
65684b8276mg * The taskq thread that invokes the scheduling function to ensure
65784b8276mg * that all readaheads are complete and cleans up the associated
65884b8276mg * memory and releases the page lock.
65984b8276mg */
66084b8276mgvoid
66184b8276mghsfs_ra_task(void *arg)
66284b8276mg{
66384b8276mg	struct hio_info *info = arg;
66484b8276mg	uint_t count;
66584b8276mg	struct buf *wbuf;
66684b8276mg
66784b8276mg	ASSERT(info->pp != NULL);
66884b8276mg
66984b8276mg	for (count = 0; count < info->bufsused; count++) {
67084b8276mg		wbuf = &(info->bufs[count]);
67184b8276mg
67284b8276mg		DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf);
67384b8276mg		while (sema_tryp(&(info->sema[count])) == 0) {
67484b8276mg			if (hsched_invoke_strategy(info->fsp)) {
67584b8276mg				sema_p(&(info->sema[count]));
67684b8276mg				break;
67784b8276mg			}
67884b8276mg		}
67984b8276mg		sema_destroy(&(info->sema[count]));
68084b8276mg		DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf);
68184b8276mg		biofini(&(info->bufs[count]));
68284b8276mg	}
68384b8276mg	for (count = 0; count < info->bufsused; count++) {
68484b8276mg		if (info->vas[count] != NULL) {
68584b8276mg			ppmapout(info->vas[count]);
68684b8276mg		}
68784b8276mg	}
68884b8276mg	kmem_free(info->vas, info->bufcnt * sizeof (caddr_t));
68984b8276mg	kmem_free(info->bufs, info->bufcnt * sizeof (struct buf));
69084b8276mg	kmem_free(info->sema, info->bufcnt * sizeof (ksema_t));
69184b8276mg
69284b8276mg	pvn_read_done(info->pp, 0);
69384b8276mg	kmem_cache_free(hio_info_cache, info);
69484b8276mg}
69584b8276mg
69684b8276mg/*
69784b8276mg * Submit asynchronous readahead requests to the I/O scheduler
69884b8276mg * depending on the number of pages to read ahead. These requests
69984b8276mg * are asynchronous to the calling thread but I/O requests issued
70084b8276mg * subsequently by other threads with higher LBNs must wait for
70184b8276mg * these readaheads to complete since we have a single ordered
70284b8276mg * I/O pipeline. Thus these readaheads are semi-asynchronous.
70384b8276mg * A TaskQ handles waiting for the readaheads to complete.
70484b8276mg *
70584b8276mg * This function is mostly a copy of hsfs_getapage but somewhat
70684b8276mg * simpler. A readahead request is aborted if page allocation
70784b8276mg * fails.
70884b8276mg */
70984b8276mg/*ARGSUSED*/
71084b8276mgstatic int
711ade42b5Sebastien Royhsfs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg,
712ade42b5Sebastien Roy    caddr_t addr, struct hsnode *hp, struct hsfs *fsp, int xarsiz,
713ade42b5Sebastien Roy    offset_t bof, int chunk_lbn_count, int chunk_data_bytes)
71484b8276mg{
71584b8276mg	struct buf *bufs;
71684b8276mg	caddr_t *vas;
71784b8276mg	caddr_t va;
71884b8276mg	struct page *pp, *searchp, *lastp;
71984b8276mg	struct vnode *devvp;
72084b8276mg	ulong_t	byte_offset;
72184b8276mg	size_t	io_len_tmp;
72284b8276mg	uint_t	io_off, io_len;
72384b8276mg	uint_t	xlen;
72484b8276mg	uint_t	filsiz;
72584b8276mg	uint_t	secsize;
72684b8276mg	uint_t	bufcnt;
72784b8276mg	uint_t	bufsused;
72884b8276mg	uint_t	count;
72984b8276mg	uint_t	io_end;
73084b8276mg	uint_t	which_chunk_lbn;
73184b8276mg	uint_t	offset_lbn;
73284b8276mg	uint_t	offset_extra;
73384b8276mg	offset_t	offset_bytes;
73484b8276mg	uint_t	remaining_bytes;
73584b8276mg	uint_t	extension;
73684b8276mg	int	remainder;	/* must be signed */
73784b8276mg	diskaddr_t driver_block;
73884b8276mg	u_offset_t io_off_tmp;
73984b8276mg	ksema_t	*fio_done;
74084b8276mg	struct hio_info *info;
74184b8276mg	size_t len;
74284b8276mg
74384b8276mg	ASSERT(fsp->hqueue != NULL);
74484b8276mg
74584b8276mg	if (addr >= seg->s_base + seg->s_size) {
74684b8276mg		return (-1);
74784b8276mg	}
74884b8276mg
74984b8276mg	devvp = fsp->hsfs_devvp;
75084b8276mg	secsize = fsp->hsfs_vol.lbn_size;  /* bytes per logical block */
75184b8276mg
75284b8276mg	/* file data size */
75384b8276mg	filsiz = hp->hs_dirent.ext_size;
75484b8276mg
75584b8276mg	if (off >= filsiz)
75684b8276mg		return (0);
75784b8276mg
75884b8276mg	extension = 0;
75984b8276mg	pp = NULL;
76084b8276mg
76184b8276mg	extension += hp->hs_ra_bytes;
76284b8276mg
76384b8276mg	/*
764f9ec9c5mg	 * Some CD writers (e.g. Kodak Photo CD writers)
765f9ec9c5mg	 * create CDs in TAO mode and reserve tracks that
766f9ec9c5mg	 * are not completely written. Some sectors remain
767f9ec9c5mg	 * unreadable for this reason and give I/O errors.
768f9ec9c5mg	 * Also, there's no point in reading sectors
769f9ec9c5mg	 * we'll never look at.  So, if we're asked to go
770f9ec9c5mg	 * beyond the end of a file, truncate to the length
771f9ec9c5mg	 * of that file.
77284b8276mg	 *
773f9ec9c5mg	 * Additionally, this behaviour is required by section
774f9ec9c5mg	 * 6.4.5 of ISO 9660:1988(E).
77584b8276mg	 */
77684b8276mg	len = MIN(extension ? extension : PAGESIZE, filsiz - off);
77784b8276mg
77884b8276mg	/* A little paranoia */
77984b8276mg	if (len <= 0)
78084b8276mg		return (-1);
78184b8276mg
78284b8276mg	/*
78384b8276mg	 * After all that, make sure we're asking for things in units
78484b8276mg	 * that bdev_strategy() will understand (see bug 4202551).
78584b8276mg	 */
78684b8276mg	len = roundup(len, DEV_BSIZE);
78784b8276mg
78884b8276mg	pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
78984b8276mg	    &io_len_tmp, off, len, 1);
79084b8276mg
79184b8276mg	if (pp == NULL) {
79284b8276mg		hp->hs_num_contig = 0;
79384b8276mg		hp->hs_ra_bytes = 0;
79484b8276mg		hp->hs_prev_offset = 0;
79584b8276mg		return (-1);
79684b8276mg	}
79784b8276mg
79884b8276mg	io_off = (uint_t)io_off_tmp;
79984b8276mg	io_len = (uint_t)io_len_tmp;
80084b8276mg
80184b8276mg	/* check for truncation */
80284b8276mg	/*
80384b8276mg	 * xxx Clean up and return EIO instead?
80484b8276mg	 * xxx Ought to go to u_offset_t for everything, but we
80584b8276mg	 * xxx call lots of things that want uint_t arguments.
80684b8276mg	 */
80784b8276mg	ASSERT(io_off == io_off_tmp);
80884b8276mg
80984b8276mg	/*
81084b8276mg	 * get enough buffers for worst-case scenario
81184b8276mg	 * (i.e., no coalescing possible).
81284b8276mg	 */
81384b8276mg	bufcnt = (len + secsize - 1) / secsize;
81484b8276mg	bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP);
81584b8276mg	vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
81684b8276mg
81784b8276mg	/*
81884b8276mg	 * Allocate a array of semaphores since we are doing I/O
81984b8276mg	 * scheduling.
82084b8276mg	 */
82184b8276mg	fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP);
82284b8276mg
82384b8276mg	/*
82484b8276mg	 * If our filesize is not an integer multiple of PAGESIZE,
82584b8276mg	 * we zero that part of the last page that's between EOF and
82684b8276mg	 * the PAGESIZE boundary.
82784b8276mg	 */
82884b8276mg	xlen = io_len & PAGEOFFSET;
82984b8276mg	if (xlen != 0)
83084b8276mg		pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
83184b8276mg
83284b8276mg	DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len);
83384b8276mg
83484b8276mg	va = NULL;
83584b8276mg	lastp = NULL;
83684b8276mg	searchp = pp;
83784b8276mg	io_end = io_off + io_len;
83884b8276mg	for (count = 0, byte_offset = io_off;
83984b8276mg	    byte_offset < io_end;
84084b8276mg	    count++) {
84184b8276mg		ASSERT(count < bufcnt);
84284b8276mg
84384b8276mg		bioinit(&bufs[count]);
84484b8276mg		bufs[count].b_edev = devvp->v_rdev;
84584b8276mg		bufs[count].b_dev = cmpdev(devvp->v_rdev);
84684b8276mg		bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
84784b8276mg		bufs[count].b_iodone = hsfs_iodone;
84884b8276mg		bufs[count].b_vp = vp;
84984b8276mg		bufs[count].b_file = vp;
85084b8276mg
85184b8276mg		/* Compute disk address for interleaving. */
85284b8276mg
85384b8276mg		/* considered without skips */
85484b8276mg		which_chunk_lbn = byte_offset / chunk_data_bytes;
85584b8276mg
85684b8276mg		/* factor in skips */
85784b8276mg		offset_lbn = which_chunk_lbn * chunk_lbn_count;
85884b8276mg
85984b8276mg		/* convert to physical byte offset for lbn */
86084b8276mg		offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
86184b8276mg
86284b8276mg		/* don't forget offset into lbn */
86384b8276mg		offset_extra = byte_offset % chunk_data_bytes;
86484b8276mg
86584b8276mg		/* get virtual block number for driver */
86684b8276mg		driver_block = lbtodb(bof + xarsiz
86784b8276mg		    + offset_bytes + offset_extra);
86884b8276mg
86984b8276mg		if (lastp != searchp) {
87084b8276mg			/* this branch taken first time through loop */
87184b8276mg			va = vas[count] = ppmapin(searchp, PROT_WRITE,
87284b8276mg			    (caddr_t)-1);
87384b8276mg			/* ppmapin() guarantees not to return NULL */
87484b8276mg		} else {
87584b8276mg			vas[count] = NULL;
87684b8276mg		}
87784b8276mg
87884b8276mg		bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
87984b8276mg		bufs[count].b_offset =
88084b8276mg		    (offset_t)(byte_offset - io_off + off);
88184b8276mg
88284b8276mg		/*
88384b8276mg		 * We specifically use the b_lblkno member here
88484b8276mg		 * as even in the 32 bit world driver_block can
88584b8276mg		 * get very large in line with the ISO9660 spec.
88684b8276mg		 */
88784b8276mg
88884b8276mg		bufs[count].b_lblkno = driver_block;
88984b8276mg
89084b8276mg		remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes)
89184b8276mg		    - byte_offset;
89284b8276mg
89384b8276mg		/*
89484b8276mg		 * remaining_bytes can't be zero, as we derived
89584b8276mg		 * which_chunk_lbn directly from byte_offset.
89684b8276mg		 */
89784b8276mg		if ((remaining_bytes + byte_offset) < (off + len)) {
89884b8276mg			/* coalesce-read the rest of the chunk */
89984b8276mg			bufs[count].b_bcount = remaining_bytes;
90084b8276mg		} else {
90184b8276mg			/* get the final bits */
90284b8276mg			bufs[count].b_bcount = off + len - byte_offset;
90384b8276mg		}
90484b8276mg
90584b8276mg		remainder = PAGESIZE - (byte_offset % PAGESIZE);
90684b8276mg		if (bufs[count].b_bcount > remainder) {
90784b8276mg			bufs[count].b_bcount = remainder;
90884b8276mg		}
90984b8276mg
91084b8276mg		bufs[count].b_bufsize = bufs[count].b_bcount;
91184b8276mg		if (((offset_t)byte_offset + bufs[count].b_bcount) >
91284b8276mg		    HS_MAXFILEOFF) {
91384b8276mg			break;
91484b8276mg		}
91584b8276mg		byte_offset += bufs[count].b_bcount;
91684b8276mg
91784b8276mg		/*
91884b8276mg		 * We are scheduling I/O so we need to enqueue
91984b8276mg		 * requests rather than calling bdev_strategy
92084b8276mg		 * here. A later invocation of the scheduling
92184b8276mg		 * function will take care of doing the actual
92284b8276mg		 * I/O as it selects requests from the queue as
92384b8276mg		 * per the scheduling logic.
92484b8276mg		 */
92584b8276mg		struct hio *hsio = kmem_cache_alloc(hio_cache,
92684b8276mg		    KM_SLEEP);
92784b8276mg
92884b8276mg		sema_init(&fio_done[count], 0, NULL,
92984b8276mg		    SEMA_DEFAULT, NULL);
93084b8276mg		hsio->bp = &bufs[count];
93184b8276mg		hsio->sema = &fio_done[count];
93284b8276mg		hsio->io_lblkno = bufs[count].b_lblkno;
93384b8276mg		hsio->nblocks = howmany(hsio->bp->b_bcount,
93484b8276mg		    DEV_BSIZE);
93584b8276mg
93684b8276mg		/* used for deadline */
93784b8276mg		hsio->io_timestamp = drv_hztousec(ddi_get_lbolt());
93884b8276mg
93984b8276mg		/* for I/O coalescing */
94084b8276mg		hsio->contig_chain = NULL;
94184b8276mg		hsched_enqueue_io(fsp, hsio, 1);
94284b8276mg
94384b8276mg		lwp_stat_update(LWP_STAT_INBLK, 1);
94484b8276mg		lastp = searchp;
94584b8276mg		if ((remainder - bufs[count].b_bcount) < 1) {
94684b8276mg			searchp = searchp->p_next;
94784b8276mg		}
94884b8276mg	}
94984b8276mg
95084b8276mg	bufsused = count;
95184b8276mg	info = kmem_cache_alloc(hio_info_cache, KM_SLEEP);
95284b8276mg	info->bufs = bufs;
95384b8276mg	info->vas = vas;
95484b8276mg	info->sema = fio_done;
95584b8276mg	info->bufsused = bufsused;
95684b8276mg	info->bufcnt = bufcnt;
95784b8276mg	info->fsp = fsp;
95884b8276mg	info->pp = pp;
95984b8276mg
96084b8276mg	(void) taskq_dispatch(fsp->hqueue->ra_task,
96184b8276mg	    hsfs_ra_task, info, KM_SLEEP);
96284b8276mg	/*
96384b8276mg	 * The I/O locked pages are unlocked in our taskq thread.
96484b8276mg	 */
96584b8276mg	return (0);
96684b8276mg}
96784b8276mg
96884b8276mg/*
9697c478bdstevel@tonic-gate * Each file may have a different interleaving on disk.  This makes
9707c478bdstevel@tonic-gate * things somewhat interesting.  The gist is that there are some
9717c478bdstevel@tonic-gate * number of contiguous data sectors, followed by some other number
9727c478bdstevel@tonic-gate * of contiguous skip sectors.  The sum of those two sets of sectors
9737c478bdstevel@tonic-gate * defines the interleave size.  Unfortunately, it means that we generally
9747c478bdstevel@tonic-gate * can't simply read N sectors starting at a given offset to satisfy
9757c478bdstevel@tonic-gate * any given request.
9767c478bdstevel@tonic-gate *
9777c478bdstevel@tonic-gate * What we do is get the relevant memory pages via pvn_read_kluster(),
9787c478bdstevel@tonic-gate * then stride through the interleaves, setting up a buf for each
9797c478bdstevel@tonic-gate * sector that needs to be brought in.  Instead of kmem_alloc'ing
9807c478bdstevel@tonic-gate * space for the sectors, though, we just point at the appropriate
9817c478bdstevel@tonic-gate * spot in the relevant page for each of them.  This saves us a bunch
9827c478bdstevel@tonic-gate * of copying.
98384b8276mg *
98484b8276mg * NOTICE: The code below in hsfs_getapage is mostly same as the code
98584b8276mg *         in hsfs_getpage_ra above (with some omissions). If you are
98684b8276mg *         making any change to this function, please also look at
98784b8276mg *         hsfs_getpage_ra.
9887c478bdstevel@tonic-gate */
9897c478bdstevel@tonic-gate/*ARGSUSED*/
9907c478bdstevel@tonic-gatestatic int
991ade42b5Sebastien Royhsfs_getapage(struct vnode *vp, u_offset_t off, size_t len, uint_t *protp,
992ade42b5Sebastien Roy    struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
993ade42b5Sebastien Roy    enum seg_rw rw, struct cred *cred)
9947c478bdstevel@tonic-gate{
9957c478bdstevel@tonic-gate	struct hsnode *hp;
9967c478bdstevel@tonic-gate	struct hsfs *fsp;
9977c478bdstevel@tonic-gate	int	err;
9987c478bdstevel@tonic-gate	struct buf *bufs;
9997c478bdstevel@tonic-gate	caddr_t *vas;
10007c478bdstevel@tonic-gate	caddr_t va;
10017c478bdstevel@tonic-gate	struct page *pp, *searchp, *lastp;
10027c478bdstevel@tonic-gate	page_t	*pagefound;
10037c478bdstevel@tonic-gate	offset_t	bof;
10047c478bdstevel@tonic-gate	struct vnode *devvp;
10057c478bdstevel@tonic-gate	ulong_t	byte_offset;
10067c478bdstevel@tonic-gate	size_t	io_len_tmp;
10077c478bdstevel@tonic-gate	uint_t	io_off, io_len;
10087c478bdstevel@tonic-gate	uint_t	xlen;
10097c478bdstevel@tonic-gate	uint_t	filsiz;
10107c478bdstevel@tonic-gate	uint_t	secsize;
10117c478bdstevel@tonic-gate	uint_t	bufcnt;
10127c478bdstevel@tonic-gate	uint_t	bufsused;
10137c478bdstevel@tonic-gate	uint_t	count;
10147c478bdstevel@tonic-gate	uint_t	io_end;
10157c478bdstevel@tonic-gate	uint_t	which_chunk_lbn;
10167c478bdstevel@tonic-gate	uint_t	offset_lbn;
10177c478bdstevel@tonic-gate	uint_t	offset_extra;
10187c478bdstevel@tonic-gate	offset_t	offset_bytes;
10197c478bdstevel@tonic-gate	uint_t	remaining_bytes;
10207c478bdstevel@tonic-gate	uint_t	extension;
10217c478bdstevel@tonic-gate	int	remainder;	/* must be signed */
10227c478bdstevel@tonic-gate	int	chunk_lbn_count;
10237c478bdstevel@tonic-gate	int	chunk_data_bytes;
10247c478bdstevel@tonic-gate	int	xarsiz;
10257c478bdstevel@tonic-gate	diskaddr_t driver_block;
10267c478bdstevel@tonic-gate	u_offset_t io_off_tmp;
102784b8276mg	ksema_t *fio_done;
102884b8276mg	int	calcdone;
10297c478bdstevel@tonic-gate
10307c478bdstevel@tonic-gate	/*
10317c478bdstevel@tonic-gate	 * We don't support asynchronous operation at the moment, so
10327c478bdstevel@tonic-gate	 * just pretend we did it.  If the pages are ever actually
10337c478bdstevel@tonic-gate	 * needed, they'll get brought in then.
10347c478bdstevel@tonic-gate	 */
10357c478bdstevel@tonic-gate	if (pl == NULL)
10367c478bdstevel@tonic-gate		return (0);
10377c478bdstevel@tonic-gate
10387c478bdstevel@tonic-gate	hp = VTOH(vp);
10397c478bdstevel@tonic-gate	fsp = VFS_TO_HSFS(vp->v_vfsp);
10407c478bdstevel@tonic-gate	devvp = fsp->hsfs_devvp;
10417c478bdstevel@tonic-gate	secsize = fsp->hsfs_vol.lbn_size;  /* bytes per logical block */
10427c478bdstevel@tonic-gate
10437c478bdstevel@tonic-gate	/* file data size */
10447c478bdstevel@tonic-gate	filsiz = hp->hs_dirent.ext_size;
10457c478bdstevel@tonic-gate
10467c478bdstevel@tonic-gate	/* disk addr for start of file */
10477c478bdstevel@tonic-gate	bof = LBN_TO_BYTE((offset_t)hp->hs_dirent.ext_lbn, vp->v_vfsp);
10487c478bdstevel@tonic-gate
10497c478bdstevel@tonic-gate	/* xarsiz byte must be skipped for data */
10507c478bdstevel@tonic-gate	xarsiz = hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift;
10517c478bdstevel@tonic-gate
10527c478bdstevel@tonic-gate	/* how many logical blocks in an interleave (data+skip) */
10537c478bdstevel@tonic-gate	chunk_lbn_count = hp->hs_dirent.intlf_sz + hp->hs_dirent.intlf_sk;
10547c478bdstevel@tonic-gate
10557c478bdstevel@tonic-gate	if (chunk_lbn_count == 0) {
10567c478bdstevel@tonic-gate		chunk_lbn_count = 1;
10577c478bdstevel@tonic-gate	}
10587c478bdstevel@tonic-gate
10597c478bdstevel@tonic-gate	/*
10607c478bdstevel@tonic-gate	 * Convert interleaving size into bytes.  The zero case
10617c478bdstevel@tonic-gate	 * (no interleaving) optimization is handled as a side-
10627c478bdstevel@tonic-gate	 * effect of the read-ahead logic.
10637c478bdstevel@tonic-gate	 */
10647c478bdstevel@tonic-gate	if (hp->hs_dirent.intlf_sz == 0) {
10657c478bdstevel@tonic-gate		chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp);
106684b8276mg		/*
106784b8276mg		 * Optimization: If our pagesize is a multiple of LBN
106884b8276mg		 * bytes, we can avoid breaking up a page into individual
106984b8276mg		 * lbn-sized requests.
107084b8276mg		 */
107184b8276mg		if (PAGESIZE % chunk_data_bytes == 0) {
107284b8276mg			chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp);
107384b8276mg			chunk_data_bytes = PAGESIZE;
107484b8276mg		}
10757c478bdstevel@tonic-gate	} else {
1076d10b670frankho		chunk_data_bytes =
1077d10b670frankho		    LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp);
10787c478bdstevel@tonic-gate	}
10797c478bdstevel@tonic-gate
10807c478bdstevel@tonic-gatereread:
10817c478bdstevel@tonic-gate	err = 0;
10827c478bdstevel@tonic-gate	pagefound = 0;
108384b8276mg	calcdone = 0;
10847c478bdstevel@tonic-gate
10857c478bdstevel@tonic-gate	/*
10867c478bdstevel@tonic-gate	 * Do some read-ahead.  This mostly saves us a bit of
10877c478bdstevel@tonic-gate	 * system cpu time more than anything else when doing
10887c478bdstevel@tonic-gate	 * sequential reads.  At some point, could do the
10897c478bdstevel@tonic-gate	 * read-ahead asynchronously which might gain us something
10907c478bdstevel@tonic-gate	 * on wall time, but it seems unlikely....
10917c478bdstevel@tonic-gate	 *
10927c478bdstevel@tonic-gate	 * We do the easy case here, which is to read through
10937c478bdstevel@tonic-gate	 * the end of the chunk, minus whatever's at the end that
10947c478bdstevel@tonic-gate	 * won't exactly fill a page.
10957c478bdstevel@tonic-gate	 */
109684b8276mg	if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) {
109784b8276mg		which_chunk_lbn = (off + len) / chunk_data_bytes;
109884b8276mg		extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
109984b8276mg		extension -= (extension % PAGESIZE);
11008cd7c4fpeterte	} else {
110184b8276mg		extension = roundup(len, PAGESIZE);
11027c478bdstevel@tonic-gate	}
11038cd7c4fpeterte
110484b8276mg	atomic_inc_64(&fsp->total_pages_requested);
11057c478bdstevel@tonic-gate
11067c478bdstevel@tonic-gate	pp = NULL;
11077c478bdstevel@tonic-gateagain:
11087c478bdstevel@tonic-gate	/* search for page in buffer */
11097c478bdstevel@tonic-gate	if ((pagefound = page_exists(vp, off)) == 0) {
11107c478bdstevel@tonic-gate		/*
11117c478bdstevel@tonic-gate		 * Need to really do disk IO to get the page.
11127c478bdstevel@tonic-gate		 */
111384b8276mg		if (!calcdone) {
111484b8276mg			extension += hp->hs_ra_bytes;
111584b8276mg
111684b8276mg			/*
111784b8276mg			 * Some cd writers don't write sectors that aren't
111884b8276mg			 * used. Also, there's no point in reading sectors
111984b8276mg			 * we'll never look at.  So, if we're asked to go
112084b8276mg			 * beyond the end of a file, truncate to the length
112184b8276mg			 * of that file.
112284b8276mg			 *
112384b8276mg			 * Additionally, this behaviour is required by section
112484b8276mg			 * 6.4.5 of ISO 9660:1988(E).
112584b8276mg			 */
112684b8276mg			len = MIN(extension ? extension : PAGESIZE,
112784b8276mg			    filsiz - off);
112884b8276mg
112984b8276mg			/* A little paranoia. */
113084b8276mg			ASSERT(len > 0);
113184b8276mg
113284b8276mg			/*
113384b8276mg			 * After all that, make sure we're asking for things
113484b8276mg			 * in units that bdev_strategy() will understand
113584b8276mg			 * (see bug 4202551).
113684b8276mg			 */
113784b8276mg			len = roundup(len, DEV_BSIZE);
113884b8276mg			calcdone = 1;
113984b8276mg		}
114084b8276mg
11417c478bdstevel@tonic-gate		pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
11427c478bdstevel@tonic-gate		    &io_len_tmp, off, len, 0);
11437c478bdstevel@tonic-gate
114484b8276mg		if (pp == NULL) {
114584b8276mg			/*
114684b8276mg			 * Pressure on memory, roll back readahead
114784b8276mg			 */
114884b8276mg			hp->hs_num_contig = 0;
114984b8276mg			hp->hs_ra_bytes = 0;
115084b8276mg			hp->hs_prev_offset = 0;
11517c478bdstevel@tonic-gate			goto again;
115284b8276mg		}
11537c478bdstevel@tonic-gate
11547c478bdstevel@tonic-gate		io_off = (uint_t)io_off_tmp;
11557c478bdstevel@tonic-gate		io_len = (uint_t)io_len_tmp;
11567c478bdstevel@tonic-gate
11577c478bdstevel@tonic-gate		/* check for truncation */
11587c478bdstevel@tonic-gate		/*
11597c478bdstevel@tonic-gate		 * xxx Clean up and return EIO instead?
11607c478bdstevel@tonic-gate		 * xxx Ought to go to u_offset_t for everything, but we
11617c478bdstevel@tonic-gate		 * xxx call lots of things that want uint_t arguments.
11627c478bdstevel@tonic-gate		 */
11637c478bdstevel@tonic-gate		ASSERT(io_off == io_off_tmp);
11647c478bdstevel@tonic-gate
11657c478bdstevel@tonic-gate		/*
11667c478bdstevel@tonic-gate		 * get enough buffers for worst-case scenario
11677c478bdstevel@tonic-gate		 * (i.e., no coalescing possible).
11687c478bdstevel@tonic-gate		 */
11697c478bdstevel@tonic-gate		bufcnt = (len + secsize - 1) / secsize;
11707c478bdstevel@tonic-gate		bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP);
11717c478bdstevel@tonic-gate		vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
117284b8276mg
117384b8276mg		/*
117484b8276mg		 * Allocate a array of semaphores if we are doing I/O
117584b8276mg		 * scheduling.
117684b8276mg		 */
117784b8276mg		if (fsp->hqueue != NULL)
117884b8276mg			fio_done = kmem_alloc(bufcnt * sizeof (ksema_t),
117984b8276mg			    KM_SLEEP);
11807c478bdstevel@tonic-gate		for (count = 0; count < bufcnt; count++) {
118184b8276mg			bioinit(&bufs[count]);
11827c478bdstevel@tonic-gate			bufs[count].b_edev = devvp->v_rdev;
11837c478bdstevel@tonic-gate			bufs[count].b_dev = cmpdev(devvp->v_rdev);
11847c478bdstevel@tonic-gate			bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
11857c478bdstevel@tonic-gate			bufs[count].b_iodone = hsfs_iodone;
11867c478bdstevel@tonic-gate			bufs[count].b_vp = vp;
11877c478bdstevel@tonic-gate			bufs[count].b_file = vp;
11887c478bdstevel@tonic-gate		}
11897c478bdstevel@tonic-gate
11908cd7c4fpeterte		/*
11918cd7c4fpeterte		 * If our filesize is not an integer multiple of PAGESIZE,
11928cd7c4fpeterte		 * we zero that part of the last page that's between EOF and
11938cd7c4fpeterte		 * the PAGESIZE boundary.
11948cd7c4fpeterte		 */
11957c478bdstevel@tonic-gate		xlen = io_len & PAGEOFFSET;
11967c478bdstevel@tonic-gate		if (xlen != 0)
11977c478bdstevel@tonic-gate			pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
11987c478bdstevel@tonic-gate
11997c478bdstevel@tonic-gate		va = NULL;
12007c478bdstevel@tonic-gate		lastp = NULL;
12017c478bdstevel@tonic-gate		searchp = pp;
12027c478bdstevel@tonic-gate		io_end = io_off + io_len;
12037c478bdstevel@tonic-gate		for (count = 0, byte_offset = io_off;
1204d10b670frankho		    byte_offset < io_end; count++) {
12057c478bdstevel@tonic-gate			ASSERT(count < bufcnt);
12067c478bdstevel@tonic-gate
12077c478bdstevel@tonic-gate			/* Compute disk address for interleaving. */
12087c478bdstevel@tonic-gate
12097c478bdstevel@tonic-gate			/* considered without skips */
12107c478bdstevel@tonic-gate			which_chunk_lbn = byte_offset / chunk_data_bytes;
12117c478bdstevel@tonic-gate
12127c478bdstevel@tonic-gate			/* factor in skips */
12137c478bdstevel@tonic-gate			offset_lbn = which_chunk_lbn * chunk_lbn_count;
12147c478bdstevel@tonic-gate
12157c478bdstevel@tonic-gate			/* convert to physical byte offset for lbn */
12167c478bdstevel@tonic-gate			offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
12177c478bdstevel@tonic-gate
12187c478bdstevel@tonic-gate			/* don't forget offset into lbn */
12197c478bdstevel@tonic-gate			offset_extra = byte_offset % chunk_data_bytes;
12207c478bdstevel@tonic-gate
12217c478bdstevel@tonic-gate			/* get virtual block number for driver */
1222d10b670frankho			driver_block =
1223d10b670frankho			    lbtodb(bof + xarsiz + offset_bytes + offset_extra);
12247c478bdstevel@tonic-gate
12257c478bdstevel@tonic-gate			if (lastp != searchp) {
12267c478bdstevel@tonic-gate				/* this branch taken first time through loop */
1227d10b670frankho				va = vas[count] =
1228d10b670frankho				    ppmapin(searchp, PROT_WRITE, (caddr_t)-1);
12297c478bdstevel@tonic-gate				/* ppmapin() guarantees not to return NULL */
12307c478bdstevel@tonic-gate			} else {
12317c478bdstevel@tonic-gate				vas[count] = NULL;
12327c478bdstevel@tonic-gate			}
12337c478bdstevel@tonic-gate
12347c478bdstevel@tonic-gate			bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
12357c478bdstevel@tonic-gate			bufs[count].b_offset =
12367c478bdstevel@tonic-gate			    (offset_t)(byte_offset - io_off + off);
12377c478bdstevel@tonic-gate
12387c478bdstevel@tonic-gate			/*
12397c478bdstevel@tonic-gate			 * We specifically use the b_lblkno member here
12407c478bdstevel@tonic-gate			 * as even in the 32 bit world driver_block can
12417c478bdstevel@tonic-gate			 * get very large in line with the ISO9660 spec.
12427c478bdstevel@tonic-gate			 */
12437c478bdstevel@tonic-gate
12447c478bdstevel@tonic-gate			bufs[count].b_lblkno = driver_block;
12457c478bdstevel@tonic-gate
1246d10b670frankho			remaining_bytes =
1247d10b670frankho			    ((which_chunk_lbn + 1) * chunk_data_bytes)
1248d10b670frankho			    - byte_offset;
12497c478bdstevel@tonic-gate
12507c478bdstevel@tonic-gate			/*
12517c478bdstevel@tonic-gate			 * remaining_bytes can't be zero, as we derived
12527c478bdstevel@tonic-gate			 * which_chunk_lbn directly from byte_offset.
12537c478bdstevel@tonic-gate			 */
12549cbc422peterte			if ((remaining_bytes + byte_offset) < (off + len)) {
12557c478bdstevel@tonic-gate				/* coalesce-read the rest of the chunk */
12567c478bdstevel@tonic-gate				bufs[count].b_bcount = remaining_bytes;
12577c478bdstevel@tonic-gate			} else {
12587c478bdstevel@tonic-gate				/* get the final bits */
12597c478bdstevel@tonic-gate				bufs[count].b_bcount = off + len - byte_offset;
12607c478bdstevel@tonic-gate			}
12617c478bdstevel@tonic-gate
12627c478bdstevel@tonic-gate			/*
12637c478bdstevel@tonic-gate			 * It would be nice to do multiple pages'
12647c478bdstevel@tonic-gate			 * worth at once here when the opportunity
12657c478bdstevel@tonic-gate			 * arises, as that has been shown to improve
12667c478bdstevel@tonic-gate			 * our wall time.  However, to do that
12677c478bdstevel@tonic-gate			 * requires that we use the pageio subsystem,
12687c478bdstevel@tonic-gate			 * which doesn't mix well with what we're
12697c478bdstevel@tonic-gate			 * already using here.  We can't use pageio
12707c478bdstevel@tonic-gate			 * all the time, because that subsystem
12717c478bdstevel@tonic-gate			 * assumes that a page is stored in N
12727c478bdstevel@tonic-gate			 * contiguous blocks on the device.
12737c478bdstevel@tonic-gate			 * Interleaving violates that assumption.
127484b8276mg			 *
127584b8276mg			 * Update: This is now not so big a problem
127684b8276mg			 * because of the I/O scheduler sitting below
127784b8276mg			 * that can re-order and coalesce I/O requests.
12787c478bdstevel@tonic-gate			 */
12797c478bdstevel@tonic-gate
12807c478bdstevel@tonic-gate			remainder = PAGESIZE - (byte_offset % PAGESIZE);
12817c478bdstevel@tonic-gate			if (bufs[count].b_bcount > remainder) {
12827c478bdstevel@tonic-gate				bufs[count].b_bcount = remainder;
12837c478bdstevel@tonic-gate			}
12847c478bdstevel@tonic-gate
12857c478bdstevel@tonic-gate			bufs[count].b_bufsize = bufs[count].b_bcount;
12869cbc422peterte			if (((offset_t)byte_offset + bufs[count].b_bcount) >
1287d10b670frankho			    HS_MAXFILEOFF) {
12889cbc422peterte				break;
12899cbc422peterte			}
12907c478bdstevel@tonic-gate			byte_offset += bufs[count].b_bcount;
12917c478bdstevel@tonic-gate
129284b8276mg			if (fsp->hqueue == NULL) {
129384b8276mg				(void) bdev_strategy(&bufs[count]);
129484b8276mg
129584b8276mg			} else {
129684b8276mg				/*
129784b8276mg				 * We are scheduling I/O so we need to enqueue
129884b8276mg				 * requests rather than calling bdev_strategy
129984b8276mg				 * here. A later invocation of the scheduling
130084b8276mg				 * function will take care of doing the actual
130184b8276mg				 * I/O as it selects requests from the queue as
130284b8276mg				 * per the scheduling logic.
130384b8276mg				 */
130484b8276mg				struct hio *hsio = kmem_cache_alloc(hio_cache,
130584b8276mg				    KM_SLEEP);
130684b8276mg
130784b8276mg				sema_init(&fio_done[count], 0, NULL,
130884b8276mg				    SEMA_DEFAULT, NULL);
130984b8276mg				hsio->bp = &bufs[count];
131084b8276mg				hsio->sema = &fio_done[count];
131184b8276mg				hsio->io_lblkno = bufs[count].b_lblkno;
131284b8276mg				hsio->nblocks = howmany(hsio->bp->b_bcount,
131384b8276mg				    DEV_BSIZE);
131484b8276mg
131584b8276mg				/* used for deadline */
131684b8276mg				hsio->io_timestamp =
131784b8276mg				    drv_hztousec(ddi_get_lbolt());
131884b8276mg
131984b8276mg				/* for I/O coalescing */
132084b8276mg				hsio->contig_chain = NULL;
132184b8276mg				hsched_enqueue_io(fsp, hsio, 0);
132284b8276mg			}
13237c478bdstevel@tonic-gate
13247c478bdstevel@tonic-gate			lwp_stat_update(LWP_STAT_INBLK, 1);
13257c478bdstevel@tonic-gate			lastp = searchp;
13267c478bdstevel@tonic-gate			if ((remainder - bufs[count].b_bcount) < 1) {
13277c478bdstevel@tonic-gate				searchp = searchp->p_next;
13287c478bdstevel@tonic-gate			}
13297c478bdstevel@tonic-gate		}
13307c478bdstevel@tonic-gate
13317c478bdstevel@tonic-gate		bufsused = count;
13327c478bdstevel@tonic-gate		/* Now wait for everything to come in */
133384b8276mg		if (fsp->hqueue == NULL) {
133484b8276mg			for (count = 0; count < bufsused; count++) {
133584b8276mg				if (err == 0) {
133684b8276mg					err = biowait(&bufs[count]);
133784b8276mg				} else
133884b8276mg					(void) biowait(&bufs[count]);
133984b8276mg			}
134084b8276mg		} else {
134184b8276mg			for (count = 0; count < bufsused; count++) {
134284b8276mg				struct buf *wbuf;
134384b8276mg
134484b8276mg				/*
134584b8276mg				 * Invoke scheduling function till our buf
134684b8276mg				 * is processed. In doing this it might
134784b8276mg				 * process bufs enqueued by other threads
134884b8276mg				 * which is good.
134984b8276mg				 */
135084b8276mg				wbuf = &bufs[count];
135184b8276mg				DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf);
135284b8276mg				while (sema_tryp(&fio_done[count]) == 0) {
135384b8276mg					/*
135484b8276mg					 * hsched_invoke_strategy will return 1
135584b8276mg					 * if the I/O queue is empty. This means
135684b8276mg					 * that there is another thread who has
135784b8276mg					 * issued our buf and is waiting. So we
135884b8276mg					 * just block instead of spinning.
135984b8276mg					 */
136084b8276mg					if (hsched_invoke_strategy(fsp)) {
136184b8276mg						sema_p(&fio_done[count]);
136284b8276mg						break;
136384b8276mg					}
136484b8276mg				}
136584b8276mg				sema_destroy(&fio_done[count]);
136684b8276mg				DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf);
136784b8276mg
136884b8276mg				if (err == 0) {
136984b8276mg					err = geterror(wbuf);
137084b8276mg				}
137184b8276mg			}
137284b8276mg			kmem_free(fio_done, bufcnt * sizeof (ksema_t));
13737c478bdstevel@tonic-gate		}
13747c478bdstevel@tonic-gate
13757c478bdstevel@tonic-gate		/* Don't leak resources */
13767c478bdstevel@tonic-gate		for (count = 0; count < bufcnt; count++) {
137784b8276mg			biofini(&bufs[count]);
13787c478bdstevel@tonic-gate			if (count < bufsused && vas[count] != NULL) {
13797c478bdstevel@tonic-gate				ppmapout(vas[count]);
13807c478bdstevel@tonic-gate			}
13817c478bdstevel@tonic-gate		}
13827c478bdstevel@tonic-gate
13837c478bdstevel@tonic-gate		kmem_free(vas, bufcnt * sizeof (caddr_t));
13847c478bdstevel@tonic-gate		kmem_free(bufs, bufcnt * sizeof (struct buf));
13857c478bdstevel@tonic-gate	}
13867c478bdstevel@tonic-gate
13877c478bdstevel@tonic-gate	if (err) {
13887c478bdstevel@tonic-gate		pvn_read_done(pp, B_ERROR);
13897c478bdstevel@tonic-gate		return (err);
13907c478bdstevel@tonic-gate	}
13917c478bdstevel@tonic-gate
13927c478bdstevel@tonic-gate	/*
13937c478bdstevel@tonic-gate	 * Lock the requested page, and the one after it if possible.
13947c478bdstevel@tonic-gate	 * Don't bother if our caller hasn't given us a place to stash
13957c478bdstevel@tonic-gate	 * the page pointers, since otherwise we'd lock pages that would
13967c478bdstevel@tonic-gate	 * never get unlocked.
13977c478bdstevel@tonic-gate	 */
13987c478bdstevel@tonic-gate	if (pagefound) {
13997c478bdstevel@tonic-gate		int index;
14007c478bdstevel@tonic-gate		ulong_t soff;
14017c478bdstevel@tonic-gate
14027c478bdstevel@tonic-gate		/*
14037c478bdstevel@tonic-gate		 * Make sure it's in memory before we say it's here.
14047c478bdstevel@tonic-gate		 */
14057c478bdstevel@tonic-gate		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
14067c478bdstevel@tonic-gate			hsfs_lostpage++;
14077c478bdstevel@tonic-gate			goto reread;
14087c478bdstevel@tonic-gate		}
14097c478bdstevel@tonic-gate
14107c478bdstevel@tonic-gate		pl[0] = pp;
14117c478bdstevel@tonic-gate		index = 1;
141284b8276mg		atomic_inc_64(&fsp->cache_read_pages);
14137c478bdstevel@tonic-gate
14147c478bdstevel@tonic-gate		/*
14157c478bdstevel@tonic-gate		 * Try to lock the next page, if it exists, without
14167c478bdstevel@tonic-gate		 * blocking.
14177c478bdstevel@tonic-gate		 */
14187c478bdstevel@tonic-gate		plsz -= PAGESIZE;
14197c478bdstevel@tonic-gate		/* LINTED (plsz is unsigned) */
14207c478bdstevel@tonic-gate		for (soff = off + PAGESIZE; plsz > 0;
14217c478bdstevel@tonic-gate		    soff += PAGESIZE, plsz -= PAGESIZE) {
14227c478bdstevel@tonic-gate			pp = page_lookup_nowait(vp, (u_offset_t)soff,
1423d10b670frankho			    SE_SHARED);
14247c478bdstevel@tonic-gate			if (pp == NULL)
14257c478bdstevel@tonic-gate				break;
14267c478bdstevel@tonic-gate			pl[index++] = pp;
14277c478bdstevel@tonic-gate		}
14287c478bdstevel@tonic-gate		pl[index] = NULL;
142984b8276mg
143084b8276mg		/*
143184b8276mg		 * Schedule a semi-asynchronous readahead if we are
143284b8276mg		 * accessing the last cached page for the current
143384b8276mg		 * file.
143484b8276mg		 *
143584b8276mg		 * Doing this here means that readaheads will be
143684b8276mg		 * issued only if cache-hits occur. This is an advantage
143784b8276mg		 * since cache-hits would mean that readahead is giving
143884b8276mg		 * the desired benefit. If cache-hits do not occur there
143984b8276mg		 * is no point in reading ahead of time - the system
144084b8276mg		 * is loaded anyway.
144184b8276mg		 */
144284b8276mg		if (fsp->hqueue != NULL &&
144384b8276mg		    hp->hs_prev_offset - off == PAGESIZE &&
144484b8276mg		    hp->hs_prev_offset < filsiz &&
144584b8276mg		    hp->hs_ra_bytes > 0 &&
144684b8276mg		    !page_exists(vp, hp->hs_prev_offset)) {
144784b8276mg			(void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg,
144884b8276mg			    addr + PAGESIZE, hp, fsp, xarsiz, bof,
144984b8276mg			    chunk_lbn_count, chunk_data_bytes);
145084b8276mg		}
145184b8276mg
14527c478bdstevel@tonic-gate		return (0);
14537c478bdstevel@tonic-gate	}
14547c478bdstevel@tonic-gate
14557c478bdstevel@tonic-gate	if (pp != NULL) {
14567c478bdstevel@tonic-gate		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
14577c478bdstevel@tonic-gate	}
14587c478bdstevel@tonic-gate
14597c478bdstevel@tonic-gate	return (err);
14607c478bdstevel@tonic-gate}
14617c478bdstevel@tonic-gate
1462da6c28aamw/*ARGSUSED*/
14637c478bdstevel@tonic-gatestatic int
1464ade42b5Sebastien Royhsfs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
1465ade42b5Sebastien Roy    struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
1466ade42b5Sebastien Roy    enum seg_rw rw, struct cred *cred, caller_context_t *ct)
14677c478bdstevel@tonic-gate{
14687c478bdstevel@tonic-gate	uint_t filsiz;
146984b8276mg	struct hsfs *fsp;
147084b8276mg	struct hsnode *hp;
147184b8276mg
147284b8276mg	fsp = VFS_TO_HSFS(vp->v_vfsp);
147384b8276mg	hp = VTOH(vp);
14747c478bdstevel@tonic-gate
14757c478bdstevel@tonic-gate	/* does not support write */
14767c478bdstevel@tonic-gate	if (rw == S_WRITE) {
1477de4ddf9Keith M Wesolowski		return (EROFS);
14787c478bdstevel@tonic-gate	}
14797c478bdstevel@tonic-gate
14807c478bdstevel@tonic-gate	if (vp->v_flag & VNOMAP) {
14817c478bdstevel@tonic-gate		return (ENOSYS);
14827c478bdstevel@tonic-gate	}
14837c478bdstevel@tonic-gate
14849cbc422peterte	ASSERT(off <= HS_MAXFILEOFF);
14857c478bdstevel@tonic-gate
14867c478bdstevel@tonic-gate	/*
14877c478bdstevel@tonic-gate	 * Determine file data size for EOF check.
14887c478bdstevel@tonic-gate	 */
14897c478bdstevel@tonic-gate	filsiz = hp->hs_dirent.ext_size;
14907c478bdstevel@tonic-gate	if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap)
14917c478bdstevel@tonic-gate		return (EFAULT);	/* beyond EOF */
14927c478bdstevel@tonic-gate
149384b8276mg	/*
149484b8276mg	 * Async Read-ahead computation.
149584b8276mg	 * This attempts to detect sequential access pattern and
149684b8276mg	 * enables reading extra pages ahead of time.
149784b8276mg	 */
149884b8276mg	if (fsp->hqueue != NULL) {
149984b8276mg		/*
150084b8276mg		 * This check for sequential access also takes into
150184b8276mg		 * account segmap weirdness when reading in chunks
150284b8276mg		 * less than the segmap size of 8K.
150384b8276mg		 */
150484b8276mg		if (hp->hs_prev_offset == off || (off <
150584b8276mg		    hp->hs_prev_offset && off + MAX(len, PAGESIZE)
150684b8276mg		    >= hp->hs_prev_offset)) {
150784b8276mg			if (hp->hs_num_contig <
150884b8276mg			    (seq_contig_requests - 1)) {
150984b8276mg				hp->hs_num_contig++;
151084b8276mg
151184b8276mg			} else {
151284b8276mg				/*
151384b8276mg				 * We increase readahead quantum till
151484b8276mg				 * a predefined max. max_readahead_bytes
151584b8276mg				 * is a multiple of PAGESIZE.
151684b8276mg				 */
151784b8276mg				if (hp->hs_ra_bytes <
151884b8276mg				    fsp->hqueue->max_ra_bytes) {
151984b8276mg					hp->hs_ra_bytes += PAGESIZE;
152084b8276mg				}
152184b8276mg			}
152284b8276mg		} else {
152384b8276mg			/*
152484b8276mg			 * Not contiguous so reduce read ahead counters.
152584b8276mg			 */
152684b8276mg			if (hp->hs_ra_bytes > 0)
152784b8276mg				hp->hs_ra_bytes -= PAGESIZE;
152884b8276mg
152984b8276mg			if (hp->hs_ra_bytes <= 0) {
153084b8276mg				hp->hs_ra_bytes = 0;
153184b8276mg				if (hp->hs_num_contig > 0)
153284b8276mg					hp->hs_num_contig--;
153384b8276mg			}
153484b8276mg		}
153584b8276mg		/*
153684b8276mg		 * Length must be rounded up to page boundary.
153784b8276mg		 * since we read in units of pages.
153884b8276mg		 */
153984b8276mg		hp->hs_prev_offset = off + roundup(len, PAGESIZE);
154084b8276mg		DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp);
154184b8276mg	}
15427c478bdstevel@tonic-gate	if (protp != NULL)
15437c478bdstevel@tonic-gate		*protp = PROT_ALL;
15447c478bdstevel@tonic-gate
154506e6833Josef 'Jeff' Sipek	return (pvn_getpages(hsfs_getapage, vp, off, len, protp, pl, plsz,
154606e6833Josef 'Jeff' Sipek	    seg, addr, rw, cred));
15477c478bdstevel@tonic-gate}
15487c478bdstevel@tonic-gate
15497c478bdstevel@tonic-gate
15507c478bdstevel@tonic-gate
15517c478bdstevel@tonic-gate/*
15527c478bdstevel@tonic-gate * This function should never be called. We need to have it to pass
15537c478bdstevel@tonic-gate * it as an argument to other functions.
15547c478bdstevel@tonic-gate */
15557c478bdstevel@tonic-gate/*ARGSUSED*/
15567c478bdstevel@tonic-gateint
1557ade42b5Sebastien Royhsfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
1558ade42b5Sebastien Roy    int flags, cred_t *cr)
15597c478bdstevel@tonic-gate{
15607c478bdstevel@tonic-gate	/* should never happen - just destroy it */
15617c478bdstevel@tonic-gate	cmn_err(CE_NOTE, "hsfs_putapage: dirty HSFS page");
15627c478bdstevel@tonic-gate	pvn_write_done(pp, B_ERROR | B_WRITE | B_INVAL | B_FORCE | flags);
15637c478bdstevel@tonic-gate	return (0);
15647c478bdstevel@tonic-gate}
15657c478bdstevel@tonic-gate
15667c478bdstevel@tonic-gate
15677c478bdstevel@tonic-gate/*
15687c478bdstevel@tonic-gate * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
15697c478bdstevel@tonic-gate * B_INVAL is set by:
15707c478bdstevel@tonic-gate *
15717c478bdstevel@tonic-gate *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
15727c478bdstevel@tonic-gate *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
15737c478bdstevel@tonic-gate *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
15747c478bdstevel@tonic-gate *
15757c478bdstevel@tonic-gate * The B_FREE (as well as the B_DONTNEED) flag is set when the
15767c478bdstevel@tonic-gate * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
15777c478bdstevel@tonic-gate * from SEGVN to release pages behind a pagefault.
15787c478bdstevel@tonic-gate */
15797c478bdstevel@tonic-gate/*ARGSUSED*/
15807c478bdstevel@tonic-gatestatic int
1581ade42b5Sebastien Royhsfs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
1582ade42b5Sebastien Roy    struct cred *cr, caller_context_t *ct)
15837c478bdstevel@tonic-gate{
15847c478bdstevel@tonic-gate	int error = 0;
15857c478bdstevel@tonic-gate
15867c478bdstevel@tonic-gate	if (vp->v_count == 0) {
15877c478bdstevel@tonic-gate		panic("hsfs_putpage: bad v_count");
15887c478bdstevel@tonic-gate		/*NOTREACHED*/
15897c478bdstevel@tonic-gate	}
15907c478bdstevel@tonic-gate
15917c478bdstevel@tonic-gate	if (vp->v_flag & VNOMAP)
15927c478bdstevel@tonic-gate		return (ENOSYS);
15937c478bdstevel@tonic-gate
15949cbc422peterte	ASSERT(off <= HS_MAXFILEOFF);
15957c478bdstevel@tonic-gate
15967c478bdstevel@tonic-gate	if (!vn_has_cached_data(vp))	/* no pages mapped */
15977c478bdstevel@tonic-gate		return (0);
15987c478bdstevel@tonic-gate
1599d10b670frankho	if (len == 0) {		/* from 'off' to EOF */
1600d10b670frankho		error = pvn_vplist_dirty(vp, off, hsfs_putapage, flags, cr);
1601d10b670frankho	} else {
16027c478bdstevel@tonic-gate		offset_t end_off = off + len;
16037c478bdstevel@tonic-gate		offset_t file_size = VTOH(vp)->hs_dirent.ext_size;
16047c478bdstevel@tonic-gate		offset_t io_off;
16057c478bdstevel@tonic-gate
16067c478bdstevel@tonic-gate		file_size = (file_size + PAGESIZE - 1) & PAGEMASK;
16077c478bdstevel@tonic-gate		if (end_off > file_size)
16087c478bdstevel@tonic-gate			end_off = file_size;
16097c478bdstevel@tonic-gate
16107c478bdstevel@tonic-gate		for (io_off = off; io_off < end_off; io_off += PAGESIZE) {
16117c478bdstevel@tonic-gate			page_t *pp;
16127c478bdstevel@tonic-gate
16137c478bdstevel@tonic-gate			/*
16147c478bdstevel@tonic-gate			 * We insist on getting the page only if we are
16157c478bdstevel@tonic-gate			 * about to invalidate, free or write it and
16167c478bdstevel@tonic-gate			 * the B_ASYNC flag is not set.
16177c478bdstevel@tonic-gate			 */
16187c478bdstevel@tonic-gate			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
16197c478bdstevel@tonic-gate				pp = page_lookup(vp, io_off,
1620d10b670frankho				    (flags & (B_INVAL | B_FREE)) ?
1621d10b670frankho				    SE_EXCL : SE_SHARED);
16227c478bdstevel@tonic-gate			} else {
16237c478bdstevel@tonic-gate				pp = page_lookup_nowait(vp, io_off,
1624d10b670frankho				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
16257c478bdstevel@tonic-gate			}
16267c478bdstevel@tonic-gate
16277c478bdstevel@tonic-gate			if (pp == NULL)
16287c478bdstevel@tonic-gate				continue;
162984b8276mg
16307c478bdstevel@tonic-gate			/*
16317c478bdstevel@tonic-gate			 * Normally pvn_getdirty() should return 0, which
16327c478bdstevel@tonic-gate			 * impies that it has done the job for us.
16337c478bdstevel@tonic-gate			 * The shouldn't-happen scenario is when it returns 1.
16347c478bdstevel@tonic-gate			 * This means that the page has been modified and
16357c478bdstevel@tonic-gate			 * needs to be put back.
16367c478bdstevel@tonic-gate			 * Since we can't write on a CD, we fake a failed
16377c478bdstevel@tonic-gate			 * I/O and force pvn_write_done() to destroy the page.
16387c478bdstevel@tonic-gate			 */
16397c478bdstevel@tonic-gate			if (pvn_getdirty(pp, flags) == 1) {
16407c478bdstevel@tonic-gate				cmn_err(CE_NOTE,
1641d10b670frankho				    "hsfs_putpage: dirty HSFS page");
16427c478bdstevel@tonic-gate				pvn_write_done(pp, flags |
16437c478bdstevel@tonic-gate				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
16447c478bdstevel@tonic-gate			}
16457c478bdstevel@tonic-gate		}
16467c478bdstevel@tonic-gate	}
16477c478bdstevel@tonic-gate	return (error);
16487c478bdstevel@tonic-gate}
16497c478bdstevel@tonic-gate
16507c478bdstevel@tonic-gate
16517c478bdstevel@tonic-gate/*ARGSUSED*/
16527c478bdstevel@tonic-gatestatic int
1653ade42b5Sebastien Royhsfs_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
1654ade42b5Sebastien Roy    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cred,
1655ade42b5Sebastien Roy    caller_context_t *ct)
16567c478bdstevel@tonic-gate{
16577c478bdstevel@tonic-gate	struct segvn_crargs vn_a;
16587c478bdstevel@tonic-gate	int error;
16597c478bdstevel@tonic-gate
16607c478bdstevel@tonic-gate	/* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */
16617c478bdstevel@tonic-gate
16627c478bdstevel@tonic-gate	if (vp->v_flag & VNOMAP)
16637c478bdstevel@tonic-gate		return (ENOSYS);
16647c478bdstevel@tonic-gate
1665277b8dcHans Rosenfeld	if ((prot & PROT_WRITE) && (flags & MAP_SHARED))
1666de4ddf9Keith M Wesolowski		return (ENOSYS);
1667de4ddf9Keith M Wesolowski
16689cbc422peterte	if (off > HS_MAXFILEOFF || off < 0 ||
16699cbc422peterte	    (off + len) < 0 || (off + len) > HS_MAXFILEOFF)
1670cfa5501peterte		return (ENXIO);
16717c478bdstevel@tonic-gate
16727c478bdstevel@tonic-gate	if (vp->v_type != VREG) {
16737c478bdstevel@tonic-gate		return (ENODEV);
16747c478bdstevel@tonic-gate	}
16757c478bdstevel@tonic-gate
16767c478bdstevel@tonic-gate	/*
16777c478bdstevel@tonic-gate	 * If file is being locked, disallow mapping.
16787c478bdstevel@tonic-gate	 */
16797c478bdstevel@tonic-gate	if (vn_has_mandatory_locks(vp, VTOH(vp)->hs_dirent.mode))
16807c478bdstevel@tonic-gate		return (EAGAIN);
16817c478bdstevel@tonic-gate
16827c478bdstevel@tonic-gate	as_rangelock(as);
168360946femec	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
168460946femec	if (error != 0) {
168560946femec		as_rangeunlock(as);
168660946femec		return (error);
16877c478bdstevel@tonic-gate	}
16887c478bdstevel@tonic-gate
16897c478bdstevel@tonic-gate	vn_a.vp = vp;
16907c478bdstevel@tonic-gate	vn_a.offset = off;
16917c478bdstevel@tonic-gate	vn_a.type = flags & MAP_TYPE;
16927c478bdstevel@tonic-gate	vn_a.prot = prot;
16937c478bdstevel@tonic-gate	vn_a.maxprot = maxprot;
16947c478bdstevel@tonic-gate	vn_a.flags = flags & ~MAP_TYPE;
16957c478bdstevel@tonic-gate	vn_a.cred = cred;
16967c478bdstevel@tonic-gate	vn_a.amp = NULL;
16977c478bdstevel@tonic-gate	vn_a.szc = 0;
16987c478bdstevel@tonic-gate	vn_a.lgrp_mem_policy_flags = 0;
16997c478bdstevel@tonic-gate
17007c478bdstevel@tonic-gate	error = as_map(as, *addrp, len, segvn_create, &vn_a);
17017c478bdstevel@tonic-gate	as_rangeunlock(as);
17027c478bdstevel@tonic-gate	return (error);
17037c478bdstevel@tonic-gate}
17047c478bdstevel@tonic-gate
17057c478bdstevel@tonic-gate/* ARGSUSED */
17067c478bdstevel@tonic-gatestatic int
1707ade42b5Sebastien Royhsfs_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
1708ade42b5Sebastien Roy    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
1709ade42b5Sebastien Roy    caller_context_t *ct)
17107c478bdstevel@tonic-gate{
17117c478bdstevel@tonic-gate	struct hsnode *hp;
17127c478bdstevel@tonic-gate
17137c478bdstevel@tonic-gate	if (vp->v_flag & VNOMAP)
17147c478bdstevel@tonic-gate		return (ENOSYS);
17157c478bdstevel@tonic-gate
17167c478bdstevel@tonic-gate	hp = VTOH(vp);
17177c478bdstevel@tonic-gate	mutex_enter(&hp->hs_contents_lock);
17187c478bdstevel@tonic-gate	hp->hs_mapcnt += btopr(len);
17197c478bdstevel@tonic-gate	mutex_exit(&hp->hs_contents_lock);
17207c478bdstevel@tonic-gate	return (0);
17217c478bdstevel@tonic-gate}
17227c478bdstevel@tonic-gate
17237c478bdstevel@tonic-gate/*ARGSUSED*/
17247c478bdstevel@tonic-gatestatic int
1725ade42b5Sebastien Royhsfs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
1726ade42b5Sebastien Roy    size_t len, uint_t prot, uint_t maxprot, uint_t flags, struct cred *cr,
1727ade42b5Sebastien Roy    caller_context_t *ct)
17287c478bdstevel@tonic-gate{
17297c478bdstevel@tonic-gate	struct hsnode *hp;
17307c478bdstevel@tonic-gate
17317c478bdstevel@tonic-gate	if (vp->v_flag & VNOMAP)
17327c478bdstevel@tonic-gate		return (ENOSYS);
17337c478bdstevel@tonic-gate
17347c478bdstevel@tonic-gate	hp = VTOH(vp);
17357c478bdstevel@tonic-gate	mutex_enter(&hp->hs_contents_lock);
17367c478bdstevel@tonic-gate	hp->hs_mapcnt -= btopr(len);	/* Count released mappings */
17377c478bdstevel@tonic-gate	ASSERT(hp->hs_mapcnt >= 0);
17387c478bdstevel@tonic-gate	mutex_exit(&hp->hs_contents_lock);
17397c478bdstevel@tonic-gate	return (0);
17407c478bdstevel@tonic-gate}
17417c478bdstevel@tonic-gate
17427c478bdstevel@tonic-gate/* ARGSUSED */
17437c478bdstevel@tonic-gatestatic int
1744ade42b5Sebastien Royhsfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1745ade42b5Sebastien Roy    caller_context_t *ct)
17467c478bdstevel@tonic-gate{
17477c478bdstevel@tonic-gate	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
17487c478bdstevel@tonic-gate}
17497c478bdstevel@tonic-gate
17507c478bdstevel@tonic-gate/* ARGSUSED */
17517c478bdstevel@tonic-gatestatic int
1752ade42b5Sebastien Royhsfs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
1753ade42b5Sebastien Roy    offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
1754ade42b5Sebastien Roy    caller_context_t *ct)
17557c478bdstevel@tonic-gate{
17567c478bdstevel@tonic-gate	struct hsnode *hp = VTOH(vp);
17577c478bdstevel@tonic-gate
17587c478bdstevel@tonic-gate	/*
17597c478bdstevel@tonic-gate	 * If the file is being mapped, disallow fs_frlock.
17607c478bdstevel@tonic-gate	 * We are not holding the hs_contents_lock while checking
17617c478bdstevel@tonic-gate	 * hs_mapcnt because the current locking strategy drops all
17627c478bdstevel@tonic-gate	 * locks before calling fs_frlock.
17637c478bdstevel@tonic-gate	 * So, hs_mapcnt could change before we enter fs_frlock making
17647c478bdstevel@tonic-gate	 * it meaningless to have held hs_contents_lock in the first place.
17657c478bdstevel@tonic-gate	 */
17667c478bdstevel@tonic-gate	if (hp->hs_mapcnt > 0 && MANDLOCK(vp, hp->hs_dirent.mode))
17677c478bdstevel@tonic-gate		return (EAGAIN);
17687c478bdstevel@tonic-gate
1769da6c28aamw	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
17707c478bdstevel@tonic-gate}
17717c478bdstevel@tonic-gate
177284b8276mgstatic int
177384b8276mghsched_deadline_compare(const void *x1, const void *x2)
177484b8276mg{
177584b8276mg	const struct hio *h1 = x1;
177684b8276mg	const struct hio *h2 = x2;
177784b8276mg
177884b8276mg	if (h1->io_timestamp < h2->io_timestamp)
177984b8276mg		return (-1);
178084b8276mg	if (h1->io_timestamp > h2->io_timestamp)
178184b8276mg		return (1);
178284b8276mg
178384b8276mg	if (h1->io_lblkno < h2->io_lblkno)
178484b8276mg		return (-1);
178584b8276mg	if (h1->io_lblkno > h2->io_lblkno)
178684b8276mg		return (1);
178784b8276mg
178884b8276mg	if (h1 < h2)
178984b8276mg		return (-1);
179084b8276mg	if (h1 > h2)
179184b8276mg		return (1);
179284b8276mg
179384b8276mg	return (0);
179484b8276mg}
179584b8276mg
179684b8276mgstatic int
179784b8276mghsched_offset_compare(const void *x1, const void *x2)
179884b8276mg{
179984b8276mg	const struct hio *h1 = x1;
180084b8276mg	const struct hio *h2 = x2;
180184b8276mg
180284b8276mg	if (h1->io_lblkno < h2->io_lblkno)
180384b8276mg		return (-1);
180484b8276mg	if (h1->io_lblkno > h2->io_lblkno)
180584b8276mg		return (1);
180684b8276mg
180784b8276mg	if (h1 < h2)
180884b8276mg		return (-1);
180984b8276mg	if (h1 > h2)
181084b8276mg		return (1);
181184b8276mg
181284b8276mg	return (0);
181384b8276mg}
181484b8276mg
181584b8276mgvoid
181684b8276mghsched_init_caches(void)
181784b8276mg{
181884b8276mg	hio_cache = kmem_cache_create("hsfs_hio_cache",
181984b8276mg	    sizeof (struct hio), 0, NULL,
182084b8276mg	    NULL, NULL, NULL, NULL, 0);
182184b8276mg
182284b8276mg	hio_info_cache = kmem_cache_create("hsfs_hio_info_cache",
182384b8276mg	    sizeof (struct hio_info), 0, NULL,
182484b8276mg	    NULL, NULL, NULL, NULL, 0);
182584b8276mg}
182684b8276mg
182784b8276mgvoid
182884b8276mghsched_fini_caches(void)
182984b8276mg{
183084b8276mg	kmem_cache_destroy(hio_cache);
183184b8276mg	kmem_cache_destroy(hio_info_cache);
183284b8276mg}
183384b8276mg
183484b8276mg/*
183584b8276mg * Initialize I/O scheduling structures. This is called via hsfs_mount
183684b8276mg */
183784b8276mgvoid
183884b8276