xref: /illumos-gate/usr/src/uts/common/fs/dcfs/dc_vnops.c (revision ade42b55)
1986fd29aSsetje 
2986fd29aSsetje /*
3986fd29aSsetje  * CDDL HEADER START
4986fd29aSsetje  *
5986fd29aSsetje  * The contents of this file are subject to the terms of the
6986fd29aSsetje  * Common Development and Distribution License (the "License").
7986fd29aSsetje  * You may not use this file except in compliance with the License.
8986fd29aSsetje  *
9986fd29aSsetje  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10986fd29aSsetje  * or http://www.opensolaris.org/os/licensing.
11986fd29aSsetje  * See the License for the specific language governing permissions
12986fd29aSsetje  * and limitations under the License.
13986fd29aSsetje  *
14986fd29aSsetje  * When distributing Covered Code, include this CDDL HEADER in each
15986fd29aSsetje  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16986fd29aSsetje  * If applicable, add the following below this CDDL HEADER, with the
17986fd29aSsetje  * fields enclosed by brackets "[]" replaced with your own identifying
18986fd29aSsetje  * information: Portions Copyright [yyyy] [name of copyright owner]
19986fd29aSsetje  *
20986fd29aSsetje  * CDDL HEADER END
21986fd29aSsetje  */
22986fd29aSsetje /*
230fbb751dSJohn Levon  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24*ade42b55SSebastien Roy  * Copyright (c) 2017 by Delphix. All rights reserved.
25986fd29aSsetje  */
26986fd29aSsetje 
27986fd29aSsetje /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28986fd29aSsetje /*	  All Rights Reserved  	*/
29986fd29aSsetje 
30986fd29aSsetje /*
31986fd29aSsetje  * University Copyright- Copyright (c) 1982, 1986, 1988
32986fd29aSsetje  * The Regents of the University of California
33986fd29aSsetje  * All Rights Reserved
34986fd29aSsetje  *
35986fd29aSsetje  * University Acknowledgment- Portions of this document are derived from
36986fd29aSsetje  * software developed by the University of California, Berkeley, and its
37986fd29aSsetje  * contributors.
38986fd29aSsetje  */
39986fd29aSsetje 
40986fd29aSsetje #include <sys/types.h>
41986fd29aSsetje #include <sys/thread.h>
42986fd29aSsetje #include <sys/t_lock.h>
43986fd29aSsetje #include <sys/param.h>
44986fd29aSsetje #include <sys/systm.h>
45986fd29aSsetje #include <sys/bitmap.h>
46986fd29aSsetje #include <sys/buf.h>
47986fd29aSsetje #include <sys/cmn_err.h>
48986fd29aSsetje #include <sys/conf.h>
49986fd29aSsetje #include <sys/ddi.h>
50986fd29aSsetje #include <sys/debug.h>
51986fd29aSsetje #include <sys/errno.h>
52986fd29aSsetje #include <sys/time.h>
53986fd29aSsetje #include <sys/fcntl.h>
54986fd29aSsetje #include <sys/flock.h>
55986fd29aSsetje #include <sys/file.h>
56986fd29aSsetje #include <sys/kmem.h>
57986fd29aSsetje #include <sys/mman.h>
58986fd29aSsetje #include <sys/vmsystm.h>
59986fd29aSsetje #include <sys/open.h>
60986fd29aSsetje #include <sys/swap.h>
61986fd29aSsetje #include <sys/sysmacros.h>
62986fd29aSsetje #include <sys/uio.h>
63986fd29aSsetje #include <sys/vfs.h>
64986fd29aSsetje #include <sys/vfs_opreg.h>
65986fd29aSsetje #include <sys/vnode.h>
66986fd29aSsetje #include <sys/stat.h>
67986fd29aSsetje #include <sys/poll.h>
68986fd29aSsetje #include <sys/zmod.h>
69986fd29aSsetje #include <sys/fs/decomp.h>
70986fd29aSsetje 
71986fd29aSsetje #include <vm/hat.h>
72986fd29aSsetje #include <vm/as.h>
73986fd29aSsetje #include <vm/page.h>
74986fd29aSsetje #include <vm/pvn.h>
75986fd29aSsetje #include <vm/seg_vn.h>
76986fd29aSsetje #include <vm/seg_kmem.h>
77986fd29aSsetje #include <vm/seg_map.h>
78986fd29aSsetje 
79986fd29aSsetje #include <fs/fs_subr.h>
80986fd29aSsetje 
81986fd29aSsetje /*
82986fd29aSsetje  * dcfs - A filesystem for automatic decompressing of fiocompressed files
83986fd29aSsetje  *
84986fd29aSsetje  * This filesystem is a layered filesystem that sits on top of a normal
85986fd29aSsetje  * persistent filesystem and provides automatic decompression of files
86986fd29aSsetje  * that have been previously compressed and stored on the host file system.
87986fd29aSsetje  * This is a pseudo filesystem in that it does not persist data, rather it
88986fd29aSsetje  * intercepts file lookup requests on the host filesystem and provides
89986fd29aSsetje  * transparent decompression of those files. Currently the only supported
90986fd29aSsetje  * host filesystem is ufs.
91986fd29aSsetje  *
92986fd29aSsetje  * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
93986fd29aSsetje  * and marked by fiocompress as a compressed file via a flag in the on-disk
94986fd29aSsetje  * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
95986fd29aSsetje  * ufs_lookup checks for this flag and if set, passes control to decompvp
96986fd29aSsetje  * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
97986fd29aSsetje  * and returns a dcfs vnode to the VFS layer.
98986fd29aSsetje  *
99986fd29aSsetje  * dcfs is layered on top of ufs and passes requests involving persistence
100986fd29aSsetje  * to the underlying ufs filesystem. The compressed files currently cannot be
101986fd29aSsetje  * written to.
102986fd29aSsetje  */
103986fd29aSsetje 
104986fd29aSsetje 
105986fd29aSsetje /*
106986fd29aSsetje  * Define data structures within this file.
107986fd29aSsetje  */
108986fd29aSsetje #define	DCSHFT		5
109986fd29aSsetje #define	DCTABLESIZE	16
110986fd29aSsetje 
111986fd29aSsetje #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
112986fd29aSsetje #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
113986fd29aSsetje #else
114986fd29aSsetje #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
115986fd29aSsetje #endif
116986fd29aSsetje 
117986fd29aSsetje #define	DCLRUSIZE	16
118986fd29aSsetje 
119986fd29aSsetje #define	DCCACHESIZE	4
120986fd29aSsetje 
121986fd29aSsetje #define	rounddown(x, y)	((x) & ~((y) - 1))
122986fd29aSsetje 
123986fd29aSsetje struct dcnode	*dctable[DCTABLESIZE];
124986fd29aSsetje 
125986fd29aSsetje struct dcnode	*dclru;
126986fd29aSsetje static int	dclru_len;
127986fd29aSsetje 
128986fd29aSsetje kmutex_t	dctable_lock;
129986fd29aSsetje 
130986fd29aSsetje dev_t		dcdev;
131986fd29aSsetje struct vfs	dc_vfs;
132986fd29aSsetje 
133986fd29aSsetje struct kmem_cache *dcnode_cache;
134986fd29aSsetje struct kmem_cache *dcbuf_cache[DCCACHESIZE];
135986fd29aSsetje 
136986fd29aSsetje kmutex_t	dccache_lock;
137986fd29aSsetje 
138986fd29aSsetje static int dcinit(int, char *);
139986fd29aSsetje 
140986fd29aSsetje static struct dcnode	*dcnode_alloc(void);
141986fd29aSsetje static void		dcnode_free(struct dcnode *);
142986fd29aSsetje static void		dcnode_recycle(struct dcnode *);
143986fd29aSsetje 
144986fd29aSsetje static void		dcinsert(struct dcnode *);
145986fd29aSsetje static void		dcdelete(struct dcnode *);
146986fd29aSsetje static struct dcnode	*dcfind(struct vnode *);
147986fd29aSsetje static void		dclru_add(struct dcnode *);
148986fd29aSsetje static void		dclru_sub(struct dcnode *);
149986fd29aSsetje 
150986fd29aSsetje 
151986fd29aSsetje /*
152986fd29aSsetje  * This is the loadable module wrapper.
153986fd29aSsetje  */
154986fd29aSsetje #include <sys/modctl.h>
155986fd29aSsetje 
156986fd29aSsetje struct vfsops *dc_vfsops;
157986fd29aSsetje 
158986fd29aSsetje static vfsdef_t vfw = {
159986fd29aSsetje 	VFSDEF_VERSION,
160986fd29aSsetje 	"dcfs",
161986fd29aSsetje 	dcinit,
1620fbb751dSJohn Levon 	VSW_ZMOUNT,
163986fd29aSsetje 	NULL
164986fd29aSsetje };
165986fd29aSsetje 
166986fd29aSsetje /*
167986fd29aSsetje  * Module linkage information for the kernel.
168986fd29aSsetje  */
169986fd29aSsetje extern struct mod_ops mod_fsops;
170986fd29aSsetje 
171986fd29aSsetje static struct modlfs modlfs = {
172986fd29aSsetje 	&mod_fsops, "compressed filesystem", &vfw
173986fd29aSsetje };
174986fd29aSsetje 
175986fd29aSsetje static struct modlinkage modlinkage = {
176986fd29aSsetje 	MODREV_1, (void *)&modlfs, NULL
177986fd29aSsetje };
178986fd29aSsetje 
179986fd29aSsetje int
_init()180986fd29aSsetje _init()
181986fd29aSsetje {
182986fd29aSsetje 	return (mod_install(&modlinkage));
183986fd29aSsetje }
184986fd29aSsetje 
185986fd29aSsetje int
_info(struct modinfo * modinfop)186986fd29aSsetje _info(struct modinfo *modinfop)
187986fd29aSsetje {
188986fd29aSsetje 	return (mod_info(&modlinkage, modinfop));
189986fd29aSsetje }
190986fd29aSsetje 
191986fd29aSsetje 
192986fd29aSsetje static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
193986fd29aSsetje static int dc_close(struct vnode *, int, int, offset_t,
194986fd29aSsetje     struct cred *, caller_context_t *);
195986fd29aSsetje static int dc_read(struct vnode *, struct uio *, int, struct cred *,
196986fd29aSsetje     struct caller_context *);
197986fd29aSsetje static int dc_getattr(struct vnode *, struct vattr *, int,
198986fd29aSsetje     struct cred *, caller_context_t *);
199986fd29aSsetje static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
200986fd29aSsetje     struct caller_context *);
201986fd29aSsetje static int dc_access(struct vnode *, int, int,
202986fd29aSsetje     struct cred *, caller_context_t *);
203986fd29aSsetje static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
204986fd29aSsetje static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
205986fd29aSsetje static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
206986fd29aSsetje static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
207986fd29aSsetje static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
208986fd29aSsetje     struct flk_callback *, struct cred *, caller_context_t *);
209e38c922eSAndrew Balfour static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *);
210986fd29aSsetje static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
211986fd29aSsetje     struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
212986fd29aSsetje     struct cred *, caller_context_t *);
213986fd29aSsetje static int dc_putpage(struct vnode *, offset_t, size_t, int,
214986fd29aSsetje     struct cred *, caller_context_t *);
215986fd29aSsetje static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
216986fd29aSsetje     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
217986fd29aSsetje static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
218986fd29aSsetje     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
219986fd29aSsetje static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
220986fd29aSsetje     uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
221986fd29aSsetje 
222986fd29aSsetje struct vnodeops *dc_vnodeops;
223986fd29aSsetje 
224986fd29aSsetje const fs_operation_def_t dc_vnodeops_template[] = {
225986fd29aSsetje 	VOPNAME_OPEN,			{ .vop_open = dc_open },
226986fd29aSsetje 	VOPNAME_CLOSE,			{ .vop_close = dc_close },
227986fd29aSsetje 	VOPNAME_READ,			{ .vop_read = dc_read },
228986fd29aSsetje 	VOPNAME_GETATTR,		{ .vop_getattr =  dc_getattr },
229986fd29aSsetje 	VOPNAME_SETATTR,		{ .vop_setattr = dc_setattr },
230986fd29aSsetje 	VOPNAME_ACCESS,			{ .vop_access = dc_access },
231986fd29aSsetje 	VOPNAME_FSYNC,			{ .vop_fsync = dc_fsync },
232986fd29aSsetje 	VOPNAME_INACTIVE,		{ .vop_inactive = dc_inactive },
233986fd29aSsetje 	VOPNAME_FID,			{ .vop_fid = dc_fid },
234986fd29aSsetje 	VOPNAME_SEEK,			{ .vop_seek = dc_seek },
235986fd29aSsetje 	VOPNAME_FRLOCK,			{ .vop_frlock = dc_frlock },
236e38c922eSAndrew Balfour 	VOPNAME_REALVP,			{ .vop_realvp = dc_realvp },
237986fd29aSsetje 	VOPNAME_GETPAGE,		{ .vop_getpage = dc_getpage },
238986fd29aSsetje 	VOPNAME_PUTPAGE,		{ .vop_putpage = dc_putpage },
239986fd29aSsetje 	VOPNAME_MAP,			{ .vop_map = dc_map },
240986fd29aSsetje 	VOPNAME_ADDMAP,			{ .vop_addmap = dc_addmap },
241986fd29aSsetje 	VOPNAME_DELMAP,			{ .vop_delmap = dc_delmap },
242986fd29aSsetje 	NULL,				NULL
243986fd29aSsetje };
244986fd29aSsetje 
245986fd29aSsetje /*ARGSUSED*/
246986fd29aSsetje static int
dc_open(struct vnode ** vpp,int flag,struct cred * cr,caller_context_t * ctp)247986fd29aSsetje dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
248986fd29aSsetje {
249986fd29aSsetje 	return (0);
250986fd29aSsetje }
251986fd29aSsetje 
252986fd29aSsetje /*ARGSUSED*/
253986fd29aSsetje static int
dc_close(struct vnode * vp,int flag,int count,offset_t off,struct cred * cr,caller_context_t * ctp)254986fd29aSsetje dc_close(struct vnode *vp, int flag, int count, offset_t off,
255986fd29aSsetje     struct cred *cr, caller_context_t *ctp)
256986fd29aSsetje {
257986fd29aSsetje 	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
258986fd29aSsetje 	cleanshares(vp, ttoproc(curthread)->p_pid);
259986fd29aSsetje 	return (0);
260986fd29aSsetje }
261986fd29aSsetje 
262986fd29aSsetje /*ARGSUSED*/
263986fd29aSsetje static int
dc_read(struct vnode * vp,struct uio * uiop,int ioflag,struct cred * cr,struct caller_context * ct)264986fd29aSsetje dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
265*ade42b55SSebastien Roy     struct caller_context *ct)
266986fd29aSsetje {
267986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
268986fd29aSsetje 	size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
269986fd29aSsetje 	size_t fsize = dp->dc_hdr->ch_fsize;
270986fd29aSsetje 	int error;
271986fd29aSsetje 
272986fd29aSsetje 	/*
273986fd29aSsetje 	 * Loop through file with segmap, decompression will occur
274986fd29aSsetje 	 * in dc_getapage
275986fd29aSsetje 	 */
276986fd29aSsetje 	do {
277986fd29aSsetje 		caddr_t base;
278986fd29aSsetje 		size_t n;
279986fd29aSsetje 		offset_t mapon;
280986fd29aSsetje 
281986fd29aSsetje 		/*
282986fd29aSsetje 		 * read to end of block or file
283986fd29aSsetje 		 */
284986fd29aSsetje 		mapon = uiop->uio_loffset & (rdsize - 1);
285986fd29aSsetje 		n = MIN(rdsize - mapon, uiop->uio_resid);
286986fd29aSsetje 		n = MIN(n, fsize - uiop->uio_loffset);
287986fd29aSsetje 		if (n == 0)
288986fd29aSsetje 			return (0);	/* at EOF */
289986fd29aSsetje 
290986fd29aSsetje 		base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
291986fd29aSsetje 		    S_READ);
292986fd29aSsetje 		error = uiomove(base + mapon, n, UIO_READ, uiop);
293986fd29aSsetje 		if (!error) {
294986fd29aSsetje 			uint_t flags;
295986fd29aSsetje 
296986fd29aSsetje 			if (n + mapon == rdsize || uiop->uio_loffset == fsize)
297986fd29aSsetje 				flags = SM_DONTNEED;
298986fd29aSsetje 			else
299986fd29aSsetje 				flags = 0;
300986fd29aSsetje 			error = segmap_release(segkmap, base, flags);
301986fd29aSsetje 		} else
302986fd29aSsetje 			(void) segmap_release(segkmap, base, 0);
303986fd29aSsetje 	} while (!error && uiop->uio_resid);
304986fd29aSsetje 
305986fd29aSsetje 	return (error);
306986fd29aSsetje }
307986fd29aSsetje 
308986fd29aSsetje static int
dc_getattr(struct vnode * vp,struct vattr * vap,int flags,cred_t * cred,caller_context_t * ctp)309986fd29aSsetje dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
310986fd29aSsetje     cred_t *cred, caller_context_t *ctp)
311986fd29aSsetje {
312986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
313986fd29aSsetje 	struct vnode *subvp = dp->dc_subvp;
314986fd29aSsetje 	int error;
315986fd29aSsetje 
316986fd29aSsetje 	error = VOP_GETATTR(subvp, vap, flags, cred, ctp);
317986fd29aSsetje 
318986fd29aSsetje 	/* substitute uncompressed size */
319986fd29aSsetje 	vap->va_size = dp->dc_hdr->ch_fsize;
320986fd29aSsetje 	return (error);
321986fd29aSsetje }
322986fd29aSsetje 
323986fd29aSsetje static int
dc_setattr(struct vnode * vp,struct vattr * vap,int flags,cred_t * cred,caller_context_t * ctp)324986fd29aSsetje dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
325986fd29aSsetje     caller_context_t *ctp)
326986fd29aSsetje {
327986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
328986fd29aSsetje 	struct vnode *subvp = dp->dc_subvp;
329986fd29aSsetje 
330986fd29aSsetje 	return (VOP_SETATTR(subvp, vap, flags, cred, ctp));
331986fd29aSsetje }
332986fd29aSsetje 
333986fd29aSsetje static int
dc_access(struct vnode * vp,int mode,int flags,cred_t * cred,caller_context_t * ctp)334986fd29aSsetje dc_access(struct vnode *vp, int mode, int flags,
335986fd29aSsetje     cred_t *cred, caller_context_t *ctp)
336986fd29aSsetje {
337986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
338986fd29aSsetje 	struct vnode *subvp = dp->dc_subvp;
339986fd29aSsetje 
340986fd29aSsetje 	return (VOP_ACCESS(subvp, mode, flags, cred, ctp));
341986fd29aSsetje }
342986fd29aSsetje 
343986fd29aSsetje /*ARGSUSED*/
344986fd29aSsetje static int
dc_fsync(vnode_t * vp,int syncflag,cred_t * cred,caller_context_t * ctp)345986fd29aSsetje dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
346986fd29aSsetje {
347986fd29aSsetje 	return (0);
348986fd29aSsetje }
349986fd29aSsetje 
350986fd29aSsetje /*ARGSUSED*/
351986fd29aSsetje static void
dc_inactive(struct vnode * vp,cred_t * cr,caller_context_t * ctp)352986fd29aSsetje dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
353986fd29aSsetje {
354986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
355986fd29aSsetje 
356986fd29aSsetje 	mutex_enter(&dctable_lock);
357986fd29aSsetje 	mutex_enter(&vp->v_lock);
358986fd29aSsetje 	ASSERT(vp->v_count >= 1);
359*ade42b55SSebastien Roy 	VN_RELE_LOCKED(vp);
360*ade42b55SSebastien Roy 	if (vp->v_count != 0) {
361986fd29aSsetje 		/*
362986fd29aSsetje 		 * Somebody accessed the dcnode before we got a chance to
363986fd29aSsetje 		 * remove it.  They will remove it when they do a vn_rele.
364986fd29aSsetje 		 */
365986fd29aSsetje 		mutex_exit(&vp->v_lock);
366986fd29aSsetje 		mutex_exit(&dctable_lock);
367986fd29aSsetje 		return;
368986fd29aSsetje 	}
369986fd29aSsetje 	mutex_exit(&vp->v_lock);
370986fd29aSsetje 
371986fd29aSsetje 	dcnode_free(dp);
372986fd29aSsetje 
373986fd29aSsetje 	mutex_exit(&dctable_lock);
374986fd29aSsetje }
375986fd29aSsetje 
376986fd29aSsetje static int
dc_fid(struct vnode * vp,struct fid * fidp,caller_context_t * ctp)377986fd29aSsetje dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
378986fd29aSsetje {
379986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
380986fd29aSsetje 	struct vnode *subvp = dp->dc_subvp;
381986fd29aSsetje 
382986fd29aSsetje 	return (VOP_FID(subvp, fidp, ctp));
383986fd29aSsetje }
384986fd29aSsetje 
385986fd29aSsetje static int
dc_seek(struct vnode * vp,offset_t oof,offset_t * noffp,caller_context_t * ctp)386986fd29aSsetje dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
387986fd29aSsetje {
388986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
389986fd29aSsetje 	struct vnode *subvp = dp->dc_subvp;
390986fd29aSsetje 
391986fd29aSsetje 	return (VOP_SEEK(subvp, oof, noffp, ctp));
392986fd29aSsetje }
393986fd29aSsetje 
394986fd29aSsetje static int
dc_frlock(struct vnode * vp,int cmd,struct flock64 * bfp,int flag,offset_t offset,struct flk_callback * flk_cbp,cred_t * cr,caller_context_t * ctp)395986fd29aSsetje dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
396986fd29aSsetje     offset_t offset, struct flk_callback *flk_cbp,
397986fd29aSsetje     cred_t *cr, caller_context_t *ctp)
398986fd29aSsetje {
399986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
40053eed1cbSJohn.Zolnowsky@Sun.COM 	int error;
40153eed1cbSJohn.Zolnowsky@Sun.COM 	struct vattr vattr;
402986fd29aSsetje 
403986fd29aSsetje 	/*
404986fd29aSsetje 	 * If file is being mapped, disallow frlock.
405986fd29aSsetje 	 */
40653eed1cbSJohn.Zolnowsky@Sun.COM 	vattr.va_mask = AT_MODE;
40753eed1cbSJohn.Zolnowsky@Sun.COM 	if (error = VOP_GETATTR(dp->dc_subvp, &vattr, 0, cr, ctp))
40853eed1cbSJohn.Zolnowsky@Sun.COM 		return (error);
40953eed1cbSJohn.Zolnowsky@Sun.COM 	if (dp->dc_mapcnt > 0 && MANDLOCK(vp, vattr.va_mode))
410986fd29aSsetje 		return (EAGAIN);
411986fd29aSsetje 
412986fd29aSsetje 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
413986fd29aSsetje }
414986fd29aSsetje 
415986fd29aSsetje /*ARGSUSED*/
416986fd29aSsetje static int
dc_getblock_miss(struct vnode * vp,offset_t off,size_t len,struct page ** ppp,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cr)417986fd29aSsetje dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
418986fd29aSsetje     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
419986fd29aSsetje {
420986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
421986fd29aSsetje 	struct comphdr *hdr = dp->dc_hdr;
422986fd29aSsetje 	struct page *pp;
423986fd29aSsetje 	struct buf *bp;
424986fd29aSsetje 	caddr_t saddr;
425986fd29aSsetje 	off_t cblkno;
426986fd29aSsetje 	size_t rdoff, rdsize, dsize;
427986fd29aSsetje 	long xlen;
428986fd29aSsetje 	int error, zerr;
429986fd29aSsetje 
430986fd29aSsetje 	ASSERT(len == hdr->ch_blksize);
431986fd29aSsetje 	/*
432986fd29aSsetje 	 * Get destination pages and make them addressable
433986fd29aSsetje 	 */
434986fd29aSsetje 	pp = page_create_va(vp, off, len, PG_WAIT, seg, addr);
435986fd29aSsetje 	bp = pageio_setup(pp, len, vp, B_READ);
436986fd29aSsetje 	bp_mapin(bp);
437986fd29aSsetje 
438986fd29aSsetje 	/*
439986fd29aSsetje 	 * read compressed data from subordinate vnode
440986fd29aSsetje 	 */
441986fd29aSsetje 	saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
442986fd29aSsetje 	cblkno = off / len;
443986fd29aSsetje 	rdoff = hdr->ch_blkmap[cblkno];
444986fd29aSsetje 	rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
445986fd29aSsetje 	error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
446986fd29aSsetje 	    UIO_SYSSPACE, 0, 0, cr, NULL);
447986fd29aSsetje 	if (error)
448986fd29aSsetje 		goto cleanup;
449986fd29aSsetje 
450986fd29aSsetje 	/*
451986fd29aSsetje 	 * Uncompress
452986fd29aSsetje 	 */
453986fd29aSsetje 	dsize = len;
454986fd29aSsetje 	zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
455986fd29aSsetje 	if (zerr != Z_OK) {
456986fd29aSsetje 		error = EIO;
457986fd29aSsetje 		goto cleanup;
458986fd29aSsetje 	}
459986fd29aSsetje 
460986fd29aSsetje 	/*
461986fd29aSsetje 	 * Handle EOF
462986fd29aSsetje 	 */
463986fd29aSsetje 	xlen = hdr->ch_fsize - off;
464986fd29aSsetje 	if (xlen < len) {
465986fd29aSsetje 		bzero(bp->b_un.b_addr + xlen, len - xlen);
466986fd29aSsetje 		if (dsize != xlen)
467986fd29aSsetje 			error = EIO;
468986fd29aSsetje 	} else if (dsize != len)
469986fd29aSsetje 		error = EIO;
470986fd29aSsetje 
471986fd29aSsetje 	/*
472986fd29aSsetje 	 * Clean up
473986fd29aSsetje 	 */
474986fd29aSsetje cleanup:
475986fd29aSsetje 	kmem_cache_free(dp->dc_bufcache, saddr);
476986fd29aSsetje 	pageio_done(bp);
477986fd29aSsetje 	*ppp = pp;
478986fd29aSsetje 	return (error);
479986fd29aSsetje }
480986fd29aSsetje 
481986fd29aSsetje static int
dc_getblock(struct vnode * vp,offset_t off,size_t len,struct page ** ppp,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cr)482986fd29aSsetje dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
483986fd29aSsetje     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
484986fd29aSsetje {
485986fd29aSsetje 	struct page *pp, *plist = NULL;
486986fd29aSsetje 	offset_t pgoff;
487986fd29aSsetje 	int rdblk;
488986fd29aSsetje 
489986fd29aSsetje 	/*
490986fd29aSsetje 	 * pvn_read_kluster() doesn't quite do what we want, since it
491986fd29aSsetje 	 * thinks sub block reads are ok.  Here we always decompress
492986fd29aSsetje 	 * a full block.
493986fd29aSsetje 	 */
494986fd29aSsetje 
495986fd29aSsetje 	/*
496986fd29aSsetje 	 * Check page cache
497986fd29aSsetje 	 */
498986fd29aSsetje 	rdblk = 0;
499986fd29aSsetje 	for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
500986fd29aSsetje 		pp = page_lookup(vp, pgoff, SE_EXCL);
501986fd29aSsetje 		if (pp == NULL) {
502986fd29aSsetje 			rdblk = 1;
503986fd29aSsetje 			break;
504986fd29aSsetje 		}
505986fd29aSsetje 		page_io_lock(pp);
506986fd29aSsetje 		page_add(&plist, pp);
507986fd29aSsetje 		plist = plist->p_next;
508986fd29aSsetje 	}
509986fd29aSsetje 	if (!rdblk) {
510986fd29aSsetje 		*ppp = plist;
511986fd29aSsetje 		return (0);	/* all pages in cache */
512986fd29aSsetje 	}
513986fd29aSsetje 
514986fd29aSsetje 	/*
515986fd29aSsetje 	 * Undo any locks so getblock_miss has an open field
516986fd29aSsetje 	 */
517986fd29aSsetje 	if (plist != NULL)
518986fd29aSsetje 		pvn_io_done(plist);
519986fd29aSsetje 
520986fd29aSsetje 	return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
521986fd29aSsetje }
522986fd29aSsetje 
523e38c922eSAndrew Balfour static int
dc_realvp(vnode_t * vp,vnode_t ** vpp,caller_context_t * ct)524e38c922eSAndrew Balfour dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
525e38c922eSAndrew Balfour {
526e38c922eSAndrew Balfour 	struct vnode *rvp;
527e38c922eSAndrew Balfour 
528e38c922eSAndrew Balfour 	vp = VTODC(vp)->dc_subvp;
529e38c922eSAndrew Balfour 	if (VOP_REALVP(vp, &rvp, ct) == 0)
530e38c922eSAndrew Balfour 		vp = rvp;
531e38c922eSAndrew Balfour 	*vpp = vp;
532e38c922eSAndrew Balfour 	return (0);
533e38c922eSAndrew Balfour }
534e38c922eSAndrew Balfour 
535986fd29aSsetje /*ARGSUSED10*/
536986fd29aSsetje static int
dc_getpage(struct vnode * vp,offset_t off,size_t len,uint_t * protp,struct page * pl[],size_t plsz,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cr,caller_context_t * ctp)537986fd29aSsetje dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
538986fd29aSsetje     struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
539986fd29aSsetje     enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
540986fd29aSsetje {
541986fd29aSsetje 	struct dcnode *dp = VTODC(vp);
542986fd29aSsetje 	struct comphdr *hdr = dp->dc_hdr;
543986fd29aSsetje 	struct page *pp, *plist = NULL;
544986fd29aSsetje 	caddr_t vp_baddr;
545986fd29aSsetje 	offset_t vp_boff, vp_bend;
546986fd29aSsetje 	size_t bsize = hdr->ch_blksize;
547986fd29aSsetje 	int nblks, error;
548986fd29aSsetje 
549986fd29aSsetje 	/* does not support write */
550986fd29aSsetje 	if (rw == S_WRITE) {
551986fd29aSsetje 		panic("write attempt on compressed file");
552986fd29aSsetje 		/*NOTREACHED*/
553986fd29aSsetje 	}
554986fd29aSsetje 
555986fd29aSsetje 	if (protp)
556986fd29aSsetje 		*protp = PROT_ALL;
557986fd29aSsetje 	/*
558986fd29aSsetje 	 * We don't support asynchronous operation at the moment, so
559986fd29aSsetje 	 * just pretend we did it.  If the pages are ever actually
560986fd29aSsetje 	 * needed, they'll get brought in then.
561986fd29aSsetje 	 */
562986fd29aSsetje 	if (pl == NULL)
563986fd29aSsetje 		return (0);
564986fd29aSsetje 
565986fd29aSsetje 	/*
566986fd29aSsetje 	 * Calc block start and end offsets
567986fd29aSsetje 	 */
568986fd29aSsetje 	vp_boff = rounddown(off, bsize);
569986fd29aSsetje 	vp_bend = roundup(off + len, bsize);
570986fd29aSsetje 	vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
571986fd29aSsetje 
572986fd29aSsetje 	nblks = (vp_bend - vp_boff) / bsize;
573986fd29aSsetje 	while (nblks--) {
574986fd29aSsetje 		error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
575986fd29aSsetje 		    rw, cr);
576986fd29aSsetje 		page_list_concat(&plist, &pp);
577986fd29aSsetje 		vp_boff += bsize;
578986fd29aSsetje 		vp_baddr += bsize;
579986fd29aSsetje 	}
580986fd29aSsetje 	if (!error)
581986fd29aSsetje 		pvn_plist_init(plist, pl, plsz, off, len, rw);
582986fd29aSsetje 	else
583986fd29aSsetje 		pvn_read_done(plist, B_ERROR);
584986fd29aSsetje 	return (error);
585986fd29aSsetje }
586986fd29aSsetje 
587986fd29aSsetje /*
588986fd29aSsetje  * This function should never be called. We need to have it to pass
589986fd29aSsetje  * it as an argument to other functions.
590986fd29aSsetje  */
591986fd29aSsetje /*ARGSUSED*/
592986fd29aSsetje static int
dc_putapage(struct vnode * vp,struct page * pp,u_offset_t * offp,size_t * lenp,int flags,struct cred * cr)593986fd29aSsetje dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp,
594986fd29aSsetje     int flags, struct cred *cr)
595986fd29aSsetje {
596986fd29aSsetje 	/* should never happen */
597986fd29aSsetje 	cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
598986fd29aSsetje 	/*NOTREACHED*/
599986fd29aSsetje 	return (0);
600986fd29aSsetje }
601986fd29aSsetje 
602986fd29aSsetje 
603986fd29aSsetje /*
604986fd29aSsetje  * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
605986fd29aSsetje  * B_INVAL is set by:
606986fd29aSsetje  *
607986fd29aSsetje  *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
608986fd29aSsetje  *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
609986fd29aSsetje  *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
610986fd29aSsetje  *
611986fd29aSsetje  * The B_FREE (as well as the B_DONTNEED) flag is set when the
612986fd29aSsetje  * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
613986fd29aSsetje  * from SEGVN to release pages behind a pagefault.
614986fd29aSsetje  */
615986fd29aSsetje /*ARGSUSED5*/
616986fd29aSsetje static int
dc_putpage(struct vnode * vp,offset_t off,size_t len,int flags,struct cred * cr,caller_context_t * ctp)617986fd29aSsetje dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
618986fd29aSsetje     struct cred *cr, caller_context_t *ctp)
619986fd29aSsetje {
620986fd29aSsetje 	int error = 0;
621986fd29aSsetje 
622986fd29aSsetje 	if (vp->v_count == 0) {
623986fd29aSsetje 		panic("dcfs_putpage: bad v_count");
624986fd29aSsetje 		/*NOTREACHED*/
625986fd29aSsetje 	}
626986fd29aSsetje 
627986fd29aSsetje 	if (vp->v_flag & VNOMAP)
628986fd29aSsetje 		return (ENOSYS);
629986fd29aSsetje 
630986fd29aSsetje 	if (!vn_has_cached_data(vp))	/* no pages mapped */
631986fd29aSsetje 		return (0);
632986fd29aSsetje 
633986fd29aSsetje 	if (len == 0)		/* from 'off' to EOF */
634986fd29aSsetje 		error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
635986fd29aSsetje 	else {
636986fd29aSsetje 		offset_t io_off;
637986fd29aSsetje 		se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
638986fd29aSsetje 
639986fd29aSsetje 		for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
640986fd29aSsetje 			page_t *pp;
641986fd29aSsetje 
642986fd29aSsetje 			/*
643986fd29aSsetje 			 * We insist on getting the page only if we are
644986fd29aSsetje 			 * about to invalidate, free or write it and
645986fd29aSsetje 			 * the B_ASYNC flag is not set.
646986fd29aSsetje 			 */
647986fd29aSsetje 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
648986fd29aSsetje 				pp = page_lookup(vp, io_off, se);
649986fd29aSsetje 			else
650986fd29aSsetje 				pp = page_lookup_nowait(vp, io_off, se);
651986fd29aSsetje 
652986fd29aSsetje 			if (pp == NULL)
653986fd29aSsetje 				continue;
654986fd29aSsetje 			/*
655986fd29aSsetje 			 * Normally pvn_getdirty() should return 0, which
656986fd29aSsetje 			 * impies that it has done the job for us.
657986fd29aSsetje 			 * The shouldn't-happen scenario is when it returns 1.
658986fd29aSsetje 			 * This means that the page has been modified and
659986fd29aSsetje 			 * needs to be put back.
660986fd29aSsetje 			 * Since we can't write to a dcfs compressed file,
661986fd29aSsetje 			 * we fake a failed I/O and force pvn_write_done()
662986fd29aSsetje 			 * to destroy the page.
663986fd29aSsetje 			 */
664986fd29aSsetje 			if (pvn_getdirty(pp, flags) == 1) {
665986fd29aSsetje 				cmn_err(CE_NOTE, "dc_putpage: dirty page");
666986fd29aSsetje 				pvn_write_done(pp, flags |
667986fd29aSsetje 				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
668986fd29aSsetje 			}
669986fd29aSsetje 		}
670986fd29aSsetje 	}
671986fd29aSsetje 	return (error);
672986fd29aSsetje }
673986fd29aSsetje 
674986fd29aSsetje static int
dc_map(struct vnode * vp,offset_t off,struct as * as,caddr_t * addrp,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,struct cred * cred,caller_context_t * ctp)675986fd29aSsetje dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
676986fd29aSsetje     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
677986fd29aSsetje     struct cred *cred, caller_context_t *ctp)
678986fd29aSsetje {
679986fd29aSsetje 	struct vattr vattr;
680986fd29aSsetje 	struct segvn_crargs vn_a;
681986fd29aSsetje 	int error;
682986fd29aSsetje 
683986fd29aSsetje 	if (vp->v_flag & VNOMAP)
684986fd29aSsetje 		return (ENOSYS);
685986fd29aSsetje 
686986fd29aSsetje 	if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0)
687986fd29aSsetje 		return (ENXIO);
688986fd29aSsetje 
689986fd29aSsetje 	/*
690986fd29aSsetje 	 * If file is being locked, disallow mapping.
691986fd29aSsetje 	 */
692986fd29aSsetje 	if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
693986fd29aSsetje 		return (error);
694986fd29aSsetje 	if (vn_has_mandatory_locks(vp, vattr.va_mode))
695986fd29aSsetje 		return (EAGAIN);
696986fd29aSsetje 
697986fd29aSsetje 	as_rangelock(as);
698986fd29aSsetje 
699986fd29aSsetje 	if ((flags & MAP_FIXED) == 0) {
700986fd29aSsetje 		map_addr(addrp, len, off, 1, flags);
701986fd29aSsetje 		if (*addrp == NULL) {
702986fd29aSsetje 			as_rangeunlock(as);
703986fd29aSsetje 			return (ENOMEM);
704986fd29aSsetje 		}
705986fd29aSsetje 	} else {
706986fd29aSsetje 		/*
707986fd29aSsetje 		 * User specified address - blow away any previous mappings
708986fd29aSsetje 		 */
709986fd29aSsetje 		(void) as_unmap(as, *addrp, len);
710986fd29aSsetje 	}
711986fd29aSsetje 
712986fd29aSsetje 	vn_a.vp = vp;
713986fd29aSsetje 	vn_a.offset = off;
714986fd29aSsetje 	vn_a.type = flags & MAP_TYPE;
715986fd29aSsetje 	vn_a.prot = prot;
716986fd29aSsetje 	vn_a.maxprot = maxprot;
717986fd29aSsetje 	vn_a.flags = flags & ~MAP_TYPE;
718986fd29aSsetje 	vn_a.cred = cred;
719986fd29aSsetje 	vn_a.amp = NULL;
720986fd29aSsetje 	vn_a.szc = 0;
721986fd29aSsetje 	vn_a.lgrp_mem_policy_flags = 0;
722986fd29aSsetje 
723986fd29aSsetje 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
724986fd29aSsetje 	as_rangeunlock(as);
725986fd29aSsetje 	return (error);
726986fd29aSsetje }
727986fd29aSsetje 
728986fd29aSsetje /*ARGSUSED*/
729986fd29aSsetje static int
dc_addmap(struct vnode * vp,offset_t off,struct as * as,caddr_t addr,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,struct cred * cr,caller_context_t * ctp)730986fd29aSsetje dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
731986fd29aSsetje     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
732986fd29aSsetje     struct cred *cr, caller_context_t *ctp)
733986fd29aSsetje {
734986fd29aSsetje 	struct dcnode *dp;
735986fd29aSsetje 
736986fd29aSsetje 	if (vp->v_flag & VNOMAP)
737986fd29aSsetje 		return (ENOSYS);
738986fd29aSsetje 
739986fd29aSsetje 	dp = VTODC(vp);
740986fd29aSsetje 	mutex_enter(&dp->dc_lock);
741986fd29aSsetje 	dp->dc_mapcnt += btopr(len);
742986fd29aSsetje 	mutex_exit(&dp->dc_lock);
743986fd29aSsetje 	return (0);
744986fd29aSsetje }
745986fd29aSsetje 
746986fd29aSsetje /*ARGSUSED*/
747986fd29aSsetje static int
dc_delmap(struct vnode * vp,offset_t off,struct as * as,caddr_t addr,size_t len,uint_t prot,uint_t maxprot,uint_t flags,struct cred * cr,caller_context_t * ctp)748986fd29aSsetje dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
749986fd29aSsetje     size_t len, uint_t prot, uint_t maxprot, uint_t flags,
750986fd29aSsetje     struct cred *cr, caller_context_t *ctp)
751986fd29aSsetje {
752986fd29aSsetje 	struct dcnode *dp;
753986fd29aSsetje 
754986fd29aSsetje 	if (vp->v_flag & VNOMAP)
755986fd29aSsetje 		return (ENOSYS);
756986fd29aSsetje 
757986fd29aSsetje 	dp = VTODC(vp);
758986fd29aSsetje 	mutex_enter(&dp->dc_lock);
759986fd29aSsetje 	dp->dc_mapcnt -= btopr(len);
760986fd29aSsetje 	ASSERT(dp->dc_mapcnt >= 0);
761986fd29aSsetje 	mutex_exit(&dp->dc_lock);
762986fd29aSsetje 	return (0);
763986fd29aSsetje }
764986fd29aSsetje 
765986fd29aSsetje /*
766986fd29aSsetje  * Constructor/destructor routines for dcnodes
767986fd29aSsetje  */
768986fd29aSsetje /*ARGSUSED1*/
769986fd29aSsetje static int
dcnode_constructor(void * buf,void * cdrarg,int kmflags)770986fd29aSsetje dcnode_constructor(void *buf, void *cdrarg, int kmflags)
771986fd29aSsetje {
772986fd29aSsetje 	struct dcnode *dp = buf;
773986fd29aSsetje 	struct vnode *vp;
774986fd29aSsetje 
7758bd3a292Stomee 	vp = dp->dc_vp = vn_alloc(kmflags);
7768bd3a292Stomee 	if (vp == NULL) {
7778bd3a292Stomee 		return (-1);
7788bd3a292Stomee 	}
7798bd3a292Stomee 	vp->v_data = dp;
780986fd29aSsetje 	vp->v_type = VREG;
781986fd29aSsetje 	vp->v_flag = VNOSWAP;
782986fd29aSsetje 	vp->v_vfsp = &dc_vfs;
783986fd29aSsetje 	vn_setops(vp, dc_vnodeops);
784986fd29aSsetje 	vn_exists(vp);
785986fd29aSsetje 
786986fd29aSsetje 	mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
787986fd29aSsetje 	dp->dc_mapcnt = 0;
788986fd29aSsetje 	dp->dc_lrunext = dp->dc_lruprev = NULL;
7898bd3a292Stomee 	dp->dc_hdr = NULL;
7908bd3a292Stomee 	dp->dc_subvp = NULL;
791986fd29aSsetje 	return (0);
792986fd29aSsetje }
793986fd29aSsetje 
794986fd29aSsetje /*ARGSUSED*/
795986fd29aSsetje static void
dcnode_destructor(void * buf,void * cdrarg)796986fd29aSsetje dcnode_destructor(void *buf, void *cdrarg)
797986fd29aSsetje {
798986fd29aSsetje 	struct dcnode *dp = buf;
799986fd29aSsetje 	struct vnode *vp = DCTOV(dp);
800986fd29aSsetje 
801986fd29aSsetje 	mutex_destroy(&dp->dc_lock);
802986fd29aSsetje 
803986fd29aSsetje 	VERIFY(dp->dc_hdr == NULL);
804986fd29aSsetje 	VERIFY(dp->dc_subvp == NULL);
805986fd29aSsetje 	vn_invalid(vp);
806986fd29aSsetje 	vn_free(vp);
807986fd29aSsetje }
808986fd29aSsetje 
809986fd29aSsetje static struct dcnode *
dcnode_alloc(void)810986fd29aSsetje dcnode_alloc(void)
811986fd29aSsetje {
812986fd29aSsetje 	struct dcnode *dp;
813986fd29aSsetje 
814986fd29aSsetje 	/*
815986fd29aSsetje 	 * If the free list is above DCLRUSIZE
816986fd29aSsetje 	 * re-use one from it
817986fd29aSsetje 	 */
818986fd29aSsetje 	mutex_enter(&dctable_lock);
819986fd29aSsetje 	if (dclru_len < DCLRUSIZE) {
820986fd29aSsetje 		mutex_exit(&dctable_lock);
821986fd29aSsetje 		dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
822986fd29aSsetje 	} else {
823986fd29aSsetje 		ASSERT(dclru != NULL);
824986fd29aSsetje 		dp = dclru;
825986fd29aSsetje 		dclru_sub(dp);
826986fd29aSsetje 		dcdelete(dp);
827986fd29aSsetje 		mutex_exit(&dctable_lock);
828986fd29aSsetje 		dcnode_recycle(dp);
829986fd29aSsetje 	}
830986fd29aSsetje 	return (dp);
831986fd29aSsetje }
832986fd29aSsetje 
833986fd29aSsetje static void
dcnode_free(struct dcnode * dp)834986fd29aSsetje dcnode_free(struct dcnode *dp)
835986fd29aSsetje {
836986fd29aSsetje 	struct vnode *vp = DCTOV(dp);
837986fd29aSsetje 
838986fd29aSsetje 	ASSERT(MUTEX_HELD(&dctable_lock));
839986fd29aSsetje 
840986fd29aSsetje 	/*
841986fd29aSsetje 	 * If no cached pages, no need to put it on lru
842986fd29aSsetje 	 */
843986fd29aSsetje 	if (!vn_has_cached_data(vp)) {
844986fd29aSsetje 		dcdelete(dp);
845986fd29aSsetje 		dcnode_recycle(dp);
846986fd29aSsetje 		kmem_cache_free(dcnode_cache, dp);
847986fd29aSsetje 		return;
848986fd29aSsetje 	}
849986fd29aSsetje 
850986fd29aSsetje 	/*
851986fd29aSsetje 	 * Add to lru, if it's over the limit, free from head
852986fd29aSsetje 	 */
853986fd29aSsetje 	dclru_add(dp);
854986fd29aSsetje 	if (dclru_len > DCLRUSIZE) {
855986fd29aSsetje 		dp = dclru;
856986fd29aSsetje 		dclru_sub(dp);
857986fd29aSsetje 		dcdelete(dp);
858986fd29aSsetje 		dcnode_recycle(dp);
859986fd29aSsetje 		kmem_cache_free(dcnode_cache, dp);
860986fd29aSsetje 	}
861986fd29aSsetje }
862986fd29aSsetje 
863986fd29aSsetje static void
dcnode_recycle(struct dcnode * dp)864986fd29aSsetje dcnode_recycle(struct dcnode *dp)
865986fd29aSsetje {
866986fd29aSsetje 	struct vnode *vp;
867986fd29aSsetje 
868986fd29aSsetje 	vp = DCTOV(dp);
869986fd29aSsetje 
870986fd29aSsetje 	VN_RELE(dp->dc_subvp);
871986fd29aSsetje 	dp->dc_subvp = NULL;
872986fd29aSsetje 	(void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
873986fd29aSsetje 	kmem_free(dp->dc_hdr, dp->dc_hdrsize);
874986fd29aSsetje 	dp->dc_hdr = NULL;
875986fd29aSsetje 	dp->dc_hdrsize = dp->dc_zmax = 0;
876986fd29aSsetje 	dp->dc_bufcache = NULL;
877986fd29aSsetje 	dp->dc_mapcnt = 0;
878986fd29aSsetje 	vn_reinit(vp);
879986fd29aSsetje 	vp->v_type = VREG;
880986fd29aSsetje 	vp->v_flag = VNOSWAP;
881986fd29aSsetje 	vp->v_vfsp = &dc_vfs;
882986fd29aSsetje }
883986fd29aSsetje 
884986fd29aSsetje static int
dcinit(int fstype,char * name)885986fd29aSsetje dcinit(int fstype, char *name)
886986fd29aSsetje {
887986fd29aSsetje 	static const fs_operation_def_t dc_vfsops_template[] = {
888986fd29aSsetje 		NULL, NULL
889986fd29aSsetje 	};
890986fd29aSsetje 	int error;
891986fd29aSsetje 	major_t dev;
892986fd29aSsetje 
893986fd29aSsetje 	error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops);
894986fd29aSsetje 	if (error) {
895986fd29aSsetje 		cmn_err(CE_WARN, "dcinit: bad vfs ops template");
896986fd29aSsetje 		return (error);
897986fd29aSsetje 	}
898986fd29aSsetje 	VFS_INIT(&dc_vfs, dc_vfsops, NULL);
899986fd29aSsetje 	dc_vfs.vfs_flag = VFS_RDONLY;
900986fd29aSsetje 	dc_vfs.vfs_fstype = fstype;
901986fd29aSsetje 	if ((dev = getudev()) == (major_t)-1)
902986fd29aSsetje 		dev = 0;
903986fd29aSsetje 	dcdev = makedevice(dev, 0);
904986fd29aSsetje 	dc_vfs.vfs_dev = dcdev;
905986fd29aSsetje 
906986fd29aSsetje 	error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops);
907986fd29aSsetje 	if (error != 0) {
908986fd29aSsetje 		(void) vfs_freevfsops_by_type(fstype);
909986fd29aSsetje 		cmn_err(CE_WARN, "dcinit: bad vnode ops template");
910986fd29aSsetje 		return (error);
911986fd29aSsetje 	}
912986fd29aSsetje 
913986fd29aSsetje 	mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
914986fd29aSsetje 	mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
915986fd29aSsetje 	dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
916986fd29aSsetje 	    0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
917986fd29aSsetje 
918986fd29aSsetje 	return (0);
919986fd29aSsetje }
920986fd29aSsetje 
921986fd29aSsetje /*
922986fd29aSsetje  * Return shadow vnode with the given vp as its subordinate
923986fd29aSsetje  */
924986fd29aSsetje struct vnode *
decompvp(struct vnode * vp,cred_t * cred,caller_context_t * ctp)925986fd29aSsetje decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
926986fd29aSsetje {
927986fd29aSsetje 	struct dcnode *dp, *ndp;
928986fd29aSsetje 	struct comphdr thdr, *hdr;
929986fd29aSsetje 	struct kmem_cache **cpp;
930986fd29aSsetje 	struct vattr vattr;
931986fd29aSsetje 	size_t hdrsize, bsize;
932986fd29aSsetje 	int error;
933986fd29aSsetje 
934986fd29aSsetje 	/*
935986fd29aSsetje 	 * See if we have an existing shadow
936986fd29aSsetje 	 * If none, we have to manufacture one
937986fd29aSsetje 	 */
938986fd29aSsetje 	mutex_enter(&dctable_lock);
939986fd29aSsetje 	dp = dcfind(vp);
940986fd29aSsetje 	mutex_exit(&dctable_lock);
941986fd29aSsetje 	if (dp != NULL)
942986fd29aSsetje 		return (DCTOV(dp));
943986fd29aSsetje 
944986fd29aSsetje 	/*
945986fd29aSsetje 	 * Make sure it's a valid compressed file
946986fd29aSsetje 	 */
947986fd29aSsetje 	hdr = &thdr;
948986fd29aSsetje 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
949986fd29aSsetje 	    UIO_SYSSPACE, 0, 0, cred, NULL);
9501d7f3fadSKrishnendu Sadhukhan - Sun Microsystems 	if (error || hdr->ch_magic != CH_MAGIC_ZLIB ||
951986fd29aSsetje 	    hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
952986fd29aSsetje 	    hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
953de710d24SJosef 'Jeff' Sipek 	    hdr->ch_blksize > ptob(DCCACHESIZE) || !ISP2(hdr->ch_blksize))
954986fd29aSsetje 		return (NULL);
955986fd29aSsetje 
956986fd29aSsetje 	/* get underlying file size */
957986fd29aSsetje 	if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0)
958986fd29aSsetje 		return (NULL);
959986fd29aSsetje 
960986fd29aSsetje 	/*
961986fd29aSsetje 	 * Re-read entire header
962986fd29aSsetje 	 */
963986fd29aSsetje 	hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
964986fd29aSsetje 	hdr = kmem_alloc(hdrsize, KM_SLEEP);
965986fd29aSsetje 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
966986fd29aSsetje 	    0, 0, cred, NULL);
967986fd29aSsetje 	if (error) {
968986fd29aSsetje 		kmem_free(hdr, hdrsize);
969986fd29aSsetje 		return (NULL);
970986fd29aSsetje 	}
971986fd29aSsetje 
972986fd29aSsetje 	/*
973986fd29aSsetje 	 * add extra blkmap entry to make dc_getblock()'s
974986fd29aSsetje 	 * life easier
975986fd29aSsetje 	 */
976986fd29aSsetje 	bsize = hdr->ch_blksize;
977986fd29aSsetje 	hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
978986fd29aSsetje 
979986fd29aSsetje 	ndp = dcnode_alloc();
980986fd29aSsetje 	ndp->dc_subvp = vp;
981986fd29aSsetje 	VN_HOLD(vp);
982986fd29aSsetje 	ndp->dc_hdr = hdr;
983986fd29aSsetje 	ndp->dc_hdrsize = hdrsize;
984986fd29aSsetje 
985986fd29aSsetje 	/*
986986fd29aSsetje 	 * Allocate kmem cache if none there already
987986fd29aSsetje 	 */
988986fd29aSsetje 	ndp->dc_zmax = ZMAXBUF(bsize);
989986fd29aSsetje 	cpp = &dcbuf_cache[btop(bsize)];
990986fd29aSsetje 	mutex_enter(&dccache_lock);
991986fd29aSsetje 	if (*cpp == NULL)
992986fd29aSsetje 		*cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
993986fd29aSsetje 		    NULL, NULL, NULL, NULL, 0);
994986fd29aSsetje 	mutex_exit(&dccache_lock);
995986fd29aSsetje 	ndp->dc_bufcache = *cpp;
996986fd29aSsetje 
997986fd29aSsetje 	/*
998986fd29aSsetje 	 * Recheck table in case someone else created shadow
999986fd29aSsetje 	 * while we were blocked above.
1000986fd29aSsetje 	 */
1001986fd29aSsetje 	mutex_enter(&dctable_lock);
1002986fd29aSsetje 	dp = dcfind(vp);
1003986fd29aSsetje 	if (dp != NULL) {
1004986fd29aSsetje 		mutex_exit(&dctable_lock);
1005986fd29aSsetje 		dcnode_recycle(ndp);
1006986fd29aSsetje 		kmem_cache_free(dcnode_cache, ndp);
1007986fd29aSsetje 		return (DCTOV(dp));
1008986fd29aSsetje 	}
1009986fd29aSsetje 	dcinsert(ndp);
1010986fd29aSsetje 	mutex_exit(&dctable_lock);
1011986fd29aSsetje 
1012986fd29aSsetje 	return (DCTOV(ndp));
1013986fd29aSsetje }
1014986fd29aSsetje 
1015986fd29aSsetje 
1016986fd29aSsetje /*
1017986fd29aSsetje  * dcnode lookup table
1018986fd29aSsetje  * These routines maintain a table of dcnodes hashed by their
1019986fd29aSsetje  * subordinate vnode so that they can be found if they already
1020986fd29aSsetje  * exist in the vnode cache
1021986fd29aSsetje  */
1022986fd29aSsetje 
1023986fd29aSsetje /*
1024986fd29aSsetje  * Put a dcnode in the table.
1025986fd29aSsetje  */
1026986fd29aSsetje static void
dcinsert(struct dcnode * newdp)1027986fd29aSsetje dcinsert(struct dcnode *newdp)
1028986fd29aSsetje {
1029986fd29aSsetje 	int idx = DCHASH(newdp->dc_subvp);
1030986fd29aSsetje 
1031986fd29aSsetje 	ASSERT(MUTEX_HELD(&dctable_lock));
1032986fd29aSsetje 	newdp->dc_hash = dctable[idx];
1033986fd29aSsetje 	dctable[idx] = newdp;
1034986fd29aSsetje }
1035986fd29aSsetje 
1036986fd29aSsetje /*
1037986fd29aSsetje  * Remove a dcnode from the hash table.
1038986fd29aSsetje  */
1039986fd29aSsetje void
dcdelete(struct dcnode * deldp)1040986fd29aSsetje dcdelete(struct dcnode *deldp)
1041986fd29aSsetje {
1042986fd29aSsetje 	int idx = DCHASH(deldp->dc_subvp);
1043986fd29aSsetje 	struct dcnode *dp, *prevdp;
1044986fd29aSsetje 
1045986fd29aSsetje 	ASSERT(MUTEX_HELD(&dctable_lock));
1046986fd29aSsetje 	dp = dctable[idx];
1047986fd29aSsetje 	if (dp == deldp)
1048986fd29aSsetje 		dctable[idx] = dp->dc_hash;
1049986fd29aSsetje 	else {
1050986fd29aSsetje 		for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
1051986fd29aSsetje 		    prevdp = dp, dp = dp->dc_hash) {
1052986fd29aSsetje 			if (dp == deldp) {
1053986fd29aSsetje 				prevdp->dc_hash = dp->dc_hash;
1054986fd29aSsetje 				break;
1055986fd29aSsetje 			}
1056986fd29aSsetje 		}
1057986fd29aSsetje 	}
1058986fd29aSsetje 	ASSERT(dp != NULL);
1059986fd29aSsetje }
1060986fd29aSsetje 
1061986fd29aSsetje /*
1062986fd29aSsetje  * Find a shadow vnode in the dctable hash list.
1063986fd29aSsetje  */
1064986fd29aSsetje static struct dcnode *
dcfind(struct vnode * vp)1065986fd29aSsetje dcfind(struct vnode *vp)
1066986fd29aSsetje {
1067986fd29aSsetje 	struct dcnode *dp;
1068986fd29aSsetje 
1069986fd29aSsetje 	ASSERT(MUTEX_HELD(&dctable_lock));
1070986fd29aSsetje 	for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
1071986fd29aSsetje 		if (dp->dc_subvp == vp) {
1072986fd29aSsetje 			VN_HOLD(DCTOV(dp));
1073986fd29aSsetje 			if (dp->dc_lrunext)
1074986fd29aSsetje 				dclru_sub(dp);
1075986fd29aSsetje 			return (dp);
1076986fd29aSsetje 		}
1077986fd29aSsetje 	return (NULL);
1078986fd29aSsetje }
1079986fd29aSsetje 
1080986fd29aSsetje #ifdef	DEBUG
1081986fd29aSsetje static int
dclru_count(void)1082986fd29aSsetje dclru_count(void)
1083986fd29aSsetje {
1084986fd29aSsetje 	struct dcnode *dp;
1085986fd29aSsetje 	int i = 0;
1086986fd29aSsetje 
1087986fd29aSsetje 	if (dclru == NULL)
1088986fd29aSsetje 		return (0);
1089986fd29aSsetje 	for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
1090986fd29aSsetje 		i++;
1091986fd29aSsetje 	return (i + 1);
1092986fd29aSsetje }
1093986fd29aSsetje #endif
1094986fd29aSsetje 
1095986fd29aSsetje static void
dclru_add(struct dcnode * dp)1096986fd29aSsetje dclru_add(struct dcnode *dp)
1097986fd29aSsetje {
1098986fd29aSsetje 	/*
1099986fd29aSsetje 	 * Add to dclru as double-link chain
1100986fd29aSsetje 	 */
1101986fd29aSsetje 	ASSERT(MUTEX_HELD(&dctable_lock));
1102986fd29aSsetje 	if (dclru == NULL) {
1103986fd29aSsetje 		dclru = dp;
1104986fd29aSsetje 		dp->dc_lruprev = dp->dc_lrunext = dp;
1105986fd29aSsetje 	} else {
1106986fd29aSsetje 		struct dcnode *last = dclru->dc_lruprev;
1107986fd29aSsetje 
1108986fd29aSsetje 		dclru->dc_lruprev = dp;
1109986fd29aSsetje 		last->dc_lrunext = dp;
1110986fd29aSsetje 		dp->dc_lruprev = last;
1111986fd29aSsetje 		dp->dc_lrunext = dclru;
1112986fd29aSsetje 	}
1113986fd29aSsetje 	dclru_len++;
1114986fd29aSsetje 	ASSERT(dclru_len == dclru_count());
1115986fd29aSsetje }
1116986fd29aSsetje 
1117986fd29aSsetje static void
dclru_sub(struct dcnode * dp)1118986fd29aSsetje dclru_sub(struct dcnode *dp)
1119986fd29aSsetje {
1120986fd29aSsetje 	ASSERT(MUTEX_HELD(&dctable_lock));
1121986fd29aSsetje 	dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
1122986fd29aSsetje 	dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
1123986fd29aSsetje 	if (dp == dclru)
1124986fd29aSsetje 		dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
1125986fd29aSsetje 	dp->dc_lrunext = dp->dc_lruprev = NULL;
1126986fd29aSsetje 	dclru_len--;
1127986fd29aSsetje 	ASSERT(dclru_len == dclru_count());
1128986fd29aSsetje }
1129