1
2/*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved  	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39#include <sys/types.h>
40#include <sys/thread.h>
41#include <sys/t_lock.h>
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/bitmap.h>
45#include <sys/buf.h>
46#include <sys/cmn_err.h>
47#include <sys/conf.h>
48#include <sys/ddi.h>
49#include <sys/debug.h>
50#include <sys/errno.h>
51#include <sys/time.h>
52#include <sys/fcntl.h>
53#include <sys/flock.h>
54#include <sys/file.h>
55#include <sys/kmem.h>
56#include <sys/mman.h>
57#include <sys/vmsystm.h>
58#include <sys/open.h>
59#include <sys/swap.h>
60#include <sys/sysmacros.h>
61#include <sys/uio.h>
62#include <sys/vfs.h>
63#include <sys/vfs_opreg.h>
64#include <sys/vnode.h>
65#include <sys/stat.h>
66#include <sys/poll.h>
67#include <sys/zmod.h>
68#include <sys/fs/decomp.h>
69
70#include <vm/hat.h>
71#include <vm/as.h>
72#include <vm/page.h>
73#include <vm/pvn.h>
74#include <vm/seg_vn.h>
75#include <vm/seg_kmem.h>
76#include <vm/seg_map.h>
77
78#include <fs/fs_subr.h>
79
80/*
81 * dcfs - A filesystem for automatic decompressing of fiocompressed files
82 *
83 * This filesystem is a layered filesystem that sits on top of a normal
84 * persistent filesystem and provides automatic decompression of files
85 * that have been previously compressed and stored on the host file system.
86 * This is a pseudo filesystem in that it does not persist data, rather it
87 * intercepts file lookup requests on the host filesystem and provides
88 * transparent decompression of those files. Currently the only supported
89 * host filesystem is ufs.
90 *
91 * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
92 * and marked by fiocompress as a compressed file via a flag in the on-disk
93 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
94 * ufs_lookup checks for this flag and if set, passes control to decompvp
95 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
96 * and returns a dcfs vnode to the VFS layer.
97 *
98 * dcfs is layered on top of ufs and passes requests involving persistence
99 * to the underlying ufs filesystem. The compressed files currently cannot be
100 * written to.
101 */
102
103
104/*
105 * Define data structures within this file.
106 */
107#define	DCSHFT		5
108#define	DCTABLESIZE	16
109
110#if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
111#define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
112#else
113#define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
114#endif
115
116#define	DCLRUSIZE	16
117
118#define	DCCACHESIZE	4
119
120#define	rounddown(x, y)	((x) & ~((y) - 1))
121
122struct dcnode	*dctable[DCTABLESIZE];
123
124struct dcnode	*dclru;
125static int	dclru_len;
126
127kmutex_t	dctable_lock;
128
129dev_t		dcdev;
130struct vfs	dc_vfs;
131
132struct kmem_cache *dcnode_cache;
133struct kmem_cache *dcbuf_cache[DCCACHESIZE];
134
135kmutex_t	dccache_lock;
136
137static int dcinit(int, char *);
138
139static struct dcnode	*dcnode_alloc(void);
140static void		dcnode_free(struct dcnode *);
141static void		dcnode_recycle(struct dcnode *);
142
143static void		dcinsert(struct dcnode *);
144static void		dcdelete(struct dcnode *);
145static struct dcnode	*dcfind(struct vnode *);
146static void		dclru_add(struct dcnode *);
147static void		dclru_sub(struct dcnode *);
148
149
150/*
151 * This is the loadable module wrapper.
152 */
153#include <sys/modctl.h>
154
155struct vfsops *dc_vfsops;
156
157static vfsdef_t vfw = {
158	VFSDEF_VERSION,
159	"dcfs",
160	dcinit,
161	VSW_ZMOUNT,
162	NULL
163};
164
165/*
166 * Module linkage information for the kernel.
167 */
168extern struct mod_ops mod_fsops;
169
170static struct modlfs modlfs = {
171	&mod_fsops, "compressed filesystem", &vfw
172};
173
174static struct modlinkage modlinkage = {
175	MODREV_1, (void *)&modlfs, NULL
176};
177
178int
179_init()
180{
181	return (mod_install(&modlinkage));
182}
183
184int
185_info(struct modinfo *modinfop)
186{
187	return (mod_info(&modlinkage, modinfop));
188}
189
190
191static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
192static int dc_close(struct vnode *, int, int, offset_t,
193    struct cred *, caller_context_t *);
194static int dc_read(struct vnode *, struct uio *, int, struct cred *,
195    struct caller_context *);
196static int dc_getattr(struct vnode *, struct vattr *, int,
197    struct cred *, caller_context_t *);
198static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
199    struct caller_context *);
200static int dc_access(struct vnode *, int, int,
201    struct cred *, caller_context_t *);
202static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
203static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
204static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
205static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
206static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
207    struct flk_callback *, struct cred *, caller_context_t *);
208static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *);
209static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
210    struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
211    struct cred *, caller_context_t *);
212static int dc_putpage(struct vnode *, offset_t, size_t, int,
213    struct cred *, caller_context_t *);
214static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
215    uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
216static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
217    uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
218static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
219    uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
220
221struct vnodeops *dc_vnodeops;
222
223const fs_operation_def_t dc_vnodeops_template[] = {
224	VOPNAME_OPEN,			{ .vop_open = dc_open },
225	VOPNAME_CLOSE,			{ .vop_close = dc_close },
226	VOPNAME_READ,			{ .vop_read = dc_read },
227	VOPNAME_GETATTR,		{ .vop_getattr =  dc_getattr },
228	VOPNAME_SETATTR,		{ .vop_setattr = dc_setattr },
229	VOPNAME_ACCESS,			{ .vop_access = dc_access },
230	VOPNAME_FSYNC,			{ .vop_fsync = dc_fsync },
231	VOPNAME_INACTIVE,		{ .vop_inactive = dc_inactive },
232	VOPNAME_FID,			{ .vop_fid = dc_fid },
233	VOPNAME_SEEK,			{ .vop_seek = dc_seek },
234	VOPNAME_FRLOCK,			{ .vop_frlock = dc_frlock },
235	VOPNAME_REALVP,			{ .vop_realvp = dc_realvp },
236	VOPNAME_GETPAGE,		{ .vop_getpage = dc_getpage },
237	VOPNAME_PUTPAGE,		{ .vop_putpage = dc_putpage },
238	VOPNAME_MAP,			{ .vop_map = dc_map },
239	VOPNAME_ADDMAP,			{ .vop_addmap = dc_addmap },
240	VOPNAME_DELMAP,			{ .vop_delmap = dc_delmap },
241	NULL,				NULL
242};
243
244/*ARGSUSED*/
245static int
246dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
247{
248	return (0);
249}
250
251/*ARGSUSED*/
252static int
253dc_close(struct vnode *vp, int flag, int count, offset_t off,
254    struct cred *cr, caller_context_t *ctp)
255{
256	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
257	cleanshares(vp, ttoproc(curthread)->p_pid);
258	return (0);
259}
260
261/*ARGSUSED*/
262static int
263dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
264	struct caller_context *ct)
265{
266	struct dcnode *dp = VTODC(vp);
267	size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
268	size_t fsize = dp->dc_hdr->ch_fsize;
269	int error;
270
271	/*
272	 * Loop through file with segmap, decompression will occur
273	 * in dc_getapage
274	 */
275	do {
276		caddr_t base;
277		size_t n;
278		offset_t mapon;
279
280		/*
281		 * read to end of block or file
282		 */
283		mapon = uiop->uio_loffset & (rdsize - 1);
284		n = MIN(rdsize - mapon, uiop->uio_resid);
285		n = MIN(n, fsize - uiop->uio_loffset);
286		if (n == 0)
287			return (0);	/* at EOF */
288
289		base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
290		    S_READ);
291		error = uiomove(base + mapon, n, UIO_READ, uiop);
292		if (!error) {
293			uint_t flags;
294
295			if (n + mapon == rdsize || uiop->uio_loffset == fsize)
296				flags = SM_DONTNEED;
297			else
298				flags = 0;
299			error = segmap_release(segkmap, base, flags);
300		} else
301			(void) segmap_release(segkmap, base, 0);
302	} while (!error && uiop->uio_resid);
303
304	return (error);
305}
306
307static int
308dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
309    cred_t *cred, caller_context_t *ctp)
310{
311	struct dcnode *dp = VTODC(vp);
312	struct vnode *subvp = dp->dc_subvp;
313	int error;
314
315	error = VOP_GETATTR(subvp, vap, flags, cred, ctp);
316
317	/* substitute uncompressed size */
318	vap->va_size = dp->dc_hdr->ch_fsize;
319	return (error);
320}
321
322static int
323dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
324    caller_context_t *ctp)
325{
326	struct dcnode *dp = VTODC(vp);
327	struct vnode *subvp = dp->dc_subvp;
328
329	return (VOP_SETATTR(subvp, vap, flags, cred, ctp));
330}
331
332static int
333dc_access(struct vnode *vp, int mode, int flags,
334    cred_t *cred, caller_context_t *ctp)
335{
336	struct dcnode *dp = VTODC(vp);
337	struct vnode *subvp = dp->dc_subvp;
338
339	return (VOP_ACCESS(subvp, mode, flags, cred, ctp));
340}
341
342/*ARGSUSED*/
343static int
344dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
345{
346	return (0);
347}
348
349/*ARGSUSED*/
350static void
351dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
352{
353	struct dcnode *dp = VTODC(vp);
354
355	mutex_enter(&dctable_lock);
356	mutex_enter(&vp->v_lock);
357	ASSERT(vp->v_count >= 1);
358	if (--vp->v_count != 0) {
359		/*
360		 * Somebody accessed the dcnode before we got a chance to
361		 * remove it.  They will remove it when they do a vn_rele.
362		 */
363		mutex_exit(&vp->v_lock);
364		mutex_exit(&dctable_lock);
365		return;
366	}
367	mutex_exit(&vp->v_lock);
368
369	dcnode_free(dp);
370
371	mutex_exit(&dctable_lock);
372}
373
374static int
375dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
376{
377	struct dcnode *dp = VTODC(vp);
378	struct vnode *subvp = dp->dc_subvp;
379
380	return (VOP_FID(subvp, fidp, ctp));
381}
382
383static int
384dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
385{
386	struct dcnode *dp = VTODC(vp);
387	struct vnode *subvp = dp->dc_subvp;
388
389	return (VOP_SEEK(subvp, oof, noffp, ctp));
390}
391
392static int
393dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
394    offset_t offset, struct flk_callback *flk_cbp,
395    cred_t *cr, caller_context_t *ctp)
396{
397	struct dcnode *dp = VTODC(vp);
398	int error;
399	struct vattr vattr;
400
401	/*
402	 * If file is being mapped, disallow frlock.
403	 */
404	vattr.va_mask = AT_MODE;
405	if (error = VOP_GETATTR(dp->dc_subvp, &vattr, 0, cr, ctp))
406		return (error);
407	if (dp->dc_mapcnt > 0 && MANDLOCK(vp, vattr.va_mode))
408		return (EAGAIN);
409
410	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
411}
412
413/*ARGSUSED*/
414static int
415dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
416    struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
417{
418	struct dcnode *dp = VTODC(vp);
419	struct comphdr *hdr = dp->dc_hdr;
420	struct page *pp;
421	struct buf *bp;
422	caddr_t saddr;
423	off_t cblkno;
424	size_t rdoff, rdsize, dsize;
425	long xlen;
426	int error, zerr;
427
428	ASSERT(len == hdr->ch_blksize);
429	/*
430	 * Get destination pages and make them addressable
431	 */
432	pp = page_create_va(vp, off, len, PG_WAIT, seg, addr);
433	bp = pageio_setup(pp, len, vp, B_READ);
434	bp_mapin(bp);
435
436	/*
437	 * read compressed data from subordinate vnode
438	 */
439	saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
440	cblkno = off / len;
441	rdoff = hdr->ch_blkmap[cblkno];
442	rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
443	error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
444	    UIO_SYSSPACE, 0, 0, cr, NULL);
445	if (error)
446		goto cleanup;
447
448	/*
449	 * Uncompress
450	 */
451	dsize = len;
452	zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
453	if (zerr != Z_OK) {
454		error = EIO;
455		goto cleanup;
456	}
457
458	/*
459	 * Handle EOF
460	 */
461	xlen = hdr->ch_fsize - off;
462	if (xlen < len) {
463		bzero(bp->b_un.b_addr + xlen, len - xlen);
464		if (dsize != xlen)
465			error = EIO;
466	} else if (dsize != len)
467		error = EIO;
468
469	/*
470	 * Clean up
471	 */
472cleanup:
473	kmem_cache_free(dp->dc_bufcache, saddr);
474	pageio_done(bp);
475	*ppp = pp;
476	return (error);
477}
478
479static int
480dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
481    struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
482{
483	struct page *pp, *plist = NULL;
484	offset_t pgoff;
485	int rdblk;
486
487	/*
488	 * pvn_read_kluster() doesn't quite do what we want, since it
489	 * thinks sub block reads are ok.  Here we always decompress
490	 * a full block.
491	 */
492
493	/*
494	 * Check page cache
495	 */
496	rdblk = 0;
497	for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
498		pp = page_lookup(vp, pgoff, SE_EXCL);
499		if (pp == NULL) {
500			rdblk = 1;
501			break;
502		}
503		page_io_lock(pp);
504		page_add(&plist, pp);
505		plist = plist->p_next;
506	}
507	if (!rdblk) {
508		*ppp = plist;
509		return (0);	/* all pages in cache */
510	}
511
512	/*
513	 * Undo any locks so getblock_miss has an open field
514	 */
515	if (plist != NULL)
516		pvn_io_done(plist);
517
518	return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
519}
520
521static int
522dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
523{
524	struct vnode *rvp;
525
526	vp = VTODC(vp)->dc_subvp;
527	if (VOP_REALVP(vp, &rvp, ct) == 0)
528		vp = rvp;
529	*vpp = vp;
530	return (0);
531}
532
533/*ARGSUSED10*/
534static int
535dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
536    struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
537    enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
538{
539	struct dcnode *dp = VTODC(vp);
540	struct comphdr *hdr = dp->dc_hdr;
541	struct page *pp, *plist = NULL;
542	caddr_t vp_baddr;
543	offset_t vp_boff, vp_bend;
544	size_t bsize = hdr->ch_blksize;
545	int nblks, error;
546
547	/* does not support write */
548	if (rw == S_WRITE) {
549		panic("write attempt on compressed file");
550		/*NOTREACHED*/
551	}
552
553	if (protp)
554		*protp = PROT_ALL;
555	/*
556	 * We don't support asynchronous operation at the moment, so
557	 * just pretend we did it.  If the pages are ever actually
558	 * needed, they'll get brought in then.
559	 */
560	if (pl == NULL)
561		return (0);
562
563	/*
564	 * Calc block start and end offsets
565	 */
566	vp_boff = rounddown(off, bsize);
567	vp_bend = roundup(off + len, bsize);
568	vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
569
570	nblks = (vp_bend - vp_boff) / bsize;
571	while (nblks--) {
572		error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
573		    rw, cr);
574		page_list_concat(&plist, &pp);
575		vp_boff += bsize;
576		vp_baddr += bsize;
577	}
578	if (!error)
579		pvn_plist_init(plist, pl, plsz, off, len, rw);
580	else
581		pvn_read_done(plist, B_ERROR);
582	return (error);
583}
584
585/*
586 * This function should never be called. We need to have it to pass
587 * it as an argument to other functions.
588 */
589/*ARGSUSED*/
590static int
591dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp,
592    int flags, struct cred *cr)
593{
594	/* should never happen */
595	cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
596	/*NOTREACHED*/
597	return (0);
598}
599
600
601/*
602 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
603 * B_INVAL is set by:
604 *
605 *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
606 *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
607 *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
608 *
609 * The B_FREE (as well as the B_DONTNEED) flag is set when the
610 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
611 * from SEGVN to release pages behind a pagefault.
612 */
613/*ARGSUSED5*/
614static int
615dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
616    struct cred *cr, caller_context_t *ctp)
617{
618	int error = 0;
619
620	if (vp->v_count == 0) {
621		panic("dcfs_putpage: bad v_count");
622		/*NOTREACHED*/
623	}
624
625	if (vp->v_flag & VNOMAP)
626		return (ENOSYS);
627
628	if (!vn_has_cached_data(vp))	/* no pages mapped */
629		return (0);
630
631	if (len == 0)		/* from 'off' to EOF */
632		error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
633	else {
634		offset_t io_off;
635		se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
636
637		for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
638			page_t *pp;
639
640			/*
641			 * We insist on getting the page only if we are
642			 * about to invalidate, free or write it and
643			 * the B_ASYNC flag is not set.
644			 */
645			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
646				pp = page_lookup(vp, io_off, se);
647			else
648				pp = page_lookup_nowait(vp, io_off, se);
649
650			if (pp == NULL)
651				continue;
652			/*
653			 * Normally pvn_getdirty() should return 0, which
654			 * impies that it has done the job for us.
655			 * The shouldn't-happen scenario is when it returns 1.
656			 * This means that the page has been modified and
657			 * needs to be put back.
658			 * Since we can't write to a dcfs compressed file,
659			 * we fake a failed I/O and force pvn_write_done()
660			 * to destroy the page.
661			 */
662			if (pvn_getdirty(pp, flags) == 1) {
663				cmn_err(CE_NOTE, "dc_putpage: dirty page");
664				pvn_write_done(pp, flags |
665				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
666			}
667		}
668	}
669	return (error);
670}
671
672static int
673dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
674    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
675    struct cred *cred, caller_context_t *ctp)
676{
677	struct vattr vattr;
678	struct segvn_crargs vn_a;
679	int error;
680
681	if (vp->v_flag & VNOMAP)
682		return (ENOSYS);
683
684	if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0)
685		return (ENXIO);
686
687	/*
688	 * If file is being locked, disallow mapping.
689	 */
690	if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
691		return (error);
692	if (vn_has_mandatory_locks(vp, vattr.va_mode))
693		return (EAGAIN);
694
695	as_rangelock(as);
696
697	if ((flags & MAP_FIXED) == 0) {
698		map_addr(addrp, len, off, 1, flags);
699		if (*addrp == NULL) {
700			as_rangeunlock(as);
701			return (ENOMEM);
702		}
703	} else {
704		/*
705		 * User specified address - blow away any previous mappings
706		 */
707		(void) as_unmap(as, *addrp, len);
708	}
709
710	vn_a.vp = vp;
711	vn_a.offset = off;
712	vn_a.type = flags & MAP_TYPE;
713	vn_a.prot = prot;
714	vn_a.maxprot = maxprot;
715	vn_a.flags = flags & ~MAP_TYPE;
716	vn_a.cred = cred;
717	vn_a.amp = NULL;
718	vn_a.szc = 0;
719	vn_a.lgrp_mem_policy_flags = 0;
720
721	error = as_map(as, *addrp, len, segvn_create, &vn_a);
722	as_rangeunlock(as);
723	return (error);
724}
725
726/*ARGSUSED*/
727static int
728dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
729    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
730    struct cred *cr, caller_context_t *ctp)
731{
732	struct dcnode *dp;
733
734	if (vp->v_flag & VNOMAP)
735		return (ENOSYS);
736
737	dp = VTODC(vp);
738	mutex_enter(&dp->dc_lock);
739	dp->dc_mapcnt += btopr(len);
740	mutex_exit(&dp->dc_lock);
741	return (0);
742}
743
744/*ARGSUSED*/
745static int
746dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
747    size_t len, uint_t prot, uint_t maxprot, uint_t flags,
748    struct cred *cr, caller_context_t *ctp)
749{
750	struct dcnode *dp;
751
752	if (vp->v_flag & VNOMAP)
753		return (ENOSYS);
754
755	dp = VTODC(vp);
756	mutex_enter(&dp->dc_lock);
757	dp->dc_mapcnt -= btopr(len);
758	ASSERT(dp->dc_mapcnt >= 0);
759	mutex_exit(&dp->dc_lock);
760	return (0);
761}
762
763/*
764 * Constructor/destructor routines for dcnodes
765 */
766/*ARGSUSED1*/
767static int
768dcnode_constructor(void *buf, void *cdrarg, int kmflags)
769{
770	struct dcnode *dp = buf;
771	struct vnode *vp;
772
773	vp = dp->dc_vp = vn_alloc(kmflags);
774	if (vp == NULL) {
775		return (-1);
776	}
777	vp->v_data = dp;
778	vp->v_type = VREG;
779	vp->v_flag = VNOSWAP;
780	vp->v_vfsp = &dc_vfs;
781	vn_setops(vp, dc_vnodeops);
782	vn_exists(vp);
783
784	mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
785	dp->dc_mapcnt = 0;
786	dp->dc_lrunext = dp->dc_lruprev = NULL;
787	dp->dc_hdr = NULL;
788	dp->dc_subvp = NULL;
789	return (0);
790}
791
792/*ARGSUSED*/
793static void
794dcnode_destructor(void *buf, void *cdrarg)
795{
796	struct dcnode *dp = buf;
797	struct vnode *vp = DCTOV(dp);
798
799	mutex_destroy(&dp->dc_lock);
800
801	VERIFY(dp->dc_hdr == NULL);
802	VERIFY(dp->dc_subvp == NULL);
803	vn_invalid(vp);
804	vn_free(vp);
805}
806
807static struct dcnode *
808dcnode_alloc(void)
809{
810	struct dcnode *dp;
811
812	/*
813	 * If the free list is above DCLRUSIZE
814	 * re-use one from it
815	 */
816	mutex_enter(&dctable_lock);
817	if (dclru_len < DCLRUSIZE) {
818		mutex_exit(&dctable_lock);
819		dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
820	} else {
821		ASSERT(dclru != NULL);
822		dp = dclru;
823		dclru_sub(dp);
824		dcdelete(dp);
825		mutex_exit(&dctable_lock);
826		dcnode_recycle(dp);
827	}
828	return (dp);
829}
830
831static void
832dcnode_free(struct dcnode *dp)
833{
834	struct vnode *vp = DCTOV(dp);
835
836	ASSERT(MUTEX_HELD(&dctable_lock));
837
838	/*
839	 * If no cached pages, no need to put it on lru
840	 */
841	if (!vn_has_cached_data(vp)) {
842		dcdelete(dp);
843		dcnode_recycle(dp);
844		kmem_cache_free(dcnode_cache, dp);
845		return;
846	}
847
848	/*
849	 * Add to lru, if it's over the limit, free from head
850	 */
851	dclru_add(dp);
852	if (dclru_len > DCLRUSIZE) {
853		dp = dclru;
854		dclru_sub(dp);
855		dcdelete(dp);
856		dcnode_recycle(dp);
857		kmem_cache_free(dcnode_cache, dp);
858	}
859}
860
861static void
862dcnode_recycle(struct dcnode *dp)
863{
864	struct vnode *vp;
865
866	vp = DCTOV(dp);
867
868	VN_RELE(dp->dc_subvp);
869	dp->dc_subvp = NULL;
870	(void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
871	kmem_free(dp->dc_hdr, dp->dc_hdrsize);
872	dp->dc_hdr = NULL;
873	dp->dc_hdrsize = dp->dc_zmax = 0;
874	dp->dc_bufcache = NULL;
875	dp->dc_mapcnt = 0;
876	vn_reinit(vp);
877	vp->v_type = VREG;
878	vp->v_flag = VNOSWAP;
879	vp->v_vfsp = &dc_vfs;
880}
881
882static int
883dcinit(int fstype, char *name)
884{
885	static const fs_operation_def_t dc_vfsops_template[] = {
886		NULL, NULL
887	};
888	int error;
889	major_t dev;
890
891	error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops);
892	if (error) {
893		cmn_err(CE_WARN, "dcinit: bad vfs ops template");
894		return (error);
895	}
896	VFS_INIT(&dc_vfs, dc_vfsops, NULL);
897	dc_vfs.vfs_flag = VFS_RDONLY;
898	dc_vfs.vfs_fstype = fstype;
899	if ((dev = getudev()) == (major_t)-1)
900		dev = 0;
901	dcdev = makedevice(dev, 0);
902	dc_vfs.vfs_dev = dcdev;
903
904	error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops);
905	if (error != 0) {
906		(void) vfs_freevfsops_by_type(fstype);
907		cmn_err(CE_WARN, "dcinit: bad vnode ops template");
908		return (error);
909	}
910
911	mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
912	mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
913	dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
914	    0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
915
916	return (0);
917}
918
919/*
920 * Return shadow vnode with the given vp as its subordinate
921 */
922struct vnode *
923decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
924{
925	struct dcnode *dp, *ndp;
926	struct comphdr thdr, *hdr;
927	struct kmem_cache **cpp;
928	struct vattr vattr;
929	size_t hdrsize, bsize;
930	int error;
931
932	/*
933	 * See if we have an existing shadow
934	 * If none, we have to manufacture one
935	 */
936	mutex_enter(&dctable_lock);
937	dp = dcfind(vp);
938	mutex_exit(&dctable_lock);
939	if (dp != NULL)
940		return (DCTOV(dp));
941
942	/*
943	 * Make sure it's a valid compressed file
944	 */
945	hdr = &thdr;
946	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
947	    UIO_SYSSPACE, 0, 0, cred, NULL);
948	if (error || hdr->ch_magic != CH_MAGIC_ZLIB ||
949	    hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
950	    hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
951	    hdr->ch_blksize > ptob(DCCACHESIZE) ||
952	    (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0)
953		return (NULL);
954
955	/* get underlying file size */
956	if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0)
957		return (NULL);
958
959	/*
960	 * Re-read entire header
961	 */
962	hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
963	hdr = kmem_alloc(hdrsize, KM_SLEEP);
964	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
965	    0, 0, cred, NULL);
966	if (error) {
967		kmem_free(hdr, hdrsize);
968		return (NULL);
969	}
970
971	/*
972	 * add extra blkmap entry to make dc_getblock()'s
973	 * life easier
974	 */
975	bsize = hdr->ch_blksize;
976	hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
977
978	ndp = dcnode_alloc();
979	ndp->dc_subvp = vp;
980	VN_HOLD(vp);
981	ndp->dc_hdr = hdr;
982	ndp->dc_hdrsize = hdrsize;
983
984	/*
985	 * Allocate kmem cache if none there already
986	 */
987	ndp->dc_zmax = ZMAXBUF(bsize);
988	cpp = &dcbuf_cache[btop(bsize)];
989	mutex_enter(&dccache_lock);
990	if (*cpp == NULL)
991		*cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
992		    NULL, NULL, NULL, NULL, 0);
993	mutex_exit(&dccache_lock);
994	ndp->dc_bufcache = *cpp;
995
996	/*
997	 * Recheck table in case someone else created shadow
998	 * while we were blocked above.
999	 */
1000	mutex_enter(&dctable_lock);
1001	dp = dcfind(vp);
1002	if (dp != NULL) {
1003		mutex_exit(&dctable_lock);
1004		dcnode_recycle(ndp);
1005		kmem_cache_free(dcnode_cache, ndp);
1006		return (DCTOV(dp));
1007	}
1008	dcinsert(ndp);
1009	mutex_exit(&dctable_lock);
1010
1011	return (DCTOV(ndp));
1012}
1013
1014
1015/*
1016 * dcnode lookup table
1017 * These routines maintain a table of dcnodes hashed by their
1018 * subordinate vnode so that they can be found if they already
1019 * exist in the vnode cache
1020 */
1021
1022/*
1023 * Put a dcnode in the table.
1024 */
1025static void
1026dcinsert(struct dcnode *newdp)
1027{
1028	int idx = DCHASH(newdp->dc_subvp);
1029
1030	ASSERT(MUTEX_HELD(&dctable_lock));
1031	newdp->dc_hash = dctable[idx];
1032	dctable[idx] = newdp;
1033}
1034
1035/*
1036 * Remove a dcnode from the hash table.
1037 */
1038void
1039dcdelete(struct dcnode *deldp)
1040{
1041	int idx = DCHASH(deldp->dc_subvp);
1042	struct dcnode *dp, *prevdp;
1043
1044	ASSERT(MUTEX_HELD(&dctable_lock));
1045	dp = dctable[idx];
1046	if (dp == deldp)
1047		dctable[idx] = dp->dc_hash;
1048	else {
1049		for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
1050		    prevdp = dp, dp = dp->dc_hash) {
1051			if (dp == deldp) {
1052				prevdp->dc_hash = dp->dc_hash;
1053				break;
1054			}
1055		}
1056	}
1057	ASSERT(dp != NULL);
1058}
1059
1060/*
1061 * Find a shadow vnode in the dctable hash list.
1062 */
1063static struct dcnode *
1064dcfind(struct vnode *vp)
1065{
1066	struct dcnode *dp;
1067
1068	ASSERT(MUTEX_HELD(&dctable_lock));
1069	for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
1070		if (dp->dc_subvp == vp) {
1071			VN_HOLD(DCTOV(dp));
1072			if (dp->dc_lrunext)
1073				dclru_sub(dp);
1074			return (dp);
1075		}
1076	return (NULL);
1077}
1078
1079#ifdef	DEBUG
1080static int
1081dclru_count(void)
1082{
1083	struct dcnode *dp;
1084	int i = 0;
1085
1086	if (dclru == NULL)
1087		return (0);
1088	for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
1089		i++;
1090	return (i + 1);
1091}
1092#endif
1093
1094static void
1095dclru_add(struct dcnode *dp)
1096{
1097	/*
1098	 * Add to dclru as double-link chain
1099	 */
1100	ASSERT(MUTEX_HELD(&dctable_lock));
1101	if (dclru == NULL) {
1102		dclru = dp;
1103		dp->dc_lruprev = dp->dc_lrunext = dp;
1104	} else {
1105		struct dcnode *last = dclru->dc_lruprev;
1106
1107		dclru->dc_lruprev = dp;
1108		last->dc_lrunext = dp;
1109		dp->dc_lruprev = last;
1110		dp->dc_lrunext = dclru;
1111	}
1112	dclru_len++;
1113	ASSERT(dclru_len == dclru_count());
1114}
1115
1116static void
1117dclru_sub(struct dcnode *dp)
1118{
1119	ASSERT(MUTEX_HELD(&dctable_lock));
1120	dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
1121	dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
1122	if (dp == dclru)
1123		dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
1124	dp->dc_lrunext = dp->dc_lruprev = NULL;
1125	dclru_len--;
1126	ASSERT(dclru_len == dclru_count());
1127}
1128