1
2/*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2017 by Delphix. All rights reserved.
25 */
26
27/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved  	*/
29
30/*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40#include <sys/types.h>
41#include <sys/thread.h>
42#include <sys/t_lock.h>
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/bitmap.h>
46#include <sys/buf.h>
47#include <sys/cmn_err.h>
48#include <sys/conf.h>
49#include <sys/ddi.h>
50#include <sys/debug.h>
51#include <sys/errno.h>
52#include <sys/time.h>
53#include <sys/fcntl.h>
54#include <sys/flock.h>
55#include <sys/file.h>
56#include <sys/kmem.h>
57#include <sys/mman.h>
58#include <sys/vmsystm.h>
59#include <sys/open.h>
60#include <sys/swap.h>
61#include <sys/sysmacros.h>
62#include <sys/uio.h>
63#include <sys/vfs.h>
64#include <sys/vfs_opreg.h>
65#include <sys/vnode.h>
66#include <sys/stat.h>
67#include <sys/poll.h>
68#include <sys/zmod.h>
69#include <sys/fs/decomp.h>
70
71#include <vm/hat.h>
72#include <vm/as.h>
73#include <vm/page.h>
74#include <vm/pvn.h>
75#include <vm/seg_vn.h>
76#include <vm/seg_kmem.h>
77#include <vm/seg_map.h>
78
79#include <fs/fs_subr.h>
80
81/*
82 * dcfs - A filesystem for automatic decompressing of fiocompressed files
83 *
84 * This filesystem is a layered filesystem that sits on top of a normal
85 * persistent filesystem and provides automatic decompression of files
86 * that have been previously compressed and stored on the host file system.
87 * This is a pseudo filesystem in that it does not persist data, rather it
88 * intercepts file lookup requests on the host filesystem and provides
89 * transparent decompression of those files. Currently the only supported
90 * host filesystem is ufs.
91 *
92 * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
93 * and marked by fiocompress as a compressed file via a flag in the on-disk
94 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
95 * ufs_lookup checks for this flag and if set, passes control to decompvp
96 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
97 * and returns a dcfs vnode to the VFS layer.
98 *
99 * dcfs is layered on top of ufs and passes requests involving persistence
100 * to the underlying ufs filesystem. The compressed files currently cannot be
101 * written to.
102 */
103
104
105/*
106 * Define data structures within this file.
107 */
108#define	DCSHFT		5
109#define	DCTABLESIZE	16
110
111#if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
112#define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
113#else
114#define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
115#endif
116
117#define	DCLRUSIZE	16
118
119#define	DCCACHESIZE	4
120
121#define	rounddown(x, y)	((x) & ~((y) - 1))
122
123struct dcnode	*dctable[DCTABLESIZE];
124
125struct dcnode	*dclru;
126static int	dclru_len;
127
128kmutex_t	dctable_lock;
129
130dev_t		dcdev;
131struct vfs	dc_vfs;
132
133struct kmem_cache *dcnode_cache;
134struct kmem_cache *dcbuf_cache[DCCACHESIZE];
135
136kmutex_t	dccache_lock;
137
138static int dcinit(int, char *);
139
140static struct dcnode	*dcnode_alloc(void);
141static void		dcnode_free(struct dcnode *);
142static void		dcnode_recycle(struct dcnode *);
143
144static void		dcinsert(struct dcnode *);
145static void		dcdelete(struct dcnode *);
146static struct dcnode	*dcfind(struct vnode *);
147static void		dclru_add(struct dcnode *);
148static void		dclru_sub(struct dcnode *);
149
150
151/*
152 * This is the loadable module wrapper.
153 */
154#include <sys/modctl.h>
155
156struct vfsops *dc_vfsops;
157
158static vfsdef_t vfw = {
159	VFSDEF_VERSION,
160	"dcfs",
161	dcinit,
162	VSW_ZMOUNT,
163	NULL
164};
165
166/*
167 * Module linkage information for the kernel.
168 */
169extern struct mod_ops mod_fsops;
170
171static struct modlfs modlfs = {
172	&mod_fsops, "compressed filesystem", &vfw
173};
174
175static struct modlinkage modlinkage = {
176	MODREV_1, (void *)&modlfs, NULL
177};
178
179int
180_init()
181{
182	return (mod_install(&modlinkage));
183}
184
185int
186_info(struct modinfo *modinfop)
187{
188	return (mod_info(&modlinkage, modinfop));
189}
190
191
192static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
193static int dc_close(struct vnode *, int, int, offset_t,
194    struct cred *, caller_context_t *);
195static int dc_read(struct vnode *, struct uio *, int, struct cred *,
196    struct caller_context *);
197static int dc_getattr(struct vnode *, struct vattr *, int,
198    struct cred *, caller_context_t *);
199static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
200    struct caller_context *);
201static int dc_access(struct vnode *, int, int,
202    struct cred *, caller_context_t *);
203static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
204static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
205static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
206static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
207static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
208    struct flk_callback *, struct cred *, caller_context_t *);
209static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *);
210static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
211    struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
212    struct cred *, caller_context_t *);
213static int dc_putpage(struct vnode *, offset_t, size_t, int,
214    struct cred *, caller_context_t *);
215static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
216    uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
217static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
218    uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
219static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
220    uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
221
222struct vnodeops *dc_vnodeops;
223
224const fs_operation_def_t dc_vnodeops_template[] = {
225	VOPNAME_OPEN,			{ .vop_open = dc_open },
226	VOPNAME_CLOSE,			{ .vop_close = dc_close },
227	VOPNAME_READ,			{ .vop_read = dc_read },
228	VOPNAME_GETATTR,		{ .vop_getattr =  dc_getattr },
229	VOPNAME_SETATTR,		{ .vop_setattr = dc_setattr },
230	VOPNAME_ACCESS,			{ .vop_access = dc_access },
231	VOPNAME_FSYNC,			{ .vop_fsync = dc_fsync },
232	VOPNAME_INACTIVE,		{ .vop_inactive = dc_inactive },
233	VOPNAME_FID,			{ .vop_fid = dc_fid },
234	VOPNAME_SEEK,			{ .vop_seek = dc_seek },
235	VOPNAME_FRLOCK,			{ .vop_frlock = dc_frlock },
236	VOPNAME_REALVP,			{ .vop_realvp = dc_realvp },
237	VOPNAME_GETPAGE,		{ .vop_getpage = dc_getpage },
238	VOPNAME_PUTPAGE,		{ .vop_putpage = dc_putpage },
239	VOPNAME_MAP,			{ .vop_map = dc_map },
240	VOPNAME_ADDMAP,			{ .vop_addmap = dc_addmap },
241	VOPNAME_DELMAP,			{ .vop_delmap = dc_delmap },
242	NULL,				NULL
243};
244
245/*ARGSUSED*/
246static int
247dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
248{
249	return (0);
250}
251
252/*ARGSUSED*/
253static int
254dc_close(struct vnode *vp, int flag, int count, offset_t off,
255    struct cred *cr, caller_context_t *ctp)
256{
257	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
258	cleanshares(vp, ttoproc(curthread)->p_pid);
259	return (0);
260}
261
262/*ARGSUSED*/
263static int
264dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
265    struct caller_context *ct)
266{
267	struct dcnode *dp = VTODC(vp);
268	size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
269	size_t fsize = dp->dc_hdr->ch_fsize;
270	int error;
271
272	/*
273	 * Loop through file with segmap, decompression will occur
274	 * in dc_getapage
275	 */
276	do {
277		caddr_t base;
278		size_t n;
279		offset_t mapon;
280
281		/*
282		 * read to end of block or file
283		 */
284		mapon = uiop->uio_loffset & (rdsize - 1);
285		n = MIN(rdsize - mapon, uiop->uio_resid);
286		n = MIN(n, fsize - uiop->uio_loffset);
287		if (n == 0)
288			return (0);	/* at EOF */
289
290		base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
291		    S_READ);
292		error = uiomove(base + mapon, n, UIO_READ, uiop);
293		if (!error) {
294			uint_t flags;
295
296			if (n + mapon == rdsize || uiop->uio_loffset == fsize)
297				flags = SM_DONTNEED;
298			else
299				flags = 0;
300			error = segmap_release(segkmap, base, flags);
301		} else
302			(void) segmap_release(segkmap, base, 0);
303	} while (!error && uiop->uio_resid);
304
305	return (error);
306}
307
308static int
309dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
310    cred_t *cred, caller_context_t *ctp)
311{
312	struct dcnode *dp = VTODC(vp);
313	struct vnode *subvp = dp->dc_subvp;
314	int error;
315
316	error = VOP_GETATTR(subvp, vap, flags, cred, ctp);
317
318	/* substitute uncompressed size */
319	vap->va_size = dp->dc_hdr->ch_fsize;
320	return (error);
321}
322
323static int
324dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
325    caller_context_t *ctp)
326{
327	struct dcnode *dp = VTODC(vp);
328	struct vnode *subvp = dp->dc_subvp;
329
330	return (VOP_SETATTR(subvp, vap, flags, cred, ctp));
331}
332
333static int
334dc_access(struct vnode *vp, int mode, int flags,
335    cred_t *cred, caller_context_t *ctp)
336{
337	struct dcnode *dp = VTODC(vp);
338	struct vnode *subvp = dp->dc_subvp;
339
340	return (VOP_ACCESS(subvp, mode, flags, cred, ctp));
341}
342
343/*ARGSUSED*/
344static int
345dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
346{
347	return (0);
348}
349
350/*ARGSUSED*/
351static void
352dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
353{
354	struct dcnode *dp = VTODC(vp);
355
356	mutex_enter(&dctable_lock);
357	mutex_enter(&vp->v_lock);
358	ASSERT(vp->v_count >= 1);
359	VN_RELE_LOCKED(vp);
360	if (vp->v_count != 0) {
361		/*
362		 * Somebody accessed the dcnode before we got a chance to
363		 * remove it.  They will remove it when they do a vn_rele.
364		 */
365		mutex_exit(&vp->v_lock);
366		mutex_exit(&dctable_lock);
367		return;
368	}
369	mutex_exit(&vp->v_lock);
370
371	dcnode_free(dp);
372
373	mutex_exit(&dctable_lock);
374}
375
376static int
377dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
378{
379	struct dcnode *dp = VTODC(vp);
380	struct vnode *subvp = dp->dc_subvp;
381
382	return (VOP_FID(subvp, fidp, ctp));
383}
384
385static int
386dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
387{
388	struct dcnode *dp = VTODC(vp);
389	struct vnode *subvp = dp->dc_subvp;
390
391	return (VOP_SEEK(subvp, oof, noffp, ctp));
392}
393
394static int
395dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
396    offset_t offset, struct flk_callback *flk_cbp,
397    cred_t *cr, caller_context_t *ctp)
398{
399	struct dcnode *dp = VTODC(vp);
400	int error;
401	struct vattr vattr;
402
403	/*
404	 * If file is being mapped, disallow frlock.
405	 */
406	vattr.va_mask = AT_MODE;
407	if (error = VOP_GETATTR(dp->dc_subvp, &vattr, 0, cr, ctp))
408		return (error);
409	if (dp->dc_mapcnt > 0 && MANDLOCK(vp, vattr.va_mode))
410		return (EAGAIN);
411
412	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
413}
414
415/*ARGSUSED*/
416static int
417dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
418    struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
419{
420	struct dcnode *dp = VTODC(vp);
421	struct comphdr *hdr = dp->dc_hdr;
422	struct page *pp;
423	struct buf *bp;
424	caddr_t saddr;
425	off_t cblkno;
426	size_t rdoff, rdsize, dsize;
427	long xlen;
428	int error, zerr;
429
430	ASSERT(len == hdr->ch_blksize);
431	/*
432	 * Get destination pages and make them addressable
433	 */
434	pp = page_create_va(vp, off, len, PG_WAIT, seg, addr);
435	bp = pageio_setup(pp, len, vp, B_READ);
436	bp_mapin(bp);
437
438	/*
439	 * read compressed data from subordinate vnode
440	 */
441	saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
442	cblkno = off / len;
443	rdoff = hdr->ch_blkmap[cblkno];
444	rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
445	error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
446	    UIO_SYSSPACE, 0, 0, cr, NULL);
447	if (error)
448		goto cleanup;
449
450	/*
451	 * Uncompress
452	 */
453	dsize = len;
454	zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
455	if (zerr != Z_OK) {
456		error = EIO;
457		goto cleanup;
458	}
459
460	/*
461	 * Handle EOF
462	 */
463	xlen = hdr->ch_fsize - off;
464	if (xlen < len) {
465		bzero(bp->b_un.b_addr + xlen, len - xlen);
466		if (dsize != xlen)
467			error = EIO;
468	} else if (dsize != len)
469		error = EIO;
470
471	/*
472	 * Clean up
473	 */
474cleanup:
475	kmem_cache_free(dp->dc_bufcache, saddr);
476	pageio_done(bp);
477	*ppp = pp;
478	return (error);
479}
480
481static int
482dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
483    struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
484{
485	struct page *pp, *plist = NULL;
486	offset_t pgoff;
487	int rdblk;
488
489	/*
490	 * pvn_read_kluster() doesn't quite do what we want, since it
491	 * thinks sub block reads are ok.  Here we always decompress
492	 * a full block.
493	 */
494
495	/*
496	 * Check page cache
497	 */
498	rdblk = 0;
499	for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
500		pp = page_lookup(vp, pgoff, SE_EXCL);
501		if (pp == NULL) {
502			rdblk = 1;
503			break;
504		}
505		page_io_lock(pp);
506		page_add(&plist, pp);
507		plist = plist->p_next;
508	}
509	if (!rdblk) {
510		*ppp = plist;
511		return (0);	/* all pages in cache */
512	}
513
514	/*
515	 * Undo any locks so getblock_miss has an open field
516	 */
517	if (plist != NULL)
518		pvn_io_done(plist);
519
520	return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
521}
522
523static int
524dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
525{
526	struct vnode *rvp;
527
528	vp = VTODC(vp)->dc_subvp;
529	if (VOP_REALVP(vp, &rvp, ct) == 0)
530		vp = rvp;
531	*vpp = vp;
532	return (0);
533}
534
535/*ARGSUSED10*/
536static int
537dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
538    struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
539    enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
540{
541	struct dcnode *dp = VTODC(vp);
542	struct comphdr *hdr = dp->dc_hdr;
543	struct page *pp, *plist = NULL;
544	caddr_t vp_baddr;
545	offset_t vp_boff, vp_bend;
546	size_t bsize = hdr->ch_blksize;
547	int nblks, error;
548
549	/* does not support write */
550	if (rw == S_WRITE) {
551		panic("write attempt on compressed file");
552		/*NOTREACHED*/
553	}
554
555	if (protp)
556		*protp = PROT_ALL;
557	/*
558	 * We don't support asynchronous operation at the moment, so
559	 * just pretend we did it.  If the pages are ever actually
560	 * needed, they'll get brought in then.
561	 */
562	if (pl == NULL)
563		return (0);
564
565	/*
566	 * Calc block start and end offsets
567	 */
568	vp_boff = rounddown(off, bsize);
569	vp_bend = roundup(off + len, bsize);
570	vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
571
572	nblks = (vp_bend - vp_boff) / bsize;
573	while (nblks--) {
574		error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
575		    rw, cr);
576		page_list_concat(&plist, &pp);
577		vp_boff += bsize;
578		vp_baddr += bsize;
579	}
580	if (!error)
581		pvn_plist_init(plist, pl, plsz, off, len, rw);
582	else
583		pvn_read_done(plist, B_ERROR);
584	return (error);
585}
586
587/*
588 * This function should never be called. We need to have it to pass
589 * it as an argument to other functions.
590 */
591/*ARGSUSED*/
592static int
593dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp,
594    int flags, struct cred *cr)
595{
596	/* should never happen */
597	cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
598	/*NOTREACHED*/
599	return (0);
600}
601
602
603/*
604 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
605 * B_INVAL is set by:
606 *
607 *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
608 *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
609 *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
610 *
611 * The B_FREE (as well as the B_DONTNEED) flag is set when the
612 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
613 * from SEGVN to release pages behind a pagefault.
614 */
615/*ARGSUSED5*/
616static int
617dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
618    struct cred *cr, caller_context_t *ctp)
619{
620	int error = 0;
621
622	if (vp->v_count == 0) {
623		panic("dcfs_putpage: bad v_count");
624		/*NOTREACHED*/
625	}
626
627	if (vp->v_flag & VNOMAP)
628		return (ENOSYS);
629
630	if (!vn_has_cached_data(vp))	/* no pages mapped */
631		return (0);
632
633	if (len == 0)		/* from 'off' to EOF */
634		error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
635	else {
636		offset_t io_off;
637		se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
638
639		for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
640			page_t *pp;
641
642			/*
643			 * We insist on getting the page only if we are
644			 * about to invalidate, free or write it and
645			 * the B_ASYNC flag is not set.
646			 */
647			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
648				pp = page_lookup(vp, io_off, se);
649			else
650				pp = page_lookup_nowait(vp, io_off, se);
651
652			if (pp == NULL)
653				continue;
654			/*
655			 * Normally pvn_getdirty() should return 0, which
656			 * impies that it has done the job for us.
657			 * The shouldn't-happen scenario is when it returns 1.
658			 * This means that the page has been modified and
659			 * needs to be put back.
660			 * Since we can't write to a dcfs compressed file,
661			 * we fake a failed I/O and force pvn_write_done()
662			 * to destroy the page.
663			 */
664			if (pvn_getdirty(pp, flags) == 1) {
665				cmn_err(CE_NOTE, "dc_putpage: dirty page");
666				pvn_write_done(pp, flags |
667				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
668			}
669		}
670	}
671	return (error);
672}
673
674static int
675dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
676    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
677    struct cred *cred, caller_context_t *ctp)
678{
679	struct vattr vattr;
680	struct segvn_crargs vn_a;
681	int error;
682
683	if (vp->v_flag & VNOMAP)
684		return (ENOSYS);
685
686	if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0)
687		return (ENXIO);
688
689	/*
690	 * If file is being locked, disallow mapping.
691	 */
692	if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
693		return (error);
694	if (vn_has_mandatory_locks(vp, vattr.va_mode))
695		return (EAGAIN);
696
697	as_rangelock(as);
698
699	if ((flags & MAP_FIXED) == 0) {
700		map_addr(addrp, len, off, 1, flags);
701		if (*addrp == NULL) {
702			as_rangeunlock(as);
703			return (ENOMEM);
704		}
705	} else {
706		/*
707		 * User specified address - blow away any previous mappings
708		 */
709		(void) as_unmap(as, *addrp, len);
710	}
711
712	vn_a.vp = vp;
713	vn_a.offset = off;
714	vn_a.type = flags & MAP_TYPE;
715	vn_a.prot = prot;
716	vn_a.maxprot = maxprot;
717	vn_a.flags = flags & ~MAP_TYPE;
718	vn_a.cred = cred;
719	vn_a.amp = NULL;
720	vn_a.szc = 0;
721	vn_a.lgrp_mem_policy_flags = 0;
722
723	error = as_map(as, *addrp, len, segvn_create, &vn_a);
724	as_rangeunlock(as);
725	return (error);
726}
727
728/*ARGSUSED*/
729static int
730dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
731    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
732    struct cred *cr, caller_context_t *ctp)
733{
734	struct dcnode *dp;
735
736	if (vp->v_flag & VNOMAP)
737		return (ENOSYS);
738
739	dp = VTODC(vp);
740	mutex_enter(&dp->dc_lock);
741	dp->dc_mapcnt += btopr(len);
742	mutex_exit(&dp->dc_lock);
743	return (0);
744}
745
746/*ARGSUSED*/
747static int
748dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
749    size_t len, uint_t prot, uint_t maxprot, uint_t flags,
750    struct cred *cr, caller_context_t *ctp)
751{
752	struct dcnode *dp;
753
754	if (vp->v_flag & VNOMAP)
755		return (ENOSYS);
756
757	dp = VTODC(vp);
758	mutex_enter(&dp->dc_lock);
759	dp->dc_mapcnt -= btopr(len);
760	ASSERT(dp->dc_mapcnt >= 0);
761	mutex_exit(&dp->dc_lock);
762	return (0);
763}
764
765/*
766 * Constructor/destructor routines for dcnodes
767 */
768/*ARGSUSED1*/
769static int
770dcnode_constructor(void *buf, void *cdrarg, int kmflags)
771{
772	struct dcnode *dp = buf;
773	struct vnode *vp;
774
775	vp = dp->dc_vp = vn_alloc(kmflags);
776	if (vp == NULL) {
777		return (-1);
778	}
779	vp->v_data = dp;
780	vp->v_type = VREG;
781	vp->v_flag = VNOSWAP;
782	vp->v_vfsp = &dc_vfs;
783	vn_setops(vp, dc_vnodeops);
784	vn_exists(vp);
785
786	mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
787	dp->dc_mapcnt = 0;
788	dp->dc_lrunext = dp->dc_lruprev = NULL;
789	dp->dc_hdr = NULL;
790	dp->dc_subvp = NULL;
791	return (0);
792}
793
794/*ARGSUSED*/
795static void
796dcnode_destructor(void *buf, void *cdrarg)
797{
798	struct dcnode *dp = buf;
799	struct vnode *vp = DCTOV(dp);
800
801	mutex_destroy(&dp->dc_lock);
802
803	VERIFY(dp->dc_hdr == NULL);
804	VERIFY(dp->dc_subvp == NULL);
805	vn_invalid(vp);
806	vn_free(vp);
807}
808
809static struct dcnode *
810dcnode_alloc(void)
811{
812	struct dcnode *dp;
813
814	/*
815	 * If the free list is above DCLRUSIZE
816	 * re-use one from it
817	 */
818	mutex_enter(&dctable_lock);
819	if (dclru_len < DCLRUSIZE) {
820		mutex_exit(&dctable_lock);
821		dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
822	} else {
823		ASSERT(dclru != NULL);
824		dp = dclru;
825		dclru_sub(dp);
826		dcdelete(dp);
827		mutex_exit(&dctable_lock);
828		dcnode_recycle(dp);
829	}
830	return (dp);
831}
832
833static void
834dcnode_free(struct dcnode *dp)
835{
836	struct vnode *vp = DCTOV(dp);
837
838	ASSERT(MUTEX_HELD(&dctable_lock));
839
840	/*
841	 * If no cached pages, no need to put it on lru
842	 */
843	if (!vn_has_cached_data(vp)) {
844		dcdelete(dp);
845		dcnode_recycle(dp);
846		kmem_cache_free(dcnode_cache, dp);
847		return;
848	}
849
850	/*
851	 * Add to lru, if it's over the limit, free from head
852	 */
853	dclru_add(dp);
854	if (dclru_len > DCLRUSIZE) {
855		dp = dclru;
856		dclru_sub(dp);
857		dcdelete(dp);
858		dcnode_recycle(dp);
859		kmem_cache_free(dcnode_cache, dp);
860	}
861}
862
863static void
864dcnode_recycle(struct dcnode *dp)
865{
866	struct vnode *vp;
867
868	vp = DCTOV(dp);
869
870	VN_RELE(dp->dc_subvp);
871	dp->dc_subvp = NULL;
872	(void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
873	kmem_free(dp->dc_hdr, dp->dc_hdrsize);
874	dp->dc_hdr = NULL;
875	dp->dc_hdrsize = dp->dc_zmax = 0;
876	dp->dc_bufcache = NULL;
877	dp->dc_mapcnt = 0;
878	vn_reinit(vp);
879	vp->v_type = VREG;
880	vp->v_flag = VNOSWAP;
881	vp->v_vfsp = &dc_vfs;
882}
883
884static int
885dcinit(int fstype, char *name)
886{
887	static const fs_operation_def_t dc_vfsops_template[] = {
888		NULL, NULL
889	};
890	int error;
891	major_t dev;
892
893	error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops);
894	if (error) {
895		cmn_err(CE_WARN, "dcinit: bad vfs ops template");
896		return (error);
897	}
898	VFS_INIT(&dc_vfs, dc_vfsops, NULL);
899	dc_vfs.vfs_flag = VFS_RDONLY;
900	dc_vfs.vfs_fstype = fstype;
901	if ((dev = getudev()) == (major_t)-1)
902		dev = 0;
903	dcdev = makedevice(dev, 0);
904	dc_vfs.vfs_dev = dcdev;
905
906	error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops);
907	if (error != 0) {
908		(void) vfs_freevfsops_by_type(fstype);
909		cmn_err(CE_WARN, "dcinit: bad vnode ops template");
910		return (error);
911	}
912
913	mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
914	mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
915	dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
916	    0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
917
918	return (0);
919}
920
921/*
922 * Return shadow vnode with the given vp as its subordinate
923 */
924struct vnode *
925decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
926{
927	struct dcnode *dp, *ndp;
928	struct comphdr thdr, *hdr;
929	struct kmem_cache **cpp;
930	struct vattr vattr;
931	size_t hdrsize, bsize;
932	int error;
933
934	/*
935	 * See if we have an existing shadow
936	 * If none, we have to manufacture one
937	 */
938	mutex_enter(&dctable_lock);
939	dp = dcfind(vp);
940	mutex_exit(&dctable_lock);
941	if (dp != NULL)
942		return (DCTOV(dp));
943
944	/*
945	 * Make sure it's a valid compressed file
946	 */
947	hdr = &thdr;
948	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
949	    UIO_SYSSPACE, 0, 0, cred, NULL);
950	if (error || hdr->ch_magic != CH_MAGIC_ZLIB ||
951	    hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
952	    hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
953	    hdr->ch_blksize > ptob(DCCACHESIZE) || !ISP2(hdr->ch_blksize))
954		return (NULL);
955
956	/* get underlying file size */
957	if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0)
958		return (NULL);
959
960	/*
961	 * Re-read entire header
962	 */
963	hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
964	hdr = kmem_alloc(hdrsize, KM_SLEEP);
965	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
966	    0, 0, cred, NULL);
967	if (error) {
968		kmem_free(hdr, hdrsize);
969		return (NULL);
970	}
971
972	/*
973	 * add extra blkmap entry to make dc_getblock()'s
974	 * life easier
975	 */
976	bsize = hdr->ch_blksize;
977	hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
978
979	ndp = dcnode_alloc();
980	ndp->dc_subvp = vp;
981	VN_HOLD(vp);
982	ndp->dc_hdr = hdr;
983	ndp->dc_hdrsize = hdrsize;
984
985	/*
986	 * Allocate kmem cache if none there already
987	 */
988	ndp->dc_zmax = ZMAXBUF(bsize);
989	cpp = &dcbuf_cache[btop(bsize)];
990	mutex_enter(&dccache_lock);
991	if (*cpp == NULL)
992		*cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
993		    NULL, NULL, NULL, NULL, 0);
994	mutex_exit(&dccache_lock);
995	ndp->dc_bufcache = *cpp;
996
997	/*
998	 * Recheck table in case someone else created shadow
999	 * while we were blocked above.
1000	 */
1001	mutex_enter(&dctable_lock);
1002	dp = dcfind(vp);
1003	if (dp != NULL) {
1004		mutex_exit(&dctable_lock);
1005		dcnode_recycle(ndp);
1006		kmem_cache_free(dcnode_cache, ndp);
1007		return (DCTOV(dp));
1008	}
1009	dcinsert(ndp);
1010	mutex_exit(&dctable_lock);
1011
1012	return (DCTOV(ndp));
1013}
1014
1015
1016/*
1017 * dcnode lookup table
1018 * These routines maintain a table of dcnodes hashed by their
1019 * subordinate vnode so that they can be found if they already
1020 * exist in the vnode cache
1021 */
1022
1023/*
1024 * Put a dcnode in the table.
1025 */
1026static void
1027dcinsert(struct dcnode *newdp)
1028{
1029	int idx = DCHASH(newdp->dc_subvp);
1030
1031	ASSERT(MUTEX_HELD(&dctable_lock));
1032	newdp->dc_hash = dctable[idx];
1033	dctable[idx] = newdp;
1034}
1035
1036/*
1037 * Remove a dcnode from the hash table.
1038 */
1039void
1040dcdelete(struct dcnode *deldp)
1041{
1042	int idx = DCHASH(deldp->dc_subvp);
1043	struct dcnode *dp, *prevdp;
1044
1045	ASSERT(MUTEX_HELD(&dctable_lock));
1046	dp = dctable[idx];
1047	if (dp == deldp)
1048		dctable[idx] = dp->dc_hash;
1049	else {
1050		for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
1051		    prevdp = dp, dp = dp->dc_hash) {
1052			if (dp == deldp) {
1053				prevdp->dc_hash = dp->dc_hash;
1054				break;
1055			}
1056		}
1057	}
1058	ASSERT(dp != NULL);
1059}
1060
1061/*
1062 * Find a shadow vnode in the dctable hash list.
1063 */
1064static struct dcnode *
1065dcfind(struct vnode *vp)
1066{
1067	struct dcnode *dp;
1068
1069	ASSERT(MUTEX_HELD(&dctable_lock));
1070	for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
1071		if (dp->dc_subvp == vp) {
1072			VN_HOLD(DCTOV(dp));
1073			if (dp->dc_lrunext)
1074				dclru_sub(dp);
1075			return (dp);
1076		}
1077	return (NULL);
1078}
1079
1080#ifdef	DEBUG
1081static int
1082dclru_count(void)
1083{
1084	struct dcnode *dp;
1085	int i = 0;
1086
1087	if (dclru == NULL)
1088		return (0);
1089	for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
1090		i++;
1091	return (i + 1);
1092}
1093#endif
1094
1095static void
1096dclru_add(struct dcnode *dp)
1097{
1098	/*
1099	 * Add to dclru as double-link chain
1100	 */
1101	ASSERT(MUTEX_HELD(&dctable_lock));
1102	if (dclru == NULL) {
1103		dclru = dp;
1104		dp->dc_lruprev = dp->dc_lrunext = dp;
1105	} else {
1106		struct dcnode *last = dclru->dc_lruprev;
1107
1108		dclru->dc_lruprev = dp;
1109		last->dc_lrunext = dp;
1110		dp->dc_lruprev = last;
1111		dp->dc_lrunext = dclru;
1112	}
1113	dclru_len++;
1114	ASSERT(dclru_len == dclru_count());
1115}
1116
1117static void
1118dclru_sub(struct dcnode *dp)
1119{
1120	ASSERT(MUTEX_HELD(&dctable_lock));
1121	dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
1122	dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
1123	if (dp == dclru)
1124		dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
1125	dp->dc_lrunext = dp->dc_lruprev = NULL;
1126	dclru_len--;
1127	ASSERT(dclru_len == dclru_count());
1128}
1129