xref: /illumos-gate/usr/src/uts/common/vm/vm_pvn.c (revision 7c478bd9)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28*7c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
29*7c478bd9Sstevel@tonic-gate 
30*7c478bd9Sstevel@tonic-gate /*
31*7c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
32*7c478bd9Sstevel@tonic-gate  * The Regents of the University of California
33*7c478bd9Sstevel@tonic-gate  * All Rights Reserved
34*7c478bd9Sstevel@tonic-gate  *
35*7c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
36*7c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
37*7c478bd9Sstevel@tonic-gate  * contributors.
38*7c478bd9Sstevel@tonic-gate  */
39*7c478bd9Sstevel@tonic-gate 
40*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
41*7c478bd9Sstevel@tonic-gate 
42*7c478bd9Sstevel@tonic-gate /*
43*7c478bd9Sstevel@tonic-gate  * VM - paged vnode.
44*7c478bd9Sstevel@tonic-gate  *
45*7c478bd9Sstevel@tonic-gate  * This file supplies vm support for the vnode operations that deal with pages.
46*7c478bd9Sstevel@tonic-gate  */
47*7c478bd9Sstevel@tonic-gate #include <sys/types.h>
48*7c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
49*7c478bd9Sstevel@tonic-gate #include <sys/param.h>
50*7c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
51*7c478bd9Sstevel@tonic-gate #include <sys/systm.h>
52*7c478bd9Sstevel@tonic-gate #include <sys/time.h>
53*7c478bd9Sstevel@tonic-gate #include <sys/buf.h>
54*7c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
55*7c478bd9Sstevel@tonic-gate #include <sys/uio.h>
56*7c478bd9Sstevel@tonic-gate #include <sys/vmmeter.h>
57*7c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h>
58*7c478bd9Sstevel@tonic-gate #include <sys/mman.h>
59*7c478bd9Sstevel@tonic-gate #include <sys/vfs.h>
60*7c478bd9Sstevel@tonic-gate #include <sys/cred.h>
61*7c478bd9Sstevel@tonic-gate #include <sys/user.h>
62*7c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
63*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
64*7c478bd9Sstevel@tonic-gate #include <sys/debug.h>
65*7c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
66*7c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
67*7c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
68*7c478bd9Sstevel@tonic-gate 
69*7c478bd9Sstevel@tonic-gate #include <vm/hat.h>
70*7c478bd9Sstevel@tonic-gate #include <vm/as.h>
71*7c478bd9Sstevel@tonic-gate #include <vm/seg.h>
72*7c478bd9Sstevel@tonic-gate #include <vm/rm.h>
73*7c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
74*7c478bd9Sstevel@tonic-gate #include <vm/page.h>
75*7c478bd9Sstevel@tonic-gate #include <vm/seg_map.h>
76*7c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
77*7c478bd9Sstevel@tonic-gate #include <sys/fs/swapnode.h>
78*7c478bd9Sstevel@tonic-gate 
79*7c478bd9Sstevel@tonic-gate int pvn_nofodklust = 0;
80*7c478bd9Sstevel@tonic-gate int pvn_write_noklust = 0;
81*7c478bd9Sstevel@tonic-gate 
82*7c478bd9Sstevel@tonic-gate uint_t pvn_vmodsort_supported = 0;	/* set if HAT supports VMODSORT */
83*7c478bd9Sstevel@tonic-gate uint_t pvn_vmodsort_disable = 0;	/* set in /etc/system to disable HAT */
84*7c478bd9Sstevel@tonic-gate 					/* support for vmodsort for testing */
85*7c478bd9Sstevel@tonic-gate 
86*7c478bd9Sstevel@tonic-gate static struct kmem_cache *marker_cache = NULL;
87*7c478bd9Sstevel@tonic-gate 
88*7c478bd9Sstevel@tonic-gate /*
89*7c478bd9Sstevel@tonic-gate  * Find the largest contiguous block which contains `addr' for file offset
90*7c478bd9Sstevel@tonic-gate  * `offset' in it while living within the file system block sizes (`vp_off'
91*7c478bd9Sstevel@tonic-gate  * and `vp_len') and the address space limits for which no pages currently
92*7c478bd9Sstevel@tonic-gate  * exist and which map to consecutive file offsets.
93*7c478bd9Sstevel@tonic-gate  */
94*7c478bd9Sstevel@tonic-gate page_t *
95*7c478bd9Sstevel@tonic-gate pvn_read_kluster(
96*7c478bd9Sstevel@tonic-gate 	struct vnode *vp,
97*7c478bd9Sstevel@tonic-gate 	u_offset_t off,
98*7c478bd9Sstevel@tonic-gate 	struct seg *seg,
99*7c478bd9Sstevel@tonic-gate 	caddr_t addr,
100*7c478bd9Sstevel@tonic-gate 	u_offset_t *offp,			/* return values */
101*7c478bd9Sstevel@tonic-gate 	size_t *lenp,				/* return values */
102*7c478bd9Sstevel@tonic-gate 	u_offset_t vp_off,
103*7c478bd9Sstevel@tonic-gate 	size_t vp_len,
104*7c478bd9Sstevel@tonic-gate 	int isra)
105*7c478bd9Sstevel@tonic-gate {
106*7c478bd9Sstevel@tonic-gate 	ssize_t deltaf, deltab;
107*7c478bd9Sstevel@tonic-gate 	page_t *pp;
108*7c478bd9Sstevel@tonic-gate 	page_t *plist = NULL;
109*7c478bd9Sstevel@tonic-gate 	spgcnt_t pagesavail;
110*7c478bd9Sstevel@tonic-gate 	u_offset_t vp_end;
111*7c478bd9Sstevel@tonic-gate 
112*7c478bd9Sstevel@tonic-gate 	ASSERT(off >= vp_off && off < vp_off + vp_len);
113*7c478bd9Sstevel@tonic-gate 
114*7c478bd9Sstevel@tonic-gate 	/*
115*7c478bd9Sstevel@tonic-gate 	 * We only want to do klustering/read ahead if there
116*7c478bd9Sstevel@tonic-gate 	 * is more than minfree pages currently available.
117*7c478bd9Sstevel@tonic-gate 	 */
118*7c478bd9Sstevel@tonic-gate 	pagesavail = freemem - minfree;
119*7c478bd9Sstevel@tonic-gate 
120*7c478bd9Sstevel@tonic-gate 	if (pagesavail <= 0)
121*7c478bd9Sstevel@tonic-gate 		if (isra)
122*7c478bd9Sstevel@tonic-gate 			return ((page_t *)NULL);    /* ra case - give up */
123*7c478bd9Sstevel@tonic-gate 		else
124*7c478bd9Sstevel@tonic-gate 			pagesavail = 1;		    /* must return a page */
125*7c478bd9Sstevel@tonic-gate 
126*7c478bd9Sstevel@tonic-gate 	/* We calculate in pages instead of bytes due to 32-bit overflows */
127*7c478bd9Sstevel@tonic-gate 	if (pagesavail < (spgcnt_t)btopr(vp_len)) {
128*7c478bd9Sstevel@tonic-gate 		/*
129*7c478bd9Sstevel@tonic-gate 		 * Don't have enough free memory for the
130*7c478bd9Sstevel@tonic-gate 		 * max request, try sizing down vp request.
131*7c478bd9Sstevel@tonic-gate 		 */
132*7c478bd9Sstevel@tonic-gate 		deltab = (ssize_t)(off - vp_off);
133*7c478bd9Sstevel@tonic-gate 		vp_len -= deltab;
134*7c478bd9Sstevel@tonic-gate 		vp_off += deltab;
135*7c478bd9Sstevel@tonic-gate 		if (pagesavail < btopr(vp_len)) {
136*7c478bd9Sstevel@tonic-gate 			/*
137*7c478bd9Sstevel@tonic-gate 			 * Still not enough memory, just settle for
138*7c478bd9Sstevel@tonic-gate 			 * pagesavail which is at least 1.
139*7c478bd9Sstevel@tonic-gate 			 */
140*7c478bd9Sstevel@tonic-gate 			vp_len = ptob(pagesavail);
141*7c478bd9Sstevel@tonic-gate 		}
142*7c478bd9Sstevel@tonic-gate 	}
143*7c478bd9Sstevel@tonic-gate 
144*7c478bd9Sstevel@tonic-gate 	vp_end = vp_off + vp_len;
145*7c478bd9Sstevel@tonic-gate 	ASSERT(off >= vp_off && off < vp_end);
146*7c478bd9Sstevel@tonic-gate 
147*7c478bd9Sstevel@tonic-gate 	if (isra && SEGOP_KLUSTER(seg, addr, 0))
148*7c478bd9Sstevel@tonic-gate 		return ((page_t *)NULL);	/* segment driver says no */
149*7c478bd9Sstevel@tonic-gate 
150*7c478bd9Sstevel@tonic-gate 	if ((plist = page_create_va(vp, off,
151*7c478bd9Sstevel@tonic-gate 	    PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
152*7c478bd9Sstevel@tonic-gate 		return ((page_t *)NULL);
153*7c478bd9Sstevel@tonic-gate 
154*7c478bd9Sstevel@tonic-gate 	if (vp_len <= PAGESIZE || pvn_nofodklust) {
155*7c478bd9Sstevel@tonic-gate 		*offp = off;
156*7c478bd9Sstevel@tonic-gate 		*lenp = MIN(vp_len, PAGESIZE);
157*7c478bd9Sstevel@tonic-gate 	} else {
158*7c478bd9Sstevel@tonic-gate 		/*
159*7c478bd9Sstevel@tonic-gate 		 * Scan back from front by incrementing "deltab" and
160*7c478bd9Sstevel@tonic-gate 		 * comparing "off" with "vp_off + deltab" to avoid
161*7c478bd9Sstevel@tonic-gate 		 * "signed" versus "unsigned" conversion problems.
162*7c478bd9Sstevel@tonic-gate 		 */
163*7c478bd9Sstevel@tonic-gate 		for (deltab = PAGESIZE; off >= vp_off + deltab;
164*7c478bd9Sstevel@tonic-gate 		    deltab += PAGESIZE) {
165*7c478bd9Sstevel@tonic-gate 			/*
166*7c478bd9Sstevel@tonic-gate 			 * Call back to the segment driver to verify that
167*7c478bd9Sstevel@tonic-gate 			 * the klustering/read ahead operation makes sense.
168*7c478bd9Sstevel@tonic-gate 			 */
169*7c478bd9Sstevel@tonic-gate 			if (SEGOP_KLUSTER(seg, addr, -deltab))
170*7c478bd9Sstevel@tonic-gate 				break;		/* page not eligible */
171*7c478bd9Sstevel@tonic-gate 			if ((pp = page_create_va(vp, off - deltab,
172*7c478bd9Sstevel@tonic-gate 			    PAGESIZE, PG_EXCL, seg, addr - deltab))
173*7c478bd9Sstevel@tonic-gate 			    == NULL)
174*7c478bd9Sstevel@tonic-gate 				break;		/* already have the page */
175*7c478bd9Sstevel@tonic-gate 			/*
176*7c478bd9Sstevel@tonic-gate 			 * Add page to front of page list.
177*7c478bd9Sstevel@tonic-gate 			 */
178*7c478bd9Sstevel@tonic-gate 			page_add(&plist, pp);
179*7c478bd9Sstevel@tonic-gate 		}
180*7c478bd9Sstevel@tonic-gate 		deltab -= PAGESIZE;
181*7c478bd9Sstevel@tonic-gate 
182*7c478bd9Sstevel@tonic-gate 		/* scan forward from front */
183*7c478bd9Sstevel@tonic-gate 		for (deltaf = PAGESIZE; off + deltaf < vp_end;
184*7c478bd9Sstevel@tonic-gate 		    deltaf += PAGESIZE) {
185*7c478bd9Sstevel@tonic-gate 			/*
186*7c478bd9Sstevel@tonic-gate 			 * Call back to the segment driver to verify that
187*7c478bd9Sstevel@tonic-gate 			 * the klustering/read ahead operation makes sense.
188*7c478bd9Sstevel@tonic-gate 			 */
189*7c478bd9Sstevel@tonic-gate 			if (SEGOP_KLUSTER(seg, addr, deltaf))
190*7c478bd9Sstevel@tonic-gate 				break;		/* page not file extension */
191*7c478bd9Sstevel@tonic-gate 			if ((pp = page_create_va(vp, off + deltaf,
192*7c478bd9Sstevel@tonic-gate 			    PAGESIZE, PG_EXCL, seg, addr + deltaf))
193*7c478bd9Sstevel@tonic-gate 			    == NULL)
194*7c478bd9Sstevel@tonic-gate 				break;		/* already have page */
195*7c478bd9Sstevel@tonic-gate 
196*7c478bd9Sstevel@tonic-gate 			/*
197*7c478bd9Sstevel@tonic-gate 			 * Add page to end of page list.
198*7c478bd9Sstevel@tonic-gate 			 */
199*7c478bd9Sstevel@tonic-gate 			page_add(&plist, pp);
200*7c478bd9Sstevel@tonic-gate 			plist = plist->p_next;
201*7c478bd9Sstevel@tonic-gate 		}
202*7c478bd9Sstevel@tonic-gate 		*offp = off = off - deltab;
203*7c478bd9Sstevel@tonic-gate 		*lenp = deltab + deltaf;
204*7c478bd9Sstevel@tonic-gate 		ASSERT(off >= vp_off);
205*7c478bd9Sstevel@tonic-gate 
206*7c478bd9Sstevel@tonic-gate 		/*
207*7c478bd9Sstevel@tonic-gate 		 * If we ended up getting more than was actually
208*7c478bd9Sstevel@tonic-gate 		 * requested, retract the returned length to only
209*7c478bd9Sstevel@tonic-gate 		 * reflect what was requested.  This might happen
210*7c478bd9Sstevel@tonic-gate 		 * if we were allowed to kluster pages across a
211*7c478bd9Sstevel@tonic-gate 		 * span of (say) 5 frags, and frag size is less
212*7c478bd9Sstevel@tonic-gate 		 * than PAGESIZE.  We need a whole number of
213*7c478bd9Sstevel@tonic-gate 		 * pages to contain those frags, but the returned
214*7c478bd9Sstevel@tonic-gate 		 * size should only allow the returned range to
215*7c478bd9Sstevel@tonic-gate 		 * extend as far as the end of the frags.
216*7c478bd9Sstevel@tonic-gate 		 */
217*7c478bd9Sstevel@tonic-gate 		if ((vp_off + vp_len) < (off + *lenp)) {
218*7c478bd9Sstevel@tonic-gate 			ASSERT(vp_end > off);
219*7c478bd9Sstevel@tonic-gate 			*lenp = vp_end - off;
220*7c478bd9Sstevel@tonic-gate 		}
221*7c478bd9Sstevel@tonic-gate 	}
222*7c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
223*7c478bd9Sstevel@tonic-gate 		"pvn_read_kluster:seg %p addr %x isra %x",
224*7c478bd9Sstevel@tonic-gate 		seg, addr, isra);
225*7c478bd9Sstevel@tonic-gate 	return (plist);
226*7c478bd9Sstevel@tonic-gate }
227*7c478bd9Sstevel@tonic-gate 
228*7c478bd9Sstevel@tonic-gate /*
229*7c478bd9Sstevel@tonic-gate  * Handle pages for this vnode on either side of the page "pp"
230*7c478bd9Sstevel@tonic-gate  * which has been locked by the caller.  This routine will also
231*7c478bd9Sstevel@tonic-gate  * do klustering in the range [vp_off, vp_off + vp_len] up
232*7c478bd9Sstevel@tonic-gate  * until a page which is not found.  The offset and length
233*7c478bd9Sstevel@tonic-gate  * of pages included is returned in "*offp" and "*lenp".
234*7c478bd9Sstevel@tonic-gate  *
235*7c478bd9Sstevel@tonic-gate  * Returns a list of dirty locked pages all ready to be
236*7c478bd9Sstevel@tonic-gate  * written back.
237*7c478bd9Sstevel@tonic-gate  */
238*7c478bd9Sstevel@tonic-gate page_t *
239*7c478bd9Sstevel@tonic-gate pvn_write_kluster(
240*7c478bd9Sstevel@tonic-gate 	struct vnode *vp,
241*7c478bd9Sstevel@tonic-gate 	page_t *pp,
242*7c478bd9Sstevel@tonic-gate 	u_offset_t *offp,		/* return values */
243*7c478bd9Sstevel@tonic-gate 	size_t *lenp,			/* return values */
244*7c478bd9Sstevel@tonic-gate 	u_offset_t vp_off,
245*7c478bd9Sstevel@tonic-gate 	size_t vp_len,
246*7c478bd9Sstevel@tonic-gate 	int flags)
247*7c478bd9Sstevel@tonic-gate {
248*7c478bd9Sstevel@tonic-gate 	u_offset_t off;
249*7c478bd9Sstevel@tonic-gate 	page_t *dirty;
250*7c478bd9Sstevel@tonic-gate 	size_t deltab, deltaf;
251*7c478bd9Sstevel@tonic-gate 	se_t se;
252*7c478bd9Sstevel@tonic-gate 	u_offset_t vp_end;
253*7c478bd9Sstevel@tonic-gate 
254*7c478bd9Sstevel@tonic-gate 	off = pp->p_offset;
255*7c478bd9Sstevel@tonic-gate 
256*7c478bd9Sstevel@tonic-gate 	/*
257*7c478bd9Sstevel@tonic-gate 	 * Kustering should not be done if we are invalidating
258*7c478bd9Sstevel@tonic-gate 	 * pages since we could destroy pages that belong to
259*7c478bd9Sstevel@tonic-gate 	 * some other process if this is a swap vnode.
260*7c478bd9Sstevel@tonic-gate 	 */
261*7c478bd9Sstevel@tonic-gate 	if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
262*7c478bd9Sstevel@tonic-gate 		*offp = off;
263*7c478bd9Sstevel@tonic-gate 		*lenp = PAGESIZE;
264*7c478bd9Sstevel@tonic-gate 		return (pp);
265*7c478bd9Sstevel@tonic-gate 	}
266*7c478bd9Sstevel@tonic-gate 
267*7c478bd9Sstevel@tonic-gate 	if (flags & (B_FREE | B_INVAL))
268*7c478bd9Sstevel@tonic-gate 		se = SE_EXCL;
269*7c478bd9Sstevel@tonic-gate 	else
270*7c478bd9Sstevel@tonic-gate 		se = SE_SHARED;
271*7c478bd9Sstevel@tonic-gate 
272*7c478bd9Sstevel@tonic-gate 	dirty = pp;
273*7c478bd9Sstevel@tonic-gate 	/*
274*7c478bd9Sstevel@tonic-gate 	 * Scan backwards looking for pages to kluster by incrementing
275*7c478bd9Sstevel@tonic-gate 	 * "deltab" and comparing "off" with "vp_off + deltab" to
276*7c478bd9Sstevel@tonic-gate 	 * avoid "signed" versus "unsigned" conversion problems.
277*7c478bd9Sstevel@tonic-gate 	 */
278*7c478bd9Sstevel@tonic-gate 	for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
279*7c478bd9Sstevel@tonic-gate 		pp = page_lookup_nowait(vp, off - deltab, se);
280*7c478bd9Sstevel@tonic-gate 		if (pp == NULL)
281*7c478bd9Sstevel@tonic-gate 			break;		/* page not found */
282*7c478bd9Sstevel@tonic-gate 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
283*7c478bd9Sstevel@tonic-gate 			break;
284*7c478bd9Sstevel@tonic-gate 		page_add(&dirty, pp);
285*7c478bd9Sstevel@tonic-gate 	}
286*7c478bd9Sstevel@tonic-gate 	deltab -= PAGESIZE;
287*7c478bd9Sstevel@tonic-gate 
288*7c478bd9Sstevel@tonic-gate 	vp_end = vp_off + vp_len;
289*7c478bd9Sstevel@tonic-gate 	/* now scan forwards looking for pages to kluster */
290*7c478bd9Sstevel@tonic-gate 	for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
291*7c478bd9Sstevel@tonic-gate 		pp = page_lookup_nowait(vp, off + deltaf, se);
292*7c478bd9Sstevel@tonic-gate 		if (pp == NULL)
293*7c478bd9Sstevel@tonic-gate 			break;		/* page not found */
294*7c478bd9Sstevel@tonic-gate 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
295*7c478bd9Sstevel@tonic-gate 			break;
296*7c478bd9Sstevel@tonic-gate 		page_add(&dirty, pp);
297*7c478bd9Sstevel@tonic-gate 		dirty = dirty->p_next;
298*7c478bd9Sstevel@tonic-gate 	}
299*7c478bd9Sstevel@tonic-gate 
300*7c478bd9Sstevel@tonic-gate 	*offp = off - deltab;
301*7c478bd9Sstevel@tonic-gate 	*lenp = deltab + deltaf;
302*7c478bd9Sstevel@tonic-gate 	return (dirty);
303*7c478bd9Sstevel@tonic-gate }
304*7c478bd9Sstevel@tonic-gate 
305*7c478bd9Sstevel@tonic-gate /*
306*7c478bd9Sstevel@tonic-gate  * Generic entry point used to release the "shared/exclusive" lock
307*7c478bd9Sstevel@tonic-gate  * and the "p_iolock" on pages after i/o is complete.
308*7c478bd9Sstevel@tonic-gate  */
309*7c478bd9Sstevel@tonic-gate void
310*7c478bd9Sstevel@tonic-gate pvn_io_done(page_t *plist)
311*7c478bd9Sstevel@tonic-gate {
312*7c478bd9Sstevel@tonic-gate 	page_t *pp;
313*7c478bd9Sstevel@tonic-gate 
314*7c478bd9Sstevel@tonic-gate 	while (plist != NULL) {
315*7c478bd9Sstevel@tonic-gate 		pp = plist;
316*7c478bd9Sstevel@tonic-gate 		page_sub(&plist, pp);
317*7c478bd9Sstevel@tonic-gate 		page_io_unlock(pp);
318*7c478bd9Sstevel@tonic-gate 		page_unlock(pp);
319*7c478bd9Sstevel@tonic-gate 	}
320*7c478bd9Sstevel@tonic-gate }
321*7c478bd9Sstevel@tonic-gate 
322*7c478bd9Sstevel@tonic-gate /*
323*7c478bd9Sstevel@tonic-gate  * Entry point to be used by file system getpage subr's and
324*7c478bd9Sstevel@tonic-gate  * other such routines which either want to unlock pages (B_ASYNC
325*7c478bd9Sstevel@tonic-gate  * request) or destroy a list of pages if an error occurred.
326*7c478bd9Sstevel@tonic-gate  */
327*7c478bd9Sstevel@tonic-gate void
328*7c478bd9Sstevel@tonic-gate pvn_read_done(page_t *plist, int flags)
329*7c478bd9Sstevel@tonic-gate {
330*7c478bd9Sstevel@tonic-gate 	page_t *pp;
331*7c478bd9Sstevel@tonic-gate 
332*7c478bd9Sstevel@tonic-gate 	while (plist != NULL) {
333*7c478bd9Sstevel@tonic-gate 		pp = plist;
334*7c478bd9Sstevel@tonic-gate 		page_sub(&plist, pp);
335*7c478bd9Sstevel@tonic-gate 		page_io_unlock(pp);
336*7c478bd9Sstevel@tonic-gate 		if (flags & B_ERROR) {
337*7c478bd9Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
338*7c478bd9Sstevel@tonic-gate 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
339*7c478bd9Sstevel@tonic-gate 		} else {
340*7c478bd9Sstevel@tonic-gate 			(void) page_release(pp, 0);
341*7c478bd9Sstevel@tonic-gate 		}
342*7c478bd9Sstevel@tonic-gate 	}
343*7c478bd9Sstevel@tonic-gate }
344*7c478bd9Sstevel@tonic-gate 
345*7c478bd9Sstevel@tonic-gate /*
346*7c478bd9Sstevel@tonic-gate  * Automagic pageout.
347*7c478bd9Sstevel@tonic-gate  * When memory gets tight, start freeing pages popping out of the
348*7c478bd9Sstevel@tonic-gate  * write queue.
349*7c478bd9Sstevel@tonic-gate  */
350*7c478bd9Sstevel@tonic-gate int	write_free = 1;
351*7c478bd9Sstevel@tonic-gate pgcnt_t	pages_before_pager = 200;	/* LMXXX */
352*7c478bd9Sstevel@tonic-gate 
353*7c478bd9Sstevel@tonic-gate /*
354*7c478bd9Sstevel@tonic-gate  * Routine to be called when page-out's complete.
355*7c478bd9Sstevel@tonic-gate  * The caller, typically VOP_PUTPAGE, has to explicity call this routine
356*7c478bd9Sstevel@tonic-gate  * after waiting for i/o to complete (biowait) to free the list of
357*7c478bd9Sstevel@tonic-gate  * pages associated with the buffer.  These pages must be locked
358*7c478bd9Sstevel@tonic-gate  * before i/o is initiated.
359*7c478bd9Sstevel@tonic-gate  *
360*7c478bd9Sstevel@tonic-gate  * If a write error occurs, the pages are marked as modified
361*7c478bd9Sstevel@tonic-gate  * so the write will be re-tried later.
362*7c478bd9Sstevel@tonic-gate  */
363*7c478bd9Sstevel@tonic-gate 
364*7c478bd9Sstevel@tonic-gate void
365*7c478bd9Sstevel@tonic-gate pvn_write_done(page_t *plist, int flags)
366*7c478bd9Sstevel@tonic-gate {
367*7c478bd9Sstevel@tonic-gate 	int dfree = 0;
368*7c478bd9Sstevel@tonic-gate 	int pgrec = 0;
369*7c478bd9Sstevel@tonic-gate 	int pgout = 0;
370*7c478bd9Sstevel@tonic-gate 	int pgpgout = 0;
371*7c478bd9Sstevel@tonic-gate 	int anonpgout = 0;
372*7c478bd9Sstevel@tonic-gate 	int anonfree = 0;
373*7c478bd9Sstevel@tonic-gate 	int fspgout = 0;
374*7c478bd9Sstevel@tonic-gate 	int fsfree = 0;
375*7c478bd9Sstevel@tonic-gate 	int execpgout = 0;
376*7c478bd9Sstevel@tonic-gate 	int execfree = 0;
377*7c478bd9Sstevel@tonic-gate 	page_t *pp;
378*7c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
379*7c478bd9Sstevel@tonic-gate 	struct vnode *vp = NULL;	/* for probe */
380*7c478bd9Sstevel@tonic-gate 	uint_t ppattr;
381*7c478bd9Sstevel@tonic-gate 
382*7c478bd9Sstevel@tonic-gate 	ASSERT((flags & B_READ) == 0);
383*7c478bd9Sstevel@tonic-gate 
384*7c478bd9Sstevel@tonic-gate 	/*
385*7c478bd9Sstevel@tonic-gate 	 * If we are about to start paging anyway, start freeing pages.
386*7c478bd9Sstevel@tonic-gate 	 */
387*7c478bd9Sstevel@tonic-gate 	if (write_free && freemem < lotsfree + pages_before_pager &&
388*7c478bd9Sstevel@tonic-gate 	    (flags & B_ERROR) == 0) {
389*7c478bd9Sstevel@tonic-gate 		flags |= B_FREE;
390*7c478bd9Sstevel@tonic-gate 	}
391*7c478bd9Sstevel@tonic-gate 
392*7c478bd9Sstevel@tonic-gate 	/*
393*7c478bd9Sstevel@tonic-gate 	 * Handle each page involved in the i/o operation.
394*7c478bd9Sstevel@tonic-gate 	 */
395*7c478bd9Sstevel@tonic-gate 	while (plist != NULL) {
396*7c478bd9Sstevel@tonic-gate 		pp = plist;
397*7c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
398*7c478bd9Sstevel@tonic-gate 		page_sub(&plist, pp);
399*7c478bd9Sstevel@tonic-gate 
400*7c478bd9Sstevel@tonic-gate 		/* Kernel probe support */
401*7c478bd9Sstevel@tonic-gate 		if (vp == NULL)
402*7c478bd9Sstevel@tonic-gate 			vp = pp->p_vnode;
403*7c478bd9Sstevel@tonic-gate 
404*7c478bd9Sstevel@tonic-gate 		if (flags & B_ERROR) {
405*7c478bd9Sstevel@tonic-gate 			/*
406*7c478bd9Sstevel@tonic-gate 			 * Write operation failed.  We don't want
407*7c478bd9Sstevel@tonic-gate 			 * to destroy (or free) the page unless B_FORCE
408*7c478bd9Sstevel@tonic-gate 			 * is set. We set the mod bit again and release
409*7c478bd9Sstevel@tonic-gate 			 * all locks on the page so that it will get written
410*7c478bd9Sstevel@tonic-gate 			 * back again later when things are hopefully
411*7c478bd9Sstevel@tonic-gate 			 * better again.
412*7c478bd9Sstevel@tonic-gate 			 * If B_INVAL and B_FORCE is set we really have
413*7c478bd9Sstevel@tonic-gate 			 * to destroy the page.
414*7c478bd9Sstevel@tonic-gate 			 */
415*7c478bd9Sstevel@tonic-gate 			if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
416*7c478bd9Sstevel@tonic-gate 				page_io_unlock(pp);
417*7c478bd9Sstevel@tonic-gate 				/*LINTED: constant in conditional context*/
418*7c478bd9Sstevel@tonic-gate 				VN_DISPOSE(pp, B_INVAL, 0, kcred);
419*7c478bd9Sstevel@tonic-gate 			} else {
420*7c478bd9Sstevel@tonic-gate 				hat_setmod(pp);
421*7c478bd9Sstevel@tonic-gate 				page_io_unlock(pp);
422*7c478bd9Sstevel@tonic-gate 				page_unlock(pp);
423*7c478bd9Sstevel@tonic-gate 			}
424*7c478bd9Sstevel@tonic-gate 		} else if (flags & B_INVAL) {
425*7c478bd9Sstevel@tonic-gate 			/*
426*7c478bd9Sstevel@tonic-gate 			 * XXX - Failed writes with B_INVAL set are
427*7c478bd9Sstevel@tonic-gate 			 * not handled appropriately.
428*7c478bd9Sstevel@tonic-gate 			 */
429*7c478bd9Sstevel@tonic-gate 			page_io_unlock(pp);
430*7c478bd9Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
431*7c478bd9Sstevel@tonic-gate 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
432*7c478bd9Sstevel@tonic-gate 		} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
433*7c478bd9Sstevel@tonic-gate 			/*
434*7c478bd9Sstevel@tonic-gate 			 * Update statistics for pages being paged out
435*7c478bd9Sstevel@tonic-gate 			 */
436*7c478bd9Sstevel@tonic-gate 			if (pp->p_vnode) {
437*7c478bd9Sstevel@tonic-gate 				if (IS_SWAPFSVP(pp->p_vnode)) {
438*7c478bd9Sstevel@tonic-gate 					anonpgout++;
439*7c478bd9Sstevel@tonic-gate 				} else {
440*7c478bd9Sstevel@tonic-gate 					if (pp->p_vnode->v_flag & VVMEXEC) {
441*7c478bd9Sstevel@tonic-gate 						execpgout++;
442*7c478bd9Sstevel@tonic-gate 					} else {
443*7c478bd9Sstevel@tonic-gate 						fspgout++;
444*7c478bd9Sstevel@tonic-gate 					}
445*7c478bd9Sstevel@tonic-gate 				}
446*7c478bd9Sstevel@tonic-gate 			}
447*7c478bd9Sstevel@tonic-gate 			page_io_unlock(pp);
448*7c478bd9Sstevel@tonic-gate 			pgout = 1;
449*7c478bd9Sstevel@tonic-gate 			pgpgout++;
450*7c478bd9Sstevel@tonic-gate 			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
451*7c478bd9Sstevel@tonic-gate 				"page_ws_out:pp %p", pp);
452*7c478bd9Sstevel@tonic-gate 
453*7c478bd9Sstevel@tonic-gate 			/*
454*7c478bd9Sstevel@tonic-gate 			 * The page_struct_lock need not be acquired to
455*7c478bd9Sstevel@tonic-gate 			 * examine "p_lckcnt" and "p_cowcnt" since we'll
456*7c478bd9Sstevel@tonic-gate 			 * have an "exclusive" lock if the upgrade succeeds.
457*7c478bd9Sstevel@tonic-gate 			 */
458*7c478bd9Sstevel@tonic-gate 			if (page_tryupgrade(pp) &&
459*7c478bd9Sstevel@tonic-gate 			    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
460*7c478bd9Sstevel@tonic-gate 				/*
461*7c478bd9Sstevel@tonic-gate 				 * Check if someone has reclaimed the
462*7c478bd9Sstevel@tonic-gate 				 * page.  If ref and mod are not set, no
463*7c478bd9Sstevel@tonic-gate 				 * one is using it so we can free it.
464*7c478bd9Sstevel@tonic-gate 				 * The rest of the system is careful
465*7c478bd9Sstevel@tonic-gate 				 * to use the NOSYNC flag to unload
466*7c478bd9Sstevel@tonic-gate 				 * translations set up for i/o w/o
467*7c478bd9Sstevel@tonic-gate 				 * affecting ref and mod bits.
468*7c478bd9Sstevel@tonic-gate 				 *
469*7c478bd9Sstevel@tonic-gate 				 * Obtain a copy of the real hardware
470*7c478bd9Sstevel@tonic-gate 				 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
471*7c478bd9Sstevel@tonic-gate 				 * to avoid having to flush the cache.
472*7c478bd9Sstevel@tonic-gate 				 */
473*7c478bd9Sstevel@tonic-gate 				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
474*7c478bd9Sstevel@tonic-gate 					HAT_SYNC_STOPON_MOD);
475*7c478bd9Sstevel@tonic-gate 			ck_refmod:
476*7c478bd9Sstevel@tonic-gate 				if (!(ppattr & (P_REF | P_MOD))) {
477*7c478bd9Sstevel@tonic-gate 					if (hat_page_is_mapped(pp)) {
478*7c478bd9Sstevel@tonic-gate 						/*
479*7c478bd9Sstevel@tonic-gate 						 * Doesn't look like the page
480*7c478bd9Sstevel@tonic-gate 						 * was modified so now we
481*7c478bd9Sstevel@tonic-gate 						 * really have to unload the
482*7c478bd9Sstevel@tonic-gate 						 * translations.  Meanwhile
483*7c478bd9Sstevel@tonic-gate 						 * another CPU could've
484*7c478bd9Sstevel@tonic-gate 						 * modified it so we have to
485*7c478bd9Sstevel@tonic-gate 						 * check again.  We don't loop
486*7c478bd9Sstevel@tonic-gate 						 * forever here because now
487*7c478bd9Sstevel@tonic-gate 						 * the translations are gone
488*7c478bd9Sstevel@tonic-gate 						 * and no one can get a new one
489*7c478bd9Sstevel@tonic-gate 						 * since we have the "exclusive"
490*7c478bd9Sstevel@tonic-gate 						 * lock on the page.
491*7c478bd9Sstevel@tonic-gate 						 */
492*7c478bd9Sstevel@tonic-gate 						(void) hat_pageunload(pp,
493*7c478bd9Sstevel@tonic-gate 							HAT_FORCE_PGUNLOAD);
494*7c478bd9Sstevel@tonic-gate 						ppattr = hat_page_getattr(pp,
495*7c478bd9Sstevel@tonic-gate 							P_REF | P_MOD);
496*7c478bd9Sstevel@tonic-gate 						goto ck_refmod;
497*7c478bd9Sstevel@tonic-gate 					}
498*7c478bd9Sstevel@tonic-gate 					/*
499*7c478bd9Sstevel@tonic-gate 					 * Update statistics for pages being
500*7c478bd9Sstevel@tonic-gate 					 * freed
501*7c478bd9Sstevel@tonic-gate 					 */
502*7c478bd9Sstevel@tonic-gate 					if (pp->p_vnode) {
503*7c478bd9Sstevel@tonic-gate 						if (IS_SWAPFSVP(pp->p_vnode)) {
504*7c478bd9Sstevel@tonic-gate 							anonfree++;
505*7c478bd9Sstevel@tonic-gate 						} else {
506*7c478bd9Sstevel@tonic-gate 							if (pp->p_vnode->v_flag
507*7c478bd9Sstevel@tonic-gate 							    & VVMEXEC) {
508*7c478bd9Sstevel@tonic-gate 								execfree++;
509*7c478bd9Sstevel@tonic-gate 							} else {
510*7c478bd9Sstevel@tonic-gate 								fsfree++;
511*7c478bd9Sstevel@tonic-gate 							}
512*7c478bd9Sstevel@tonic-gate 						}
513*7c478bd9Sstevel@tonic-gate 					}
514*7c478bd9Sstevel@tonic-gate 					/*LINTED: constant in conditional ctx*/
515*7c478bd9Sstevel@tonic-gate 					VN_DISPOSE(pp, B_FREE,
516*7c478bd9Sstevel@tonic-gate 						(flags & B_DONTNEED), kcred);
517*7c478bd9Sstevel@tonic-gate 					dfree++;
518*7c478bd9Sstevel@tonic-gate 				} else {
519*7c478bd9Sstevel@tonic-gate 					page_unlock(pp);
520*7c478bd9Sstevel@tonic-gate 					pgrec++;
521*7c478bd9Sstevel@tonic-gate 					TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
522*7c478bd9Sstevel@tonic-gate 					    "page_ws_free:pp %p", pp);
523*7c478bd9Sstevel@tonic-gate 				}
524*7c478bd9Sstevel@tonic-gate 			} else {
525*7c478bd9Sstevel@tonic-gate 				/*
526*7c478bd9Sstevel@tonic-gate 				 * Page is either `locked' in memory
527*7c478bd9Sstevel@tonic-gate 				 * or was reclaimed and now has a
528*7c478bd9Sstevel@tonic-gate 				 * "shared" lock, so release it.
529*7c478bd9Sstevel@tonic-gate 				 */
530*7c478bd9Sstevel@tonic-gate 				page_unlock(pp);
531*7c478bd9Sstevel@tonic-gate 			}
532*7c478bd9Sstevel@tonic-gate 		} else {
533*7c478bd9Sstevel@tonic-gate 			/*
534*7c478bd9Sstevel@tonic-gate 			 * Neither B_FREE nor B_INVAL nor B_ERROR.
535*7c478bd9Sstevel@tonic-gate 			 * Just release locks.
536*7c478bd9Sstevel@tonic-gate 			 */
537*7c478bd9Sstevel@tonic-gate 			page_io_unlock(pp);
538*7c478bd9Sstevel@tonic-gate 			page_unlock(pp);
539*7c478bd9Sstevel@tonic-gate 		}
540*7c478bd9Sstevel@tonic-gate 	}
541*7c478bd9Sstevel@tonic-gate 
542*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ENTER_K();
543*7c478bd9Sstevel@tonic-gate 	cpup = CPU;		/* get cpup now that CPU cannot change */
544*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
545*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
546*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
547*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
548*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
549*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
550*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
551*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
552*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
553*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
554*7c478bd9Sstevel@tonic-gate 	CPU_STATS_EXIT_K();
555*7c478bd9Sstevel@tonic-gate 
556*7c478bd9Sstevel@tonic-gate 	/* Kernel probe */
557*7c478bd9Sstevel@tonic-gate 	TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
558*7c478bd9Sstevel@tonic-gate 		tnf_opaque,	vnode,			vp,
559*7c478bd9Sstevel@tonic-gate 		tnf_ulong,	pages_pageout,		pgpgout,
560*7c478bd9Sstevel@tonic-gate 		tnf_ulong,	pages_freed,		dfree,
561*7c478bd9Sstevel@tonic-gate 		tnf_ulong,	pages_reclaimed,	pgrec);
562*7c478bd9Sstevel@tonic-gate }
563*7c478bd9Sstevel@tonic-gate 
564*7c478bd9Sstevel@tonic-gate /*
565*7c478bd9Sstevel@tonic-gate  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
566*7c478bd9Sstevel@tonic-gate  * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
567*7c478bd9Sstevel@tonic-gate  * operation and is only to be considered if it doesn't involve any
568*7c478bd9Sstevel@tonic-gate  * waiting here.  B_TRUNC indicates that the file is being truncated
569*7c478bd9Sstevel@tonic-gate  * and so no i/o needs to be done. B_FORCE indicates that the page
570*7c478bd9Sstevel@tonic-gate  * must be destroyed so don't try wrting it out.
571*7c478bd9Sstevel@tonic-gate  *
572*7c478bd9Sstevel@tonic-gate  * The caller must ensure that the page is locked.  Returns 1, if
573*7c478bd9Sstevel@tonic-gate  * the page should be written back (the "iolock" is held in this
574*7c478bd9Sstevel@tonic-gate  * case), or 0 if the page has been dealt with or has been
575*7c478bd9Sstevel@tonic-gate  * unlocked.
576*7c478bd9Sstevel@tonic-gate  */
577*7c478bd9Sstevel@tonic-gate int
578*7c478bd9Sstevel@tonic-gate pvn_getdirty(page_t *pp, int flags)
579*7c478bd9Sstevel@tonic-gate {
580*7c478bd9Sstevel@tonic-gate 	ASSERT((flags & (B_INVAL | B_FREE)) ?
581*7c478bd9Sstevel@tonic-gate 	    PAGE_EXCL(pp) : PAGE_SHARED(pp));
582*7c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp) == 0);
583*7c478bd9Sstevel@tonic-gate 
584*7c478bd9Sstevel@tonic-gate 	/*
585*7c478bd9Sstevel@tonic-gate 	 * If trying to invalidate or free a logically `locked' page,
586*7c478bd9Sstevel@tonic-gate 	 * forget it.  Don't need page_struct_lock to check p_lckcnt and
587*7c478bd9Sstevel@tonic-gate 	 * p_cowcnt as the page is exclusively locked.
588*7c478bd9Sstevel@tonic-gate 	 */
589*7c478bd9Sstevel@tonic-gate 	if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
590*7c478bd9Sstevel@tonic-gate 	    (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
591*7c478bd9Sstevel@tonic-gate 		page_unlock(pp);
592*7c478bd9Sstevel@tonic-gate 		return (0);
593*7c478bd9Sstevel@tonic-gate 	}
594*7c478bd9Sstevel@tonic-gate 
595*7c478bd9Sstevel@tonic-gate 	/*
596*7c478bd9Sstevel@tonic-gate 	 * Now acquire the i/o lock so we can add it to the dirty
597*7c478bd9Sstevel@tonic-gate 	 * list (if necessary).  We avoid blocking on the i/o lock
598*7c478bd9Sstevel@tonic-gate 	 * in the following cases:
599*7c478bd9Sstevel@tonic-gate 	 *
600*7c478bd9Sstevel@tonic-gate 	 *	If B_DELWRI is set, which implies that this request is
601*7c478bd9Sstevel@tonic-gate 	 *	due to a klustering operartion.
602*7c478bd9Sstevel@tonic-gate 	 *
603*7c478bd9Sstevel@tonic-gate 	 *	If this is an async (B_ASYNC) operation and we are not doing
604*7c478bd9Sstevel@tonic-gate 	 *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
605*7c478bd9Sstevel@tonic-gate 	 *	that the the page is written out].
606*7c478bd9Sstevel@tonic-gate 	 */
607*7c478bd9Sstevel@tonic-gate 	if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
608*7c478bd9Sstevel@tonic-gate 		if (!page_io_trylock(pp)) {
609*7c478bd9Sstevel@tonic-gate 			page_unlock(pp);
610*7c478bd9Sstevel@tonic-gate 			return (0);
611*7c478bd9Sstevel@tonic-gate 		}
612*7c478bd9Sstevel@tonic-gate 	} else {
613*7c478bd9Sstevel@tonic-gate 		page_io_lock(pp);
614*7c478bd9Sstevel@tonic-gate 	}
615*7c478bd9Sstevel@tonic-gate 
616*7c478bd9Sstevel@tonic-gate 	/*
617*7c478bd9Sstevel@tonic-gate 	 * If we want to free or invalidate the page then
618*7c478bd9Sstevel@tonic-gate 	 * we need to unload it so that anyone who wants
619*7c478bd9Sstevel@tonic-gate 	 * it will have to take a minor fault to get it.
620*7c478bd9Sstevel@tonic-gate 	 * Otherwise, we're just writing the page back so we
621*7c478bd9Sstevel@tonic-gate 	 * need to sync up the hardwre and software mod bit to
622*7c478bd9Sstevel@tonic-gate 	 * detect any future modifications.  We clear the
623*7c478bd9Sstevel@tonic-gate 	 * software mod bit when we put the page on the dirty
624*7c478bd9Sstevel@tonic-gate 	 * list.
625*7c478bd9Sstevel@tonic-gate 	 */
626*7c478bd9Sstevel@tonic-gate 	if (flags & (B_INVAL | B_FREE)) {
627*7c478bd9Sstevel@tonic-gate 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
628*7c478bd9Sstevel@tonic-gate 	} else {
629*7c478bd9Sstevel@tonic-gate 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
630*7c478bd9Sstevel@tonic-gate 	}
631*7c478bd9Sstevel@tonic-gate 
632*7c478bd9Sstevel@tonic-gate 	if (!hat_ismod(pp) || (flags & B_TRUNC)) {
633*7c478bd9Sstevel@tonic-gate 		/*
634*7c478bd9Sstevel@tonic-gate 		 * Don't need to add it to the
635*7c478bd9Sstevel@tonic-gate 		 * list after all.
636*7c478bd9Sstevel@tonic-gate 		 */
637*7c478bd9Sstevel@tonic-gate 		page_io_unlock(pp);
638*7c478bd9Sstevel@tonic-gate 		if (flags & B_INVAL) {
639*7c478bd9Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
640*7c478bd9Sstevel@tonic-gate 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
641*7c478bd9Sstevel@tonic-gate 		} else if (flags & B_FREE) {
642*7c478bd9Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
643*7c478bd9Sstevel@tonic-gate 			VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
644*7c478bd9Sstevel@tonic-gate 		} else {
645*7c478bd9Sstevel@tonic-gate 			/*
646*7c478bd9Sstevel@tonic-gate 			 * This is advisory path for the callers
647*7c478bd9Sstevel@tonic-gate 			 * of VOP_PUTPAGE() who prefer freeing the
648*7c478bd9Sstevel@tonic-gate 			 * page _only_ if no one else is accessing it.
649*7c478bd9Sstevel@tonic-gate 			 * E.g. segmap_release()
650*7c478bd9Sstevel@tonic-gate 			 *
651*7c478bd9Sstevel@tonic-gate 			 * The above hat_ismod() check is useless because:
652*7c478bd9Sstevel@tonic-gate 			 * (1) we may not be holding SE_EXCL lock;
653*7c478bd9Sstevel@tonic-gate 			 * (2) we've not unloaded _all_ translations
654*7c478bd9Sstevel@tonic-gate 			 *
655*7c478bd9Sstevel@tonic-gate 			 * Let page_release() do the heavy-lifting.
656*7c478bd9Sstevel@tonic-gate 			 */
657*7c478bd9Sstevel@tonic-gate 			(void) page_release(pp, 1);
658*7c478bd9Sstevel@tonic-gate 		}
659*7c478bd9Sstevel@tonic-gate 		return (0);
660*7c478bd9Sstevel@tonic-gate 	}
661*7c478bd9Sstevel@tonic-gate 
662*7c478bd9Sstevel@tonic-gate 	/*
663*7c478bd9Sstevel@tonic-gate 	 * Page is dirty, get it ready for the write back
664*7c478bd9Sstevel@tonic-gate 	 * and add page to the dirty list.
665*7c478bd9Sstevel@tonic-gate 	 */
666*7c478bd9Sstevel@tonic-gate 	hat_clrrefmod(pp);
667*7c478bd9Sstevel@tonic-gate 
668*7c478bd9Sstevel@tonic-gate 	/*
669*7c478bd9Sstevel@tonic-gate 	 * If we're going to free the page when we're done
670*7c478bd9Sstevel@tonic-gate 	 * then we can let others try to use it starting now.
671*7c478bd9Sstevel@tonic-gate 	 * We'll detect the fact that they used it when the
672*7c478bd9Sstevel@tonic-gate 	 * i/o is done and avoid freeing the page.
673*7c478bd9Sstevel@tonic-gate 	 */
674*7c478bd9Sstevel@tonic-gate 	if (flags & B_FREE)
675*7c478bd9Sstevel@tonic-gate 		page_downgrade(pp);
676*7c478bd9Sstevel@tonic-gate 
677*7c478bd9Sstevel@tonic-gate 
678*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
679*7c478bd9Sstevel@tonic-gate 
680*7c478bd9Sstevel@tonic-gate 	return (1);
681*7c478bd9Sstevel@tonic-gate }
682*7c478bd9Sstevel@tonic-gate 
683*7c478bd9Sstevel@tonic-gate 
684*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
685*7c478bd9Sstevel@tonic-gate static int
686*7c478bd9Sstevel@tonic-gate marker_constructor(void *buf, void *cdrarg, int kmflags)
687*7c478bd9Sstevel@tonic-gate {
688*7c478bd9Sstevel@tonic-gate 	page_t *mark = buf;
689*7c478bd9Sstevel@tonic-gate 	bzero(mark, sizeof (page_t));
690*7c478bd9Sstevel@tonic-gate 	return (0);
691*7c478bd9Sstevel@tonic-gate }
692*7c478bd9Sstevel@tonic-gate 
693*7c478bd9Sstevel@tonic-gate void
694*7c478bd9Sstevel@tonic-gate pvn_init()
695*7c478bd9Sstevel@tonic-gate {
696*7c478bd9Sstevel@tonic-gate 	if (pvn_vmodsort_disable == 0)
697*7c478bd9Sstevel@tonic-gate 		pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
698*7c478bd9Sstevel@tonic-gate 	marker_cache = kmem_cache_create("marker_cache",
699*7c478bd9Sstevel@tonic-gate 	    sizeof (page_t), 0, marker_constructor,
700*7c478bd9Sstevel@tonic-gate 	    NULL, NULL, NULL, NULL, 0);
701*7c478bd9Sstevel@tonic-gate }
702*7c478bd9Sstevel@tonic-gate 
703*7c478bd9Sstevel@tonic-gate 
704*7c478bd9Sstevel@tonic-gate /*
705*7c478bd9Sstevel@tonic-gate  * Process a vnode's page list for all pages whose offset is >= off.
706*7c478bd9Sstevel@tonic-gate  * Pages are to either be free'd, invalidated, or written back to disk.
707*7c478bd9Sstevel@tonic-gate  *
708*7c478bd9Sstevel@tonic-gate  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
709*7c478bd9Sstevel@tonic-gate  * is specified, otherwise they are "shared" locked.
710*7c478bd9Sstevel@tonic-gate  *
711*7c478bd9Sstevel@tonic-gate  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
712*7c478bd9Sstevel@tonic-gate  *
713*7c478bd9Sstevel@tonic-gate  * Special marker page_t's are inserted in the list in order
714*7c478bd9Sstevel@tonic-gate  * to keep track of where we are in the list when locks are dropped.
715*7c478bd9Sstevel@tonic-gate  *
716*7c478bd9Sstevel@tonic-gate  * Note the list is circular and insertions can happen only at the
717*7c478bd9Sstevel@tonic-gate  * head and tail of the list. The algorithm ensures visiting all pages
718*7c478bd9Sstevel@tonic-gate  * on the list in the following way:
719*7c478bd9Sstevel@tonic-gate  *
720*7c478bd9Sstevel@tonic-gate  *    Drop two marker pages at the end of the list.
721*7c478bd9Sstevel@tonic-gate  *
722*7c478bd9Sstevel@tonic-gate  *    Move one marker page backwards towards the start of the list until
723*7c478bd9Sstevel@tonic-gate  *    it is at the list head, processing the pages passed along the way.
724*7c478bd9Sstevel@tonic-gate  *
725*7c478bd9Sstevel@tonic-gate  *    Due to race conditions when the vphm mutex is dropped, additional pages
726*7c478bd9Sstevel@tonic-gate  *    can be added to either end of the list, so we'll continue to move
727*7c478bd9Sstevel@tonic-gate  *    the marker and process pages until it is up against the end marker.
728*7c478bd9Sstevel@tonic-gate  *
729*7c478bd9Sstevel@tonic-gate  * There is one special exit condition. If we are processing a VMODSORT
730*7c478bd9Sstevel@tonic-gate  * vnode and only writing back modified pages, we can stop as soon as
731*7c478bd9Sstevel@tonic-gate  * we run into an unmodified page.  This makes fsync(3) operations fast.
732*7c478bd9Sstevel@tonic-gate  */
733*7c478bd9Sstevel@tonic-gate int
734*7c478bd9Sstevel@tonic-gate pvn_vplist_dirty(
735*7c478bd9Sstevel@tonic-gate 	vnode_t		*vp,
736*7c478bd9Sstevel@tonic-gate 	u_offset_t	off,
737*7c478bd9Sstevel@tonic-gate 	int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
738*7c478bd9Sstevel@tonic-gate 			size_t *, int, cred_t *),
739*7c478bd9Sstevel@tonic-gate 	int		flags,
740*7c478bd9Sstevel@tonic-gate 	cred_t		*cred)
741*7c478bd9Sstevel@tonic-gate {
742*7c478bd9Sstevel@tonic-gate 	page_t		*pp;
743*7c478bd9Sstevel@tonic-gate 	page_t		*mark;		/* marker page that moves toward head */
744*7c478bd9Sstevel@tonic-gate 	page_t		*end;		/* marker page at end of list */
745*7c478bd9Sstevel@tonic-gate 	int		err = 0;
746*7c478bd9Sstevel@tonic-gate 	int		error;
747*7c478bd9Sstevel@tonic-gate 	kmutex_t	*vphm;
748*7c478bd9Sstevel@tonic-gate 	se_t		se;
749*7c478bd9Sstevel@tonic-gate 	page_t		**where_to_move;
750*7c478bd9Sstevel@tonic-gate 
751*7c478bd9Sstevel@tonic-gate 	ASSERT(vp->v_type != VCHR);
752*7c478bd9Sstevel@tonic-gate 
753*7c478bd9Sstevel@tonic-gate 	if (vp->v_pages == NULL)
754*7c478bd9Sstevel@tonic-gate 		return (0);
755*7c478bd9Sstevel@tonic-gate 
756*7c478bd9Sstevel@tonic-gate 
757*7c478bd9Sstevel@tonic-gate 	/*
758*7c478bd9Sstevel@tonic-gate 	 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
759*7c478bd9Sstevel@tonic-gate 	 *
760*7c478bd9Sstevel@tonic-gate 	 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
761*7c478bd9Sstevel@tonic-gate 	 * from getting blocked while flushing pages to a dead NFS server.
762*7c478bd9Sstevel@tonic-gate 	 */
763*7c478bd9Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
764*7c478bd9Sstevel@tonic-gate 	if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
765*7c478bd9Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
766*7c478bd9Sstevel@tonic-gate 		return (EAGAIN);
767*7c478bd9Sstevel@tonic-gate 	}
768*7c478bd9Sstevel@tonic-gate 
769*7c478bd9Sstevel@tonic-gate 	while (vp->v_flag & VVMLOCK)
770*7c478bd9Sstevel@tonic-gate 		cv_wait(&vp->v_cv, &vp->v_lock);
771*7c478bd9Sstevel@tonic-gate 
772*7c478bd9Sstevel@tonic-gate 	if (vp->v_pages == NULL) {
773*7c478bd9Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
774*7c478bd9Sstevel@tonic-gate 		return (0);
775*7c478bd9Sstevel@tonic-gate 	}
776*7c478bd9Sstevel@tonic-gate 
777*7c478bd9Sstevel@tonic-gate 	vp->v_flag |= VVMLOCK;
778*7c478bd9Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
779*7c478bd9Sstevel@tonic-gate 
780*7c478bd9Sstevel@tonic-gate 
781*7c478bd9Sstevel@tonic-gate 	/*
782*7c478bd9Sstevel@tonic-gate 	 * Set up the marker pages used to walk the list
783*7c478bd9Sstevel@tonic-gate 	 */
784*7c478bd9Sstevel@tonic-gate 	end = kmem_cache_alloc(marker_cache, KM_SLEEP);
785*7c478bd9Sstevel@tonic-gate 	end->p_vnode = vp;
786*7c478bd9Sstevel@tonic-gate 	end->p_offset = (u_offset_t)-2;
787*7c478bd9Sstevel@tonic-gate 	mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
788*7c478bd9Sstevel@tonic-gate 	mark->p_vnode = vp;
789*7c478bd9Sstevel@tonic-gate 	mark->p_offset = (u_offset_t)-1;
790*7c478bd9Sstevel@tonic-gate 
791*7c478bd9Sstevel@tonic-gate 	/*
792*7c478bd9Sstevel@tonic-gate 	 * Grab the lock protecting the vnode's page list
793*7c478bd9Sstevel@tonic-gate 	 * note that this lock is dropped at times in the loop.
794*7c478bd9Sstevel@tonic-gate 	 */
795*7c478bd9Sstevel@tonic-gate 	vphm = page_vnode_mutex(vp);
796*7c478bd9Sstevel@tonic-gate 	mutex_enter(vphm);
797*7c478bd9Sstevel@tonic-gate 	if (vp->v_pages == NULL)
798*7c478bd9Sstevel@tonic-gate 		goto leave;
799*7c478bd9Sstevel@tonic-gate 
800*7c478bd9Sstevel@tonic-gate 	/*
801*7c478bd9Sstevel@tonic-gate 	 * insert the markers and loop through the list of pages
802*7c478bd9Sstevel@tonic-gate 	 */
803*7c478bd9Sstevel@tonic-gate 	page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
804*7c478bd9Sstevel@tonic-gate 	page_vpadd(&mark->p_vpnext, end);
805*7c478bd9Sstevel@tonic-gate 	for (;;) {
806*7c478bd9Sstevel@tonic-gate 
807*7c478bd9Sstevel@tonic-gate 		/*
808*7c478bd9Sstevel@tonic-gate 		 * If only doing an async write back, then we can
809*7c478bd9Sstevel@tonic-gate 		 * stop as soon as we get to start of the list.
810*7c478bd9Sstevel@tonic-gate 		 */
811*7c478bd9Sstevel@tonic-gate 		if (flags == B_ASYNC && vp->v_pages == mark)
812*7c478bd9Sstevel@tonic-gate 			break;
813*7c478bd9Sstevel@tonic-gate 
814*7c478bd9Sstevel@tonic-gate 		/*
815*7c478bd9Sstevel@tonic-gate 		 * otherwise stop when we've gone through all the pages
816*7c478bd9Sstevel@tonic-gate 		 */
817*7c478bd9Sstevel@tonic-gate 		if (mark->p_vpprev == end)
818*7c478bd9Sstevel@tonic-gate 			break;
819*7c478bd9Sstevel@tonic-gate 
820*7c478bd9Sstevel@tonic-gate 		pp = mark->p_vpprev;
821*7c478bd9Sstevel@tonic-gate 		if (vp->v_pages == pp)
822*7c478bd9Sstevel@tonic-gate 			where_to_move = &vp->v_pages;
823*7c478bd9Sstevel@tonic-gate 		else
824*7c478bd9Sstevel@tonic-gate 			where_to_move = &pp->p_vpprev->p_vpnext;
825*7c478bd9Sstevel@tonic-gate 
826*7c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vnode == vp);
827*7c478bd9Sstevel@tonic-gate 
828*7c478bd9Sstevel@tonic-gate 		/*
829*7c478bd9Sstevel@tonic-gate 		 * Skip this page if the offset is out of the desired range.
830*7c478bd9Sstevel@tonic-gate 		 * Just move the marker and continue.
831*7c478bd9Sstevel@tonic-gate 		 */
832*7c478bd9Sstevel@tonic-gate 		if (pp->p_offset < off) {
833*7c478bd9Sstevel@tonic-gate 			page_vpsub(&vp->v_pages, mark);
834*7c478bd9Sstevel@tonic-gate 			page_vpadd(where_to_move, mark);
835*7c478bd9Sstevel@tonic-gate 			continue;
836*7c478bd9Sstevel@tonic-gate 		}
837*7c478bd9Sstevel@tonic-gate 
838*7c478bd9Sstevel@tonic-gate 		/*
839*7c478bd9Sstevel@tonic-gate 		 * If just flushing dirty pages to disk and this vnode
840*7c478bd9Sstevel@tonic-gate 		 * is using a sorted list of pages, we can stop processing
841*7c478bd9Sstevel@tonic-gate 		 * as soon as we find an unmodified page. Since all the
842*7c478bd9Sstevel@tonic-gate 		 * modified pages are visited first.
843*7c478bd9Sstevel@tonic-gate 		 */
844*7c478bd9Sstevel@tonic-gate 		if (IS_VMODSORT(vp) &&
845*7c478bd9Sstevel@tonic-gate 		    !(flags & (B_INVAL | B_FREE | B_TRUNC)) &&
846*7c478bd9Sstevel@tonic-gate 		    !hat_ismod(pp)) {
847*7c478bd9Sstevel@tonic-gate #ifdef  DEBUG
848*7c478bd9Sstevel@tonic-gate 			/*
849*7c478bd9Sstevel@tonic-gate 			 * For debug kernels examine what should be all the
850*7c478bd9Sstevel@tonic-gate 			 * remaining clean pages, asserting that they are
851*7c478bd9Sstevel@tonic-gate 			 * not modified.
852*7c478bd9Sstevel@tonic-gate 			 */
853*7c478bd9Sstevel@tonic-gate 			page_t	*chk = pp;
854*7c478bd9Sstevel@tonic-gate 			int	attr;
855*7c478bd9Sstevel@tonic-gate 
856*7c478bd9Sstevel@tonic-gate 			page_vpsub(&vp->v_pages, mark);
857*7c478bd9Sstevel@tonic-gate 			page_vpadd(where_to_move, mark);
858*7c478bd9Sstevel@tonic-gate 			do {
859*7c478bd9Sstevel@tonic-gate 				chk = chk->p_vpprev;
860*7c478bd9Sstevel@tonic-gate 				ASSERT(chk != end);
861*7c478bd9Sstevel@tonic-gate 				if (chk == mark)
862*7c478bd9Sstevel@tonic-gate 					continue;
863*7c478bd9Sstevel@tonic-gate 				attr = hat_page_getattr(chk, P_MOD | P_REF);
864*7c478bd9Sstevel@tonic-gate 				if ((attr & P_MOD) == 0)
865*7c478bd9Sstevel@tonic-gate 					continue;
866*7c478bd9Sstevel@tonic-gate 				panic("v_pages list not all clean: "
867*7c478bd9Sstevel@tonic-gate 				    "page_t*=%p vnode=%p off=%lx "
868*7c478bd9Sstevel@tonic-gate 				    "attr=0x%x last clean page_t*=%p\n",
869*7c478bd9Sstevel@tonic-gate 				    (void *)chk, (void *)chk->p_vnode,
870*7c478bd9Sstevel@tonic-gate 				    (long)chk->p_offset, attr, (void *)pp);
871*7c478bd9Sstevel@tonic-gate 			} while (chk != vp->v_pages);
872*7c478bd9Sstevel@tonic-gate #endif
873*7c478bd9Sstevel@tonic-gate 			break;
874*7c478bd9Sstevel@tonic-gate 		}
875*7c478bd9Sstevel@tonic-gate 
876*7c478bd9Sstevel@tonic-gate 		/*
877*7c478bd9Sstevel@tonic-gate 		 * If we are supposed to invalidate or free this
878*7c478bd9Sstevel@tonic-gate 		 * page, then we need an exclusive lock.
879*7c478bd9Sstevel@tonic-gate 		 */
880*7c478bd9Sstevel@tonic-gate 		se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
881*7c478bd9Sstevel@tonic-gate 
882*7c478bd9Sstevel@tonic-gate 		/*
883*7c478bd9Sstevel@tonic-gate 		 * We must acquire the page lock for all synchronous
884*7c478bd9Sstevel@tonic-gate 		 * operations (invalidate, free and write).
885*7c478bd9Sstevel@tonic-gate 		 */
886*7c478bd9Sstevel@tonic-gate 		if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
887*7c478bd9Sstevel@tonic-gate 			/*
888*7c478bd9Sstevel@tonic-gate 			 * If the page_lock() drops the mutex
889*7c478bd9Sstevel@tonic-gate 			 * we must retry the loop.
890*7c478bd9Sstevel@tonic-gate 			 */
891*7c478bd9Sstevel@tonic-gate 			if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
892*7c478bd9Sstevel@tonic-gate 				continue;
893*7c478bd9Sstevel@tonic-gate 
894*7c478bd9Sstevel@tonic-gate 			/*
895*7c478bd9Sstevel@tonic-gate 			 * It's ok to move the marker page now.
896*7c478bd9Sstevel@tonic-gate 			 */
897*7c478bd9Sstevel@tonic-gate 			page_vpsub(&vp->v_pages, mark);
898*7c478bd9Sstevel@tonic-gate 			page_vpadd(where_to_move, mark);
899*7c478bd9Sstevel@tonic-gate 		} else {
900*7c478bd9Sstevel@tonic-gate 
901*7c478bd9Sstevel@tonic-gate 			/*
902*7c478bd9Sstevel@tonic-gate 			 * update the marker page for all remaining cases
903*7c478bd9Sstevel@tonic-gate 			 */
904*7c478bd9Sstevel@tonic-gate 			page_vpsub(&vp->v_pages, mark);
905*7c478bd9Sstevel@tonic-gate 			page_vpadd(where_to_move, mark);
906*7c478bd9Sstevel@tonic-gate 
907*7c478bd9Sstevel@tonic-gate 			/*
908*7c478bd9Sstevel@tonic-gate 			 * For write backs, If we can't lock the page, it's
909*7c478bd9Sstevel@tonic-gate 			 * invalid or in the process of being destroyed.  Skip
910*7c478bd9Sstevel@tonic-gate 			 * it, assuming someone else is writing it.
911*7c478bd9Sstevel@tonic-gate 			 */
912*7c478bd9Sstevel@tonic-gate 			if (!page_trylock(pp, se))
913*7c478bd9Sstevel@tonic-gate 				continue;
914*7c478bd9Sstevel@tonic-gate 		}
915*7c478bd9Sstevel@tonic-gate 
916*7c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vnode == vp);
917*7c478bd9Sstevel@tonic-gate 
918*7c478bd9Sstevel@tonic-gate 		/*
919*7c478bd9Sstevel@tonic-gate 		 * Successfully locked the page, now figure out what to
920*7c478bd9Sstevel@tonic-gate 		 * do with it. Free pages are easily dealt with, invalidate
921*7c478bd9Sstevel@tonic-gate 		 * if desired or just go on to the next page.
922*7c478bd9Sstevel@tonic-gate 		 */
923*7c478bd9Sstevel@tonic-gate 		if (PP_ISFREE(pp)) {
924*7c478bd9Sstevel@tonic-gate 			if ((flags & B_INVAL) == 0) {
925*7c478bd9Sstevel@tonic-gate 				page_unlock(pp);
926*7c478bd9Sstevel@tonic-gate 				continue;
927*7c478bd9Sstevel@tonic-gate 			}
928*7c478bd9Sstevel@tonic-gate 
929*7c478bd9Sstevel@tonic-gate 			/*
930*7c478bd9Sstevel@tonic-gate 			 * Invalidate (destroy) the page.
931*7c478bd9Sstevel@tonic-gate 			 */
932*7c478bd9Sstevel@tonic-gate 			mutex_exit(vphm);
933*7c478bd9Sstevel@tonic-gate 			page_destroy_free(pp);
934*7c478bd9Sstevel@tonic-gate 			mutex_enter(vphm);
935*7c478bd9Sstevel@tonic-gate 			continue;
936*7c478bd9Sstevel@tonic-gate 		}
937*7c478bd9Sstevel@tonic-gate 
938*7c478bd9Sstevel@tonic-gate 		/*
939*7c478bd9Sstevel@tonic-gate 		 * pvn_getdirty() figures out what do do with a dirty page.
940*7c478bd9Sstevel@tonic-gate 		 * If the page is dirty, the putapage() routine will write it
941*7c478bd9Sstevel@tonic-gate 		 * and will kluster any other adjacent dirty pages it can.
942*7c478bd9Sstevel@tonic-gate 		 *
943*7c478bd9Sstevel@tonic-gate 		 * pvn_getdirty() and `(*putapage)' unlock the page.
944*7c478bd9Sstevel@tonic-gate 		 */
945*7c478bd9Sstevel@tonic-gate 		mutex_exit(vphm);
946*7c478bd9Sstevel@tonic-gate 		if (pvn_getdirty(pp, flags)) {
947*7c478bd9Sstevel@tonic-gate 			error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
948*7c478bd9Sstevel@tonic-gate 			if (!err)
949*7c478bd9Sstevel@tonic-gate 				err = error;
950*7c478bd9Sstevel@tonic-gate 		}
951*7c478bd9Sstevel@tonic-gate 		mutex_enter(vphm);
952*7c478bd9Sstevel@tonic-gate 	}
953*7c478bd9Sstevel@tonic-gate 	page_vpsub(&vp->v_pages, mark);
954*7c478bd9Sstevel@tonic-gate 	page_vpsub(&vp->v_pages, end);
955*7c478bd9Sstevel@tonic-gate 
956*7c478bd9Sstevel@tonic-gate leave:
957*7c478bd9Sstevel@tonic-gate 	/*
958*7c478bd9Sstevel@tonic-gate 	 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
959*7c478bd9Sstevel@tonic-gate 	 */
960*7c478bd9Sstevel@tonic-gate 	mutex_exit(vphm);
961*7c478bd9Sstevel@tonic-gate 	kmem_cache_free(marker_cache, mark);
962*7c478bd9Sstevel@tonic-gate 	kmem_cache_free(marker_cache, end);
963*7c478bd9Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
964*7c478bd9Sstevel@tonic-gate 	vp->v_flag &= ~VVMLOCK;
965*7c478bd9Sstevel@tonic-gate 	cv_broadcast(&vp->v_cv);
966*7c478bd9Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
967*7c478bd9Sstevel@tonic-gate 	return (err);
968*7c478bd9Sstevel@tonic-gate }
969*7c478bd9Sstevel@tonic-gate 
970*7c478bd9Sstevel@tonic-gate /*
971*7c478bd9Sstevel@tonic-gate  * Zero out zbytes worth of data. Caller should be aware that this
972*7c478bd9Sstevel@tonic-gate  * routine may enter back into the fs layer (xxx_getpage). Locks
973*7c478bd9Sstevel@tonic-gate  * that the xxx_getpage routine may need should not be held while
974*7c478bd9Sstevel@tonic-gate  * calling this.
975*7c478bd9Sstevel@tonic-gate  */
976*7c478bd9Sstevel@tonic-gate void
977*7c478bd9Sstevel@tonic-gate pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
978*7c478bd9Sstevel@tonic-gate {
979*7c478bd9Sstevel@tonic-gate 	caddr_t addr;
980*7c478bd9Sstevel@tonic-gate 
981*7c478bd9Sstevel@tonic-gate 	ASSERT(vp->v_type != VCHR);
982*7c478bd9Sstevel@tonic-gate 
983*7c478bd9Sstevel@tonic-gate 	if (vp->v_pages == NULL)
984*7c478bd9Sstevel@tonic-gate 		return;
985*7c478bd9Sstevel@tonic-gate 
986*7c478bd9Sstevel@tonic-gate 	/*
987*7c478bd9Sstevel@tonic-gate 	 * zbytes may be zero but there still may be some portion of
988*7c478bd9Sstevel@tonic-gate 	 * a page which needs clearing (since zbytes is a function
989*7c478bd9Sstevel@tonic-gate 	 * of filesystem block size, not pagesize.)
990*7c478bd9Sstevel@tonic-gate 	 */
991*7c478bd9Sstevel@tonic-gate 	if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
992*7c478bd9Sstevel@tonic-gate 		return;
993*7c478bd9Sstevel@tonic-gate 
994*7c478bd9Sstevel@tonic-gate 	/*
995*7c478bd9Sstevel@tonic-gate 	 * We get the last page and handle the partial
996*7c478bd9Sstevel@tonic-gate 	 * zeroing via kernel mappings.  This will make the page
997*7c478bd9Sstevel@tonic-gate 	 * dirty so that we know that when this page is written
998*7c478bd9Sstevel@tonic-gate 	 * back, the zeroed information will go out with it.  If
999*7c478bd9Sstevel@tonic-gate 	 * the page is not currently in memory, then the kzero
1000*7c478bd9Sstevel@tonic-gate 	 * operation will cause it to be brought it.  We use kzero
1001*7c478bd9Sstevel@tonic-gate 	 * instead of bzero so that if the page cannot be read in
1002*7c478bd9Sstevel@tonic-gate 	 * for any reason, the system will not panic.  We need
1003*7c478bd9Sstevel@tonic-gate 	 * to zero out a minimum of the fs given zbytes, but we
1004*7c478bd9Sstevel@tonic-gate 	 * might also have to do more to get the entire last page.
1005*7c478bd9Sstevel@tonic-gate 	 */
1006*7c478bd9Sstevel@tonic-gate 
1007*7c478bd9Sstevel@tonic-gate 	if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1008*7c478bd9Sstevel@tonic-gate 		panic("pvn_vptrunc zbytes");
1009*7c478bd9Sstevel@tonic-gate 	addr = segmap_getmapflt(segkmap, vp, vplen,
1010*7c478bd9Sstevel@tonic-gate 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1011*7c478bd9Sstevel@tonic-gate 	(void) kzero(addr + (vplen & MAXBOFFSET),
1012*7c478bd9Sstevel@tonic-gate 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1013*7c478bd9Sstevel@tonic-gate 	(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1014*7c478bd9Sstevel@tonic-gate }
1015*7c478bd9Sstevel@tonic-gate 
1016*7c478bd9Sstevel@tonic-gate /*
1017*7c478bd9Sstevel@tonic-gate  * Handles common work of the VOP_GETPAGE routines when more than
1018*7c478bd9Sstevel@tonic-gate  * one page must be returned by calling a file system specific operation
1019*7c478bd9Sstevel@tonic-gate  * to do most of the work.  Must be called with the vp already locked
1020*7c478bd9Sstevel@tonic-gate  * by the VOP_GETPAGE routine.
1021*7c478bd9Sstevel@tonic-gate  */
1022*7c478bd9Sstevel@tonic-gate int
1023*7c478bd9Sstevel@tonic-gate pvn_getpages(
1024*7c478bd9Sstevel@tonic-gate 	int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1025*7c478bd9Sstevel@tonic-gate 		size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1026*7c478bd9Sstevel@tonic-gate 	struct vnode *vp,
1027*7c478bd9Sstevel@tonic-gate 	u_offset_t off,
1028*7c478bd9Sstevel@tonic-gate 	size_t len,
1029*7c478bd9Sstevel@tonic-gate 	uint_t *protp,
1030*7c478bd9Sstevel@tonic-gate 	page_t *pl[],
1031*7c478bd9Sstevel@tonic-gate 	size_t plsz,
1032*7c478bd9Sstevel@tonic-gate 	struct seg *seg,
1033*7c478bd9Sstevel@tonic-gate 	caddr_t addr,
1034*7c478bd9Sstevel@tonic-gate 	enum seg_rw rw,
1035*7c478bd9Sstevel@tonic-gate 	struct cred *cred)
1036*7c478bd9Sstevel@tonic-gate {
1037*7c478bd9Sstevel@tonic-gate 	page_t **ppp;
1038*7c478bd9Sstevel@tonic-gate 	u_offset_t o, eoff;
1039*7c478bd9Sstevel@tonic-gate 	size_t sz, xlen;
1040*7c478bd9Sstevel@tonic-gate 	int err;
1041*7c478bd9Sstevel@tonic-gate 
1042*7c478bd9Sstevel@tonic-gate 	ASSERT(plsz >= len);		/* insure that we have enough space */
1043*7c478bd9Sstevel@tonic-gate 
1044*7c478bd9Sstevel@tonic-gate 	/*
1045*7c478bd9Sstevel@tonic-gate 	 * Loop one page at a time and let getapage function fill
1046*7c478bd9Sstevel@tonic-gate 	 * in the next page in array.  We only allow one page to be
1047*7c478bd9Sstevel@tonic-gate 	 * returned at a time (except for the last page) so that we
1048*7c478bd9Sstevel@tonic-gate 	 * don't have any problems with duplicates and other such
1049*7c478bd9Sstevel@tonic-gate 	 * painful problems.  This is a very simple minded algorithm,
1050*7c478bd9Sstevel@tonic-gate 	 * but it does the job correctly.  We hope that the cost of a
1051*7c478bd9Sstevel@tonic-gate 	 * getapage call for a resident page that we might have been
1052*7c478bd9Sstevel@tonic-gate 	 * able to get from an earlier call doesn't cost too much.
1053*7c478bd9Sstevel@tonic-gate 	 */
1054*7c478bd9Sstevel@tonic-gate 	ppp = pl;
1055*7c478bd9Sstevel@tonic-gate 	sz = PAGESIZE;
1056*7c478bd9Sstevel@tonic-gate 	eoff = off + len;
1057*7c478bd9Sstevel@tonic-gate 	xlen = len;
1058*7c478bd9Sstevel@tonic-gate 	for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1059*7c478bd9Sstevel@tonic-gate 	    xlen -= PAGESIZE) {
1060*7c478bd9Sstevel@tonic-gate 		if (o + PAGESIZE >= eoff) {
1061*7c478bd9Sstevel@tonic-gate 			/*
1062*7c478bd9Sstevel@tonic-gate 			 * Last time through - allow the all of
1063*7c478bd9Sstevel@tonic-gate 			 * what's left of the pl[] array to be used.
1064*7c478bd9Sstevel@tonic-gate 			 */
1065*7c478bd9Sstevel@tonic-gate 			sz = plsz - (o - off);
1066*7c478bd9Sstevel@tonic-gate 		}
1067*7c478bd9Sstevel@tonic-gate 		err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1068*7c478bd9Sstevel@tonic-gate 		    rw, cred);
1069*7c478bd9Sstevel@tonic-gate 		if (err) {
1070*7c478bd9Sstevel@tonic-gate 			/*
1071*7c478bd9Sstevel@tonic-gate 			 * Release any pages we already got.
1072*7c478bd9Sstevel@tonic-gate 			 */
1073*7c478bd9Sstevel@tonic-gate 			if (o > off && pl != NULL) {
1074*7c478bd9Sstevel@tonic-gate 				for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1075*7c478bd9Sstevel@tonic-gate 					(void) page_release(*ppp, 1);
1076*7c478bd9Sstevel@tonic-gate 			}
1077*7c478bd9Sstevel@tonic-gate 			break;
1078*7c478bd9Sstevel@tonic-gate 		}
1079*7c478bd9Sstevel@tonic-gate 		if (pl != NULL)
1080*7c478bd9Sstevel@tonic-gate 			ppp++;
1081*7c478bd9Sstevel@tonic-gate 	}
1082*7c478bd9Sstevel@tonic-gate 	return (err);
1083*7c478bd9Sstevel@tonic-gate }
1084*7c478bd9Sstevel@tonic-gate 
1085*7c478bd9Sstevel@tonic-gate /*
1086*7c478bd9Sstevel@tonic-gate  * Initialize the page list array.
1087*7c478bd9Sstevel@tonic-gate  */
1088*7c478bd9Sstevel@tonic-gate void
1089*7c478bd9Sstevel@tonic-gate pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1090*7c478bd9Sstevel@tonic-gate     u_offset_t off, size_t io_len, enum seg_rw rw)
1091*7c478bd9Sstevel@tonic-gate {
1092*7c478bd9Sstevel@tonic-gate 	ssize_t sz;
1093*7c478bd9Sstevel@tonic-gate 	page_t *ppcur, **ppp;
1094*7c478bd9Sstevel@tonic-gate 
1095*7c478bd9Sstevel@tonic-gate 	if (plsz >= io_len) {
1096*7c478bd9Sstevel@tonic-gate 		/*
1097*7c478bd9Sstevel@tonic-gate 		 * Everything fits, set up to load
1098*7c478bd9Sstevel@tonic-gate 		 * all the pages.
1099*7c478bd9Sstevel@tonic-gate 		 */
1100*7c478bd9Sstevel@tonic-gate 		sz = io_len;
1101*7c478bd9Sstevel@tonic-gate 	} else {
1102*7c478bd9Sstevel@tonic-gate 		/*
1103*7c478bd9Sstevel@tonic-gate 		 * Set up to load plsz worth
1104*7c478bd9Sstevel@tonic-gate 		 * starting at the needed page.
1105*7c478bd9Sstevel@tonic-gate 		 */
1106*7c478bd9Sstevel@tonic-gate 		while (pp->p_offset != off) {
1107*7c478bd9Sstevel@tonic-gate 			/* XXX - Do we need this assert? */
1108*7c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_next->p_offset !=
1109*7c478bd9Sstevel@tonic-gate 			    pp->p_offset);
1110*7c478bd9Sstevel@tonic-gate 			/*
1111*7c478bd9Sstevel@tonic-gate 			 * Remove page from the i/o list,
1112*7c478bd9Sstevel@tonic-gate 			 * release the i/o and the page lock.
1113*7c478bd9Sstevel@tonic-gate 			 */
1114*7c478bd9Sstevel@tonic-gate 			ppcur = pp;
1115*7c478bd9Sstevel@tonic-gate 			page_sub(&pp, ppcur);
1116*7c478bd9Sstevel@tonic-gate 			page_io_unlock(ppcur);
1117*7c478bd9Sstevel@tonic-gate 			(void) page_release(ppcur, 1);
1118*7c478bd9Sstevel@tonic-gate 		}
1119*7c478bd9Sstevel@tonic-gate 		sz = plsz;
1120*7c478bd9Sstevel@tonic-gate 	}
1121*7c478bd9Sstevel@tonic-gate 
1122*7c478bd9Sstevel@tonic-gate 	/*
1123*7c478bd9Sstevel@tonic-gate 	 * Initialize the page list array.
1124*7c478bd9Sstevel@tonic-gate 	 */
1125*7c478bd9Sstevel@tonic-gate 	ppp = pl;
1126*7c478bd9Sstevel@tonic-gate 	do {
1127*7c478bd9Sstevel@tonic-gate 		ppcur = pp;
1128*7c478bd9Sstevel@tonic-gate 		*ppp++ = ppcur;
1129*7c478bd9Sstevel@tonic-gate 		page_sub(&pp, ppcur);
1130*7c478bd9Sstevel@tonic-gate 		page_io_unlock(ppcur);
1131*7c478bd9Sstevel@tonic-gate 		if (rw != S_CREATE)
1132*7c478bd9Sstevel@tonic-gate 			page_downgrade(ppcur);
1133*7c478bd9Sstevel@tonic-gate 		sz -= PAGESIZE;
1134*7c478bd9Sstevel@tonic-gate 	} while (sz > 0 && pp != NULL);
1135*7c478bd9Sstevel@tonic-gate 	*ppp = NULL;		/* terminate list */
1136*7c478bd9Sstevel@tonic-gate 
1137*7c478bd9Sstevel@tonic-gate 	/*
1138*7c478bd9Sstevel@tonic-gate 	 * Now free the remaining pages that weren't
1139*7c478bd9Sstevel@tonic-gate 	 * loaded in the page list.
1140*7c478bd9Sstevel@tonic-gate 	 */
1141*7c478bd9Sstevel@tonic-gate 	while (pp != NULL) {
1142*7c478bd9Sstevel@tonic-gate 		ppcur = pp;
1143*7c478bd9Sstevel@tonic-gate 		page_sub(&pp, ppcur);
1144*7c478bd9Sstevel@tonic-gate 		page_io_unlock(ppcur);
1145*7c478bd9Sstevel@tonic-gate 		(void) page_release(ppcur, 1);
1146*7c478bd9Sstevel@tonic-gate 	}
1147*7c478bd9Sstevel@tonic-gate }
1148