xref: /illumos-gate/usr/src/uts/common/vm/vm_pvn.c (revision a71e32b6)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*a71e32b6Sstans  * Common Development and Distribution License (the "License").
6*a71e32b6Sstans  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22*a71e32b6Sstans  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
277c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
287c478bd9Sstevel@tonic-gate 
297c478bd9Sstevel@tonic-gate /*
307c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
317c478bd9Sstevel@tonic-gate  * The Regents of the University of California
327c478bd9Sstevel@tonic-gate  * All Rights Reserved
337c478bd9Sstevel@tonic-gate  *
347c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
357c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
367c478bd9Sstevel@tonic-gate  * contributors.
377c478bd9Sstevel@tonic-gate  */
387c478bd9Sstevel@tonic-gate 
397c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
407c478bd9Sstevel@tonic-gate 
417c478bd9Sstevel@tonic-gate /*
427c478bd9Sstevel@tonic-gate  * VM - paged vnode.
437c478bd9Sstevel@tonic-gate  *
447c478bd9Sstevel@tonic-gate  * This file supplies vm support for the vnode operations that deal with pages.
457c478bd9Sstevel@tonic-gate  */
467c478bd9Sstevel@tonic-gate #include <sys/types.h>
477c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
487c478bd9Sstevel@tonic-gate #include <sys/param.h>
497c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
507c478bd9Sstevel@tonic-gate #include <sys/systm.h>
517c478bd9Sstevel@tonic-gate #include <sys/time.h>
527c478bd9Sstevel@tonic-gate #include <sys/buf.h>
537c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
547c478bd9Sstevel@tonic-gate #include <sys/uio.h>
557c478bd9Sstevel@tonic-gate #include <sys/vmmeter.h>
567c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h>
577c478bd9Sstevel@tonic-gate #include <sys/mman.h>
587c478bd9Sstevel@tonic-gate #include <sys/vfs.h>
597c478bd9Sstevel@tonic-gate #include <sys/cred.h>
607c478bd9Sstevel@tonic-gate #include <sys/user.h>
617c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
627c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
637c478bd9Sstevel@tonic-gate #include <sys/debug.h>
647c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
657c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
667c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
677c478bd9Sstevel@tonic-gate 
687c478bd9Sstevel@tonic-gate #include <vm/hat.h>
697c478bd9Sstevel@tonic-gate #include <vm/as.h>
707c478bd9Sstevel@tonic-gate #include <vm/seg.h>
717c478bd9Sstevel@tonic-gate #include <vm/rm.h>
727c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
737c478bd9Sstevel@tonic-gate #include <vm/page.h>
747c478bd9Sstevel@tonic-gate #include <vm/seg_map.h>
757c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
767c478bd9Sstevel@tonic-gate #include <sys/fs/swapnode.h>
777c478bd9Sstevel@tonic-gate 
787c478bd9Sstevel@tonic-gate int pvn_nofodklust = 0;
797c478bd9Sstevel@tonic-gate int pvn_write_noklust = 0;
807c478bd9Sstevel@tonic-gate 
817c478bd9Sstevel@tonic-gate uint_t pvn_vmodsort_supported = 0;	/* set if HAT supports VMODSORT */
827c478bd9Sstevel@tonic-gate uint_t pvn_vmodsort_disable = 0;	/* set in /etc/system to disable HAT */
837c478bd9Sstevel@tonic-gate 					/* support for vmodsort for testing */
847c478bd9Sstevel@tonic-gate 
857c478bd9Sstevel@tonic-gate static struct kmem_cache *marker_cache = NULL;
867c478bd9Sstevel@tonic-gate 
877c478bd9Sstevel@tonic-gate /*
887c478bd9Sstevel@tonic-gate  * Find the largest contiguous block which contains `addr' for file offset
897c478bd9Sstevel@tonic-gate  * `offset' in it while living within the file system block sizes (`vp_off'
907c478bd9Sstevel@tonic-gate  * and `vp_len') and the address space limits for which no pages currently
917c478bd9Sstevel@tonic-gate  * exist and which map to consecutive file offsets.
927c478bd9Sstevel@tonic-gate  */
937c478bd9Sstevel@tonic-gate page_t *
947c478bd9Sstevel@tonic-gate pvn_read_kluster(
957c478bd9Sstevel@tonic-gate 	struct vnode *vp,
967c478bd9Sstevel@tonic-gate 	u_offset_t off,
977c478bd9Sstevel@tonic-gate 	struct seg *seg,
987c478bd9Sstevel@tonic-gate 	caddr_t addr,
997c478bd9Sstevel@tonic-gate 	u_offset_t *offp,			/* return values */
1007c478bd9Sstevel@tonic-gate 	size_t *lenp,				/* return values */
1017c478bd9Sstevel@tonic-gate 	u_offset_t vp_off,
1027c478bd9Sstevel@tonic-gate 	size_t vp_len,
1037c478bd9Sstevel@tonic-gate 	int isra)
1047c478bd9Sstevel@tonic-gate {
1057c478bd9Sstevel@tonic-gate 	ssize_t deltaf, deltab;
1067c478bd9Sstevel@tonic-gate 	page_t *pp;
1077c478bd9Sstevel@tonic-gate 	page_t *plist = NULL;
1087c478bd9Sstevel@tonic-gate 	spgcnt_t pagesavail;
1097c478bd9Sstevel@tonic-gate 	u_offset_t vp_end;
1107c478bd9Sstevel@tonic-gate 
1117c478bd9Sstevel@tonic-gate 	ASSERT(off >= vp_off && off < vp_off + vp_len);
1127c478bd9Sstevel@tonic-gate 
1137c478bd9Sstevel@tonic-gate 	/*
1147c478bd9Sstevel@tonic-gate 	 * We only want to do klustering/read ahead if there
1157c478bd9Sstevel@tonic-gate 	 * is more than minfree pages currently available.
1167c478bd9Sstevel@tonic-gate 	 */
1177c478bd9Sstevel@tonic-gate 	pagesavail = freemem - minfree;
1187c478bd9Sstevel@tonic-gate 
1197c478bd9Sstevel@tonic-gate 	if (pagesavail <= 0)
1207c478bd9Sstevel@tonic-gate 		if (isra)
1217c478bd9Sstevel@tonic-gate 			return ((page_t *)NULL);    /* ra case - give up */
1227c478bd9Sstevel@tonic-gate 		else
1237c478bd9Sstevel@tonic-gate 			pagesavail = 1;		    /* must return a page */
1247c478bd9Sstevel@tonic-gate 
1257c478bd9Sstevel@tonic-gate 	/* We calculate in pages instead of bytes due to 32-bit overflows */
1267c478bd9Sstevel@tonic-gate 	if (pagesavail < (spgcnt_t)btopr(vp_len)) {
1277c478bd9Sstevel@tonic-gate 		/*
1287c478bd9Sstevel@tonic-gate 		 * Don't have enough free memory for the
1297c478bd9Sstevel@tonic-gate 		 * max request, try sizing down vp request.
1307c478bd9Sstevel@tonic-gate 		 */
1317c478bd9Sstevel@tonic-gate 		deltab = (ssize_t)(off - vp_off);
1327c478bd9Sstevel@tonic-gate 		vp_len -= deltab;
1337c478bd9Sstevel@tonic-gate 		vp_off += deltab;
1347c478bd9Sstevel@tonic-gate 		if (pagesavail < btopr(vp_len)) {
1357c478bd9Sstevel@tonic-gate 			/*
1367c478bd9Sstevel@tonic-gate 			 * Still not enough memory, just settle for
1377c478bd9Sstevel@tonic-gate 			 * pagesavail which is at least 1.
1387c478bd9Sstevel@tonic-gate 			 */
1397c478bd9Sstevel@tonic-gate 			vp_len = ptob(pagesavail);
1407c478bd9Sstevel@tonic-gate 		}
1417c478bd9Sstevel@tonic-gate 	}
1427c478bd9Sstevel@tonic-gate 
1437c478bd9Sstevel@tonic-gate 	vp_end = vp_off + vp_len;
1447c478bd9Sstevel@tonic-gate 	ASSERT(off >= vp_off && off < vp_end);
1457c478bd9Sstevel@tonic-gate 
1467c478bd9Sstevel@tonic-gate 	if (isra && SEGOP_KLUSTER(seg, addr, 0))
1477c478bd9Sstevel@tonic-gate 		return ((page_t *)NULL);	/* segment driver says no */
1487c478bd9Sstevel@tonic-gate 
1497c478bd9Sstevel@tonic-gate 	if ((plist = page_create_va(vp, off,
1507c478bd9Sstevel@tonic-gate 	    PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
1517c478bd9Sstevel@tonic-gate 		return ((page_t *)NULL);
1527c478bd9Sstevel@tonic-gate 
1537c478bd9Sstevel@tonic-gate 	if (vp_len <= PAGESIZE || pvn_nofodklust) {
1547c478bd9Sstevel@tonic-gate 		*offp = off;
1557c478bd9Sstevel@tonic-gate 		*lenp = MIN(vp_len, PAGESIZE);
1567c478bd9Sstevel@tonic-gate 	} else {
1577c478bd9Sstevel@tonic-gate 		/*
1587c478bd9Sstevel@tonic-gate 		 * Scan back from front by incrementing "deltab" and
1597c478bd9Sstevel@tonic-gate 		 * comparing "off" with "vp_off + deltab" to avoid
1607c478bd9Sstevel@tonic-gate 		 * "signed" versus "unsigned" conversion problems.
1617c478bd9Sstevel@tonic-gate 		 */
1627c478bd9Sstevel@tonic-gate 		for (deltab = PAGESIZE; off >= vp_off + deltab;
1637c478bd9Sstevel@tonic-gate 		    deltab += PAGESIZE) {
1647c478bd9Sstevel@tonic-gate 			/*
1657c478bd9Sstevel@tonic-gate 			 * Call back to the segment driver to verify that
1667c478bd9Sstevel@tonic-gate 			 * the klustering/read ahead operation makes sense.
1677c478bd9Sstevel@tonic-gate 			 */
1687c478bd9Sstevel@tonic-gate 			if (SEGOP_KLUSTER(seg, addr, -deltab))
1697c478bd9Sstevel@tonic-gate 				break;		/* page not eligible */
1707c478bd9Sstevel@tonic-gate 			if ((pp = page_create_va(vp, off - deltab,
1717c478bd9Sstevel@tonic-gate 			    PAGESIZE, PG_EXCL, seg, addr - deltab))
1727c478bd9Sstevel@tonic-gate 			    == NULL)
1737c478bd9Sstevel@tonic-gate 				break;		/* already have the page */
1747c478bd9Sstevel@tonic-gate 			/*
1757c478bd9Sstevel@tonic-gate 			 * Add page to front of page list.
1767c478bd9Sstevel@tonic-gate 			 */
1777c478bd9Sstevel@tonic-gate 			page_add(&plist, pp);
1787c478bd9Sstevel@tonic-gate 		}
1797c478bd9Sstevel@tonic-gate 		deltab -= PAGESIZE;
1807c478bd9Sstevel@tonic-gate 
1817c478bd9Sstevel@tonic-gate 		/* scan forward from front */
1827c478bd9Sstevel@tonic-gate 		for (deltaf = PAGESIZE; off + deltaf < vp_end;
1837c478bd9Sstevel@tonic-gate 		    deltaf += PAGESIZE) {
1847c478bd9Sstevel@tonic-gate 			/*
1857c478bd9Sstevel@tonic-gate 			 * Call back to the segment driver to verify that
1867c478bd9Sstevel@tonic-gate 			 * the klustering/read ahead operation makes sense.
1877c478bd9Sstevel@tonic-gate 			 */
1887c478bd9Sstevel@tonic-gate 			if (SEGOP_KLUSTER(seg, addr, deltaf))
1897c478bd9Sstevel@tonic-gate 				break;		/* page not file extension */
1907c478bd9Sstevel@tonic-gate 			if ((pp = page_create_va(vp, off + deltaf,
1917c478bd9Sstevel@tonic-gate 			    PAGESIZE, PG_EXCL, seg, addr + deltaf))
1927c478bd9Sstevel@tonic-gate 			    == NULL)
1937c478bd9Sstevel@tonic-gate 				break;		/* already have page */
1947c478bd9Sstevel@tonic-gate 
1957c478bd9Sstevel@tonic-gate 			/*
1967c478bd9Sstevel@tonic-gate 			 * Add page to end of page list.
1977c478bd9Sstevel@tonic-gate 			 */
1987c478bd9Sstevel@tonic-gate 			page_add(&plist, pp);
1997c478bd9Sstevel@tonic-gate 			plist = plist->p_next;
2007c478bd9Sstevel@tonic-gate 		}
2017c478bd9Sstevel@tonic-gate 		*offp = off = off - deltab;
2027c478bd9Sstevel@tonic-gate 		*lenp = deltab + deltaf;
2037c478bd9Sstevel@tonic-gate 		ASSERT(off >= vp_off);
2047c478bd9Sstevel@tonic-gate 
2057c478bd9Sstevel@tonic-gate 		/*
2067c478bd9Sstevel@tonic-gate 		 * If we ended up getting more than was actually
2077c478bd9Sstevel@tonic-gate 		 * requested, retract the returned length to only
2087c478bd9Sstevel@tonic-gate 		 * reflect what was requested.  This might happen
2097c478bd9Sstevel@tonic-gate 		 * if we were allowed to kluster pages across a
2107c478bd9Sstevel@tonic-gate 		 * span of (say) 5 frags, and frag size is less
2117c478bd9Sstevel@tonic-gate 		 * than PAGESIZE.  We need a whole number of
2127c478bd9Sstevel@tonic-gate 		 * pages to contain those frags, but the returned
2137c478bd9Sstevel@tonic-gate 		 * size should only allow the returned range to
2147c478bd9Sstevel@tonic-gate 		 * extend as far as the end of the frags.
2157c478bd9Sstevel@tonic-gate 		 */
2167c478bd9Sstevel@tonic-gate 		if ((vp_off + vp_len) < (off + *lenp)) {
2177c478bd9Sstevel@tonic-gate 			ASSERT(vp_end > off);
2187c478bd9Sstevel@tonic-gate 			*lenp = vp_end - off;
2197c478bd9Sstevel@tonic-gate 		}
2207c478bd9Sstevel@tonic-gate 	}
2217c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
2227c478bd9Sstevel@tonic-gate 		"pvn_read_kluster:seg %p addr %x isra %x",
2237c478bd9Sstevel@tonic-gate 		seg, addr, isra);
2247c478bd9Sstevel@tonic-gate 	return (plist);
2257c478bd9Sstevel@tonic-gate }
2267c478bd9Sstevel@tonic-gate 
2277c478bd9Sstevel@tonic-gate /*
2287c478bd9Sstevel@tonic-gate  * Handle pages for this vnode on either side of the page "pp"
2297c478bd9Sstevel@tonic-gate  * which has been locked by the caller.  This routine will also
2307c478bd9Sstevel@tonic-gate  * do klustering in the range [vp_off, vp_off + vp_len] up
2317c478bd9Sstevel@tonic-gate  * until a page which is not found.  The offset and length
2327c478bd9Sstevel@tonic-gate  * of pages included is returned in "*offp" and "*lenp".
2337c478bd9Sstevel@tonic-gate  *
2347c478bd9Sstevel@tonic-gate  * Returns a list of dirty locked pages all ready to be
2357c478bd9Sstevel@tonic-gate  * written back.
2367c478bd9Sstevel@tonic-gate  */
2377c478bd9Sstevel@tonic-gate page_t *
2387c478bd9Sstevel@tonic-gate pvn_write_kluster(
2397c478bd9Sstevel@tonic-gate 	struct vnode *vp,
2407c478bd9Sstevel@tonic-gate 	page_t *pp,
2417c478bd9Sstevel@tonic-gate 	u_offset_t *offp,		/* return values */
2427c478bd9Sstevel@tonic-gate 	size_t *lenp,			/* return values */
2437c478bd9Sstevel@tonic-gate 	u_offset_t vp_off,
2447c478bd9Sstevel@tonic-gate 	size_t vp_len,
2457c478bd9Sstevel@tonic-gate 	int flags)
2467c478bd9Sstevel@tonic-gate {
2477c478bd9Sstevel@tonic-gate 	u_offset_t off;
2487c478bd9Sstevel@tonic-gate 	page_t *dirty;
2497c478bd9Sstevel@tonic-gate 	size_t deltab, deltaf;
2507c478bd9Sstevel@tonic-gate 	se_t se;
2517c478bd9Sstevel@tonic-gate 	u_offset_t vp_end;
2527c478bd9Sstevel@tonic-gate 
2537c478bd9Sstevel@tonic-gate 	off = pp->p_offset;
2547c478bd9Sstevel@tonic-gate 
2557c478bd9Sstevel@tonic-gate 	/*
2567c478bd9Sstevel@tonic-gate 	 * Kustering should not be done if we are invalidating
2577c478bd9Sstevel@tonic-gate 	 * pages since we could destroy pages that belong to
2587c478bd9Sstevel@tonic-gate 	 * some other process if this is a swap vnode.
2597c478bd9Sstevel@tonic-gate 	 */
2607c478bd9Sstevel@tonic-gate 	if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
2617c478bd9Sstevel@tonic-gate 		*offp = off;
2627c478bd9Sstevel@tonic-gate 		*lenp = PAGESIZE;
2637c478bd9Sstevel@tonic-gate 		return (pp);
2647c478bd9Sstevel@tonic-gate 	}
2657c478bd9Sstevel@tonic-gate 
2667c478bd9Sstevel@tonic-gate 	if (flags & (B_FREE | B_INVAL))
2677c478bd9Sstevel@tonic-gate 		se = SE_EXCL;
2687c478bd9Sstevel@tonic-gate 	else
2697c478bd9Sstevel@tonic-gate 		se = SE_SHARED;
2707c478bd9Sstevel@tonic-gate 
2717c478bd9Sstevel@tonic-gate 	dirty = pp;
2727c478bd9Sstevel@tonic-gate 	/*
2737c478bd9Sstevel@tonic-gate 	 * Scan backwards looking for pages to kluster by incrementing
2747c478bd9Sstevel@tonic-gate 	 * "deltab" and comparing "off" with "vp_off + deltab" to
2757c478bd9Sstevel@tonic-gate 	 * avoid "signed" versus "unsigned" conversion problems.
2767c478bd9Sstevel@tonic-gate 	 */
2777c478bd9Sstevel@tonic-gate 	for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
2787c478bd9Sstevel@tonic-gate 		pp = page_lookup_nowait(vp, off - deltab, se);
2797c478bd9Sstevel@tonic-gate 		if (pp == NULL)
2807c478bd9Sstevel@tonic-gate 			break;		/* page not found */
2817c478bd9Sstevel@tonic-gate 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
2827c478bd9Sstevel@tonic-gate 			break;
2837c478bd9Sstevel@tonic-gate 		page_add(&dirty, pp);
2847c478bd9Sstevel@tonic-gate 	}
2857c478bd9Sstevel@tonic-gate 	deltab -= PAGESIZE;
2867c478bd9Sstevel@tonic-gate 
2877c478bd9Sstevel@tonic-gate 	vp_end = vp_off + vp_len;
2887c478bd9Sstevel@tonic-gate 	/* now scan forwards looking for pages to kluster */
2897c478bd9Sstevel@tonic-gate 	for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
2907c478bd9Sstevel@tonic-gate 		pp = page_lookup_nowait(vp, off + deltaf, se);
2917c478bd9Sstevel@tonic-gate 		if (pp == NULL)
2927c478bd9Sstevel@tonic-gate 			break;		/* page not found */
2937c478bd9Sstevel@tonic-gate 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
2947c478bd9Sstevel@tonic-gate 			break;
2957c478bd9Sstevel@tonic-gate 		page_add(&dirty, pp);
2967c478bd9Sstevel@tonic-gate 		dirty = dirty->p_next;
2977c478bd9Sstevel@tonic-gate 	}
2987c478bd9Sstevel@tonic-gate 
2997c478bd9Sstevel@tonic-gate 	*offp = off - deltab;
3007c478bd9Sstevel@tonic-gate 	*lenp = deltab + deltaf;
3017c478bd9Sstevel@tonic-gate 	return (dirty);
3027c478bd9Sstevel@tonic-gate }
3037c478bd9Sstevel@tonic-gate 
3047c478bd9Sstevel@tonic-gate /*
3057c478bd9Sstevel@tonic-gate  * Generic entry point used to release the "shared/exclusive" lock
3067c478bd9Sstevel@tonic-gate  * and the "p_iolock" on pages after i/o is complete.
3077c478bd9Sstevel@tonic-gate  */
3087c478bd9Sstevel@tonic-gate void
3097c478bd9Sstevel@tonic-gate pvn_io_done(page_t *plist)
3107c478bd9Sstevel@tonic-gate {
3117c478bd9Sstevel@tonic-gate 	page_t *pp;
3127c478bd9Sstevel@tonic-gate 
3137c478bd9Sstevel@tonic-gate 	while (plist != NULL) {
3147c478bd9Sstevel@tonic-gate 		pp = plist;
3157c478bd9Sstevel@tonic-gate 		page_sub(&plist, pp);
3167c478bd9Sstevel@tonic-gate 		page_io_unlock(pp);
3177c478bd9Sstevel@tonic-gate 		page_unlock(pp);
3187c478bd9Sstevel@tonic-gate 	}
3197c478bd9Sstevel@tonic-gate }
3207c478bd9Sstevel@tonic-gate 
3217c478bd9Sstevel@tonic-gate /*
3227c478bd9Sstevel@tonic-gate  * Entry point to be used by file system getpage subr's and
3237c478bd9Sstevel@tonic-gate  * other such routines which either want to unlock pages (B_ASYNC
3247c478bd9Sstevel@tonic-gate  * request) or destroy a list of pages if an error occurred.
3257c478bd9Sstevel@tonic-gate  */
3267c478bd9Sstevel@tonic-gate void
3277c478bd9Sstevel@tonic-gate pvn_read_done(page_t *plist, int flags)
3287c478bd9Sstevel@tonic-gate {
3297c478bd9Sstevel@tonic-gate 	page_t *pp;
3307c478bd9Sstevel@tonic-gate 
3317c478bd9Sstevel@tonic-gate 	while (plist != NULL) {
3327c478bd9Sstevel@tonic-gate 		pp = plist;
3337c478bd9Sstevel@tonic-gate 		page_sub(&plist, pp);
3347c478bd9Sstevel@tonic-gate 		page_io_unlock(pp);
3357c478bd9Sstevel@tonic-gate 		if (flags & B_ERROR) {
3367c478bd9Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
3377c478bd9Sstevel@tonic-gate 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
3387c478bd9Sstevel@tonic-gate 		} else {
3397c478bd9Sstevel@tonic-gate 			(void) page_release(pp, 0);
3407c478bd9Sstevel@tonic-gate 		}
3417c478bd9Sstevel@tonic-gate 	}
3427c478bd9Sstevel@tonic-gate }
3437c478bd9Sstevel@tonic-gate 
3447c478bd9Sstevel@tonic-gate /*
3457c478bd9Sstevel@tonic-gate  * Automagic pageout.
3467c478bd9Sstevel@tonic-gate  * When memory gets tight, start freeing pages popping out of the
3477c478bd9Sstevel@tonic-gate  * write queue.
3487c478bd9Sstevel@tonic-gate  */
3497c478bd9Sstevel@tonic-gate int	write_free = 1;
3507c478bd9Sstevel@tonic-gate pgcnt_t	pages_before_pager = 200;	/* LMXXX */
3517c478bd9Sstevel@tonic-gate 
3527c478bd9Sstevel@tonic-gate /*
3537c478bd9Sstevel@tonic-gate  * Routine to be called when page-out's complete.
3547c478bd9Sstevel@tonic-gate  * The caller, typically VOP_PUTPAGE, has to explicity call this routine
3557c478bd9Sstevel@tonic-gate  * after waiting for i/o to complete (biowait) to free the list of
3567c478bd9Sstevel@tonic-gate  * pages associated with the buffer.  These pages must be locked
3577c478bd9Sstevel@tonic-gate  * before i/o is initiated.
3587c478bd9Sstevel@tonic-gate  *
3597c478bd9Sstevel@tonic-gate  * If a write error occurs, the pages are marked as modified
3607c478bd9Sstevel@tonic-gate  * so the write will be re-tried later.
3617c478bd9Sstevel@tonic-gate  */
3627c478bd9Sstevel@tonic-gate 
3637c478bd9Sstevel@tonic-gate void
3647c478bd9Sstevel@tonic-gate pvn_write_done(page_t *plist, int flags)
3657c478bd9Sstevel@tonic-gate {
3667c478bd9Sstevel@tonic-gate 	int dfree = 0;
3677c478bd9Sstevel@tonic-gate 	int pgrec = 0;
3687c478bd9Sstevel@tonic-gate 	int pgout = 0;
3697c478bd9Sstevel@tonic-gate 	int pgpgout = 0;
3707c478bd9Sstevel@tonic-gate 	int anonpgout = 0;
3717c478bd9Sstevel@tonic-gate 	int anonfree = 0;
3727c478bd9Sstevel@tonic-gate 	int fspgout = 0;
3737c478bd9Sstevel@tonic-gate 	int fsfree = 0;
3747c478bd9Sstevel@tonic-gate 	int execpgout = 0;
3757c478bd9Sstevel@tonic-gate 	int execfree = 0;
3767c478bd9Sstevel@tonic-gate 	page_t *pp;
3777c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
3787c478bd9Sstevel@tonic-gate 	struct vnode *vp = NULL;	/* for probe */
3797c478bd9Sstevel@tonic-gate 	uint_t ppattr;
380*a71e32b6Sstans 	kmutex_t *vphm = NULL;
3817c478bd9Sstevel@tonic-gate 
3827c478bd9Sstevel@tonic-gate 	ASSERT((flags & B_READ) == 0);
3837c478bd9Sstevel@tonic-gate 
3847c478bd9Sstevel@tonic-gate 	/*
3857c478bd9Sstevel@tonic-gate 	 * If we are about to start paging anyway, start freeing pages.
3867c478bd9Sstevel@tonic-gate 	 */
3877c478bd9Sstevel@tonic-gate 	if (write_free && freemem < lotsfree + pages_before_pager &&
3887c478bd9Sstevel@tonic-gate 	    (flags & B_ERROR) == 0) {
3897c478bd9Sstevel@tonic-gate 		flags |= B_FREE;
3907c478bd9Sstevel@tonic-gate 	}
3917c478bd9Sstevel@tonic-gate 
3927c478bd9Sstevel@tonic-gate 	/*
3937c478bd9Sstevel@tonic-gate 	 * Handle each page involved in the i/o operation.
3947c478bd9Sstevel@tonic-gate 	 */
3957c478bd9Sstevel@tonic-gate 	while (plist != NULL) {
3967c478bd9Sstevel@tonic-gate 		pp = plist;
3977c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
3987c478bd9Sstevel@tonic-gate 		page_sub(&plist, pp);
3997c478bd9Sstevel@tonic-gate 
4007c478bd9Sstevel@tonic-gate 		/* Kernel probe support */
4017c478bd9Sstevel@tonic-gate 		if (vp == NULL)
4027c478bd9Sstevel@tonic-gate 			vp = pp->p_vnode;
4037c478bd9Sstevel@tonic-gate 
404*a71e32b6Sstans 		if (IS_VMODSORT(vp)) {
405*a71e32b6Sstans 			/*
406*a71e32b6Sstans 			 * Move page to the top of the v_page list.
407*a71e32b6Sstans 			 * Skip pages modified during IO.
408*a71e32b6Sstans 			 */
409*a71e32b6Sstans 			vphm = page_vnode_mutex(vp);
410*a71e32b6Sstans 			mutex_enter(vphm);
411*a71e32b6Sstans 			if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
412*a71e32b6Sstans 				page_vpsub(&vp->v_pages, pp);
413*a71e32b6Sstans 				page_vpadd(&vp->v_pages, pp);
414*a71e32b6Sstans 			}
415*a71e32b6Sstans 			mutex_exit(vphm);
416*a71e32b6Sstans 		}
417*a71e32b6Sstans 
4187c478bd9Sstevel@tonic-gate 		if (flags & B_ERROR) {
4197c478bd9Sstevel@tonic-gate 			/*
4207c478bd9Sstevel@tonic-gate 			 * Write operation failed.  We don't want
4217c478bd9Sstevel@tonic-gate 			 * to destroy (or free) the page unless B_FORCE
4227c478bd9Sstevel@tonic-gate 			 * is set. We set the mod bit again and release
4237c478bd9Sstevel@tonic-gate 			 * all locks on the page so that it will get written
4247c478bd9Sstevel@tonic-gate 			 * back again later when things are hopefully
4257c478bd9Sstevel@tonic-gate 			 * better again.
4267c478bd9Sstevel@tonic-gate 			 * If B_INVAL and B_FORCE is set we really have
4277c478bd9Sstevel@tonic-gate 			 * to destroy the page.
4287c478bd9Sstevel@tonic-gate 			 */
4297c478bd9Sstevel@tonic-gate 			if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
4307c478bd9Sstevel@tonic-gate 				page_io_unlock(pp);
4317c478bd9Sstevel@tonic-gate 				/*LINTED: constant in conditional context*/
4327c478bd9Sstevel@tonic-gate 				VN_DISPOSE(pp, B_INVAL, 0, kcred);
4337c478bd9Sstevel@tonic-gate 			} else {
4347c478bd9Sstevel@tonic-gate 				hat_setmod(pp);
4357c478bd9Sstevel@tonic-gate 				page_io_unlock(pp);
4367c478bd9Sstevel@tonic-gate 				page_unlock(pp);
4377c478bd9Sstevel@tonic-gate 			}
4387c478bd9Sstevel@tonic-gate 		} else if (flags & B_INVAL) {
4397c478bd9Sstevel@tonic-gate 			/*
4407c478bd9Sstevel@tonic-gate 			 * XXX - Failed writes with B_INVAL set are
4417c478bd9Sstevel@tonic-gate 			 * not handled appropriately.
4427c478bd9Sstevel@tonic-gate 			 */
4437c478bd9Sstevel@tonic-gate 			page_io_unlock(pp);
4447c478bd9Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
4457c478bd9Sstevel@tonic-gate 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
4467c478bd9Sstevel@tonic-gate 		} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
4477c478bd9Sstevel@tonic-gate 			/*
4487c478bd9Sstevel@tonic-gate 			 * Update statistics for pages being paged out
4497c478bd9Sstevel@tonic-gate 			 */
4507c478bd9Sstevel@tonic-gate 			if (pp->p_vnode) {
4517c478bd9Sstevel@tonic-gate 				if (IS_SWAPFSVP(pp->p_vnode)) {
4527c478bd9Sstevel@tonic-gate 					anonpgout++;
4537c478bd9Sstevel@tonic-gate 				} else {
4547c478bd9Sstevel@tonic-gate 					if (pp->p_vnode->v_flag & VVMEXEC) {
4557c478bd9Sstevel@tonic-gate 						execpgout++;
4567c478bd9Sstevel@tonic-gate 					} else {
4577c478bd9Sstevel@tonic-gate 						fspgout++;
4587c478bd9Sstevel@tonic-gate 					}
4597c478bd9Sstevel@tonic-gate 				}
4607c478bd9Sstevel@tonic-gate 			}
4617c478bd9Sstevel@tonic-gate 			page_io_unlock(pp);
4627c478bd9Sstevel@tonic-gate 			pgout = 1;
4637c478bd9Sstevel@tonic-gate 			pgpgout++;
4647c478bd9Sstevel@tonic-gate 			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
4657c478bd9Sstevel@tonic-gate 				"page_ws_out:pp %p", pp);
4667c478bd9Sstevel@tonic-gate 
4677c478bd9Sstevel@tonic-gate 			/*
4687c478bd9Sstevel@tonic-gate 			 * The page_struct_lock need not be acquired to
4697c478bd9Sstevel@tonic-gate 			 * examine "p_lckcnt" and "p_cowcnt" since we'll
4707c478bd9Sstevel@tonic-gate 			 * have an "exclusive" lock if the upgrade succeeds.
4717c478bd9Sstevel@tonic-gate 			 */
4727c478bd9Sstevel@tonic-gate 			if (page_tryupgrade(pp) &&
4737c478bd9Sstevel@tonic-gate 			    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
4747c478bd9Sstevel@tonic-gate 				/*
4757c478bd9Sstevel@tonic-gate 				 * Check if someone has reclaimed the
4767c478bd9Sstevel@tonic-gate 				 * page.  If ref and mod are not set, no
4777c478bd9Sstevel@tonic-gate 				 * one is using it so we can free it.
4787c478bd9Sstevel@tonic-gate 				 * The rest of the system is careful
4797c478bd9Sstevel@tonic-gate 				 * to use the NOSYNC flag to unload
4807c478bd9Sstevel@tonic-gate 				 * translations set up for i/o w/o
4817c478bd9Sstevel@tonic-gate 				 * affecting ref and mod bits.
4827c478bd9Sstevel@tonic-gate 				 *
4837c478bd9Sstevel@tonic-gate 				 * Obtain a copy of the real hardware
4847c478bd9Sstevel@tonic-gate 				 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
4857c478bd9Sstevel@tonic-gate 				 * to avoid having to flush the cache.
4867c478bd9Sstevel@tonic-gate 				 */
4877c478bd9Sstevel@tonic-gate 				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
4887c478bd9Sstevel@tonic-gate 					HAT_SYNC_STOPON_MOD);
4897c478bd9Sstevel@tonic-gate 			ck_refmod:
4907c478bd9Sstevel@tonic-gate 				if (!(ppattr & (P_REF | P_MOD))) {
4917c478bd9Sstevel@tonic-gate 					if (hat_page_is_mapped(pp)) {
4927c478bd9Sstevel@tonic-gate 						/*
4937c478bd9Sstevel@tonic-gate 						 * Doesn't look like the page
4947c478bd9Sstevel@tonic-gate 						 * was modified so now we
4957c478bd9Sstevel@tonic-gate 						 * really have to unload the
4967c478bd9Sstevel@tonic-gate 						 * translations.  Meanwhile
4977c478bd9Sstevel@tonic-gate 						 * another CPU could've
4987c478bd9Sstevel@tonic-gate 						 * modified it so we have to
4997c478bd9Sstevel@tonic-gate 						 * check again.  We don't loop
5007c478bd9Sstevel@tonic-gate 						 * forever here because now
5017c478bd9Sstevel@tonic-gate 						 * the translations are gone
5027c478bd9Sstevel@tonic-gate 						 * and no one can get a new one
5037c478bd9Sstevel@tonic-gate 						 * since we have the "exclusive"
5047c478bd9Sstevel@tonic-gate 						 * lock on the page.
5057c478bd9Sstevel@tonic-gate 						 */
5067c478bd9Sstevel@tonic-gate 						(void) hat_pageunload(pp,
5077c478bd9Sstevel@tonic-gate 							HAT_FORCE_PGUNLOAD);
5087c478bd9Sstevel@tonic-gate 						ppattr = hat_page_getattr(pp,
5097c478bd9Sstevel@tonic-gate 							P_REF | P_MOD);
5107c478bd9Sstevel@tonic-gate 						goto ck_refmod;
5117c478bd9Sstevel@tonic-gate 					}
5127c478bd9Sstevel@tonic-gate 					/*
5137c478bd9Sstevel@tonic-gate 					 * Update statistics for pages being
5147c478bd9Sstevel@tonic-gate 					 * freed
5157c478bd9Sstevel@tonic-gate 					 */
5167c478bd9Sstevel@tonic-gate 					if (pp->p_vnode) {
5177c478bd9Sstevel@tonic-gate 						if (IS_SWAPFSVP(pp->p_vnode)) {
5187c478bd9Sstevel@tonic-gate 							anonfree++;
5197c478bd9Sstevel@tonic-gate 						} else {
5207c478bd9Sstevel@tonic-gate 							if (pp->p_vnode->v_flag
5217c478bd9Sstevel@tonic-gate 							    & VVMEXEC) {
5227c478bd9Sstevel@tonic-gate 								execfree++;
5237c478bd9Sstevel@tonic-gate 							} else {
5247c478bd9Sstevel@tonic-gate 								fsfree++;
5257c478bd9Sstevel@tonic-gate 							}
5267c478bd9Sstevel@tonic-gate 						}
5277c478bd9Sstevel@tonic-gate 					}
5287c478bd9Sstevel@tonic-gate 					/*LINTED: constant in conditional ctx*/
5297c478bd9Sstevel@tonic-gate 					VN_DISPOSE(pp, B_FREE,
5307c478bd9Sstevel@tonic-gate 						(flags & B_DONTNEED), kcred);
5317c478bd9Sstevel@tonic-gate 					dfree++;
5327c478bd9Sstevel@tonic-gate 				} else {
5337c478bd9Sstevel@tonic-gate 					page_unlock(pp);
5347c478bd9Sstevel@tonic-gate 					pgrec++;
5357c478bd9Sstevel@tonic-gate 					TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
5367c478bd9Sstevel@tonic-gate 					    "page_ws_free:pp %p", pp);
5377c478bd9Sstevel@tonic-gate 				}
5387c478bd9Sstevel@tonic-gate 			} else {
5397c478bd9Sstevel@tonic-gate 				/*
5407c478bd9Sstevel@tonic-gate 				 * Page is either `locked' in memory
5417c478bd9Sstevel@tonic-gate 				 * or was reclaimed and now has a
5427c478bd9Sstevel@tonic-gate 				 * "shared" lock, so release it.
5437c478bd9Sstevel@tonic-gate 				 */
5447c478bd9Sstevel@tonic-gate 				page_unlock(pp);
5457c478bd9Sstevel@tonic-gate 			}
5467c478bd9Sstevel@tonic-gate 		} else {
5477c478bd9Sstevel@tonic-gate 			/*
5487c478bd9Sstevel@tonic-gate 			 * Neither B_FREE nor B_INVAL nor B_ERROR.
5497c478bd9Sstevel@tonic-gate 			 * Just release locks.
5507c478bd9Sstevel@tonic-gate 			 */
5517c478bd9Sstevel@tonic-gate 			page_io_unlock(pp);
5527c478bd9Sstevel@tonic-gate 			page_unlock(pp);
5537c478bd9Sstevel@tonic-gate 		}
5547c478bd9Sstevel@tonic-gate 	}
5557c478bd9Sstevel@tonic-gate 
5567c478bd9Sstevel@tonic-gate 	CPU_STATS_ENTER_K();
5577c478bd9Sstevel@tonic-gate 	cpup = CPU;		/* get cpup now that CPU cannot change */
5587c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
5597c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
5607c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
5617c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
5627c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
5637c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
5647c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
5657c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
5667c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
5677c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
5687c478bd9Sstevel@tonic-gate 	CPU_STATS_EXIT_K();
5697c478bd9Sstevel@tonic-gate 
5707c478bd9Sstevel@tonic-gate 	/* Kernel probe */
5717c478bd9Sstevel@tonic-gate 	TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
5727c478bd9Sstevel@tonic-gate 		tnf_opaque,	vnode,			vp,
5737c478bd9Sstevel@tonic-gate 		tnf_ulong,	pages_pageout,		pgpgout,
5747c478bd9Sstevel@tonic-gate 		tnf_ulong,	pages_freed,		dfree,
5757c478bd9Sstevel@tonic-gate 		tnf_ulong,	pages_reclaimed,	pgrec);
5767c478bd9Sstevel@tonic-gate }
5777c478bd9Sstevel@tonic-gate 
5787c478bd9Sstevel@tonic-gate /*
5797c478bd9Sstevel@tonic-gate  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
5807c478bd9Sstevel@tonic-gate  * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
5817c478bd9Sstevel@tonic-gate  * operation and is only to be considered if it doesn't involve any
5827c478bd9Sstevel@tonic-gate  * waiting here.  B_TRUNC indicates that the file is being truncated
5837c478bd9Sstevel@tonic-gate  * and so no i/o needs to be done. B_FORCE indicates that the page
5847c478bd9Sstevel@tonic-gate  * must be destroyed so don't try wrting it out.
5857c478bd9Sstevel@tonic-gate  *
5867c478bd9Sstevel@tonic-gate  * The caller must ensure that the page is locked.  Returns 1, if
5877c478bd9Sstevel@tonic-gate  * the page should be written back (the "iolock" is held in this
5887c478bd9Sstevel@tonic-gate  * case), or 0 if the page has been dealt with or has been
5897c478bd9Sstevel@tonic-gate  * unlocked.
5907c478bd9Sstevel@tonic-gate  */
5917c478bd9Sstevel@tonic-gate int
5927c478bd9Sstevel@tonic-gate pvn_getdirty(page_t *pp, int flags)
5937c478bd9Sstevel@tonic-gate {
5947c478bd9Sstevel@tonic-gate 	ASSERT((flags & (B_INVAL | B_FREE)) ?
5957c478bd9Sstevel@tonic-gate 	    PAGE_EXCL(pp) : PAGE_SHARED(pp));
5967c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp) == 0);
5977c478bd9Sstevel@tonic-gate 
5987c478bd9Sstevel@tonic-gate 	/*
5997c478bd9Sstevel@tonic-gate 	 * If trying to invalidate or free a logically `locked' page,
6007c478bd9Sstevel@tonic-gate 	 * forget it.  Don't need page_struct_lock to check p_lckcnt and
6017c478bd9Sstevel@tonic-gate 	 * p_cowcnt as the page is exclusively locked.
6027c478bd9Sstevel@tonic-gate 	 */
6037c478bd9Sstevel@tonic-gate 	if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
6047c478bd9Sstevel@tonic-gate 	    (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
6057c478bd9Sstevel@tonic-gate 		page_unlock(pp);
6067c478bd9Sstevel@tonic-gate 		return (0);
6077c478bd9Sstevel@tonic-gate 	}
6087c478bd9Sstevel@tonic-gate 
6097c478bd9Sstevel@tonic-gate 	/*
6107c478bd9Sstevel@tonic-gate 	 * Now acquire the i/o lock so we can add it to the dirty
6117c478bd9Sstevel@tonic-gate 	 * list (if necessary).  We avoid blocking on the i/o lock
6127c478bd9Sstevel@tonic-gate 	 * in the following cases:
6137c478bd9Sstevel@tonic-gate 	 *
6147c478bd9Sstevel@tonic-gate 	 *	If B_DELWRI is set, which implies that this request is
6157c478bd9Sstevel@tonic-gate 	 *	due to a klustering operartion.
6167c478bd9Sstevel@tonic-gate 	 *
6177c478bd9Sstevel@tonic-gate 	 *	If this is an async (B_ASYNC) operation and we are not doing
6187c478bd9Sstevel@tonic-gate 	 *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
6197c478bd9Sstevel@tonic-gate 	 *	that the the page is written out].
6207c478bd9Sstevel@tonic-gate 	 */
6217c478bd9Sstevel@tonic-gate 	if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
6227c478bd9Sstevel@tonic-gate 		if (!page_io_trylock(pp)) {
6237c478bd9Sstevel@tonic-gate 			page_unlock(pp);
6247c478bd9Sstevel@tonic-gate 			return (0);
6257c478bd9Sstevel@tonic-gate 		}
6267c478bd9Sstevel@tonic-gate 	} else {
6277c478bd9Sstevel@tonic-gate 		page_io_lock(pp);
6287c478bd9Sstevel@tonic-gate 	}
6297c478bd9Sstevel@tonic-gate 
6307c478bd9Sstevel@tonic-gate 	/*
6317c478bd9Sstevel@tonic-gate 	 * If we want to free or invalidate the page then
6327c478bd9Sstevel@tonic-gate 	 * we need to unload it so that anyone who wants
6337c478bd9Sstevel@tonic-gate 	 * it will have to take a minor fault to get it.
6347c478bd9Sstevel@tonic-gate 	 * Otherwise, we're just writing the page back so we
6357c478bd9Sstevel@tonic-gate 	 * need to sync up the hardwre and software mod bit to
6367c478bd9Sstevel@tonic-gate 	 * detect any future modifications.  We clear the
6377c478bd9Sstevel@tonic-gate 	 * software mod bit when we put the page on the dirty
6387c478bd9Sstevel@tonic-gate 	 * list.
6397c478bd9Sstevel@tonic-gate 	 */
6407c478bd9Sstevel@tonic-gate 	if (flags & (B_INVAL | B_FREE)) {
6417c478bd9Sstevel@tonic-gate 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6427c478bd9Sstevel@tonic-gate 	} else {
6437c478bd9Sstevel@tonic-gate 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
6447c478bd9Sstevel@tonic-gate 	}
6457c478bd9Sstevel@tonic-gate 
6467c478bd9Sstevel@tonic-gate 	if (!hat_ismod(pp) || (flags & B_TRUNC)) {
6477c478bd9Sstevel@tonic-gate 		/*
6487c478bd9Sstevel@tonic-gate 		 * Don't need to add it to the
6497c478bd9Sstevel@tonic-gate 		 * list after all.
6507c478bd9Sstevel@tonic-gate 		 */
6517c478bd9Sstevel@tonic-gate 		page_io_unlock(pp);
6527c478bd9Sstevel@tonic-gate 		if (flags & B_INVAL) {
6537c478bd9Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
6547c478bd9Sstevel@tonic-gate 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
6557c478bd9Sstevel@tonic-gate 		} else if (flags & B_FREE) {
6567c478bd9Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
6577c478bd9Sstevel@tonic-gate 			VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
6587c478bd9Sstevel@tonic-gate 		} else {
6597c478bd9Sstevel@tonic-gate 			/*
6607c478bd9Sstevel@tonic-gate 			 * This is advisory path for the callers
6617c478bd9Sstevel@tonic-gate 			 * of VOP_PUTPAGE() who prefer freeing the
6627c478bd9Sstevel@tonic-gate 			 * page _only_ if no one else is accessing it.
6637c478bd9Sstevel@tonic-gate 			 * E.g. segmap_release()
6647c478bd9Sstevel@tonic-gate 			 *
6657c478bd9Sstevel@tonic-gate 			 * The above hat_ismod() check is useless because:
6667c478bd9Sstevel@tonic-gate 			 * (1) we may not be holding SE_EXCL lock;
6677c478bd9Sstevel@tonic-gate 			 * (2) we've not unloaded _all_ translations
6687c478bd9Sstevel@tonic-gate 			 *
6697c478bd9Sstevel@tonic-gate 			 * Let page_release() do the heavy-lifting.
6707c478bd9Sstevel@tonic-gate 			 */
6717c478bd9Sstevel@tonic-gate 			(void) page_release(pp, 1);
6727c478bd9Sstevel@tonic-gate 		}
6737c478bd9Sstevel@tonic-gate 		return (0);
6747c478bd9Sstevel@tonic-gate 	}
6757c478bd9Sstevel@tonic-gate 
6767c478bd9Sstevel@tonic-gate 	/*
6777c478bd9Sstevel@tonic-gate 	 * Page is dirty, get it ready for the write back
6787c478bd9Sstevel@tonic-gate 	 * and add page to the dirty list.
6797c478bd9Sstevel@tonic-gate 	 */
6807c478bd9Sstevel@tonic-gate 	hat_clrrefmod(pp);
6817c478bd9Sstevel@tonic-gate 
6827c478bd9Sstevel@tonic-gate 	/*
6837c478bd9Sstevel@tonic-gate 	 * If we're going to free the page when we're done
6847c478bd9Sstevel@tonic-gate 	 * then we can let others try to use it starting now.
6857c478bd9Sstevel@tonic-gate 	 * We'll detect the fact that they used it when the
6867c478bd9Sstevel@tonic-gate 	 * i/o is done and avoid freeing the page.
6877c478bd9Sstevel@tonic-gate 	 */
6887c478bd9Sstevel@tonic-gate 	if (flags & B_FREE)
6897c478bd9Sstevel@tonic-gate 		page_downgrade(pp);
6907c478bd9Sstevel@tonic-gate 
6917c478bd9Sstevel@tonic-gate 
6927c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
6937c478bd9Sstevel@tonic-gate 
6947c478bd9Sstevel@tonic-gate 	return (1);
6957c478bd9Sstevel@tonic-gate }
6967c478bd9Sstevel@tonic-gate 
6977c478bd9Sstevel@tonic-gate 
6987c478bd9Sstevel@tonic-gate /*ARGSUSED*/
6997c478bd9Sstevel@tonic-gate static int
7007c478bd9Sstevel@tonic-gate marker_constructor(void *buf, void *cdrarg, int kmflags)
7017c478bd9Sstevel@tonic-gate {
7027c478bd9Sstevel@tonic-gate 	page_t *mark = buf;
7037c478bd9Sstevel@tonic-gate 	bzero(mark, sizeof (page_t));
7047c478bd9Sstevel@tonic-gate 	return (0);
7057c478bd9Sstevel@tonic-gate }
7067c478bd9Sstevel@tonic-gate 
7077c478bd9Sstevel@tonic-gate void
7087c478bd9Sstevel@tonic-gate pvn_init()
7097c478bd9Sstevel@tonic-gate {
7107c478bd9Sstevel@tonic-gate 	if (pvn_vmodsort_disable == 0)
7117c478bd9Sstevel@tonic-gate 		pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
7127c478bd9Sstevel@tonic-gate 	marker_cache = kmem_cache_create("marker_cache",
7137c478bd9Sstevel@tonic-gate 	    sizeof (page_t), 0, marker_constructor,
7147c478bd9Sstevel@tonic-gate 	    NULL, NULL, NULL, NULL, 0);
7157c478bd9Sstevel@tonic-gate }
7167c478bd9Sstevel@tonic-gate 
7177c478bd9Sstevel@tonic-gate 
7187c478bd9Sstevel@tonic-gate /*
7197c478bd9Sstevel@tonic-gate  * Process a vnode's page list for all pages whose offset is >= off.
7207c478bd9Sstevel@tonic-gate  * Pages are to either be free'd, invalidated, or written back to disk.
7217c478bd9Sstevel@tonic-gate  *
7227c478bd9Sstevel@tonic-gate  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
7237c478bd9Sstevel@tonic-gate  * is specified, otherwise they are "shared" locked.
7247c478bd9Sstevel@tonic-gate  *
7257c478bd9Sstevel@tonic-gate  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
7267c478bd9Sstevel@tonic-gate  *
7277c478bd9Sstevel@tonic-gate  * Special marker page_t's are inserted in the list in order
7287c478bd9Sstevel@tonic-gate  * to keep track of where we are in the list when locks are dropped.
7297c478bd9Sstevel@tonic-gate  *
7307c478bd9Sstevel@tonic-gate  * Note the list is circular and insertions can happen only at the
7317c478bd9Sstevel@tonic-gate  * head and tail of the list. The algorithm ensures visiting all pages
7327c478bd9Sstevel@tonic-gate  * on the list in the following way:
7337c478bd9Sstevel@tonic-gate  *
7347c478bd9Sstevel@tonic-gate  *    Drop two marker pages at the end of the list.
7357c478bd9Sstevel@tonic-gate  *
7367c478bd9Sstevel@tonic-gate  *    Move one marker page backwards towards the start of the list until
7377c478bd9Sstevel@tonic-gate  *    it is at the list head, processing the pages passed along the way.
7387c478bd9Sstevel@tonic-gate  *
7397c478bd9Sstevel@tonic-gate  *    Due to race conditions when the vphm mutex is dropped, additional pages
7407c478bd9Sstevel@tonic-gate  *    can be added to either end of the list, so we'll continue to move
7417c478bd9Sstevel@tonic-gate  *    the marker and process pages until it is up against the end marker.
7427c478bd9Sstevel@tonic-gate  *
7437c478bd9Sstevel@tonic-gate  * There is one special exit condition. If we are processing a VMODSORT
7447c478bd9Sstevel@tonic-gate  * vnode and only writing back modified pages, we can stop as soon as
7457c478bd9Sstevel@tonic-gate  * we run into an unmodified page.  This makes fsync(3) operations fast.
7467c478bd9Sstevel@tonic-gate  */
7477c478bd9Sstevel@tonic-gate int
7487c478bd9Sstevel@tonic-gate pvn_vplist_dirty(
7497c478bd9Sstevel@tonic-gate 	vnode_t		*vp,
7507c478bd9Sstevel@tonic-gate 	u_offset_t	off,
7517c478bd9Sstevel@tonic-gate 	int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
7527c478bd9Sstevel@tonic-gate 			size_t *, int, cred_t *),
7537c478bd9Sstevel@tonic-gate 	int		flags,
7547c478bd9Sstevel@tonic-gate 	cred_t		*cred)
7557c478bd9Sstevel@tonic-gate {
7567c478bd9Sstevel@tonic-gate 	page_t		*pp;
7577c478bd9Sstevel@tonic-gate 	page_t		*mark;		/* marker page that moves toward head */
7587c478bd9Sstevel@tonic-gate 	page_t		*end;		/* marker page at end of list */
7597c478bd9Sstevel@tonic-gate 	int		err = 0;
7607c478bd9Sstevel@tonic-gate 	int		error;
7617c478bd9Sstevel@tonic-gate 	kmutex_t	*vphm;
7627c478bd9Sstevel@tonic-gate 	se_t		se;
7637c478bd9Sstevel@tonic-gate 	page_t		**where_to_move;
7647c478bd9Sstevel@tonic-gate 
7657c478bd9Sstevel@tonic-gate 	ASSERT(vp->v_type != VCHR);
7667c478bd9Sstevel@tonic-gate 
7677c478bd9Sstevel@tonic-gate 	if (vp->v_pages == NULL)
7687c478bd9Sstevel@tonic-gate 		return (0);
7697c478bd9Sstevel@tonic-gate 
7707c478bd9Sstevel@tonic-gate 
7717c478bd9Sstevel@tonic-gate 	/*
7727c478bd9Sstevel@tonic-gate 	 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
7737c478bd9Sstevel@tonic-gate 	 *
7747c478bd9Sstevel@tonic-gate 	 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
7757c478bd9Sstevel@tonic-gate 	 * from getting blocked while flushing pages to a dead NFS server.
7767c478bd9Sstevel@tonic-gate 	 */
7777c478bd9Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
7787c478bd9Sstevel@tonic-gate 	if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
7797c478bd9Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
7807c478bd9Sstevel@tonic-gate 		return (EAGAIN);
7817c478bd9Sstevel@tonic-gate 	}
7827c478bd9Sstevel@tonic-gate 
7837c478bd9Sstevel@tonic-gate 	while (vp->v_flag & VVMLOCK)
7847c478bd9Sstevel@tonic-gate 		cv_wait(&vp->v_cv, &vp->v_lock);
7857c478bd9Sstevel@tonic-gate 
7867c478bd9Sstevel@tonic-gate 	if (vp->v_pages == NULL) {
7877c478bd9Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
7887c478bd9Sstevel@tonic-gate 		return (0);
7897c478bd9Sstevel@tonic-gate 	}
7907c478bd9Sstevel@tonic-gate 
7917c478bd9Sstevel@tonic-gate 	vp->v_flag |= VVMLOCK;
7927c478bd9Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
7937c478bd9Sstevel@tonic-gate 
7947c478bd9Sstevel@tonic-gate 
7957c478bd9Sstevel@tonic-gate 	/*
7967c478bd9Sstevel@tonic-gate 	 * Set up the marker pages used to walk the list
7977c478bd9Sstevel@tonic-gate 	 */
7987c478bd9Sstevel@tonic-gate 	end = kmem_cache_alloc(marker_cache, KM_SLEEP);
7997c478bd9Sstevel@tonic-gate 	end->p_vnode = vp;
8007c478bd9Sstevel@tonic-gate 	end->p_offset = (u_offset_t)-2;
8017c478bd9Sstevel@tonic-gate 	mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
8027c478bd9Sstevel@tonic-gate 	mark->p_vnode = vp;
8037c478bd9Sstevel@tonic-gate 	mark->p_offset = (u_offset_t)-1;
8047c478bd9Sstevel@tonic-gate 
8057c478bd9Sstevel@tonic-gate 	/*
8067c478bd9Sstevel@tonic-gate 	 * Grab the lock protecting the vnode's page list
8077c478bd9Sstevel@tonic-gate 	 * note that this lock is dropped at times in the loop.
8087c478bd9Sstevel@tonic-gate 	 */
8097c478bd9Sstevel@tonic-gate 	vphm = page_vnode_mutex(vp);
8107c478bd9Sstevel@tonic-gate 	mutex_enter(vphm);
8117c478bd9Sstevel@tonic-gate 	if (vp->v_pages == NULL)
8127c478bd9Sstevel@tonic-gate 		goto leave;
8137c478bd9Sstevel@tonic-gate 
8147c478bd9Sstevel@tonic-gate 	/*
8157c478bd9Sstevel@tonic-gate 	 * insert the markers and loop through the list of pages
8167c478bd9Sstevel@tonic-gate 	 */
8177c478bd9Sstevel@tonic-gate 	page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
8187c478bd9Sstevel@tonic-gate 	page_vpadd(&mark->p_vpnext, end);
8197c478bd9Sstevel@tonic-gate 	for (;;) {
8207c478bd9Sstevel@tonic-gate 
8217c478bd9Sstevel@tonic-gate 		/*
8227c478bd9Sstevel@tonic-gate 		 * If only doing an async write back, then we can
8237c478bd9Sstevel@tonic-gate 		 * stop as soon as we get to start of the list.
8247c478bd9Sstevel@tonic-gate 		 */
8257c478bd9Sstevel@tonic-gate 		if (flags == B_ASYNC && vp->v_pages == mark)
8267c478bd9Sstevel@tonic-gate 			break;
8277c478bd9Sstevel@tonic-gate 
8287c478bd9Sstevel@tonic-gate 		/*
8297c478bd9Sstevel@tonic-gate 		 * otherwise stop when we've gone through all the pages
8307c478bd9Sstevel@tonic-gate 		 */
8317c478bd9Sstevel@tonic-gate 		if (mark->p_vpprev == end)
8327c478bd9Sstevel@tonic-gate 			break;
8337c478bd9Sstevel@tonic-gate 
8347c478bd9Sstevel@tonic-gate 		pp = mark->p_vpprev;
8357c478bd9Sstevel@tonic-gate 		if (vp->v_pages == pp)
8367c478bd9Sstevel@tonic-gate 			where_to_move = &vp->v_pages;
8377c478bd9Sstevel@tonic-gate 		else
8387c478bd9Sstevel@tonic-gate 			where_to_move = &pp->p_vpprev->p_vpnext;
8397c478bd9Sstevel@tonic-gate 
8407c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vnode == vp);
8417c478bd9Sstevel@tonic-gate 
8427c478bd9Sstevel@tonic-gate 		/*
8437c478bd9Sstevel@tonic-gate 		 * Skip this page if the offset is out of the desired range.
8447c478bd9Sstevel@tonic-gate 		 * Just move the marker and continue.
8457c478bd9Sstevel@tonic-gate 		 */
8467c478bd9Sstevel@tonic-gate 		if (pp->p_offset < off) {
8477c478bd9Sstevel@tonic-gate 			page_vpsub(&vp->v_pages, mark);
8487c478bd9Sstevel@tonic-gate 			page_vpadd(where_to_move, mark);
8497c478bd9Sstevel@tonic-gate 			continue;
8507c478bd9Sstevel@tonic-gate 		}
8517c478bd9Sstevel@tonic-gate 
8527c478bd9Sstevel@tonic-gate 		/*
8537c478bd9Sstevel@tonic-gate 		 * If just flushing dirty pages to disk and this vnode
8547c478bd9Sstevel@tonic-gate 		 * is using a sorted list of pages, we can stop processing
8557c478bd9Sstevel@tonic-gate 		 * as soon as we find an unmodified page. Since all the
8567c478bd9Sstevel@tonic-gate 		 * modified pages are visited first.
8577c478bd9Sstevel@tonic-gate 		 */
8587c478bd9Sstevel@tonic-gate 		if (IS_VMODSORT(vp) &&
859*a71e32b6Sstans 		    !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
860*a71e32b6Sstans 			if (!hat_ismod(pp) && !page_io_locked(pp)) {
8617c478bd9Sstevel@tonic-gate #ifdef  DEBUG
862*a71e32b6Sstans 				/*
863*a71e32b6Sstans 				 * For debug kernels examine what should be
864*a71e32b6Sstans 				 * all the remaining clean pages, asserting
865*a71e32b6Sstans 				 * that they are not modified.
866*a71e32b6Sstans 				 */
867*a71e32b6Sstans 				page_t	*chk = pp;
868*a71e32b6Sstans 				int	attr;
869*a71e32b6Sstans 
870*a71e32b6Sstans 				page_vpsub(&vp->v_pages, mark);
871*a71e32b6Sstans 				page_vpadd(where_to_move, mark);
872*a71e32b6Sstans 				do {
873*a71e32b6Sstans 					chk = chk->p_vpprev;
874*a71e32b6Sstans 					ASSERT(chk != end);
875*a71e32b6Sstans 					if (chk == mark)
876*a71e32b6Sstans 						continue;
877*a71e32b6Sstans 					attr = hat_page_getattr(chk, P_MOD |
878*a71e32b6Sstans 					    P_REF);
879*a71e32b6Sstans 					if ((attr & P_MOD) == 0)
880*a71e32b6Sstans 						continue;
881*a71e32b6Sstans 					panic("v_pages list not all clean: "
882*a71e32b6Sstans 					    "page_t*=%p vnode=%p off=%lx "
883*a71e32b6Sstans 					    "attr=0x%x last clean page_t*=%p\n",
884*a71e32b6Sstans 					    (void *)chk, (void *)chk->p_vnode,
885*a71e32b6Sstans 					    (long)chk->p_offset, attr,
886*a71e32b6Sstans 					    (void *)pp);
887*a71e32b6Sstans 				} while (chk != vp->v_pages);
8887c478bd9Sstevel@tonic-gate #endif
889*a71e32b6Sstans 				break;
890*a71e32b6Sstans 			} else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
891*a71e32b6Sstans 				/*
892*a71e32b6Sstans 				 * Couldn't get io lock, wait until IO is done.
893*a71e32b6Sstans 				 * Block only for sync IO since we don't want
894*a71e32b6Sstans 				 * to block async IO.
895*a71e32b6Sstans 				 */
896*a71e32b6Sstans 				mutex_exit(vphm);
897*a71e32b6Sstans 				page_io_wait(pp);
898*a71e32b6Sstans 				mutex_enter(vphm);
899*a71e32b6Sstans 				continue;
900*a71e32b6Sstans 			}
9017c478bd9Sstevel@tonic-gate 		}
9027c478bd9Sstevel@tonic-gate 
9037c478bd9Sstevel@tonic-gate 		/*
9047c478bd9Sstevel@tonic-gate 		 * If we are supposed to invalidate or free this
9057c478bd9Sstevel@tonic-gate 		 * page, then we need an exclusive lock.
9067c478bd9Sstevel@tonic-gate 		 */
9077c478bd9Sstevel@tonic-gate 		se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
9087c478bd9Sstevel@tonic-gate 
9097c478bd9Sstevel@tonic-gate 		/*
9107c478bd9Sstevel@tonic-gate 		 * We must acquire the page lock for all synchronous
9117c478bd9Sstevel@tonic-gate 		 * operations (invalidate, free and write).
9127c478bd9Sstevel@tonic-gate 		 */
9137c478bd9Sstevel@tonic-gate 		if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
9147c478bd9Sstevel@tonic-gate 			/*
9157c478bd9Sstevel@tonic-gate 			 * If the page_lock() drops the mutex
9167c478bd9Sstevel@tonic-gate 			 * we must retry the loop.
9177c478bd9Sstevel@tonic-gate 			 */
9187c478bd9Sstevel@tonic-gate 			if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
9197c478bd9Sstevel@tonic-gate 				continue;
9207c478bd9Sstevel@tonic-gate 
9217c478bd9Sstevel@tonic-gate 			/*
9227c478bd9Sstevel@tonic-gate 			 * It's ok to move the marker page now.
9237c478bd9Sstevel@tonic-gate 			 */
9247c478bd9Sstevel@tonic-gate 			page_vpsub(&vp->v_pages, mark);
9257c478bd9Sstevel@tonic-gate 			page_vpadd(where_to_move, mark);
9267c478bd9Sstevel@tonic-gate 		} else {
9277c478bd9Sstevel@tonic-gate 
9287c478bd9Sstevel@tonic-gate 			/*
9297c478bd9Sstevel@tonic-gate 			 * update the marker page for all remaining cases
9307c478bd9Sstevel@tonic-gate 			 */
9317c478bd9Sstevel@tonic-gate 			page_vpsub(&vp->v_pages, mark);
9327c478bd9Sstevel@tonic-gate 			page_vpadd(where_to_move, mark);
9337c478bd9Sstevel@tonic-gate 
9347c478bd9Sstevel@tonic-gate 			/*
9357c478bd9Sstevel@tonic-gate 			 * For write backs, If we can't lock the page, it's
9367c478bd9Sstevel@tonic-gate 			 * invalid or in the process of being destroyed.  Skip
9377c478bd9Sstevel@tonic-gate 			 * it, assuming someone else is writing it.
9387c478bd9Sstevel@tonic-gate 			 */
9397c478bd9Sstevel@tonic-gate 			if (!page_trylock(pp, se))
9407c478bd9Sstevel@tonic-gate 				continue;
9417c478bd9Sstevel@tonic-gate 		}
9427c478bd9Sstevel@tonic-gate 
9437c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vnode == vp);
9447c478bd9Sstevel@tonic-gate 
9457c478bd9Sstevel@tonic-gate 		/*
9467c478bd9Sstevel@tonic-gate 		 * Successfully locked the page, now figure out what to
9477c478bd9Sstevel@tonic-gate 		 * do with it. Free pages are easily dealt with, invalidate
9487c478bd9Sstevel@tonic-gate 		 * if desired or just go on to the next page.
9497c478bd9Sstevel@tonic-gate 		 */
9507c478bd9Sstevel@tonic-gate 		if (PP_ISFREE(pp)) {
9517c478bd9Sstevel@tonic-gate 			if ((flags & B_INVAL) == 0) {
9527c478bd9Sstevel@tonic-gate 				page_unlock(pp);
9537c478bd9Sstevel@tonic-gate 				continue;
9547c478bd9Sstevel@tonic-gate 			}
9557c478bd9Sstevel@tonic-gate 
9567c478bd9Sstevel@tonic-gate 			/*
9577c478bd9Sstevel@tonic-gate 			 * Invalidate (destroy) the page.
9587c478bd9Sstevel@tonic-gate 			 */
9597c478bd9Sstevel@tonic-gate 			mutex_exit(vphm);
9607c478bd9Sstevel@tonic-gate 			page_destroy_free(pp);
9617c478bd9Sstevel@tonic-gate 			mutex_enter(vphm);
9627c478bd9Sstevel@tonic-gate 			continue;
9637c478bd9Sstevel@tonic-gate 		}
9647c478bd9Sstevel@tonic-gate 
9657c478bd9Sstevel@tonic-gate 		/*
9667c478bd9Sstevel@tonic-gate 		 * pvn_getdirty() figures out what do do with a dirty page.
9677c478bd9Sstevel@tonic-gate 		 * If the page is dirty, the putapage() routine will write it
9687c478bd9Sstevel@tonic-gate 		 * and will kluster any other adjacent dirty pages it can.
9697c478bd9Sstevel@tonic-gate 		 *
9707c478bd9Sstevel@tonic-gate 		 * pvn_getdirty() and `(*putapage)' unlock the page.
9717c478bd9Sstevel@tonic-gate 		 */
9727c478bd9Sstevel@tonic-gate 		mutex_exit(vphm);
9737c478bd9Sstevel@tonic-gate 		if (pvn_getdirty(pp, flags)) {
9747c478bd9Sstevel@tonic-gate 			error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
9757c478bd9Sstevel@tonic-gate 			if (!err)
9767c478bd9Sstevel@tonic-gate 				err = error;
9777c478bd9Sstevel@tonic-gate 		}
9787c478bd9Sstevel@tonic-gate 		mutex_enter(vphm);
9797c478bd9Sstevel@tonic-gate 	}
9807c478bd9Sstevel@tonic-gate 	page_vpsub(&vp->v_pages, mark);
9817c478bd9Sstevel@tonic-gate 	page_vpsub(&vp->v_pages, end);
9827c478bd9Sstevel@tonic-gate 
9837c478bd9Sstevel@tonic-gate leave:
9847c478bd9Sstevel@tonic-gate 	/*
9857c478bd9Sstevel@tonic-gate 	 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
9867c478bd9Sstevel@tonic-gate 	 */
9877c478bd9Sstevel@tonic-gate 	mutex_exit(vphm);
9887c478bd9Sstevel@tonic-gate 	kmem_cache_free(marker_cache, mark);
9897c478bd9Sstevel@tonic-gate 	kmem_cache_free(marker_cache, end);
9907c478bd9Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
9917c478bd9Sstevel@tonic-gate 	vp->v_flag &= ~VVMLOCK;
9927c478bd9Sstevel@tonic-gate 	cv_broadcast(&vp->v_cv);
9937c478bd9Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
9947c478bd9Sstevel@tonic-gate 	return (err);
9957c478bd9Sstevel@tonic-gate }
9967c478bd9Sstevel@tonic-gate 
9977c478bd9Sstevel@tonic-gate /*
9987c478bd9Sstevel@tonic-gate  * Zero out zbytes worth of data. Caller should be aware that this
9997c478bd9Sstevel@tonic-gate  * routine may enter back into the fs layer (xxx_getpage). Locks
10007c478bd9Sstevel@tonic-gate  * that the xxx_getpage routine may need should not be held while
10017c478bd9Sstevel@tonic-gate  * calling this.
10027c478bd9Sstevel@tonic-gate  */
10037c478bd9Sstevel@tonic-gate void
10047c478bd9Sstevel@tonic-gate pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
10057c478bd9Sstevel@tonic-gate {
10067c478bd9Sstevel@tonic-gate 	caddr_t addr;
10077c478bd9Sstevel@tonic-gate 
10087c478bd9Sstevel@tonic-gate 	ASSERT(vp->v_type != VCHR);
10097c478bd9Sstevel@tonic-gate 
10107c478bd9Sstevel@tonic-gate 	if (vp->v_pages == NULL)
10117c478bd9Sstevel@tonic-gate 		return;
10127c478bd9Sstevel@tonic-gate 
10137c478bd9Sstevel@tonic-gate 	/*
10147c478bd9Sstevel@tonic-gate 	 * zbytes may be zero but there still may be some portion of
10157c478bd9Sstevel@tonic-gate 	 * a page which needs clearing (since zbytes is a function
10167c478bd9Sstevel@tonic-gate 	 * of filesystem block size, not pagesize.)
10177c478bd9Sstevel@tonic-gate 	 */
10187c478bd9Sstevel@tonic-gate 	if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
10197c478bd9Sstevel@tonic-gate 		return;
10207c478bd9Sstevel@tonic-gate 
10217c478bd9Sstevel@tonic-gate 	/*
10227c478bd9Sstevel@tonic-gate 	 * We get the last page and handle the partial
10237c478bd9Sstevel@tonic-gate 	 * zeroing via kernel mappings.  This will make the page
10247c478bd9Sstevel@tonic-gate 	 * dirty so that we know that when this page is written
10257c478bd9Sstevel@tonic-gate 	 * back, the zeroed information will go out with it.  If
10267c478bd9Sstevel@tonic-gate 	 * the page is not currently in memory, then the kzero
10277c478bd9Sstevel@tonic-gate 	 * operation will cause it to be brought it.  We use kzero
10287c478bd9Sstevel@tonic-gate 	 * instead of bzero so that if the page cannot be read in
10297c478bd9Sstevel@tonic-gate 	 * for any reason, the system will not panic.  We need
10307c478bd9Sstevel@tonic-gate 	 * to zero out a minimum of the fs given zbytes, but we
10317c478bd9Sstevel@tonic-gate 	 * might also have to do more to get the entire last page.
10327c478bd9Sstevel@tonic-gate 	 */
10337c478bd9Sstevel@tonic-gate 
10347c478bd9Sstevel@tonic-gate 	if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
10357c478bd9Sstevel@tonic-gate 		panic("pvn_vptrunc zbytes");
10367c478bd9Sstevel@tonic-gate 	addr = segmap_getmapflt(segkmap, vp, vplen,
10377c478bd9Sstevel@tonic-gate 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
10387c478bd9Sstevel@tonic-gate 	(void) kzero(addr + (vplen & MAXBOFFSET),
10397c478bd9Sstevel@tonic-gate 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
10407c478bd9Sstevel@tonic-gate 	(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
10417c478bd9Sstevel@tonic-gate }
10427c478bd9Sstevel@tonic-gate 
10437c478bd9Sstevel@tonic-gate /*
10447c478bd9Sstevel@tonic-gate  * Handles common work of the VOP_GETPAGE routines when more than
10457c478bd9Sstevel@tonic-gate  * one page must be returned by calling a file system specific operation
10467c478bd9Sstevel@tonic-gate  * to do most of the work.  Must be called with the vp already locked
10477c478bd9Sstevel@tonic-gate  * by the VOP_GETPAGE routine.
10487c478bd9Sstevel@tonic-gate  */
10497c478bd9Sstevel@tonic-gate int
10507c478bd9Sstevel@tonic-gate pvn_getpages(
10517c478bd9Sstevel@tonic-gate 	int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
10527c478bd9Sstevel@tonic-gate 		size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
10537c478bd9Sstevel@tonic-gate 	struct vnode *vp,
10547c478bd9Sstevel@tonic-gate 	u_offset_t off,
10557c478bd9Sstevel@tonic-gate 	size_t len,
10567c478bd9Sstevel@tonic-gate 	uint_t *protp,
10577c478bd9Sstevel@tonic-gate 	page_t *pl[],
10587c478bd9Sstevel@tonic-gate 	size_t plsz,
10597c478bd9Sstevel@tonic-gate 	struct seg *seg,
10607c478bd9Sstevel@tonic-gate 	caddr_t addr,
10617c478bd9Sstevel@tonic-gate 	enum seg_rw rw,
10627c478bd9Sstevel@tonic-gate 	struct cred *cred)
10637c478bd9Sstevel@tonic-gate {
10647c478bd9Sstevel@tonic-gate 	page_t **ppp;
10657c478bd9Sstevel@tonic-gate 	u_offset_t o, eoff;
10667c478bd9Sstevel@tonic-gate 	size_t sz, xlen;
10677c478bd9Sstevel@tonic-gate 	int err;
10687c478bd9Sstevel@tonic-gate 
10697c478bd9Sstevel@tonic-gate 	ASSERT(plsz >= len);		/* insure that we have enough space */
10707c478bd9Sstevel@tonic-gate 
10717c478bd9Sstevel@tonic-gate 	/*
10727c478bd9Sstevel@tonic-gate 	 * Loop one page at a time and let getapage function fill
10737c478bd9Sstevel@tonic-gate 	 * in the next page in array.  We only allow one page to be
10747c478bd9Sstevel@tonic-gate 	 * returned at a time (except for the last page) so that we
10757c478bd9Sstevel@tonic-gate 	 * don't have any problems with duplicates and other such
10767c478bd9Sstevel@tonic-gate 	 * painful problems.  This is a very simple minded algorithm,
10777c478bd9Sstevel@tonic-gate 	 * but it does the job correctly.  We hope that the cost of a
10787c478bd9Sstevel@tonic-gate 	 * getapage call for a resident page that we might have been
10797c478bd9Sstevel@tonic-gate 	 * able to get from an earlier call doesn't cost too much.
10807c478bd9Sstevel@tonic-gate 	 */
10817c478bd9Sstevel@tonic-gate 	ppp = pl;
10827c478bd9Sstevel@tonic-gate 	sz = PAGESIZE;
10837c478bd9Sstevel@tonic-gate 	eoff = off + len;
10847c478bd9Sstevel@tonic-gate 	xlen = len;
10857c478bd9Sstevel@tonic-gate 	for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
10867c478bd9Sstevel@tonic-gate 	    xlen -= PAGESIZE) {
10877c478bd9Sstevel@tonic-gate 		if (o + PAGESIZE >= eoff) {
10887c478bd9Sstevel@tonic-gate 			/*
10897c478bd9Sstevel@tonic-gate 			 * Last time through - allow the all of
10907c478bd9Sstevel@tonic-gate 			 * what's left of the pl[] array to be used.
10917c478bd9Sstevel@tonic-gate 			 */
10927c478bd9Sstevel@tonic-gate 			sz = plsz - (o - off);
10937c478bd9Sstevel@tonic-gate 		}
10947c478bd9Sstevel@tonic-gate 		err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
10957c478bd9Sstevel@tonic-gate 		    rw, cred);
10967c478bd9Sstevel@tonic-gate 		if (err) {
10977c478bd9Sstevel@tonic-gate 			/*
10987c478bd9Sstevel@tonic-gate 			 * Release any pages we already got.
10997c478bd9Sstevel@tonic-gate 			 */
11007c478bd9Sstevel@tonic-gate 			if (o > off && pl != NULL) {
11017c478bd9Sstevel@tonic-gate 				for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
11027c478bd9Sstevel@tonic-gate 					(void) page_release(*ppp, 1);
11037c478bd9Sstevel@tonic-gate 			}
11047c478bd9Sstevel@tonic-gate 			break;
11057c478bd9Sstevel@tonic-gate 		}
11067c478bd9Sstevel@tonic-gate 		if (pl != NULL)
11077c478bd9Sstevel@tonic-gate 			ppp++;
11087c478bd9Sstevel@tonic-gate 	}
11097c478bd9Sstevel@tonic-gate 	return (err);
11107c478bd9Sstevel@tonic-gate }
11117c478bd9Sstevel@tonic-gate 
11127c478bd9Sstevel@tonic-gate /*
11137c478bd9Sstevel@tonic-gate  * Initialize the page list array.
11147c478bd9Sstevel@tonic-gate  */
11157c478bd9Sstevel@tonic-gate void
11167c478bd9Sstevel@tonic-gate pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
11177c478bd9Sstevel@tonic-gate     u_offset_t off, size_t io_len, enum seg_rw rw)
11187c478bd9Sstevel@tonic-gate {
11197c478bd9Sstevel@tonic-gate 	ssize_t sz;
11207c478bd9Sstevel@tonic-gate 	page_t *ppcur, **ppp;
11217c478bd9Sstevel@tonic-gate 
11227c478bd9Sstevel@tonic-gate 	if (plsz >= io_len) {
11237c478bd9Sstevel@tonic-gate 		/*
11247c478bd9Sstevel@tonic-gate 		 * Everything fits, set up to load
11257c478bd9Sstevel@tonic-gate 		 * all the pages.
11267c478bd9Sstevel@tonic-gate 		 */
11277c478bd9Sstevel@tonic-gate 		sz = io_len;
11287c478bd9Sstevel@tonic-gate 	} else {
11297c478bd9Sstevel@tonic-gate 		/*
11307c478bd9Sstevel@tonic-gate 		 * Set up to load plsz worth
11317c478bd9Sstevel@tonic-gate 		 * starting at the needed page.
11327c478bd9Sstevel@tonic-gate 		 */
11337c478bd9Sstevel@tonic-gate 		while (pp->p_offset != off) {
11347c478bd9Sstevel@tonic-gate 			/* XXX - Do we need this assert? */
11357c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_next->p_offset !=
11367c478bd9Sstevel@tonic-gate 			    pp->p_offset);
11377c478bd9Sstevel@tonic-gate 			/*
11387c478bd9Sstevel@tonic-gate 			 * Remove page from the i/o list,
11397c478bd9Sstevel@tonic-gate 			 * release the i/o and the page lock.
11407c478bd9Sstevel@tonic-gate 			 */
11417c478bd9Sstevel@tonic-gate 			ppcur = pp;
11427c478bd9Sstevel@tonic-gate 			page_sub(&pp, ppcur);
11437c478bd9Sstevel@tonic-gate 			page_io_unlock(ppcur);
11447c478bd9Sstevel@tonic-gate 			(void) page_release(ppcur, 1);
11457c478bd9Sstevel@tonic-gate 		}
11467c478bd9Sstevel@tonic-gate 		sz = plsz;
11477c478bd9Sstevel@tonic-gate 	}
11487c478bd9Sstevel@tonic-gate 
11497c478bd9Sstevel@tonic-gate 	/*
11507c478bd9Sstevel@tonic-gate 	 * Initialize the page list array.
11517c478bd9Sstevel@tonic-gate 	 */
11527c478bd9Sstevel@tonic-gate 	ppp = pl;
11537c478bd9Sstevel@tonic-gate 	do {
11547c478bd9Sstevel@tonic-gate 		ppcur = pp;
11557c478bd9Sstevel@tonic-gate 		*ppp++ = ppcur;
11567c478bd9Sstevel@tonic-gate 		page_sub(&pp, ppcur);
11577c478bd9Sstevel@tonic-gate 		page_io_unlock(ppcur);
11587c478bd9Sstevel@tonic-gate 		if (rw != S_CREATE)
11597c478bd9Sstevel@tonic-gate 			page_downgrade(ppcur);
11607c478bd9Sstevel@tonic-gate 		sz -= PAGESIZE;
11617c478bd9Sstevel@tonic-gate 	} while (sz > 0 && pp != NULL);
11627c478bd9Sstevel@tonic-gate 	*ppp = NULL;		/* terminate list */
11637c478bd9Sstevel@tonic-gate 
11647c478bd9Sstevel@tonic-gate 	/*
11657c478bd9Sstevel@tonic-gate 	 * Now free the remaining pages that weren't
11667c478bd9Sstevel@tonic-gate 	 * loaded in the page list.
11677c478bd9Sstevel@tonic-gate 	 */
11687c478bd9Sstevel@tonic-gate 	while (pp != NULL) {
11697c478bd9Sstevel@tonic-gate 		ppcur = pp;
11707c478bd9Sstevel@tonic-gate 		page_sub(&pp, ppcur);
11717c478bd9Sstevel@tonic-gate 		page_io_unlock(ppcur);
11727c478bd9Sstevel@tonic-gate 		(void) page_release(ppcur, 1);
11737c478bd9Sstevel@tonic-gate 	}
11747c478bd9Sstevel@tonic-gate }
1175