17c478bdstevel@tonic-gate/*
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
5a71e32bstans * Common Development and Distribution License (the "License").
6a71e32bstans * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
217c478bdstevel@tonic-gate/*
2244c4f64John Levon * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23ca41123Josef 'Jeff' Sipek * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
247c478bdstevel@tonic-gate */
257c478bdstevel@tonic-gate
267c478bdstevel@tonic-gate/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
277c478bdstevel@tonic-gate/*	  All Rights Reserved  	*/
287c478bdstevel@tonic-gate
297c478bdstevel@tonic-gate/*
307c478bdstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988
317c478bdstevel@tonic-gate * The Regents of the University of California
327c478bdstevel@tonic-gate * All Rights Reserved
337c478bdstevel@tonic-gate *
347c478bdstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from
357c478bdstevel@tonic-gate * software developed by the University of California, Berkeley, and its
367c478bdstevel@tonic-gate * contributors.
377c478bdstevel@tonic-gate */
387c478bdstevel@tonic-gate
397c478bdstevel@tonic-gate/*
407c478bdstevel@tonic-gate * VM - paged vnode.
417c478bdstevel@tonic-gate *
427c478bdstevel@tonic-gate * This file supplies vm support for the vnode operations that deal with pages.
437c478bdstevel@tonic-gate */
447c478bdstevel@tonic-gate#include <sys/types.h>
457c478bdstevel@tonic-gate#include <sys/t_lock.h>
467c478bdstevel@tonic-gate#include <sys/param.h>
477c478bdstevel@tonic-gate#include <sys/sysmacros.h>
487c478bdstevel@tonic-gate#include <sys/systm.h>
497c478bdstevel@tonic-gate#include <sys/time.h>
507c478bdstevel@tonic-gate#include <sys/buf.h>
517c478bdstevel@tonic-gate#include <sys/vnode.h>
527c478bdstevel@tonic-gate#include <sys/uio.h>
537c478bdstevel@tonic-gate#include <sys/vmsystm.h>
547c478bdstevel@tonic-gate#include <sys/mman.h>
557c478bdstevel@tonic-gate#include <sys/vfs.h>
567c478bdstevel@tonic-gate#include <sys/cred.h>
577c478bdstevel@tonic-gate#include <sys/user.h>
587c478bdstevel@tonic-gate#include <sys/kmem.h>
597c478bdstevel@tonic-gate#include <sys/cmn_err.h>
607c478bdstevel@tonic-gate#include <sys/debug.h>
617c478bdstevel@tonic-gate#include <sys/cpuvar.h>
627c478bdstevel@tonic-gate#include <sys/vtrace.h>
637c478bdstevel@tonic-gate#include <sys/tnf_probe.h>
647c478bdstevel@tonic-gate
657c478bdstevel@tonic-gate#include <vm/hat.h>
667c478bdstevel@tonic-gate#include <vm/as.h>
677c478bdstevel@tonic-gate#include <vm/seg.h>
687c478bdstevel@tonic-gate#include <vm/rm.h>
697c478bdstevel@tonic-gate#include <vm/pvn.h>
707c478bdstevel@tonic-gate#include <vm/page.h>
717c478bdstevel@tonic-gate#include <vm/seg_map.h>
727c478bdstevel@tonic-gate#include <vm/seg_kmem.h>
737c478bdstevel@tonic-gate#include <sys/fs/swapnode.h>
747c478bdstevel@tonic-gate
757c478bdstevel@tonic-gateint pvn_nofodklust = 0;
767c478bdstevel@tonic-gateint pvn_write_noklust = 0;
777c478bdstevel@tonic-gate
787c478bdstevel@tonic-gateuint_t pvn_vmodsort_supported = 0;	/* set if HAT supports VMODSORT */
797c478bdstevel@tonic-gateuint_t pvn_vmodsort_disable = 0;	/* set in /etc/system to disable HAT */
807c478bdstevel@tonic-gate					/* support for vmodsort for testing */
817c478bdstevel@tonic-gate
827c478bdstevel@tonic-gatestatic struct kmem_cache *marker_cache = NULL;
837c478bdstevel@tonic-gate
847c478bdstevel@tonic-gate/*
857c478bdstevel@tonic-gate * Find the largest contiguous block which contains `addr' for file offset
867c478bdstevel@tonic-gate * `offset' in it while living within the file system block sizes (`vp_off'
877c478bdstevel@tonic-gate * and `vp_len') and the address space limits for which no pages currently
887c478bdstevel@tonic-gate * exist and which map to consecutive file offsets.
897c478bdstevel@tonic-gate */
907c478bdstevel@tonic-gatepage_t *
917c478bdstevel@tonic-gatepvn_read_kluster(
927c478bdstevel@tonic-gate	struct vnode *vp,
937c478bdstevel@tonic-gate	u_offset_t off,
947c478bdstevel@tonic-gate	struct seg *seg,
957c478bdstevel@tonic-gate	caddr_t addr,
967c478bdstevel@tonic-gate	u_offset_t *offp,			/* return values */
977c478bdstevel@tonic-gate	size_t *lenp,				/* return values */
987c478bdstevel@tonic-gate	u_offset_t vp_off,
997c478bdstevel@tonic-gate	size_t vp_len,
1007c478bdstevel@tonic-gate	int isra)
1017c478bdstevel@tonic-gate{
1027c478bdstevel@tonic-gate	ssize_t deltaf, deltab;
1037c478bdstevel@tonic-gate	page_t *pp;
1047c478bdstevel@tonic-gate	page_t *plist = NULL;
1057c478bdstevel@tonic-gate	spgcnt_t pagesavail;
1067c478bdstevel@tonic-gate	u_offset_t vp_end;
1077c478bdstevel@tonic-gate
1087c478bdstevel@tonic-gate	ASSERT(off >= vp_off && off < vp_off + vp_len);
1097c478bdstevel@tonic-gate
1107c478bdstevel@tonic-gate	/*
1117c478bdstevel@tonic-gate	 * We only want to do klustering/read ahead if there
1127c478bdstevel@tonic-gate	 * is more than minfree pages currently available.
1137c478bdstevel@tonic-gate	 */
1147c478bdstevel@tonic-gate	pagesavail = freemem - minfree;
1157c478bdstevel@tonic-gate
1167c478bdstevel@tonic-gate	if (pagesavail <= 0)
1177c478bdstevel@tonic-gate		if (isra)
1187c478bdstevel@tonic-gate			return ((page_t *)NULL);    /* ra case - give up */
1197c478bdstevel@tonic-gate		else
1207c478bdstevel@tonic-gate			pagesavail = 1;		    /* must return a page */
1217c478bdstevel@tonic-gate
1227c478bdstevel@tonic-gate	/* We calculate in pages instead of bytes due to 32-bit overflows */
1237c478bdstevel@tonic-gate	if (pagesavail < (spgcnt_t)btopr(vp_len)) {
1247c478bdstevel@tonic-gate		/*
1257c478bdstevel@tonic-gate		 * Don't have enough free memory for the
1267c478bdstevel@tonic-gate		 * max request, try sizing down vp request.
1277c478bdstevel@tonic-gate		 */
1287c478bdstevel@tonic-gate		deltab = (ssize_t)(off - vp_off);
1297c478bdstevel@tonic-gate		vp_len -= deltab;
1307c478bdstevel@tonic-gate		vp_off += deltab;
1317c478bdstevel@tonic-gate		if (pagesavail < btopr(vp_len)) {
1327c478bdstevel@tonic-gate			/*
1337c478bdstevel@tonic-gate			 * Still not enough memory, just settle for
1347c478bdstevel@tonic-gate			 * pagesavail which is at least 1.
1357c478bdstevel@tonic-gate			 */
1367c478bdstevel@tonic-gate			vp_len = ptob(pagesavail);
1377c478bdstevel@tonic-gate		}
1387c478bdstevel@tonic-gate	}
1397c478bdstevel@tonic-gate
1407c478bdstevel@tonic-gate	vp_end = vp_off + vp_len;
1417c478bdstevel@tonic-gate	ASSERT(off >= vp_off && off < vp_end);
1427c478bdstevel@tonic-gate
1437c478bdstevel@tonic-gate	if (isra && SEGOP_KLUSTER(seg, addr, 0))
1447c478bdstevel@tonic-gate		return ((page_t *)NULL);	/* segment driver says no */
1457c478bdstevel@tonic-gate
1467c478bdstevel@tonic-gate	if ((plist = page_create_va(vp, off,
1477c478bdstevel@tonic-gate	    PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
1487c478bdstevel@tonic-gate		return ((page_t *)NULL);
1497c478bdstevel@tonic-gate
1507c478bdstevel@tonic-gate	if (vp_len <= PAGESIZE || pvn_nofodklust) {
1517c478bdstevel@tonic-gate		*offp = off;
1527c478bdstevel@tonic-gate		*lenp = MIN(vp_len, PAGESIZE);
1537c478bdstevel@tonic-gate	} else {
1547c478bdstevel@tonic-gate		/*
1557c478bdstevel@tonic-gate		 * Scan back from front by incrementing "deltab" and
1567c478bdstevel@tonic-gate		 * comparing "off" with "vp_off + deltab" to avoid
1577c478bdstevel@tonic-gate		 * "signed" versus "unsigned" conversion problems.
1587c478bdstevel@tonic-gate		 */
1597c478bdstevel@tonic-gate		for (deltab = PAGESIZE; off >= vp_off + deltab;
1607c478bdstevel@tonic-gate		    deltab += PAGESIZE) {
1617c478bdstevel@tonic-gate			/*
1627c478bdstevel@tonic-gate			 * Call back to the segment driver to verify that
1637c478bdstevel@tonic-gate			 * the klustering/read ahead operation makes sense.
1647c478bdstevel@tonic-gate			 */
1657c478bdstevel@tonic-gate			if (SEGOP_KLUSTER(seg, addr, -deltab))
1667c478bdstevel@tonic-gate				break;		/* page not eligible */
1677c478bdstevel@tonic-gate			if ((pp = page_create_va(vp, off - deltab,
1687c478bdstevel@tonic-gate			    PAGESIZE, PG_EXCL, seg, addr - deltab))
1697c478bdstevel@tonic-gate			    == NULL)
1707c478bdstevel@tonic-gate				break;		/* already have the page */
1717c478bdstevel@tonic-gate			/*
1727c478bdstevel@tonic-gate			 * Add page to front of page list.
1737c478bdstevel@tonic-gate			 */
1747c478bdstevel@tonic-gate			page_add(&plist, pp);
1757c478bdstevel@tonic-gate		}
1767c478bdstevel@tonic-gate		deltab -= PAGESIZE;
1777c478bdstevel@tonic-gate
1787c478bdstevel@tonic-gate		/* scan forward from front */
1797c478bdstevel@tonic-gate		for (deltaf = PAGESIZE; off + deltaf < vp_end;
1807c478bdstevel@tonic-gate		    deltaf += PAGESIZE) {
1817c478bdstevel@tonic-gate			/*
1827c478bdstevel@tonic-gate			 * Call back to the segment driver to verify that
1837c478bdstevel@tonic-gate			 * the klustering/read ahead operation makes sense.
1847c478bdstevel@tonic-gate			 */
1857c478bdstevel@tonic-gate			if (SEGOP_KLUSTER(seg, addr, deltaf))
1867c478bdstevel@tonic-gate				break;		/* page not file extension */
1877c478bdstevel@tonic-gate			if ((pp = page_create_va(vp, off + deltaf,
1887c478bdstevel@tonic-gate			    PAGESIZE, PG_EXCL, seg, addr + deltaf))
1897c478bdstevel@tonic-gate			    == NULL)
1907c478bdstevel@tonic-gate				break;		/* already have page */
1917c478bdstevel@tonic-gate
1927c478bdstevel@tonic-gate			/*
1937c478bdstevel@tonic-gate			 * Add page to end of page list.
1947c478bdstevel@tonic-gate			 */
1957c478bdstevel@tonic-gate			page_add(&plist, pp);
1967c478bdstevel@tonic-gate			plist = plist->p_next;
1977c478bdstevel@tonic-gate		}
1987c478bdstevel@tonic-gate		*offp = off = off - deltab;
1997c478bdstevel@tonic-gate		*lenp = deltab + deltaf;
2007c478bdstevel@tonic-gate		ASSERT(off >= vp_off);
2017c478bdstevel@tonic-gate
2027c478bdstevel@tonic-gate		/*
2037c478bdstevel@tonic-gate		 * If we ended up getting more than was actually
2047c478bdstevel@tonic-gate		 * requested, retract the returned length to only
2057c478bdstevel@tonic-gate		 * reflect what was requested.  This might happen
2067c478bdstevel@tonic-gate		 * if we were allowed to kluster pages across a
2077c478bdstevel@tonic-gate		 * span of (say) 5 frags, and frag size is less
2087c478bdstevel@tonic-gate		 * than PAGESIZE.  We need a whole number of
2097c478bdstevel@tonic-gate		 * pages to contain those frags, but the returned
2107c478bdstevel@tonic-gate		 * size should only allow the returned range to
2117c478bdstevel@tonic-gate		 * extend as far as the end of the frags.
2127c478bdstevel@tonic-gate		 */
2137c478bdstevel@tonic-gate		if ((vp_off + vp_len) < (off + *lenp)) {
2147c478bdstevel@tonic-gate			ASSERT(vp_end > off);
2157c478bdstevel@tonic-gate			*lenp = vp_end - off;
2167c478bdstevel@tonic-gate		}
2177c478bdstevel@tonic-gate	}
2187c478bdstevel@tonic-gate	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
219f8bbc57Pavel Filipensky	    "pvn_read_kluster:seg %p addr %x isra %x",
220f8bbc57Pavel Filipensky	    seg, addr, isra);
2217c478bdstevel@tonic-gate	return (plist);
2227c478bdstevel@tonic-gate}
2237c478bdstevel@tonic-gate
2247c478bdstevel@tonic-gate/*
2257c478bdstevel@tonic-gate * Handle pages for this vnode on either side of the page "pp"
2267c478bdstevel@tonic-gate * which has been locked by the caller.  This routine will also
2277c478bdstevel@tonic-gate * do klustering in the range [vp_off, vp_off + vp_len] up
2287c478bdstevel@tonic-gate * until a page which is not found.  The offset and length
2297c478bdstevel@tonic-gate * of pages included is returned in "*offp" and "*lenp".
2307c478bdstevel@tonic-gate *
2317c478bdstevel@tonic-gate * Returns a list of dirty locked pages all ready to be
2327c478bdstevel@tonic-gate * written back.
2337c478bdstevel@tonic-gate */
2347c478bdstevel@tonic-gatepage_t *
2357c478bdstevel@tonic-gatepvn_write_kluster(
2367c478bdstevel@tonic-gate	struct vnode *vp,
2377c478bdstevel@tonic-gate	page_t *pp,
2387c478bdstevel@tonic-gate	u_offset_t *offp,		/* return values */
2397c478bdstevel@tonic-gate	size_t *lenp,			/* return values */
2407c478bdstevel@tonic-gate	u_offset_t vp_off,
2417c478bdstevel@tonic-gate	size_t vp_len,
2427c478bdstevel@tonic-gate	int flags)
2437c478bdstevel@tonic-gate{
2447c478bdstevel@tonic-gate	u_offset_t off;
2457c478bdstevel@tonic-gate	page_t *dirty;
2467c478bdstevel@tonic-gate	size_t deltab, deltaf;
2477c478bdstevel@tonic-gate	se_t se;
2487c478bdstevel@tonic-gate	u_offset_t vp_end;
2497c478bdstevel@tonic-gate
2507c478bdstevel@tonic-gate	off = pp->p_offset;
2517c478bdstevel@tonic-gate
2527c478bdstevel@tonic-gate	/*
2537c478bdstevel@tonic-gate	 * Kustering should not be done if we are invalidating
2547c478bdstevel@tonic-gate	 * pages since we could destroy pages that belong to
2557c478bdstevel@tonic-gate	 * some other process if this is a swap vnode.
2567c478bdstevel@tonic-gate	 */
2577c478bdstevel@tonic-gate	if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
2587c478bdstevel@tonic-gate		*offp = off;
2597c478bdstevel@tonic-gate		*lenp = PAGESIZE;
2607c478bdstevel@tonic-gate		return (pp);
2617c478bdstevel@tonic-gate	}
2627c478bdstevel@tonic-gate
2637c478bdstevel@tonic-gate	if (flags & (B_FREE | B_INVAL))
2647c478bdstevel@tonic-gate		se = SE_EXCL;
2657c478bdstevel@tonic-gate	else
2667c478bdstevel@tonic-gate		se = SE_SHARED;
2677c478bdstevel@tonic-gate
2687c478bdstevel@tonic-gate	dirty = pp;
2697c478bdstevel@tonic-gate	/*
2707c478bdstevel@tonic-gate	 * Scan backwards looking for pages to kluster by incrementing
2717c478bdstevel@tonic-gate	 * "deltab" and comparing "off" with "vp_off + deltab" to
2727c478bdstevel@tonic-gate	 * avoid "signed" versus "unsigned" conversion problems.
2737c478bdstevel@tonic-gate	 */
2747c478bdstevel@tonic-gate	for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
2757c478bdstevel@tonic-gate		pp = page_lookup_nowait(vp, off - deltab, se);
2767c478bdstevel@tonic-gate		if (pp == NULL)
2777c478bdstevel@tonic-gate			break;		/* page not found */
2787c478bdstevel@tonic-gate		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
2797c478bdstevel@tonic-gate			break;
2807c478bdstevel@tonic-gate		page_add(&dirty, pp);
2817c478bdstevel@tonic-gate	}
2827c478bdstevel@tonic-gate	deltab -= PAGESIZE;
2837c478bdstevel@tonic-gate
2847c478bdstevel@tonic-gate	vp_end = vp_off + vp_len;
2857c478bdstevel@tonic-gate	/* now scan forwards looking for pages to kluster */
2867c478bdstevel@tonic-gate	for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
2877c478bdstevel@tonic-gate		pp = page_lookup_nowait(vp, off + deltaf, se);
2887c478bdstevel@tonic-gate		if (pp == NULL)
2897c478bdstevel@tonic-gate			break;		/* page not found */
2907c478bdstevel@tonic-gate		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
2917c478bdstevel@tonic-gate			break;
2927c478bdstevel@tonic-gate		page_add(&dirty, pp);
2937c478bdstevel@tonic-gate		dirty = dirty->p_next;
2947c478bdstevel@tonic-gate	}
2957c478bdstevel@tonic-gate
2967c478bdstevel@tonic-gate	*offp = off - deltab;
2977c478bdstevel@tonic-gate	*lenp = deltab + deltaf;
2987c478bdstevel@tonic-gate	return (dirty);
2997c478bdstevel@tonic-gate}
3007c478bdstevel@tonic-gate
3017c478bdstevel@tonic-gate/*
3027c478bdstevel@tonic-gate * Generic entry point used to release the "shared/exclusive" lock
3037c478bdstevel@tonic-gate * and the "p_iolock" on pages after i/o is complete.
3047c478bdstevel@tonic-gate */
3057c478bdstevel@tonic-gatevoid
3067c478bdstevel@tonic-gatepvn_io_done(page_t *plist)
3077c478bdstevel@tonic-gate{
3087c478bdstevel@tonic-gate	page_t *pp;
3097c478bdstevel@tonic-gate
3107c478bdstevel@tonic-gate	while (plist != NULL) {
3117c478bdstevel@tonic-gate		pp = plist;
3127c478bdstevel@tonic-gate		page_sub(&plist, pp);
3137c478bdstevel@tonic-gate		page_io_unlock(pp);
3147c478bdstevel@tonic-gate		page_unlock(pp);
3157c478bdstevel@tonic-gate	}
3167c478bdstevel@tonic-gate}
3177c478bdstevel@tonic-gate
3187c478bdstevel@tonic-gate/*
3197c478bdstevel@tonic-gate * Entry point to be used by file system getpage subr's and
3207c478bdstevel@tonic-gate * other such routines which either want to unlock pages (B_ASYNC
3217c478bdstevel@tonic-gate * request) or destroy a list of pages if an error occurred.
3227c478bdstevel@tonic-gate */
3237c478bdstevel@tonic-gatevoid
3247c478bdstevel@tonic-gatepvn_read_done(page_t *plist, int flags)
3257c478bdstevel@tonic-gate{
3267c478bdstevel@tonic-gate	page_t *pp;
3277c478bdstevel@tonic-gate
3287c478bdstevel@tonic-gate	while (plist != NULL) {
3297c478bdstevel@tonic-gate		pp = plist;
3307c478bdstevel@tonic-gate		page_sub(&plist, pp);
3317c478bdstevel@tonic-gate		page_io_unlock(pp);
3327c478bdstevel@tonic-gate		if (flags & B_ERROR) {
3337c478bdstevel@tonic-gate			/*LINTED: constant in conditional context*/
3347c478bdstevel@tonic-gate			VN_DISPOSE(pp, B_INVAL, 0, kcred);
3357c478bdstevel@tonic-gate		} else {
3367c478bdstevel@tonic-gate			(void) page_release(pp, 0);
3377c478bdstevel@tonic-gate		}
3387c478bdstevel@tonic-gate	}
3397c478bdstevel@tonic-gate}
3407c478bdstevel@tonic-gate
3417c478bdstevel@tonic-gate/*
3427c478bdstevel@tonic-gate * Automagic pageout.
3437c478bdstevel@tonic-gate * When memory gets tight, start freeing pages popping out of the
3447c478bdstevel@tonic-gate * write queue.
3457c478bdstevel@tonic-gate */
3467c478bdstevel@tonic-gateint	write_free = 1;
3477c478bdstevel@tonic-gatepgcnt_t	pages_before_pager = 200;	/* LMXXX */
3487c478bdstevel@tonic-gate
3497c478bdstevel@tonic-gate/*
3507c478bdstevel@tonic-gate * Routine to be called when page-out's complete.
3517c478bdstevel@tonic-gate * The caller, typically VOP_PUTPAGE, has to explicity call this routine
3527c478bdstevel@tonic-gate * after waiting for i/o to complete (biowait) to free the list of
3537c478bdstevel@tonic-gate * pages associated with the buffer.  These pages must be locked
3547c478bdstevel@tonic-gate * before i/o is initiated.
3557c478bdstevel@tonic-gate *
3567c478bdstevel@tonic-gate * If a write error occurs, the pages are marked as modified
3577c478bdstevel@tonic-gate * so the write will be re-tried later.
3587c478bdstevel@tonic-gate */
3597c478bdstevel@tonic-gate
3607c478bdstevel@tonic-gatevoid
3617c478bdstevel@tonic-gatepvn_write_done(page_t *plist, int flags)
3627c478bdstevel@tonic-gate{
3637c478bdstevel@tonic-gate	int dfree = 0;
3647c478bdstevel@tonic-gate	int pgrec = 0;
3657c478bdstevel@tonic-gate	int pgout = 0;
3667c478bdstevel@tonic-gate	int pgpgout = 0;
3677c478bdstevel@tonic-gate	int anonpgout = 0;
3687c478bdstevel@tonic-gate	int anonfree = 0;
3697c478bdstevel@tonic-gate	int fspgout = 0;
3707c478bdstevel@tonic-gate	int fsfree = 0;
3717c478bdstevel@tonic-gate	int execpgout = 0;
3727c478bdstevel@tonic-gate	int execfree = 0;
3737c478bdstevel@tonic-gate	page_t *pp;
3747c478bdstevel@tonic-gate	struct cpu *cpup;
3757c478bdstevel@tonic-gate	struct vnode *vp = NULL;	/* for probe */
3767c478bdstevel@tonic-gate	uint_t ppattr;
377a71e32bstans	kmutex_t *vphm = NULL;
3787c478bdstevel@tonic-gate
3797c478bdstevel@tonic-gate	ASSERT((flags & B_READ) == 0);
3807c478bdstevel@tonic-gate
3817c478bdstevel@tonic-gate	/*
3827c478bdstevel@tonic-gate	 * If we are about to start paging anyway, start freeing pages.
3837c478bdstevel@tonic-gate	 */
3847c478bdstevel@tonic-gate	if (write_free && freemem < lotsfree + pages_before_pager &&
3857c478bdstevel@tonic-gate	    (flags & B_ERROR) == 0) {
3867c478bdstevel@tonic-gate		flags |= B_FREE;
3877c478bdstevel@tonic-gate	}
3887c478bdstevel@tonic-gate
3897c478bdstevel@tonic-gate	/*
3907c478bdstevel@tonic-gate	 * Handle each page involved in the i/o operation.
3917c478bdstevel@tonic-gate	 */
3927c478bdstevel@tonic-gate	while (plist != NULL) {
3937c478bdstevel@tonic-gate		pp = plist;
3947c478bdstevel@tonic-gate		ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
3957c478bdstevel@tonic-gate		page_sub(&plist, pp);
3967c478bdstevel@tonic-gate
3977c478bdstevel@tonic-gate		/* Kernel probe support */
3987c478bdstevel@tonic-gate		if (vp == NULL)
3997c478bdstevel@tonic-gate			vp = pp->p_vnode;
4007c478bdstevel@tonic-gate
401d961597qiao		if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
402a71e32bstans			/*
403a71e32bstans			 * Move page to the top of the v_page list.
404a71e32bstans			 * Skip pages modified during IO.
405a71e32bstans			 */
406a71e32bstans			vphm = page_vnode_mutex(vp);
407a71e32bstans			mutex_enter(vphm);
408a71e32bstans			if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
409a71e32bstans				page_vpsub(&vp->v_pages, pp);
410a71e32bstans				page_vpadd(&vp->v_pages, pp);
411a71e32bstans			}
412a71e32bstans			mutex_exit(vphm);
413a71e32bstans		}
414a71e32bstans
4157c478bdstevel@tonic-gate		if (flags & B_ERROR) {
4167c478bdstevel@tonic-gate			/*
4177c478bdstevel@tonic-gate			 * Write operation failed.  We don't want
4187c478bdstevel@tonic-gate			 * to destroy (or free) the page unless B_FORCE
4197c478bdstevel@tonic-gate			 * is set. We set the mod bit again and release
4207c478bdstevel@tonic-gate			 * all locks on the page so that it will get written
4217c478bdstevel@tonic-gate			 * back again later when things are hopefully
4227c478bdstevel@tonic-gate			 * better again.
4237c478bdstevel@tonic-gate			 * If B_INVAL and B_FORCE is set we really have
4247c478bdstevel@tonic-gate			 * to destroy the page.
4257c478bdstevel@tonic-gate			 */
4267c478bdstevel@tonic-gate			if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
4277c478bdstevel@tonic-gate				page_io_unlock(pp);
4287c478bdstevel@tonic-gate				/*LINTED: constant in conditional context*/
4297c478bdstevel@tonic-gate				VN_DISPOSE(pp, B_INVAL, 0, kcred);
4307c478bdstevel@tonic-gate			} else {
431d961597qiao				hat_setmod_only(pp);
4327c478bdstevel@tonic-gate				page_io_unlock(pp);
4337c478bdstevel@tonic-gate				page_unlock(pp);
4347c478bdstevel@tonic-gate			}
4357c478bdstevel@tonic-gate		} else if (flags & B_INVAL) {
4367c478bdstevel@tonic-gate			/*
4377c478bdstevel@tonic-gate			 * XXX - Failed writes with B_INVAL set are
4387c478bdstevel@tonic-gate			 * not handled appropriately.
4397c478bdstevel@tonic-gate			 */
4407c478bdstevel@tonic-gate			page_io_unlock(pp);
4417c478bdstevel@tonic-gate			/*LINTED: constant in conditional context*/
4427c478bdstevel@tonic-gate			VN_DISPOSE(pp, B_INVAL, 0, kcred);
4437c478bdstevel@tonic-gate		} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
4447c478bdstevel@tonic-gate			/*
4457c478bdstevel@tonic-gate			 * Update statistics for pages being paged out
4467c478bdstevel@tonic-gate			 */
4477c478bdstevel@tonic-gate			if (pp->p_vnode) {
4487c478bdstevel@tonic-gate				if (IS_SWAPFSVP(pp->p_vnode)) {
4497c478bdstevel@tonic-gate					anonpgout++;
4507c478bdstevel@tonic-gate				} else {
4517c478bdstevel@tonic-gate					if (pp->p_vnode->v_flag & VVMEXEC) {
4527c478bdstevel@tonic-gate						execpgout++;
4537c478bdstevel@tonic-gate					} else {
4547c478bdstevel@tonic-gate						fspgout++;
4557c478bdstevel@tonic-gate					}
4567c478bdstevel@tonic-gate				}
4577c478bdstevel@tonic-gate			}
4587c478bdstevel@tonic-gate			page_io_unlock(pp);
4597c478bdstevel@tonic-gate			pgout = 1;
4607c478bdstevel@tonic-gate			pgpgout++;
4617c478bdstevel@tonic-gate			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
462f8bbc57Pavel Filipensky			    "page_ws_out:pp %p", pp);
4637c478bdstevel@tonic-gate
4647c478bdstevel@tonic-gate			/*
4657c478bdstevel@tonic-gate			 * The page_struct_lock need not be acquired to
4667c478bdstevel@tonic-gate			 * examine "p_lckcnt" and "p_cowcnt" since we'll
4677c478bdstevel@tonic-gate			 * have an "exclusive" lock if the upgrade succeeds.
4687c478bdstevel@tonic-gate			 */
4697c478bdstevel@tonic-gate			if (page_tryupgrade(pp) &&
4707c478bdstevel@tonic-gate			    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
4717c478bdstevel@tonic-gate				/*
4727c478bdstevel@tonic-gate				 * Check if someone has reclaimed the
4737c478bdstevel@tonic-gate				 * page.  If ref and mod are not set, no
4747c478bdstevel@tonic-gate				 * one is using it so we can free it.
4757c478bdstevel@tonic-gate				 * The rest of the system is careful
4767c478bdstevel@tonic-gate				 * to use the NOSYNC flag to unload
4777c478bdstevel@tonic-gate				 * translations set up for i/o w/o
4787c478bdstevel@tonic-gate				 * affecting ref and mod bits.
4797c478bdstevel@tonic-gate				 *
4807c478bdstevel@tonic-gate				 * Obtain a copy of the real hardware
4817c478bdstevel@tonic-gate				 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
4827c478bdstevel@tonic-gate				 * to avoid having to flush the cache.
4837c478bdstevel@tonic-gate				 */
4847c478bdstevel@tonic-gate				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
485f8bbc57Pavel Filipensky				    HAT_SYNC_STOPON_MOD);
4867c478bdstevel@tonic-gate			ck_refmod:
4877c478bdstevel@tonic-gate				if (!(ppattr & (P_REF | P_MOD))) {
4887c478bdstevel@tonic-gate					if (hat_page_is_mapped(pp)) {
4897c478bdstevel@tonic-gate						/*
4907c478bdstevel@tonic-gate						 * Doesn't look like the page
4917c478bdstevel@tonic-gate						 * was modified so now we
4927c478bdstevel@tonic-gate						 * really have to unload the
4937c478bdstevel@tonic-gate						 * translations.  Meanwhile
4947c478bdstevel@tonic-gate						 * another CPU could've
4957c478bdstevel@tonic-gate						 * modified it so we have to
4967c478bdstevel@tonic-gate						 * check again.  We don't loop
4977c478bdstevel@tonic-gate						 * forever here because now
4987c478bdstevel@tonic-gate						 * the translations are gone
4997c478bdstevel@tonic-gate						 * and no one can get a new one
5007c478bdstevel@tonic-gate						 * since we have the "exclusive"
5017c478bdstevel@tonic-gate						 * lock on the page.
5027c478bdstevel@tonic-gate						 */
5037c478bdstevel@tonic-gate						(void) hat_pageunload(pp,
504f8bbc57Pavel Filipensky						    HAT_FORCE_PGUNLOAD);
5057c478bdstevel@tonic-gate						ppattr = hat_page_getattr(pp,
506f8bbc57Pavel Filipensky						    P_REF | P_MOD);
5077c478bdstevel@tonic-gate						goto ck_refmod;
5087c478bdstevel@tonic-gate					}
5097c478bdstevel@tonic-gate					/*
5107c478bdstevel@tonic-gate					 * Update statistics for pages being
5117c478bdstevel@tonic-gate					 * freed
5127c478bdstevel@tonic-gate					 */
5137c478bdstevel@tonic-gate					if (pp->p_vnode) {
5147c478bdstevel@tonic-gate						if (IS_SWAPFSVP(pp->p_vnode)) {
5157c478bdstevel@tonic-gate							anonfree++;
5167c478bdstevel@tonic-gate						} else {
5177c478bdstevel@tonic-gate							if (pp->p_vnode->v_flag
5187c478bdstevel@tonic-gate							    & VVMEXEC) {
5197c478bdstevel@tonic-gate								execfree++;
5207c478bdstevel@tonic-gate							} else {
5217c478bdstevel@tonic-gate								fsfree++;
5227c478bdstevel@tonic-gate							}
5237c478bdstevel@tonic-gate						}
5247c478bdstevel@tonic-gate					}
5257c478bdstevel@tonic-gate					/*LINTED: constant in conditional ctx*/
5267c478bdstevel@tonic-gate					VN_DISPOSE(pp, B_FREE,
527f8bbc57Pavel Filipensky					    (flags & B_DONTNEED), kcred);
5287c478bdstevel@tonic-gate					dfree++;
5297c478bdstevel@tonic-gate				} else {
5307c478bdstevel@tonic-gate					page_unlock(pp);
5317c478bdstevel@tonic-gate					pgrec++;
5327c478bdstevel@tonic-gate					TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
5337c478bdstevel@tonic-gate					    "page_ws_free:pp %p", pp);
5347c478bdstevel@tonic-gate				}
5357c478bdstevel@tonic-gate			} else {
5367c478bdstevel@tonic-gate				/*
5377c478bdstevel@tonic-gate				 * Page is either `locked' in memory
5387c478bdstevel@tonic-gate				 * or was reclaimed and now has a
5397c478bdstevel@tonic-gate				 * "shared" lock, so release it.
5407c478bdstevel@tonic-gate				 */
5417c478bdstevel@tonic-gate				page_unlock(pp);
5427c478bdstevel@tonic-gate			}
5437c478bdstevel@tonic-gate		} else {
5447c478bdstevel@tonic-gate			/*
5457c478bdstevel@tonic-gate			 * Neither B_FREE nor B_INVAL nor B_ERROR.
5467c478bdstevel@tonic-gate			 * Just release locks.
5477c478bdstevel@tonic-gate			 */
5487c478bdstevel@tonic-gate			page_io_unlock(pp);
5497c478bdstevel@tonic-gate			page_unlock(pp);
5507c478bdstevel@tonic-gate		}
5517c478bdstevel@tonic-gate	}
5527c478bdstevel@tonic-gate
5537c478bdstevel@tonic-gate	CPU_STATS_ENTER_K();
5547c478bdstevel@tonic-gate	cpup = CPU;		/* get cpup now that CPU cannot change */
5557c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
5567c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
5577c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
5587c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
5597c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
5607c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
5617c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
5627c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
5637c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
5647c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
5657c478bdstevel@tonic-gate	CPU_STATS_EXIT_K();
5667c478bdstevel@tonic-gate
5677c478bdstevel@tonic-gate	/* Kernel probe */
5687c478bdstevel@tonic-gate	TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
569f8bbc57Pavel Filipensky	    tnf_opaque,	vnode,			vp,
570f8bbc57Pavel Filipensky	    tnf_ulong,	pages_pageout,		pgpgout,
571f8bbc57Pavel Filipensky	    tnf_ulong,	pages_freed,		dfree,
572f8bbc57Pavel Filipensky	    tnf_ulong,	pages_reclaimed,	pgrec);
5737c478bdstevel@tonic-gate}
5747c478bdstevel@tonic-gate
5757c478bdstevel@tonic-gate/*
5767c478bdstevel@tonic-gate * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
5777c478bdstevel@tonic-gate * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
5787c478bdstevel@tonic-gate * operation and is only to be considered if it doesn't involve any
5797c478bdstevel@tonic-gate * waiting here.  B_TRUNC indicates that the file is being truncated
5807c478bdstevel@tonic-gate * and so no i/o needs to be done. B_FORCE indicates that the page
5817c478bdstevel@tonic-gate * must be destroyed so don't try wrting it out.
5827c478bdstevel@tonic-gate *
5837c478bdstevel@tonic-gate * The caller must ensure that the page is locked.  Returns 1, if
5847c478bdstevel@tonic-gate * the page should be written back (the "iolock" is held in this
5857c478bdstevel@tonic-gate * case), or 0 if the page has been dealt with or has been
5867c478bdstevel@tonic-gate * unlocked.
5877c478bdstevel@tonic-gate */
5887c478bdstevel@tonic-gateint
5897c478bdstevel@tonic-gatepvn_getdirty(page_t *pp, int flags)
5907c478bdstevel@tonic-gate{
5917c478bdstevel@tonic-gate	ASSERT((flags & (B_INVAL | B_FREE)) ?
5927c478bdstevel@tonic-gate	    PAGE_EXCL(pp) : PAGE_SHARED(pp));
5937c478bdstevel@tonic-gate	ASSERT(PP_ISFREE(pp) == 0);
5947c478bdstevel@tonic-gate
5957c478bdstevel@tonic-gate	/*
5967c478bdstevel@tonic-gate	 * If trying to invalidate or free a logically `locked' page,
5977c478bdstevel@tonic-gate	 * forget it.  Don't need page_struct_lock to check p_lckcnt and
5987c478bdstevel@tonic-gate	 * p_cowcnt as the page is exclusively locked.
5997c478bdstevel@tonic-gate	 */
6007c478bdstevel@tonic-gate	if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
6017c478bdstevel@tonic-gate	    (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
6027c478bdstevel@tonic-gate		page_unlock(pp);
6037c478bdstevel@tonic-gate		return (0);
6047c478bdstevel@tonic-gate	}
6057c478bdstevel@tonic-gate
6067c478bdstevel@tonic-gate	/*
6077c478bdstevel@tonic-gate	 * Now acquire the i/o lock so we can add it to the dirty
6087c478bdstevel@tonic-gate	 * list (if necessary).  We avoid blocking on the i/o lock
6097c478bdstevel@tonic-gate	 * in the following cases:
6107c478bdstevel@tonic-gate	 *
6117c478bdstevel@tonic-gate	 *	If B_DELWRI is set, which implies that this request is
6127c478bdstevel@tonic-gate	 *	due to a klustering operartion.
6137c478bdstevel@tonic-gate	 *
6147c478bdstevel@tonic-gate	 *	If this is an async (B_ASYNC) operation and we are not doing
6157c478bdstevel@tonic-gate	 *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
6167c478bdstevel@tonic-gate	 *	that the the page is written out].
6177c478bdstevel@tonic-gate	 */
6187c478bdstevel@tonic-gate	if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
6197c478bdstevel@tonic-gate		if (!page_io_trylock(pp)) {
6207c478bdstevel@tonic-gate			page_unlock(pp);
6217c478bdstevel@tonic-gate			return (0);
6227c478bdstevel@tonic-gate		}
6237c478bdstevel@tonic-gate	} else {
6247c478bdstevel@tonic-gate		page_io_lock(pp);
6257c478bdstevel@tonic-gate	}
6267c478bdstevel@tonic-gate
6277c478bdstevel@tonic-gate	/*
6287c478bdstevel@tonic-gate	 * If we want to free or invalidate the page then
6297c478bdstevel@tonic-gate	 * we need to unload it so that anyone who wants
6307c478bdstevel@tonic-gate	 * it will have to take a minor fault to get it.
6317c478bdstevel@tonic-gate	 * Otherwise, we're just writing the page back so we
6327c478bdstevel@tonic-gate	 * need to sync up the hardwre and software mod bit to
6337c478bdstevel@tonic-gate	 * detect any future modifications.  We clear the
6347c478bdstevel@tonic-gate	 * software mod bit when we put the page on the dirty
6357c478bdstevel@tonic-gate	 * list.
6367c478bdstevel@tonic-gate	 */
6377c478bdstevel@tonic-gate	if (flags & (B_INVAL | B_FREE)) {
6387c478bdstevel@tonic-gate		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6397c478bdstevel@tonic-gate	} else {
6407c478bdstevel@tonic-gate		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
6417c478bdstevel@tonic-gate	}
6427c478bdstevel@tonic-gate
6437c478bdstevel@tonic-gate	if (!hat_ismod(pp) || (flags & B_TRUNC)) {
6447c478bdstevel@tonic-gate		/*
6457c478bdstevel@tonic-gate		 * Don't need to add it to the
6467c478bdstevel@tonic-gate		 * list after all.
6477c478bdstevel@tonic-gate		 */
6487c478bdstevel@tonic-gate		page_io_unlock(pp);
6497c478bdstevel@tonic-gate		if (flags & B_INVAL) {
6507c478bdstevel@tonic-gate			/*LINTED: constant in conditional context*/
6517c478bdstevel@tonic-gate			VN_DISPOSE(pp, B_INVAL, 0, kcred);
6527c478bdstevel@tonic-gate		} else if (flags & B_FREE) {
6537c478bdstevel@tonic-gate			/*LINTED: constant in conditional context*/
6547c478bdstevel@tonic-gate			VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
6557c478bdstevel@tonic-gate		} else {
6567c478bdstevel@tonic-gate			/*
6577c478bdstevel@tonic-gate			 * This is advisory path for the callers
6587c478bdstevel@tonic-gate			 * of VOP_PUTPAGE() who prefer freeing the
6597c478bdstevel@tonic-gate			 * page _only_ if no one else is accessing it.
6607c478bdstevel@tonic-gate			 * E.g. segmap_release()
6617c478bdstevel@tonic-gate			 *
6627c478bdstevel@tonic-gate			 * The above hat_ismod() check is useless because:
6637c478bdstevel@tonic-gate			 * (1) we may not be holding SE_EXCL lock;
6647c478bdstevel@tonic-gate			 * (2) we've not unloaded _all_ translations
6657c478bdstevel@tonic-gate			 *
6667c478bdstevel@tonic-gate			 * Let page_release() do the heavy-lifting.
6677c478bdstevel@tonic-gate			 */
6687c478bdstevel@tonic-gate			(void) page_release(pp, 1);
6697c478bdstevel@tonic-gate		}
6707c478bdstevel@tonic-gate		return (0);
6717c478bdstevel@tonic-gate	}
6727c478bdstevel@tonic-gate
6737c478bdstevel@tonic-gate	/*
6747c478bdstevel@tonic-gate	 * Page is dirty, get it ready for the write back
6757c478bdstevel@tonic-gate	 * and add page to the dirty list.
6767c478bdstevel@tonic-gate	 */
6777c478bdstevel@tonic-gate	hat_clrrefmod(pp);
6787c478bdstevel@tonic-gate
6797c478bdstevel@tonic-gate	/*
6807c478bdstevel@tonic-gate	 * If we're going to free the page when we're done
6817c478bdstevel@tonic-gate	 * then we can let others try to use it starting now.
6827c478bdstevel@tonic-gate	 * We'll detect the fact that they used it when the
6837c478bdstevel@tonic-gate	 * i/o is done and avoid freeing the page.
6847c478bdstevel@tonic-gate	 */
6857c478bdstevel@tonic-gate	if (flags & B_FREE)
6867c478bdstevel@tonic-gate		page_downgrade(pp);
6877c478bdstevel@tonic-gate
6887c478bdstevel@tonic-gate
6897c478bdstevel@tonic-gate	TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
6907c478bdstevel@tonic-gate
6917c478bdstevel@tonic-gate	return (1);
6927c478bdstevel@tonic-gate}
6937c478bdstevel@tonic-gate
6947c478bdstevel@tonic-gate
6957c478bdstevel@tonic-gate/*ARGSUSED*/
6967c478bdstevel@tonic-gatestatic int
6977c478bdstevel@tonic-gatemarker_constructor(void *buf, void *cdrarg, int kmflags)
6987c478bdstevel@tonic-gate{
6997c478bdstevel@tonic-gate	page_t *mark = buf;
7007c478bdstevel@tonic-gate	bzero(mark, sizeof (page_t));
701f8bbc57Pavel Filipensky	mark->p_hash = PVN_VPLIST_HASH_TAG;
7027c478bdstevel@tonic-gate	return (0);
7037c478bdstevel@tonic-gate}
7047c478bdstevel@tonic-gate
7057c478bdstevel@tonic-gatevoid
7067c478bdstevel@tonic-gatepvn_init()
7077c478bdstevel@tonic-gate{
7087c478bdstevel@tonic-gate	if (pvn_vmodsort_disable == 0)
7097c478bdstevel@tonic-gate		pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
7107c478bdstevel@tonic-gate	marker_cache = kmem_cache_create("marker_cache",
7117c478bdstevel@tonic-gate	    sizeof (page_t), 0, marker_constructor,
7127c478bdstevel@tonic-gate	    NULL, NULL, NULL, NULL, 0);
7137c478bdstevel@tonic-gate}
7147c478bdstevel@tonic-gate
7157c478bdstevel@tonic-gate
7167c478bdstevel@tonic-gate/*
7177c478bdstevel@tonic-gate * Process a vnode's page list for all pages whose offset is >= off.
7187c478bdstevel@tonic-gate * Pages are to either be free'd, invalidated, or written back to disk.
7197c478bdstevel@tonic-gate *
7207c478bdstevel@tonic-gate * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
7217c478bdstevel@tonic-gate * is specified, otherwise they are "shared" locked.
7227c478bdstevel@tonic-gate *
7237c478bdstevel@tonic-gate * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
7247c478bdstevel@tonic-gate *
7257c478bdstevel@tonic-gate * Special marker page_t's are inserted in the list in order
7267c478bdstevel@tonic-gate * to keep track of where we are in the list when locks are dropped.
7277c478bdstevel@tonic-gate *
7287c478bdstevel@tonic-gate * Note the list is circular and insertions can happen only at the
7297c478bdstevel@tonic-gate * head and tail of the list. The algorithm ensures visiting all pages
7307c478bdstevel@tonic-gate * on the list in the following way:
7317c478bdstevel@tonic-gate *
7327c478bdstevel@tonic-gate *    Drop two marker pages at the end of the list.
7337c478bdstevel@tonic-gate *
7347c478bdstevel@tonic-gate *    Move one marker page backwards towards the start of the list until
7357c478bdstevel@tonic-gate *    it is at the list head, processing the pages passed along the way.
7367c478bdstevel@tonic-gate *
7377c478bdstevel@tonic-gate *    Due to race conditions when the vphm mutex is dropped, additional pages
7387c478bdstevel@tonic-gate *    can be added to either end of the list, so we'll continue to move
7397c478bdstevel@tonic-gate *    the marker and process pages until it is up against the end marker.
7407c478bdstevel@tonic-gate *
7417c478bdstevel@tonic-gate * There is one special exit condition. If we are processing a VMODSORT
7427c478bdstevel@tonic-gate * vnode and only writing back modified pages, we can stop as soon as
7437c478bdstevel@tonic-gate * we run into an unmodified page.  This makes fsync(3) operations fast.
7447c478bdstevel@tonic-gate */
7457c478bdstevel@tonic-gateint
7467c478bdstevel@tonic-gatepvn_vplist_dirty(
7477c478bdstevel@tonic-gate	vnode_t		*vp,
7487c478bdstevel@tonic-gate	u_offset_t	off,
7497c478bdstevel@tonic-gate	int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
7507c478bdstevel@tonic-gate			size_t *, int, cred_t *),
7517c478bdstevel@tonic-gate	int		flags,
7527c478bdstevel@tonic-gate	cred_t		*cred)
7537c478bdstevel@tonic-gate{
7547c478bdstevel@tonic-gate	page_t		*pp;
7557c478bdstevel@tonic-gate	page_t		*mark;		/* marker page that moves toward head */
7567c478bdstevel@tonic-gate	page_t		*end;		/* marker page at end of list */
7577c478bdstevel@tonic-gate	int		err = 0;
7587c478bdstevel@tonic-gate	int		error;
7597c478bdstevel@tonic-gate	kmutex_t	*vphm;
7607c478bdstevel@tonic-gate	se_t		se;
7617c478bdstevel@tonic-gate	page_t		**where_to_move;
7627c478bdstevel@tonic-gate
7637c478bdstevel@tonic-gate	ASSERT(vp->v_type != VCHR);
7647c478bdstevel@tonic-gate
7657c478bdstevel@tonic-gate	if (vp->v_pages == NULL)
7667c478bdstevel@tonic-gate		return (0);
7677c478bdstevel@tonic-gate
7687c478bdstevel@tonic-gate
7697c478bdstevel@tonic-gate	/*
7707c478bdstevel@tonic-gate	 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
7717c478bdstevel@tonic-gate	 *
7727c478bdstevel@tonic-gate	 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
7737c478bdstevel@tonic-gate	 * from getting blocked while flushing pages to a dead NFS server.
7747c478bdstevel@tonic-gate	 */
7757c478bdstevel@tonic-gate	mutex_enter(&vp->v_lock);
7767c478bdstevel@tonic-gate	if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
7777c478bdstevel@tonic-gate		mutex_exit(&vp->v_lock);
7787c478bdstevel@tonic-gate		return (EAGAIN);
7797c478bdstevel@tonic-gate	}
7807c478bdstevel@tonic-gate
7817c478bdstevel@tonic-gate	while (vp->v_flag & VVMLOCK)
7827c478bdstevel@tonic-gate		cv_wait(&vp->v_cv, &vp->v_lock);
7837c478bdstevel@tonic-gate
7847c478bdstevel@tonic-gate	if (vp->v_pages == NULL) {
7857c478bdstevel@tonic-gate		mutex_exit(&vp->v_lock);
7867c478bdstevel@tonic-gate		return (0);
7877c478bdstevel@tonic-gate	}
7887c478bdstevel@tonic-gate
7897c478bdstevel@tonic-gate	vp->v_flag |= VVMLOCK;
7907c478bdstevel@tonic-gate	mutex_exit(&vp->v_lock);
7917c478bdstevel@tonic-gate
7927c478bdstevel@tonic-gate
7937c478bdstevel@tonic-gate	/*
7947c478bdstevel@tonic-gate	 * Set up the marker pages used to walk the list
7957c478bdstevel@tonic-gate	 */
7967c478bdstevel@tonic-gate	end = kmem_cache_alloc(marker_cache, KM_SLEEP);
7977c478bdstevel@tonic-gate	end->p_vnode = vp;
7987c478bdstevel@tonic-gate	end->p_offset = (u_offset_t)-2;
7997c478bdstevel@tonic-gate	mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
8007c478bdstevel@tonic-gate	mark->p_vnode = vp;
8017c478bdstevel@tonic-gate	mark->p_offset = (u_offset_t)-1;
8027c478bdstevel@tonic-gate
8037c478bdstevel@tonic-gate	/*
8047c478bdstevel@tonic-gate	 * Grab the lock protecting the vnode's page list
8057c478bdstevel@tonic-gate	 * note that this lock is dropped at times in the loop.
8067c478bdstevel@tonic-gate	 */
8077c478bdstevel@tonic-gate	vphm = page_vnode_mutex(vp);
8087c478bdstevel@tonic-gate	mutex_enter(vphm);
8097c478bdstevel@tonic-gate	if (vp->v_pages == NULL)
8107c478bdstevel@tonic-gate		goto leave;
8117c478bdstevel@tonic-gate
8127c478bdstevel@tonic-gate	/*
8137c478bdstevel@tonic-gate	 * insert the markers and loop through the list of pages
8147c478bdstevel@tonic-gate	 */
8157c478bdstevel@tonic-gate	page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
8167c478bdstevel@tonic-gate	page_vpadd(&mark->p_vpnext, end);
8177c478bdstevel@tonic-gate	for (;;) {
8187c478bdstevel@tonic-gate
8197c478bdstevel@tonic-gate		/*
8207c478bdstevel@tonic-gate		 * If only doing an async write back, then we can
8217c478bdstevel@tonic-gate		 * stop as soon as we get to start of the list.
8227c478bdstevel@tonic-gate		 */
8237c478bdstevel@tonic-gate		if (flags == B_ASYNC && vp->v_pages == mark)
8247c478bdstevel@tonic-gate			break;
8257c478bdstevel@tonic-gate
8267c478bdstevel@tonic-gate		/*
8277c478bdstevel@tonic-gate		 * otherwise stop when we've gone through all the pages
8287c478bdstevel@tonic-gate		 */
8297c478bdstevel@tonic-gate		if (mark->p_vpprev == end)
8307c478bdstevel@tonic-gate			break;
8317c478bdstevel@tonic-gate
8327c478bdstevel@tonic-gate		pp = mark->p_vpprev;
8337c478bdstevel@tonic-gate		if (vp->v_pages == pp)
8347c478bdstevel@tonic-gate			where_to_move = &vp->v_pages;
8357c478bdstevel@tonic-gate		else
8367c478bdstevel@tonic-gate			where_to_move = &pp->p_vpprev->p_vpnext;
8377c478bdstevel@tonic-gate
8387c478bdstevel@tonic-gate		ASSERT(pp->p_vnode == vp);
8397c478bdstevel@tonic-gate
8407c478bdstevel@tonic-gate		/*
8417c478bdstevel@tonic-gate		 * If just flushing dirty pages to disk and this vnode
8427c478bdstevel@tonic-gate		 * is using a sorted list of pages, we can stop processing
8437c478bdstevel@tonic-gate		 * as soon as we find an unmodified page. Since all the
8447c478bdstevel@tonic-gate		 * modified pages are visited first.
8457c478bdstevel@tonic-gate		 */
8467c478bdstevel@tonic-gate		if (IS_VMODSORT(vp) &&
847a71e32bstans		    !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
848a71e32bstans			if (!hat_ismod(pp) && !page_io_locked(pp)) {
8497c478bdstevel@tonic-gate#ifdef  DEBUG
850a71e32bstans				/*
851a71e32bstans				 * For debug kernels examine what should be
852a71e32bstans				 * all the remaining clean pages, asserting
853a71e32bstans				 * that they are not modified.
854a71e32bstans				 */
855a71e32bstans				page_t	*chk = pp;
856a71e32bstans				int	attr;
857a71e32bstans
858a71e32bstans				page_vpsub(&vp->v_pages, mark);
859a71e32bstans				page_vpadd(where_to_move, mark);
860a71e32bstans				do {
861a71e32bstans					chk = chk->p_vpprev;
862a71e32bstans					ASSERT(chk != end);
863a71e32bstans					if (chk == mark)
864a71e32bstans						continue;
865a71e32bstans					attr = hat_page_getattr(chk, P_MOD |
866a71e32bstans					    P_REF);
867a71e32bstans					if ((attr & P_MOD) == 0)
868a71e32bstans						continue;
869a71e32bstans					panic("v_pages list not all clean: "
870a71e32bstans					    "page_t*=%p vnode=%p off=%lx "
871a71e32bstans					    "attr=0x%x last clean page_t*=%p\n",
872a71e32bstans					    (void *)chk, (void *)chk->p_vnode,
873a71e32bstans					    (long)chk->p_offset, attr,
874a71e32bstans					    (void *)pp);
875a71e32bstans				} while (chk != vp->v_pages);
8767c478bdstevel@tonic-gate#endif
877a71e32bstans				break;
878a71e32bstans			} else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
879a71e32bstans				/*
880a71e32bstans				 * Couldn't get io lock, wait until IO is done.
881a71e32bstans				 * Block only for sync IO since we don't want
882a71e32bstans				 * to block async IO.
883a71e32bstans				 */
884a71e32bstans				mutex_exit(vphm);
885a71e32bstans				page_io_wait(pp);
886a71e32bstans				mutex_enter(vphm);
887a71e32bstans				continue;
888a71e32bstans			}
8897c478bdstevel@tonic-gate		}
8907c478bdstevel@tonic-gate
8917c478bdstevel@tonic-gate		/*
892adbe22ePeter Telford		 * Skip this page if the offset is out of the desired range.
893adbe22ePeter Telford		 * Just move the marker and continue.
894adbe22ePeter Telford		 */
895adbe22ePeter Telford		if (pp->p_offset < off) {
896adbe22ePeter Telford			page_vpsub(&vp->v_pages, mark);
897adbe22ePeter Telford			page_vpadd(where_to_move, mark);
898adbe22ePeter Telford			continue;
899adbe22ePeter Telford		}
900adbe22ePeter Telford
901adbe22ePeter Telford		/*
9027c478bdstevel@tonic-gate		 * If we are supposed to invalidate or free this
9037c478bdstevel@tonic-gate		 * page, then we need an exclusive lock.
9047c478bdstevel@tonic-gate		 */
9057c478bdstevel@tonic-gate		se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
9067c478bdstevel@tonic-gate
9077c478bdstevel@tonic-gate		/*
9087c478bdstevel@tonic-gate		 * We must acquire the page lock for all synchronous
9097c478bdstevel@tonic-gate		 * operations (invalidate, free and write).
9107c478bdstevel@tonic-gate		 */
9117c478bdstevel@tonic-gate		if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
9127c478bdstevel@tonic-gate			/*
9137c478bdstevel@tonic-gate			 * If the page_lock() drops the mutex
9147c478bdstevel@tonic-gate			 * we must retry the loop.
9157c478bdstevel@tonic-gate			 */
9167c478bdstevel@tonic-gate			if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
9177c478bdstevel@tonic-gate				continue;
9187c478bdstevel@tonic-gate
9197c478bdstevel@tonic-gate			/*
9207c478bdstevel@tonic-gate			 * It's ok to move the marker page now.
9217c478bdstevel@tonic-gate			 */
9227c478bdstevel@tonic-gate			page_vpsub(&vp->v_pages, mark);
9237c478bdstevel@tonic-gate			page_vpadd(where_to_move, mark);
9247c478bdstevel@tonic-gate		} else {
9257c478bdstevel@tonic-gate
9267c478bdstevel@tonic-gate			/*
9277c478bdstevel@tonic-gate			 * update the marker page for all remaining cases
9287c478bdstevel@tonic-gate			 */
9297c478bdstevel@tonic-gate			page_vpsub(&vp->v_pages, mark);
9307c478bdstevel@tonic-gate			page_vpadd(where_to_move, mark);
9317c478bdstevel@tonic-gate
9327c478bdstevel@tonic-gate			/*
9337c478bdstevel@tonic-gate			 * For write backs, If we can't lock the page, it's
9347c478bdstevel@tonic-gate			 * invalid or in the process of being destroyed.  Skip
9357c478bdstevel@tonic-gate			 * it, assuming someone else is writing it.
9367c478bdstevel@tonic-gate			 */
9377c478bdstevel@tonic-gate			if (!page_trylock(pp, se))
9387c478bdstevel@tonic-gate				continue;
9397c478bdstevel@tonic-gate		}
9407c478bdstevel@tonic-gate
9417c478bdstevel@tonic-gate		ASSERT(pp->p_vnode == vp);
9427c478bdstevel@tonic-gate
9437c478bdstevel@tonic-gate		/*
9447c478bdstevel@tonic-gate		 * Successfully locked the page, now figure out what to
9457c478bdstevel@tonic-gate		 * do with it. Free pages are easily dealt with, invalidate
9467c478bdstevel@tonic-gate		 * if desired or just go on to the next page.
9477c478bdstevel@tonic-gate		 */
9487c478bdstevel@tonic-gate		if (PP_ISFREE(pp)) {
9497c478bdstevel@tonic-gate			if ((flags & B_INVAL) == 0) {
9507c478bdstevel@tonic-gate				page_unlock(pp);
9517c478bdstevel@tonic-gate				continue;
9527c478bdstevel@tonic-gate			}
9537c478bdstevel@tonic-gate
9547c478bdstevel@tonic-gate			/*
9557c478bdstevel@tonic-gate			 * Invalidate (destroy) the page.
9567c478bdstevel@tonic-gate			 */
9577c478bdstevel@tonic-gate			mutex_exit(vphm);
9587c478bdstevel@tonic-gate			page_destroy_free(pp);
9597c478bdstevel@tonic-gate			mutex_enter(vphm);
9607c478bdstevel@tonic-gate			continue;
9617c478bdstevel@tonic-gate		}
9627c478bdstevel@tonic-gate
9637c478bdstevel@tonic-gate		/*
9647c478bdstevel@tonic-gate		 * pvn_getdirty() figures out what do do with a dirty page.
9657c478bdstevel@tonic-gate		 * If the page is dirty, the putapage() routine will write it
9667c478bdstevel@tonic-gate		 * and will kluster any other adjacent dirty pages it can.
9677c478bdstevel@tonic-gate		 *
9687c478bdstevel@tonic-gate		 * pvn_getdirty() and `(*putapage)' unlock the page.
9697c478bdstevel@tonic-gate		 */
9707c478bdstevel@tonic-gate		mutex_exit(vphm);
9717c478bdstevel@tonic-gate		if (pvn_getdirty(pp, flags)) {
9727c478bdstevel@tonic-gate			error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
9737c478bdstevel@tonic-gate			if (!err)
9747c478bdstevel@tonic-gate				err = error;
9757c478bdstevel@tonic-gate		}
9767c478bdstevel@tonic-gate		mutex_enter(vphm);
9777c478bdstevel@tonic-gate	}
9787c478bdstevel@tonic-gate	page_vpsub(&vp->v_pages, mark);
9797c478bdstevel@tonic-gate	page_vpsub(&vp->v_pages, end);
9807c478bdstevel@tonic-gate
9817c478bdstevel@tonic-gateleave:
9827c478bdstevel@tonic-gate	/*
9837c478bdstevel@tonic-gate	 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
9847c478bdstevel@tonic-gate	 */
9857c478bdstevel@tonic-gate	mutex_exit(vphm);
9867c478bdstevel@tonic-gate	kmem_cache_free(marker_cache, mark);
9877c478bdstevel@tonic-gate	kmem_cache_free(marker_cache, end);
9887c478bdstevel@tonic-gate	mutex_enter(&vp->v_lock);
9897c478bdstevel@tonic-gate	vp->v_flag &= ~VVMLOCK;
9907c478bdstevel@tonic-gate	cv_broadcast(&vp->v_cv);
9917c478bdstevel@tonic-gate	mutex_exit(&vp->v_lock);
9927c478bdstevel@tonic-gate	return (err);
9937c478bdstevel@tonic-gate}
9947c478bdstevel@tonic-gate
9957c478bdstevel@tonic-gate/*
996f8bbc57Pavel Filipensky * Walk the vp->v_pages list, for every page call the callback function
997f8bbc57Pavel Filipensky * pointed by *page_check. If page_check returns non-zero, then mark the
998f8bbc57Pavel Filipensky * page as modified and if VMODSORT is set, move it to the end of v_pages
999f8bbc57Pavel Filipensky * list. Moving makes sense only if we have at least two pages - this also
1000f8bbc57Pavel Filipensky * avoids having v_pages temporarily being NULL after calling page_vpsub()
1001f8bbc57Pavel Filipensky * if there was just one page.
1002f8bbc57Pavel Filipensky */
1003f8bbc57Pavel Filipenskyvoid
1004f8bbc57Pavel Filipenskypvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1005f8bbc57Pavel Filipensky{
1006f8bbc57Pavel Filipensky	page_t	*pp, *next, *end;
1007f8bbc57Pavel Filipensky	kmutex_t	*vphm;
1008f8bbc57Pavel Filipensky	int	shuffle;
1009f8bbc57Pavel Filipensky
1010f8bbc57Pavel Filipensky	vphm = page_vnode_mutex(vp);
1011f8bbc57Pavel Filipensky	mutex_enter(vphm);
1012f8bbc57Pavel Filipensky
1013f8bbc57Pavel Filipensky	if (vp->v_pages == NULL) {
1014f8bbc57Pavel Filipensky		mutex_exit(vphm);
1015f8bbc57Pavel Filipensky		return;
1016f8bbc57Pavel Filipensky	}
1017f8bbc57Pavel Filipensky
1018f8bbc57Pavel Filipensky	end = vp->v_pages->p_vpprev;
1019f8bbc57Pavel Filipensky	shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1020f8bbc57Pavel Filipensky	pp = vp->v_pages;
1021f8bbc57Pavel Filipensky
1022f8bbc57Pavel Filipensky	for (;;) {
1023f8bbc57Pavel Filipensky		next = pp->p_vpnext;
1024f8bbc57Pavel Filipensky		if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1025f8bbc57Pavel Filipensky			/*
1026f8bbc57Pavel Filipensky			 * hat_setmod_only() in contrast to hat_setmod() does
1027f8bbc57Pavel Filipensky			 * not shuffle the pages and does not grab the mutex
1028f8bbc57Pavel Filipensky			 * page_vnode_mutex. Exactly what we need.
1029f8bbc57Pavel Filipensky			 */
1030f8bbc57Pavel Filipensky			hat_setmod_only(pp);
1031f8bbc57Pavel Filipensky			if (shuffle) {
1032f8bbc57Pavel Filipensky				page_vpsub(&vp->v_pages, pp);
1033f8bbc57Pavel Filipensky				ASSERT(vp->v_pages != NULL);
1034f8bbc57Pavel Filipensky				page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1035f8bbc57Pavel Filipensky				    pp);
1036f8bbc57Pavel Filipensky			}
1037f8bbc57Pavel Filipensky		}
1038f8bbc57Pavel Filipensky		/* Stop if we have just processed the last page. */
1039f8bbc57Pavel Filipensky		if (pp == end)
1040f8bbc57Pavel Filipensky			break;
1041f8bbc57Pavel Filipensky		pp = next;
1042f8bbc57Pavel Filipensky	}
1043f8bbc57Pavel Filipensky
1044f8bbc57Pavel Filipensky	mutex_exit(vphm);
1045f8bbc57Pavel Filipensky}
1046f8bbc57Pavel Filipensky
1047f8bbc57Pavel Filipensky/*
10487c478bdstevel@tonic-gate * Zero out zbytes worth of data. Caller should be aware that this
10497c478bdstevel@tonic-gate * routine may enter back into the fs layer (xxx_getpage). Locks
10507c478bdstevel@tonic-gate * that the xxx_getpage routine may need should not be held while
10517c478bdstevel@tonic-gate * calling this.
10527c478bdstevel@tonic-gate */
10537c478bdstevel@tonic-gatevoid
10547c478bdstevel@tonic-gatepvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
10557c478bdstevel@tonic-gate{
10567c478bdstevel@tonic-gate	caddr_t addr;
10577c478bdstevel@tonic-gate
10587c478bdstevel@tonic-gate	ASSERT(vp->v_type != VCHR);
10597c478bdstevel@tonic-gate
10607c478bdstevel@tonic-gate	if (vp->v_pages == NULL)
10617c478bdstevel@tonic-gate		return;
10627c478bdstevel@tonic-gate
10637c478bdstevel@tonic-gate	/*
10647c478bdstevel@tonic-gate	 * zbytes may be zero but there still may be some portion of
10657c478bdstevel@tonic-gate	 * a page which needs clearing (since zbytes is a function
10667c478bdstevel@tonic-gate	 * of filesystem block size, not pagesize.)
10677c478bdstevel@tonic-gate	 */
10687c478bdstevel@tonic-gate	if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
10697c478bdstevel@tonic-gate		return;
10707c478bdstevel@tonic-gate
10717c478bdstevel@tonic-gate	/*
10727c478bdstevel@tonic-gate	 * We get the last page and handle the partial
10737c478bdstevel@tonic-gate	 * zeroing via kernel mappings.  This will make the page
10747c478bdstevel@tonic-gate	 * dirty so that we know that when this page is written
10757c478bdstevel@tonic-gate	 * back, the zeroed information will go out with it.  If
10767c478bdstevel@tonic-gate	 * the page is not currently in memory, then the kzero
10777c478bdstevel@tonic-gate	 * operation will cause it to be brought it.  We use kzero
10787c478bdstevel@tonic-gate	 * instead of bzero so that if the page cannot be read in
10797c478bdstevel@tonic-gate	 * for any reason, the system will not panic.  We need
10807c478bdstevel@tonic-gate	 * to zero out a minimum of the fs given zbytes, but we
10817c478bdstevel@tonic-gate	 * might also have to do more to get the entire last page.
10827c478bdstevel@tonic-gate	 */
10837c478bdstevel@tonic-gate
10847c478bdstevel@tonic-gate	if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
10857c478bdstevel@tonic-gate		panic("pvn_vptrunc zbytes");
10867c478bdstevel@tonic-gate	addr = segmap_getmapflt(segkmap, vp, vplen,
10877c478bdstevel@tonic-gate	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
10887c478bdstevel@tonic-gate	(void) kzero(addr + (vplen & MAXBOFFSET),
10897c478bdstevel@tonic-gate	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
10907c478bdstevel@tonic-gate	(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
10917c478bdstevel@tonic-gate}
10927c478bdstevel@tonic-gate
10937c478bdstevel@tonic-gate/*
1094ca41123Josef 'Jeff' Sipek * Handles common work of the VOP_GETPAGE routines by iterating page by page
1095ca41123Josef 'Jeff' Sipek * calling the getpage helper for each.
10967c478bdstevel@tonic-gate */
10977c478bdstevel@tonic-gateint
10987c478bdstevel@tonic-gatepvn_getpages(
10997c478bdstevel@tonic-gate	int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
11007c478bdstevel@tonic-gate		size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
11017c478bdstevel@tonic-gate	struct vnode *vp,
11027c478bdstevel@tonic-gate	u_offset_t off,
11037c478bdstevel@tonic-gate	size_t len,
11047c478bdstevel@tonic-gate	uint_t *protp,
11057c478bdstevel@tonic-gate	page_t *pl[],
11067c478bdstevel@tonic-gate	size_t plsz,
11077c478bdstevel@tonic-gate	struct seg *seg,
11087c478bdstevel@tonic-gate	caddr_t addr,
11097c478bdstevel@tonic-gate	enum seg_rw rw,
11107c478bdstevel@tonic-gate	struct cred *cred)
11117c478bdstevel@tonic-gate{
11127c478bdstevel@tonic-gate	page_t **ppp;
11137c478bdstevel@tonic-gate	u_offset_t o, eoff;
11147c478bdstevel@tonic-gate	size_t sz, xlen;
11157c478bdstevel@tonic-gate	int err;
11167c478bdstevel@tonic-gate
1117ca41123Josef 'Jeff' Sipek	/* ensure that we have enough space */
1118ca41123Josef 'Jeff' Sipek	ASSERT(pl == NULL || plsz >= len);
11197c478bdstevel@tonic-gate
11207c478bdstevel@tonic-gate	/*
11217c478bdstevel@tonic-gate	 * Loop one page at a time and let getapage function fill
11227c478bdstevel@tonic-gate	 * in the next page in array.  We only allow one page to be
11237c478bdstevel@tonic-gate	 * returned at a time (except for the last page) so that we
11247c478bdstevel@tonic-gate	 * don't have any problems with duplicates and other such
11257c478bdstevel@tonic-gate	 * painful problems.  This is a very simple minded algorithm,
11267c478bdstevel@tonic-gate	 * but it does the job correctly.  We hope that the cost of a
11277c478bdstevel@tonic-gate	 * getapage call for a resident page that we might have been
11287c478bdstevel@tonic-gate	 * able to get from an earlier call doesn't cost too much.
11297c478bdstevel@tonic-gate	 */
11307c478bdstevel@tonic-gate	ppp = pl;
1131ca41123Josef 'Jeff' Sipek	sz = (pl != NULL) ? PAGESIZE : 0;
11327c478bdstevel@tonic-gate	eoff = off + len;
11337c478bdstevel@tonic-gate	xlen = len;
11347c478bdstevel@tonic-gate	for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
11357c478bdstevel@tonic-gate	    xlen -= PAGESIZE) {
1136ca41123Josef 'Jeff' Sipek		if (o + PAGESIZE >= eoff && pl != NULL) {
11377c478bdstevel@tonic-gate			/*
11387c478bdstevel@tonic-gate			 * Last time through - allow the all of
11397c478bdstevel@tonic-gate			 * what's left of the pl[] array to be used.
11407c478bdstevel@tonic-gate			 */
11417c478bdstevel@tonic-gate			sz = plsz - (o - off);
11427c478bdstevel@tonic-gate		}
11437c478bdstevel@tonic-gate		err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
11447c478bdstevel@tonic-gate		    rw, cred);
11457c478bdstevel@tonic-gate		if (err) {
11467c478bdstevel@tonic-gate			/*
11477c478bdstevel@tonic-gate			 * Release any pages we already got.
11487c478bdstevel@tonic-gate			 */
11497c478bdstevel@tonic-gate			if (o > off && pl != NULL) {
11507c478bdstevel@tonic-gate				for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
11517c478bdstevel@tonic-gate					(void) page_release(*ppp, 1);
11527c478bdstevel@tonic-gate			}
11537c478bdstevel@tonic-gate			break;
11547c478bdstevel@tonic-gate		}
11557c478bdstevel@tonic-gate		if (pl != NULL)
11567c478bdstevel@tonic-gate			ppp++;
11577c478bdstevel@tonic-gate	}
11587c478bdstevel@tonic-gate	return (err);
11597c478bdstevel@tonic-gate}
11607c478bdstevel@tonic-gate
11617c478bdstevel@tonic-gate/*
11627c478bdstevel@tonic-gate * Initialize the page list array.
11637c478bdstevel@tonic-gate */
1164081a94baguzovsk/*ARGSUSED*/
11657c478bdstevel@tonic-gatevoid
11667c478bdstevel@tonic-gatepvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
11677c478bdstevel@tonic-gate    u_offset_t off, size_t io_len, enum seg_rw rw)
11687c478bdstevel@tonic-gate{
11697c478bdstevel@tonic-gate	ssize_t sz;
11707c478bdstevel@tonic-gate	page_t *ppcur, **ppp;
11717c478bdstevel@tonic-gate
1172081a94baguzovsk	/*
1173081a94baguzovsk	 * Set up to load plsz worth
1174081a94baguzovsk	 * starting at the needed page.
1175081a94baguzovsk	 */
1176081a94baguzovsk	while (pp != NULL && pp->p_offset != off) {
11777c478bdstevel@tonic-gate		/*
1178081a94baguzovsk		 * Remove page from the i/o list,
1179081a94baguzovsk		 * release the i/o and the page lock.
11807c478bdstevel@tonic-gate		 */
1181081a94baguzovsk		ppcur = pp;
1182081a94baguzovsk		page_sub(&pp, ppcur);
1183081a94baguzovsk		page_io_unlock(ppcur);
1184081a94baguzovsk		(void) page_release(ppcur, 1);
11857c478bdstevel@tonic-gate	}
11867c478bdstevel@tonic-gate
1187081a94baguzovsk	if (pp == NULL) {
1188081a94baguzovsk		pl[0] = NULL;
1189081a94baguzovsk		return;
1190081a94baguzovsk	}
1191081a94baguzovsk
1192081a94baguzovsk	sz = plsz;
1193081a94baguzovsk
11947c478bdstevel@tonic-gate	/*
11957c478bdstevel@tonic-gate	 * Initialize the page list array.
11967c478bdstevel@tonic-gate	 */
11977c478bdstevel@tonic-gate	ppp = pl;
11987c478bdstevel@tonic-gate	do {
11997c478bdstevel@tonic-gate		ppcur = pp;
12007c478bdstevel@tonic-gate		*ppp++ = ppcur;
12017c478bdstevel@tonic-gate		page_sub(&pp, ppcur);
12027c478bdstevel@tonic-gate		page_io_unlock(ppcur);
12037c478bdstevel@tonic-gate		if (rw != S_CREATE)
12047c478bdstevel@tonic-gate			page_downgrade(ppcur);
12057c478bdstevel@tonic-gate		sz -= PAGESIZE;
12067c478bdstevel@tonic-gate	} while (sz > 0 && pp != NULL);
12077c478bdstevel@tonic-gate	*ppp = NULL;		/* terminate list */
12087c478bdstevel@tonic-gate
12097c478bdstevel@tonic-gate	/*
12107c478bdstevel@tonic-gate	 * Now free the remaining pages that weren't
12117c478bdstevel@tonic-gate	 * loaded in the page list.
12127c478bdstevel@tonic-gate	 */
12137c478bdstevel@tonic-gate	while (pp != NULL) {
12147c478bdstevel@tonic-gate		ppcur = pp;
12157c478bdstevel@tonic-gate		page_sub(&pp, ppcur);
12167c478bdstevel@tonic-gate		page_io_unlock(ppcur);
12177c478bdstevel@tonic-gate		(void) page_release(ppcur, 1);
12187c478bdstevel@tonic-gate	}
12197c478bdstevel@tonic-gate}
1220