xref: /illumos-gate/usr/src/uts/common/vm/vm_page.c (revision 727737b4)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
53cff2f43Sstans  * Common Development and Distribution License (the "License").
63cff2f43Sstans  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
2211494be0SStan Studzinski  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23e7c874afSJosef 'Jeff' Sipek  * Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
243f11de9dSSara Hartse  * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
250418219cSJerry Jelinek  * Copyright 2018 Joyent, Inc.
267c478bd9Sstevel@tonic-gate  */
277c478bd9Sstevel@tonic-gate 
28*727737b4SJoshua M. Clulow /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T */
29*727737b4SJoshua M. Clulow /* All Rights Reserved */
307c478bd9Sstevel@tonic-gate 
317c478bd9Sstevel@tonic-gate /*
327c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
337c478bd9Sstevel@tonic-gate  * The Regents of the University of California
347c478bd9Sstevel@tonic-gate  * All Rights Reserved
357c478bd9Sstevel@tonic-gate  *
367c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
377c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
387c478bd9Sstevel@tonic-gate  * contributors.
397c478bd9Sstevel@tonic-gate  */
407c478bd9Sstevel@tonic-gate 
417c478bd9Sstevel@tonic-gate /*
427c478bd9Sstevel@tonic-gate  * VM - physical page management.
437c478bd9Sstevel@tonic-gate  */
447c478bd9Sstevel@tonic-gate 
457c478bd9Sstevel@tonic-gate #include <sys/types.h>
467c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
477c478bd9Sstevel@tonic-gate #include <sys/param.h>
487c478bd9Sstevel@tonic-gate #include <sys/systm.h>
497c478bd9Sstevel@tonic-gate #include <sys/errno.h>
507c478bd9Sstevel@tonic-gate #include <sys/time.h>
517c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
527c478bd9Sstevel@tonic-gate #include <sys/vm.h>
537c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
547c478bd9Sstevel@tonic-gate #include <sys/swap.h>
557c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
567c478bd9Sstevel@tonic-gate #include <sys/tuneable.h>
577c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
587c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
597c478bd9Sstevel@tonic-gate #include <sys/callb.h>
607c478bd9Sstevel@tonic-gate #include <sys/debug.h>
617c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
627c478bd9Sstevel@tonic-gate #include <sys/condvar_impl.h>
637c478bd9Sstevel@tonic-gate #include <sys/mem_config.h>
647c478bd9Sstevel@tonic-gate #include <sys/mem_cage.h>
657c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
667c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
677c478bd9Sstevel@tonic-gate #include <sys/strlog.h>
687c478bd9Sstevel@tonic-gate #include <sys/mman.h>
697c478bd9Sstevel@tonic-gate #include <sys/ontrap.h>
707c478bd9Sstevel@tonic-gate #include <sys/lgrp.h>
717c478bd9Sstevel@tonic-gate #include <sys/vfs.h>
727c478bd9Sstevel@tonic-gate 
737c478bd9Sstevel@tonic-gate #include <vm/hat.h>
747c478bd9Sstevel@tonic-gate #include <vm/anon.h>
757c478bd9Sstevel@tonic-gate #include <vm/page.h>
767c478bd9Sstevel@tonic-gate #include <vm/seg.h>
777c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
787c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
797c478bd9Sstevel@tonic-gate #include <vm/vm_dep.h>
800209230bSgjelinek #include <sys/vm_usage.h>
817c478bd9Sstevel@tonic-gate #include <fs/fs_subr.h>
82cee1d74bSjfrank #include <sys/ddi.h>
83cee1d74bSjfrank #include <sys/modctl.h>
847c478bd9Sstevel@tonic-gate 
857c478bd9Sstevel@tonic-gate static pgcnt_t max_page_get;	/* max page_get request size in pages */
867c478bd9Sstevel@tonic-gate pgcnt_t total_pages = 0;	/* total number of pages (used by /proc) */
877c478bd9Sstevel@tonic-gate 
887c478bd9Sstevel@tonic-gate /*
897c478bd9Sstevel@tonic-gate  * freemem_lock protects all freemem variables:
907c478bd9Sstevel@tonic-gate  * availrmem. Also this lock protects the globals which track the
917c478bd9Sstevel@tonic-gate  * availrmem changes for accurate kernel footprint calculation.
927c478bd9Sstevel@tonic-gate  * See below for an explanation of these
937c478bd9Sstevel@tonic-gate  * globals.
947c478bd9Sstevel@tonic-gate  */
957c478bd9Sstevel@tonic-gate kmutex_t freemem_lock;
967c478bd9Sstevel@tonic-gate pgcnt_t availrmem;
977c478bd9Sstevel@tonic-gate pgcnt_t availrmem_initial;
987c478bd9Sstevel@tonic-gate 
997c478bd9Sstevel@tonic-gate /*
1007c478bd9Sstevel@tonic-gate  * These globals track availrmem changes to get a more accurate
1017c478bd9Sstevel@tonic-gate  * estimate of tke kernel size. Historically pp_kernel is used for
1027c478bd9Sstevel@tonic-gate  * kernel size and is based on availrmem. But availrmem is adjusted for
1037c478bd9Sstevel@tonic-gate  * locked pages in the system not just for kernel locked pages.
1047c478bd9Sstevel@tonic-gate  * These new counters will track the pages locked through segvn and
1057c478bd9Sstevel@tonic-gate  * by explicit user locking.
1067c478bd9Sstevel@tonic-gate  *
107da6c28aaSamw  * pages_locked : How many pages are locked because of user specified
1087c478bd9Sstevel@tonic-gate  * locking through mlock or plock.
1097c478bd9Sstevel@tonic-gate  *
1107c478bd9Sstevel@tonic-gate  * pages_useclaim,pages_claimed : These two variables track the
111da6c28aaSamw  * claim adjustments because of the protection changes on a segvn segment.
1127c478bd9Sstevel@tonic-gate  *
1137c478bd9Sstevel@tonic-gate  * All these globals are protected by the same lock which protects availrmem.
1147c478bd9Sstevel@tonic-gate  */
115a98e9dbfSaguzovsk pgcnt_t pages_locked = 0;
116a98e9dbfSaguzovsk pgcnt_t pages_useclaim = 0;
117a98e9dbfSaguzovsk pgcnt_t pages_claimed = 0;
1187c478bd9Sstevel@tonic-gate 
1197c478bd9Sstevel@tonic-gate 
1207c478bd9Sstevel@tonic-gate /*
1217c478bd9Sstevel@tonic-gate  * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
1227c478bd9Sstevel@tonic-gate  */
1237c478bd9Sstevel@tonic-gate static kmutex_t	new_freemem_lock;
1247c478bd9Sstevel@tonic-gate static uint_t	freemem_wait;	/* someone waiting for freemem */
1257c478bd9Sstevel@tonic-gate static kcondvar_t freemem_cv;
1267c478bd9Sstevel@tonic-gate 
1277c478bd9Sstevel@tonic-gate /*
1287c478bd9Sstevel@tonic-gate  * The logical page free list is maintained as two lists, the 'free'
1297c478bd9Sstevel@tonic-gate  * and the 'cache' lists.
1307c478bd9Sstevel@tonic-gate  * The free list contains those pages that should be reused first.
1317c478bd9Sstevel@tonic-gate  *
1327c478bd9Sstevel@tonic-gate  * The implementation of the lists is machine dependent.
133d94ffb28Sjmcp  * page_get_freelist(), page_get_cachelist(),
1347c478bd9Sstevel@tonic-gate  * page_list_sub(), and page_list_add()
1357c478bd9Sstevel@tonic-gate  * form the interface to the machine dependent implementation.
1367c478bd9Sstevel@tonic-gate  *
1377c478bd9Sstevel@tonic-gate  * Pages with p_free set are on the cache list.
1387c478bd9Sstevel@tonic-gate  * Pages with p_free and p_age set are on the free list,
1397c478bd9Sstevel@tonic-gate  *
1407c478bd9Sstevel@tonic-gate  * A page may be locked while on either list.
1417c478bd9Sstevel@tonic-gate  */
1427c478bd9Sstevel@tonic-gate 
1437c478bd9Sstevel@tonic-gate /*
1447c478bd9Sstevel@tonic-gate  * free list accounting stuff.
1457c478bd9Sstevel@tonic-gate  *
1467c478bd9Sstevel@tonic-gate  *
1477c478bd9Sstevel@tonic-gate  * Spread out the value for the number of pages on the
1487c478bd9Sstevel@tonic-gate  * page free and page cache lists.  If there is just one
1497c478bd9Sstevel@tonic-gate  * value, then it must be under just one lock.
1507c478bd9Sstevel@tonic-gate  * The lock contention and cache traffic are a real bother.
1517c478bd9Sstevel@tonic-gate  *
1527c478bd9Sstevel@tonic-gate  * When we acquire and then drop a single pcf lock
1537c478bd9Sstevel@tonic-gate  * we can start in the middle of the array of pcf structures.
1547c478bd9Sstevel@tonic-gate  * If we acquire more than one pcf lock at a time, we need to
1557c478bd9Sstevel@tonic-gate  * start at the front to avoid deadlocking.
1567c478bd9Sstevel@tonic-gate  *
1577c478bd9Sstevel@tonic-gate  * pcf_count holds the number of pages in each pool.
1587c478bd9Sstevel@tonic-gate  *
1597c478bd9Sstevel@tonic-gate  * pcf_block is set when page_create_get_something() has asked the
1607c478bd9Sstevel@tonic-gate  * PSM page freelist and page cachelist routines without specifying
1617c478bd9Sstevel@tonic-gate  * a color and nothing came back.  This is used to block anything
1627c478bd9Sstevel@tonic-gate  * else from moving pages from one list to the other while the
1637c478bd9Sstevel@tonic-gate  * lists are searched again.  If a page is freeed while pcf_block is
1647c478bd9Sstevel@tonic-gate  * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
1657c478bd9Sstevel@tonic-gate  * of clearning pcf_block, doing the wakeups, etc.
1667c478bd9Sstevel@tonic-gate  */
1677c478bd9Sstevel@tonic-gate 
16806fb6a36Sdv #define	MAX_PCF_FANOUT NCPU
16906fb6a36Sdv static uint_t pcf_fanout = 1; /* Will get changed at boot time */
17006fb6a36Sdv static uint_t pcf_fanout_mask = 0;
1717c478bd9Sstevel@tonic-gate 
1727c478bd9Sstevel@tonic-gate struct pcf {
1737c478bd9Sstevel@tonic-gate 	kmutex_t	pcf_lock;	/* protects the structure */
174f2b37d75Sfr 	uint_t		pcf_count;	/* page count */
1757c478bd9Sstevel@tonic-gate 	uint_t		pcf_wait;	/* number of waiters */
176*727737b4SJoshua M. Clulow 	uint_t		pcf_block;	/* pcgs flag to page_free() */
177*727737b4SJoshua M. Clulow 	uint_t		pcf_reserve;	/* pages freed after pcf_block set */
17806fb6a36Sdv 	uint_t		pcf_fill[10];	/* to line up on the caches */
1797c478bd9Sstevel@tonic-gate };
1807c478bd9Sstevel@tonic-gate 
18106fb6a36Sdv /*
18206fb6a36Sdv  * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
18306fb6a36Sdv  * it will hash the cpu to).  This is done to prevent a drain condition
18406fb6a36Sdv  * from happening.  This drain condition will occur when pcf_count decrement
18506fb6a36Sdv  * occurs on cpu A and the increment of pcf_count always occurs on cpu B.  An
18606fb6a36Sdv  * example of this shows up with device interrupts.  The dma buffer is allocated
18706fb6a36Sdv  * by the cpu requesting the IO thus the pcf_count is decremented based on that.
18806fb6a36Sdv  * When the memory is returned by the interrupt thread, the pcf_count will be
18906fb6a36Sdv  * incremented based on the cpu servicing the interrupt.
19006fb6a36Sdv  */
19106fb6a36Sdv static struct pcf pcf[MAX_PCF_FANOUT];
19206fb6a36Sdv #define	PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
19306fb6a36Sdv 	(randtick() >> 24)) & (pcf_fanout_mask))
19406fb6a36Sdv 
19506fb6a36Sdv static int pcf_decrement_bucket(pgcnt_t);
19606fb6a36Sdv static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
1977c478bd9Sstevel@tonic-gate 
1987c478bd9Sstevel@tonic-gate kmutex_t	pcgs_lock;		/* serializes page_create_get_ */
1997c478bd9Sstevel@tonic-gate kmutex_t	pcgs_cagelock;		/* serializes NOSLEEP cage allocs */
2007c478bd9Sstevel@tonic-gate kmutex_t	pcgs_wait_lock;		/* used for delay in pcgs */
2017c478bd9Sstevel@tonic-gate static kcondvar_t	pcgs_cv;	/* cv for delay in pcgs */
2027c478bd9Sstevel@tonic-gate 
2037c478bd9Sstevel@tonic-gate #ifdef VM_STATS
2047c478bd9Sstevel@tonic-gate 
2057c478bd9Sstevel@tonic-gate /*
2067c478bd9Sstevel@tonic-gate  * No locks, but so what, they are only statistics.
2077c478bd9Sstevel@tonic-gate  */
2087c478bd9Sstevel@tonic-gate 
2097c478bd9Sstevel@tonic-gate static struct page_tcnt {
2107c478bd9Sstevel@tonic-gate 	int	pc_free_cache;		/* free's into cache list */
2117c478bd9Sstevel@tonic-gate 	int	pc_free_dontneed;	/* free's with dontneed */
2127c478bd9Sstevel@tonic-gate 	int	pc_free_pageout;	/* free's from pageout */
2137c478bd9Sstevel@tonic-gate 	int	pc_free_free;		/* free's into free list */
2147c478bd9Sstevel@tonic-gate 	int	pc_free_pages;		/* free's into large page free list */
2157c478bd9Sstevel@tonic-gate 	int	pc_destroy_pages;	/* large page destroy's */
2167c478bd9Sstevel@tonic-gate 	int	pc_get_cache;		/* get's from cache list */
2177c478bd9Sstevel@tonic-gate 	int	pc_get_free;		/* get's from free list */
2187c478bd9Sstevel@tonic-gate 	int	pc_reclaim;		/* reclaim's */
2197c478bd9Sstevel@tonic-gate 	int	pc_abortfree;		/* abort's of free pages */
2207c478bd9Sstevel@tonic-gate 	int	pc_find_hit;		/* find's that find page */
2217c478bd9Sstevel@tonic-gate 	int	pc_find_miss;		/* find's that don't find page */
2227c478bd9Sstevel@tonic-gate 	int	pc_destroy_free;	/* # of free pages destroyed */
2237c478bd9Sstevel@tonic-gate #define	PC_HASH_CNT	(4*PAGE_HASHAVELEN)
2247c478bd9Sstevel@tonic-gate 	int	pc_find_hashlen[PC_HASH_CNT+1];
2257c478bd9Sstevel@tonic-gate 	int	pc_addclaim_pages;
2267c478bd9Sstevel@tonic-gate 	int	pc_subclaim_pages;
2277c478bd9Sstevel@tonic-gate 	int	pc_free_replacement_page[2];
2287c478bd9Sstevel@tonic-gate 	int	pc_try_demote_pages[6];
2297c478bd9Sstevel@tonic-gate 	int	pc_demote_pages[2];
2307c478bd9Sstevel@tonic-gate } pagecnt;
2317c478bd9Sstevel@tonic-gate 
2327c478bd9Sstevel@tonic-gate uint_t	hashin_count;
2337c478bd9Sstevel@tonic-gate uint_t	hashin_not_held;
2347c478bd9Sstevel@tonic-gate uint_t	hashin_already;
2357c478bd9Sstevel@tonic-gate 
2367c478bd9Sstevel@tonic-gate uint_t	hashout_count;
2377c478bd9Sstevel@tonic-gate uint_t	hashout_not_held;
2387c478bd9Sstevel@tonic-gate 
2397c478bd9Sstevel@tonic-gate uint_t	page_create_count;
2407c478bd9Sstevel@tonic-gate uint_t	page_create_not_enough;
2417c478bd9Sstevel@tonic-gate uint_t	page_create_not_enough_again;
2427c478bd9Sstevel@tonic-gate uint_t	page_create_zero;
2437c478bd9Sstevel@tonic-gate uint_t	page_create_hashout;
2447c478bd9Sstevel@tonic-gate uint_t	page_create_page_lock_failed;
2457c478bd9Sstevel@tonic-gate uint_t	page_create_trylock_failed;
2467c478bd9Sstevel@tonic-gate uint_t	page_create_found_one;
2477c478bd9Sstevel@tonic-gate uint_t	page_create_hashin_failed;
2487c478bd9Sstevel@tonic-gate uint_t	page_create_dropped_phm;
2497c478bd9Sstevel@tonic-gate 
2507c478bd9Sstevel@tonic-gate uint_t	page_create_new;
2517c478bd9Sstevel@tonic-gate uint_t	page_create_exists;
2527c478bd9Sstevel@tonic-gate uint_t	page_create_putbacks;
2537c478bd9Sstevel@tonic-gate uint_t	page_create_overshoot;
2547c478bd9Sstevel@tonic-gate 
2557c478bd9Sstevel@tonic-gate uint_t	page_reclaim_zero;
2567c478bd9Sstevel@tonic-gate uint_t	page_reclaim_zero_locked;
2577c478bd9Sstevel@tonic-gate 
2587c478bd9Sstevel@tonic-gate uint_t	page_rename_exists;
2597c478bd9Sstevel@tonic-gate uint_t	page_rename_count;
2607c478bd9Sstevel@tonic-gate 
2617c478bd9Sstevel@tonic-gate uint_t	page_lookup_cnt[20];
2627c478bd9Sstevel@tonic-gate uint_t	page_lookup_nowait_cnt[10];
2637c478bd9Sstevel@tonic-gate uint_t	page_find_cnt;
2647c478bd9Sstevel@tonic-gate uint_t	page_exists_cnt;
2657c478bd9Sstevel@tonic-gate uint_t	page_exists_forreal_cnt;
2667c478bd9Sstevel@tonic-gate uint_t	page_lookup_dev_cnt;
2677c478bd9Sstevel@tonic-gate uint_t	get_cachelist_cnt;
2687c478bd9Sstevel@tonic-gate uint_t	page_create_cnt[10];
26978b03d3aSkchow uint_t	alloc_pages[9];
2707c478bd9Sstevel@tonic-gate uint_t	page_exphcontg[19];
2717c478bd9Sstevel@tonic-gate uint_t  page_create_large_cnt[10];
2727c478bd9Sstevel@tonic-gate 
273e7c874afSJosef 'Jeff' Sipek #endif
2747c478bd9Sstevel@tonic-gate 
275e7c874afSJosef 'Jeff' Sipek static inline page_t *
page_hash_search(ulong_t index,vnode_t * vnode,u_offset_t off)276e7c874afSJosef 'Jeff' Sipek page_hash_search(ulong_t index, vnode_t *vnode, u_offset_t off)
277e7c874afSJosef 'Jeff' Sipek {
278e7c874afSJosef 'Jeff' Sipek 	uint_t mylen = 0;
279e7c874afSJosef 'Jeff' Sipek 	page_t *page;
2807c478bd9Sstevel@tonic-gate 
281e7c874afSJosef 'Jeff' Sipek 	for (page = page_hash[index]; page; page = page->p_hash, mylen++)
282e7c874afSJosef 'Jeff' Sipek 		if (page->p_vnode == vnode && page->p_offset == off)
283e7c874afSJosef 'Jeff' Sipek 			break;
2847c478bd9Sstevel@tonic-gate 
285e7c874afSJosef 'Jeff' Sipek #ifdef	VM_STATS
286e7c874afSJosef 'Jeff' Sipek 	if (page != NULL)
287e7c874afSJosef 'Jeff' Sipek 		pagecnt.pc_find_hit++;
288e7c874afSJosef 'Jeff' Sipek 	else
289e7c874afSJosef 'Jeff' Sipek 		pagecnt.pc_find_miss++;
2907c478bd9Sstevel@tonic-gate 
291e7c874afSJosef 'Jeff' Sipek 	pagecnt.pc_find_hashlen[MIN(mylen, PC_HASH_CNT)]++;
292e7c874afSJosef 'Jeff' Sipek #endif
293e7c874afSJosef 'Jeff' Sipek 
294e7c874afSJosef 'Jeff' Sipek 	return (page);
295e7c874afSJosef 'Jeff' Sipek }
2967c478bd9Sstevel@tonic-gate 
2977c478bd9Sstevel@tonic-gate 
2987c478bd9Sstevel@tonic-gate #ifdef DEBUG
2997c478bd9Sstevel@tonic-gate #define	MEMSEG_SEARCH_STATS
3007c478bd9Sstevel@tonic-gate #endif
3017c478bd9Sstevel@tonic-gate 
3027c478bd9Sstevel@tonic-gate #ifdef MEMSEG_SEARCH_STATS
3037c478bd9Sstevel@tonic-gate struct memseg_stats {
3047c478bd9Sstevel@tonic-gate     uint_t nsearch;
3057c478bd9Sstevel@tonic-gate     uint_t nlastwon;
3067c478bd9Sstevel@tonic-gate     uint_t nhashwon;
3077c478bd9Sstevel@tonic-gate     uint_t nnotfound;
3087c478bd9Sstevel@tonic-gate } memseg_stats;
3097c478bd9Sstevel@tonic-gate 
3107c478bd9Sstevel@tonic-gate #define	MEMSEG_STAT_INCR(v) \
3111a5e258fSJosef 'Jeff' Sipek 	atomic_inc_32(&memseg_stats.v)
3127c478bd9Sstevel@tonic-gate #else
3137c478bd9Sstevel@tonic-gate #define	MEMSEG_STAT_INCR(x)
3147c478bd9Sstevel@tonic-gate #endif
3157c478bd9Sstevel@tonic-gate 
3167c478bd9Sstevel@tonic-gate struct memseg *memsegs;		/* list of memory segments */
3177c478bd9Sstevel@tonic-gate 
3182be2af34Smec /*
3192be2af34Smec  * /etc/system tunable to control large page allocation hueristic.
3202be2af34Smec  *
3212be2af34Smec  * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
3222be2af34Smec  * for large page allocation requests.  If a large page is not readily
3232be2af34Smec  * avaliable on the local freelists we will go through additional effort
3242be2af34Smec  * to create a large page, potentially moving smaller pages around to coalesce
3252be2af34Smec  * larger pages in the local lgroup.
3262be2af34Smec  * Default value of LPAP_DEFAULT will go to remote freelists if large pages
3272be2af34Smec  * are not readily available in the local lgroup.
3282be2af34Smec  */
3292be2af34Smec enum lpap {
3302be2af34Smec 	LPAP_DEFAULT,	/* default large page allocation policy */
3312be2af34Smec 	LPAP_LOCAL	/* local large page allocation policy */
3322be2af34Smec };
3332be2af34Smec 
3342be2af34Smec enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
3357c478bd9Sstevel@tonic-gate 
3367c478bd9Sstevel@tonic-gate static void page_init_mem_config(void);
3377c478bd9Sstevel@tonic-gate static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
3387c478bd9Sstevel@tonic-gate static void page_do_hashout(page_t *);
3398b464eb8Smec static void page_capture_init();
3408b464eb8Smec int page_capture_take_action(page_t *, uint_t, void *);
3417c478bd9Sstevel@tonic-gate 
3427c478bd9Sstevel@tonic-gate static void page_demote_vp_pages(page_t *);
3437c478bd9Sstevel@tonic-gate 
34406fb6a36Sdv 
34506fb6a36Sdv void
pcf_init(void)34606fb6a36Sdv pcf_init(void)
34706fb6a36Sdv {
34806fb6a36Sdv 	if (boot_ncpus != -1) {
34906fb6a36Sdv 		pcf_fanout = boot_ncpus;
35006fb6a36Sdv 	} else {
35106fb6a36Sdv 		pcf_fanout = max_ncpus;
35206fb6a36Sdv 	}
35306fb6a36Sdv #ifdef sun4v
35406fb6a36Sdv 	/*
35506fb6a36Sdv 	 * Force at least 4 buckets if possible for sun4v.
35606fb6a36Sdv 	 */
35706fb6a36Sdv 	pcf_fanout = MAX(pcf_fanout, 4);
35806fb6a36Sdv #endif /* sun4v */
35906fb6a36Sdv 
36006fb6a36Sdv 	/*
36106fb6a36Sdv 	 * Round up to the nearest power of 2.
36206fb6a36Sdv 	 */
36306fb6a36Sdv 	pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
36406fb6a36Sdv 	if (!ISP2(pcf_fanout)) {
36506fb6a36Sdv 		pcf_fanout = 1 << highbit(pcf_fanout);
36606fb6a36Sdv 
36706fb6a36Sdv 		if (pcf_fanout > MAX_PCF_FANOUT) {
36806fb6a36Sdv 			pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
36906fb6a36Sdv 		}
37006fb6a36Sdv 	}
37106fb6a36Sdv 	pcf_fanout_mask = pcf_fanout - 1;
37206fb6a36Sdv }
37306fb6a36Sdv 
3747c478bd9Sstevel@tonic-gate /*
3757c478bd9Sstevel@tonic-gate  * vm subsystem related initialization
3767c478bd9Sstevel@tonic-gate  */
3777c478bd9Sstevel@tonic-gate void
vm_init(void)3787c478bd9Sstevel@tonic-gate vm_init(void)
3797c478bd9Sstevel@tonic-gate {
3807c478bd9Sstevel@tonic-gate 	boolean_t callb_vm_cpr(void *, int);
3817c478bd9Sstevel@tonic-gate 
3827c478bd9Sstevel@tonic-gate 	(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
3837c478bd9Sstevel@tonic-gate 	page_init_mem_config();
384db874c57Selowe 	page_retire_init();
3850209230bSgjelinek 	vm_usage_init();
3868b464eb8Smec 	page_capture_init();
3877c478bd9Sstevel@tonic-gate }
3887c478bd9Sstevel@tonic-gate 
3897c478bd9Sstevel@tonic-gate /*
3907c478bd9Sstevel@tonic-gate  * This function is called at startup and when memory is added or deleted.
3917c478bd9Sstevel@tonic-gate  */
3927c478bd9Sstevel@tonic-gate void
init_pages_pp_maximum()3937c478bd9Sstevel@tonic-gate init_pages_pp_maximum()
3947c478bd9Sstevel@tonic-gate {
3957c478bd9Sstevel@tonic-gate 	static pgcnt_t p_min;
3967c478bd9Sstevel@tonic-gate 	static pgcnt_t pages_pp_maximum_startup;
3977c478bd9Sstevel@tonic-gate 	static pgcnt_t avrmem_delta;
3987c478bd9Sstevel@tonic-gate 	static int init_done;
3997c478bd9Sstevel@tonic-gate 	static int user_set;	/* true if set in /etc/system */
4007c478bd9Sstevel@tonic-gate 
4017c478bd9Sstevel@tonic-gate 	if (init_done == 0) {
4027c478bd9Sstevel@tonic-gate 
4037c478bd9Sstevel@tonic-gate 		/* If the user specified a value, save it */
4047c478bd9Sstevel@tonic-gate 		if (pages_pp_maximum != 0) {
4057c478bd9Sstevel@tonic-gate 			user_set = 1;
4067c478bd9Sstevel@tonic-gate 			pages_pp_maximum_startup = pages_pp_maximum;
4077c478bd9Sstevel@tonic-gate 		}
4087c478bd9Sstevel@tonic-gate 
4097c478bd9Sstevel@tonic-gate 		/*
4107c478bd9Sstevel@tonic-gate 		 * Setting of pages_pp_maximum is based first time
4117c478bd9Sstevel@tonic-gate 		 * on the value of availrmem just after the start-up
4127c478bd9Sstevel@tonic-gate 		 * allocations. To preserve this relationship at run
4137c478bd9Sstevel@tonic-gate 		 * time, use a delta from availrmem_initial.
4147c478bd9Sstevel@tonic-gate 		 */
4157c478bd9Sstevel@tonic-gate 		ASSERT(availrmem_initial >= availrmem);
4167c478bd9Sstevel@tonic-gate 		avrmem_delta = availrmem_initial - availrmem;
4177c478bd9Sstevel@tonic-gate 
4187c478bd9Sstevel@tonic-gate 		/* The allowable floor of pages_pp_maximum */
4197c478bd9Sstevel@tonic-gate 		p_min = tune.t_minarmem + 100;
4207c478bd9Sstevel@tonic-gate 
4217c478bd9Sstevel@tonic-gate 		/* Make sure we don't come through here again. */
4227c478bd9Sstevel@tonic-gate 		init_done = 1;
4237c478bd9Sstevel@tonic-gate 	}
4247c478bd9Sstevel@tonic-gate 	/*
4257c478bd9Sstevel@tonic-gate 	 * Determine pages_pp_maximum, the number of currently available
4267c478bd9Sstevel@tonic-gate 	 * pages (availrmem) that can't be `locked'. If not set by
4277c478bd9Sstevel@tonic-gate 	 * the user, we set it to 4% of the currently available memory
4287c478bd9Sstevel@tonic-gate 	 * plus 4MB.
4297c478bd9Sstevel@tonic-gate 	 * But we also insist that it be greater than tune.t_minarmem;
4307c478bd9Sstevel@tonic-gate 	 * otherwise a process could lock down a lot of memory, get swapped
4317c478bd9Sstevel@tonic-gate 	 * out, and never have enough to get swapped back in.
4327c478bd9Sstevel@tonic-gate 	 */
4337c478bd9Sstevel@tonic-gate 	if (user_set)
4347c478bd9Sstevel@tonic-gate 		pages_pp_maximum = pages_pp_maximum_startup;
4357c478bd9Sstevel@tonic-gate 	else
4367c478bd9Sstevel@tonic-gate 		pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
4377c478bd9Sstevel@tonic-gate 		    + btop(4 * 1024 * 1024);
4387c478bd9Sstevel@tonic-gate 
4397c478bd9Sstevel@tonic-gate 	if (pages_pp_maximum <= p_min) {
4407c478bd9Sstevel@tonic-gate 		pages_pp_maximum = p_min;
4417c478bd9Sstevel@tonic-gate 	}
4427c478bd9Sstevel@tonic-gate }
4437c478bd9Sstevel@tonic-gate 
4440418219cSJerry Jelinek /*
4450418219cSJerry Jelinek  * In the past, we limited the maximum pages that could be gotten to essentially
4460418219cSJerry Jelinek  * 1/2 of the total pages on the system. However, this is too conservative for
4470418219cSJerry Jelinek  * some cases. For example, if we want to host a large virtual machine which
4480418219cSJerry Jelinek  * needs to use a significant portion of the system's memory. In practice,
4490418219cSJerry Jelinek  * allowing more than 1/2 of the total pages is fine, but becomes problematic
4500418219cSJerry Jelinek  * as we approach or exceed 75% of the pages on the system. Thus, we limit the
4510418219cSJerry Jelinek  * maximum to 23/32 of the total pages, which is ~72%.
4520418219cSJerry Jelinek  */
4537c478bd9Sstevel@tonic-gate void
set_max_page_get(pgcnt_t target_total_pages)4547c478bd9Sstevel@tonic-gate set_max_page_get(pgcnt_t target_total_pages)
4557c478bd9Sstevel@tonic-gate {
4560418219cSJerry Jelinek 	max_page_get = (target_total_pages >> 5) * 23;
4570418219cSJerry Jelinek 	ASSERT3U(max_page_get, >, 0);
4580418219cSJerry Jelinek }
4590418219cSJerry Jelinek 
4600418219cSJerry Jelinek pgcnt_t
get_max_page_get()4610418219cSJerry Jelinek get_max_page_get()
4620418219cSJerry Jelinek {
4630418219cSJerry Jelinek 	return (max_page_get);
4647c478bd9Sstevel@tonic-gate }
4657c478bd9Sstevel@tonic-gate 
4667c478bd9Sstevel@tonic-gate static pgcnt_t pending_delete;
4677c478bd9Sstevel@tonic-gate 
4687c478bd9Sstevel@tonic-gate /*ARGSUSED*/
4697c478bd9Sstevel@tonic-gate static void
page_mem_config_post_add(void * arg,pgcnt_t delta_pages)4707c478bd9Sstevel@tonic-gate page_mem_config_post_add(
4717c478bd9Sstevel@tonic-gate 	void *arg,
4727c478bd9Sstevel@tonic-gate 	pgcnt_t delta_pages)
4737c478bd9Sstevel@tonic-gate {
4747c478bd9Sstevel@tonic-gate 	set_max_page_get(total_pages - pending_delete);
4757c478bd9Sstevel@tonic-gate 	init_pages_pp_maximum();
4767c478bd9Sstevel@tonic-gate }
4777c478bd9Sstevel@tonic-gate 
4787c478bd9Sstevel@tonic-gate /*ARGSUSED*/
4797c478bd9Sstevel@tonic-gate static int
page_mem_config_pre_del(void * arg,pgcnt_t delta_pages)4807c478bd9Sstevel@tonic-gate page_mem_config_pre_del(
4817c478bd9Sstevel@tonic-gate 	void *arg,
4827c478bd9Sstevel@tonic-gate 	pgcnt_t delta_pages)
4837c478bd9Sstevel@tonic-gate {
4847c478bd9Sstevel@tonic-gate 	pgcnt_t nv;
4857c478bd9Sstevel@tonic-gate 
4867c478bd9Sstevel@tonic-gate 	nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
4877c478bd9Sstevel@tonic-gate 	set_max_page_get(total_pages - nv);
4887c478bd9Sstevel@tonic-gate 	return (0);
4897c478bd9Sstevel@tonic-gate }
4907c478bd9Sstevel@tonic-gate 
4917c478bd9Sstevel@tonic-gate /*ARGSUSED*/
4927c478bd9Sstevel@tonic-gate static void
page_mem_config_post_del(void * arg,pgcnt_t delta_pages,int cancelled)4937c478bd9Sstevel@tonic-gate page_mem_config_post_del(
4947c478bd9Sstevel@tonic-gate 	void *arg,
4957c478bd9Sstevel@tonic-gate 	pgcnt_t delta_pages,
4967c478bd9Sstevel@tonic-gate 	int cancelled)
4977c478bd9Sstevel@tonic-gate {
4987c478bd9Sstevel@tonic-gate 	pgcnt_t nv;
4997c478bd9Sstevel@tonic-gate 
5007c478bd9Sstevel@tonic-gate 	nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
5017c478bd9Sstevel@tonic-gate 	set_max_page_get(total_pages - nv);
5027c478bd9Sstevel@tonic-gate 	if (!cancelled)
5037c478bd9Sstevel@tonic-gate 		init_pages_pp_maximum();
5047c478bd9Sstevel@tonic-gate }
5057c478bd9Sstevel@tonic-gate 
5067c478bd9Sstevel@tonic-gate static kphysm_setup_vector_t page_mem_config_vec = {
5077c478bd9Sstevel@tonic-gate 	KPHYSM_SETUP_VECTOR_VERSION,
5087c478bd9Sstevel@tonic-gate 	page_mem_config_post_add,
5097c478bd9Sstevel@tonic-gate 	page_mem_config_pre_del,
5107c478bd9Sstevel@tonic-gate 	page_mem_config_post_del,
5117c478bd9Sstevel@tonic-gate };
5127c478bd9Sstevel@tonic-gate 
5137c478bd9Sstevel@tonic-gate static void
page_init_mem_config(void)5147c478bd9Sstevel@tonic-gate page_init_mem_config(void)
5157c478bd9Sstevel@tonic-gate {
516d94ffb28Sjmcp 	int ret;
5177c478bd9Sstevel@tonic-gate 
518d94ffb28Sjmcp 	ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
519d94ffb28Sjmcp 	ASSERT(ret == 0);
5207c478bd9Sstevel@tonic-gate }
5217c478bd9Sstevel@tonic-gate 
5227c478bd9Sstevel@tonic-gate /*
5237c478bd9Sstevel@tonic-gate  * Evenly spread out the PCF counters for large free pages
5247c478bd9Sstevel@tonic-gate  */
5257c478bd9Sstevel@tonic-gate static void
page_free_large_ctr(pgcnt_t npages)5267c478bd9Sstevel@tonic-gate page_free_large_ctr(pgcnt_t npages)
5277c478bd9Sstevel@tonic-gate {
5287c478bd9Sstevel@tonic-gate 	static struct pcf	*p = pcf;
5297c478bd9Sstevel@tonic-gate 	pgcnt_t			lump;
5307c478bd9Sstevel@tonic-gate 
5317c478bd9Sstevel@tonic-gate 	freemem += npages;
5327c478bd9Sstevel@tonic-gate 
53306fb6a36Sdv 	lump = roundup(npages, pcf_fanout) / pcf_fanout;
5347c478bd9Sstevel@tonic-gate 
5357c478bd9Sstevel@tonic-gate 	while (npages > 0) {
5367c478bd9Sstevel@tonic-gate 
5377c478bd9Sstevel@tonic-gate 		ASSERT(!p->pcf_block);
5387c478bd9Sstevel@tonic-gate 
5397c478bd9Sstevel@tonic-gate 		if (lump < npages) {
5407c478bd9Sstevel@tonic-gate 			p->pcf_count += (uint_t)lump;
5417c478bd9Sstevel@tonic-gate 			npages -= lump;
5427c478bd9Sstevel@tonic-gate 		} else {
5437c478bd9Sstevel@tonic-gate 			p->pcf_count += (uint_t)npages;
5447c478bd9Sstevel@tonic-gate 			npages = 0;
5457c478bd9Sstevel@tonic-gate 		}
5467c478bd9Sstevel@tonic-gate 
5477c478bd9Sstevel@tonic-gate 		ASSERT(!p->pcf_wait);
5487c478bd9Sstevel@tonic-gate 
54906fb6a36Sdv 		if (++p > &pcf[pcf_fanout - 1])
5507c478bd9Sstevel@tonic-gate 			p = pcf;
5517c478bd9Sstevel@tonic-gate 	}
5527c478bd9Sstevel@tonic-gate 
5537c478bd9Sstevel@tonic-gate 	ASSERT(npages == 0);
5547c478bd9Sstevel@tonic-gate }
5557c478bd9Sstevel@tonic-gate 
5567c478bd9Sstevel@tonic-gate /*
557da6c28aaSamw  * Add a physical chunk of memory to the system free lists during startup.
5587c478bd9Sstevel@tonic-gate  * Platform specific startup() allocates the memory for the page structs.
5597c478bd9Sstevel@tonic-gate  *
5607c478bd9Sstevel@tonic-gate  * num	- number of page structures
5617c478bd9Sstevel@tonic-gate  * base - page number (pfn) to be associated with the first page.
5627c478bd9Sstevel@tonic-gate  *
5637c478bd9Sstevel@tonic-gate  * Since we are doing this during startup (ie. single threaded), we will
5647c478bd9Sstevel@tonic-gate  * use shortcut routines to avoid any locking overhead while putting all
5657c478bd9Sstevel@tonic-gate  * these pages on the freelists.
5667c478bd9Sstevel@tonic-gate  *
5677c478bd9Sstevel@tonic-gate  * NOTE: Any changes performed to page_free(), must also be performed to
5687c478bd9Sstevel@tonic-gate  *	 add_physmem() since this is how we initialize all page_t's at
5697c478bd9Sstevel@tonic-gate  *	 boot time.
5707c478bd9Sstevel@tonic-gate  */
5717c478bd9Sstevel@tonic-gate void
add_physmem(page_t * pp,pgcnt_t num,pfn_t pnum)5727c478bd9Sstevel@tonic-gate add_physmem(
5737c478bd9Sstevel@tonic-gate 	page_t	*pp,
5747c478bd9Sstevel@tonic-gate 	pgcnt_t	num,
5757c478bd9Sstevel@tonic-gate 	pfn_t	pnum)
5767c478bd9Sstevel@tonic-gate {
5777c478bd9Sstevel@tonic-gate 	page_t	*root = NULL;
5787c478bd9Sstevel@tonic-gate 	uint_t	szc = page_num_pagesizes() - 1;
5797c478bd9Sstevel@tonic-gate 	pgcnt_t	large = page_get_pagecnt(szc);
5807c478bd9Sstevel@tonic-gate 	pgcnt_t	cnt = 0;
5817c478bd9Sstevel@tonic-gate 
5827c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
5836e4dd838Smec 	    "add_physmem:pp %p num %lu", pp, num);
5847c478bd9Sstevel@tonic-gate 
5857c478bd9Sstevel@tonic-gate 	/*
5867c478bd9Sstevel@tonic-gate 	 * Arbitrarily limit the max page_get request
5877c478bd9Sstevel@tonic-gate 	 * to 1/2 of the page structs we have.
5887c478bd9Sstevel@tonic-gate 	 */
5897c478bd9Sstevel@tonic-gate 	total_pages += num;
5907c478bd9Sstevel@tonic-gate 	set_max_page_get(total_pages);
5917c478bd9Sstevel@tonic-gate 
592e21bae1bSkchow 	PLCNT_MODIFY_MAX(pnum, (long)num);
593e21bae1bSkchow 
5947c478bd9Sstevel@tonic-gate 	/*
5957c478bd9Sstevel@tonic-gate 	 * The physical space for the pages array
5967c478bd9Sstevel@tonic-gate 	 * representing ram pages has already been
5977c478bd9Sstevel@tonic-gate 	 * allocated.  Here we initialize each lock
5987c478bd9Sstevel@tonic-gate 	 * in the page structure, and put each on
5997c478bd9Sstevel@tonic-gate 	 * the free list
6007c478bd9Sstevel@tonic-gate 	 */
601affbd3ccSkchow 	for (; num; pp++, pnum++, num--) {
6027c478bd9Sstevel@tonic-gate 
6037c478bd9Sstevel@tonic-gate 		/*
6047c478bd9Sstevel@tonic-gate 		 * this needs to fill in the page number
6057c478bd9Sstevel@tonic-gate 		 * and do any other arch specific initialization
6067c478bd9Sstevel@tonic-gate 		 */
6077c478bd9Sstevel@tonic-gate 		add_physmem_cb(pp, pnum);
6087c478bd9Sstevel@tonic-gate 
60907b65a64Saguzovsk 		pp->p_lckcnt = 0;
61007b65a64Saguzovsk 		pp->p_cowcnt = 0;
61107b65a64Saguzovsk 		pp->p_slckcnt = 0;
61207b65a64Saguzovsk 
6137c478bd9Sstevel@tonic-gate 		/*
6147c478bd9Sstevel@tonic-gate 		 * Initialize the page lock as unlocked, since nobody
6157c478bd9Sstevel@tonic-gate 		 * can see or access this page yet.
6167c478bd9Sstevel@tonic-gate 		 */
6177c478bd9Sstevel@tonic-gate 		pp->p_selock = 0;
6187c478bd9Sstevel@tonic-gate 
6197c478bd9Sstevel@tonic-gate 		/*
6207c478bd9Sstevel@tonic-gate 		 * Initialize IO lock
6217c478bd9Sstevel@tonic-gate 		 */
6227c478bd9Sstevel@tonic-gate 		page_iolock_init(pp);
6237c478bd9Sstevel@tonic-gate 
6247c478bd9Sstevel@tonic-gate 		/*
6257c478bd9Sstevel@tonic-gate 		 * initialize other fields in the page_t
6267c478bd9Sstevel@tonic-gate 		 */
6277c478bd9Sstevel@tonic-gate 		PP_SETFREE(pp);
6289d0d62adSJason Beloro 		page_clr_all_props(pp);
6297c478bd9Sstevel@tonic-gate 		PP_SETAGED(pp);
6307c478bd9Sstevel@tonic-gate 		pp->p_offset = (u_offset_t)-1;
6317c478bd9Sstevel@tonic-gate 		pp->p_next = pp;
6327c478bd9Sstevel@tonic-gate 		pp->p_prev = pp;
6337c478bd9Sstevel@tonic-gate 
6347c478bd9Sstevel@tonic-gate 		/*
6357c478bd9Sstevel@tonic-gate 		 * Simple case: System doesn't support large pages.
6367c478bd9Sstevel@tonic-gate 		 */
6377c478bd9Sstevel@tonic-gate 		if (szc == 0) {
6387c478bd9Sstevel@tonic-gate 			pp->p_szc = 0;
6397c478bd9Sstevel@tonic-gate 			page_free_at_startup(pp);
6407c478bd9Sstevel@tonic-gate 			continue;
6417c478bd9Sstevel@tonic-gate 		}
6427c478bd9Sstevel@tonic-gate 
6437c478bd9Sstevel@tonic-gate 		/*
6447c478bd9Sstevel@tonic-gate 		 * Handle unaligned pages, we collect them up onto
6457c478bd9Sstevel@tonic-gate 		 * the root page until we have a full large page.
6467c478bd9Sstevel@tonic-gate 		 */
6477c478bd9Sstevel@tonic-gate 		if (!IS_P2ALIGNED(pnum, large)) {
6487c478bd9Sstevel@tonic-gate 
6497c478bd9Sstevel@tonic-gate 			/*
6507c478bd9Sstevel@tonic-gate 			 * If not in a large page,
6517c478bd9Sstevel@tonic-gate 			 * just free as small page.
6527c478bd9Sstevel@tonic-gate 			 */
6537c478bd9Sstevel@tonic-gate 			if (root == NULL) {
6547c478bd9Sstevel@tonic-gate 				pp->p_szc = 0;
6557c478bd9Sstevel@tonic-gate 				page_free_at_startup(pp);
6567c478bd9Sstevel@tonic-gate 				continue;
6577c478bd9Sstevel@tonic-gate 			}
6587c478bd9Sstevel@tonic-gate 
6597c478bd9Sstevel@tonic-gate 			/*
6607c478bd9Sstevel@tonic-gate 			 * Link a constituent page into the large page.
6617c478bd9Sstevel@tonic-gate 			 */
6627c478bd9Sstevel@tonic-gate 			pp->p_szc = szc;
6637c478bd9Sstevel@tonic-gate 			page_list_concat(&root, &pp);
6647c478bd9Sstevel@tonic-gate 
6657c478bd9Sstevel@tonic-gate 			/*
6667c478bd9Sstevel@tonic-gate 			 * When large page is fully formed, free it.
6677c478bd9Sstevel@tonic-gate 			 */
6687c478bd9Sstevel@tonic-gate 			if (++cnt == large) {
6697c478bd9Sstevel@tonic-gate 				page_free_large_ctr(cnt);
6707c478bd9Sstevel@tonic-gate 				page_list_add_pages(root, PG_LIST_ISINIT);
6717c478bd9Sstevel@tonic-gate 				root = NULL;
6727c478bd9Sstevel@tonic-gate 				cnt = 0;
6737c478bd9Sstevel@tonic-gate 			}
6747c478bd9Sstevel@tonic-gate 			continue;
6757c478bd9Sstevel@tonic-gate 		}
6767c478bd9Sstevel@tonic-gate 
6777c478bd9Sstevel@tonic-gate 		/*
6787c478bd9Sstevel@tonic-gate 		 * At this point we have a page number which
6797c478bd9Sstevel@tonic-gate 		 * is aligned. We assert that we aren't already
6807c478bd9Sstevel@tonic-gate 		 * in a different large page.
6817c478bd9Sstevel@tonic-gate 		 */
6827c478bd9Sstevel@tonic-gate 		ASSERT(IS_P2ALIGNED(pnum, large));
6837c478bd9Sstevel@tonic-gate 		ASSERT(root == NULL && cnt == 0);
6847c478bd9Sstevel@tonic-gate 
6857c478bd9Sstevel@tonic-gate 		/*
6867c478bd9Sstevel@tonic-gate 		 * If insufficient number of pages left to form
6877c478bd9Sstevel@tonic-gate 		 * a large page, just free the small page.
6887c478bd9Sstevel@tonic-gate 		 */
6897c478bd9Sstevel@tonic-gate 		if (num < large) {
6907c478bd9Sstevel@tonic-gate 			pp->p_szc = 0;
6917c478bd9Sstevel@tonic-gate 			page_free_at_startup(pp);
6927c478bd9Sstevel@tonic-gate 			continue;
6937c478bd9Sstevel@tonic-gate 		}
6947c478bd9Sstevel@tonic-gate 
6957c478bd9Sstevel@tonic-gate 		/*
6967c478bd9Sstevel@tonic-gate 		 * Otherwise start a new large page.
6977c478bd9Sstevel@tonic-gate 		 */
6987c478bd9Sstevel@tonic-gate 		pp->p_szc = szc;
6997c478bd9Sstevel@tonic-gate 		cnt++;
7007c478bd9Sstevel@tonic-gate 		root = pp;
7017c478bd9Sstevel@tonic-gate 	}
7027c478bd9Sstevel@tonic-gate 	ASSERT(root == NULL && cnt == 0);
7037c478bd9Sstevel@tonic-gate }
7047c478bd9Sstevel@tonic-gate 
7057c478bd9Sstevel@tonic-gate /*
7067c478bd9Sstevel@tonic-gate  * Find a page representing the specified [vp, offset].
7077c478bd9Sstevel@tonic-gate  * If we find the page but it is intransit coming in,
7087c478bd9Sstevel@tonic-gate  * it will have an "exclusive" lock and we wait for
7097c478bd9Sstevel@tonic-gate  * the i/o to complete.  A page found on the free list
7107c478bd9Sstevel@tonic-gate  * is always reclaimed and then locked.  On success, the page
7117c478bd9Sstevel@tonic-gate  * is locked, its data is valid and it isn't on the free
7127c478bd9Sstevel@tonic-gate  * list, while a NULL is returned if the page doesn't exist.
7137c478bd9Sstevel@tonic-gate  */
7147c478bd9Sstevel@tonic-gate page_t *
page_lookup(vnode_t * vp,u_offset_t off,se_t se)7157c478bd9Sstevel@tonic-gate page_lookup(vnode_t *vp, u_offset_t off, se_t se)
7167c478bd9Sstevel@tonic-gate {
7177c478bd9Sstevel@tonic-gate 	return (page_lookup_create(vp, off, se, NULL, NULL, 0));
7187c478bd9Sstevel@tonic-gate }
7197c478bd9Sstevel@tonic-gate 
7207c478bd9Sstevel@tonic-gate /*
7217c478bd9Sstevel@tonic-gate  * Find a page representing the specified [vp, offset].
7227c478bd9Sstevel@tonic-gate  * We either return the one we found or, if passed in,
7237c478bd9Sstevel@tonic-gate  * create one with identity of [vp, offset] of the
724da6c28aaSamw  * pre-allocated page. If we find existing page but it is
7257c478bd9Sstevel@tonic-gate  * intransit coming in, it will have an "exclusive" lock
7267c478bd9Sstevel@tonic-gate  * and we wait for the i/o to complete.  A page found on
7277c478bd9Sstevel@tonic-gate  * the free list is always reclaimed and then locked.
7287c478bd9Sstevel@tonic-gate  * On success, the page is locked, its data is valid and
7297c478bd9Sstevel@tonic-gate  * it isn't on the free list, while a NULL is returned
7307c478bd9Sstevel@tonic-gate  * if the page doesn't exist and newpp is NULL;
7317c478bd9Sstevel@tonic-gate  */
7327c478bd9Sstevel@tonic-gate page_t *
page_lookup_create(vnode_t * vp,u_offset_t off,se_t se,page_t * newpp,spgcnt_t * nrelocp,int flags)7337c478bd9Sstevel@tonic-gate page_lookup_create(
7347c478bd9Sstevel@tonic-gate 	vnode_t *vp,
7357c478bd9Sstevel@tonic-gate 	u_offset_t off,
7367c478bd9Sstevel@tonic-gate 	se_t se,
7377c478bd9Sstevel@tonic-gate 	page_t *newpp,
7387c478bd9Sstevel@tonic-gate 	spgcnt_t *nrelocp,
7397c478bd9Sstevel@tonic-gate 	int flags)
7407c478bd9Sstevel@tonic-gate {
7417c478bd9Sstevel@tonic-gate 	page_t		*pp;
7427c478bd9Sstevel@tonic-gate 	kmutex_t	*phm;
7437c478bd9Sstevel@tonic-gate 	ulong_t		index;
7447c478bd9Sstevel@tonic-gate 	uint_t		hash_locked;
7457c478bd9Sstevel@tonic-gate 	uint_t		es;
7467c478bd9Sstevel@tonic-gate 
7477c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
7487c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(page_lookup_cnt[0]);
7497c478bd9Sstevel@tonic-gate 	ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
7507c478bd9Sstevel@tonic-gate 
7517c478bd9Sstevel@tonic-gate 	/*
7527c478bd9Sstevel@tonic-gate 	 * Acquire the appropriate page hash lock since
7537c478bd9Sstevel@tonic-gate 	 * we have to search the hash list.  Pages that
7547c478bd9Sstevel@tonic-gate 	 * hash to this list can't change identity while
7557c478bd9Sstevel@tonic-gate 	 * this lock is held.
7567c478bd9Sstevel@tonic-gate 	 */
7577c478bd9Sstevel@tonic-gate 	hash_locked = 0;
7587c478bd9Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
7597c478bd9Sstevel@tonic-gate 	phm = NULL;
7607c478bd9Sstevel@tonic-gate top:
761e7c874afSJosef 'Jeff' Sipek 	pp = page_hash_search(index, vp, off);
7627c478bd9Sstevel@tonic-gate 	if (pp != NULL) {
7637c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[1]);
7647c478bd9Sstevel@tonic-gate 		es = (newpp != NULL) ? 1 : 0;
7657c478bd9Sstevel@tonic-gate 		es |= flags;
7667c478bd9Sstevel@tonic-gate 		if (!hash_locked) {
7677c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_cnt[2]);
7687c478bd9Sstevel@tonic-gate 			if (!page_try_reclaim_lock(pp, se, es)) {
7697c478bd9Sstevel@tonic-gate 				/*
7707c478bd9Sstevel@tonic-gate 				 * On a miss, acquire the phm.  Then
7717c478bd9Sstevel@tonic-gate 				 * next time, page_lock() will be called,
7727c478bd9Sstevel@tonic-gate 				 * causing a wait if the page is busy.
7737c478bd9Sstevel@tonic-gate 				 * just looping with page_trylock() would
7747c478bd9Sstevel@tonic-gate 				 * get pretty boring.
7757c478bd9Sstevel@tonic-gate 				 */
7767c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[3]);
7777c478bd9Sstevel@tonic-gate 				phm = PAGE_HASH_MUTEX(index);
7787c478bd9Sstevel@tonic-gate 				mutex_enter(phm);
7797c478bd9Sstevel@tonic-gate 				hash_locked = 1;
7807c478bd9Sstevel@tonic-gate 				goto top;
7817c478bd9Sstevel@tonic-gate 			}
7827c478bd9Sstevel@tonic-gate 		} else {
7837c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_cnt[4]);
7847c478bd9Sstevel@tonic-gate 			if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
7857c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[5]);
7867c478bd9Sstevel@tonic-gate 				goto top;
7877c478bd9Sstevel@tonic-gate 			}
7887c478bd9Sstevel@tonic-gate 		}
7897c478bd9Sstevel@tonic-gate 
7907c478bd9Sstevel@tonic-gate 		/*
7917c478bd9Sstevel@tonic-gate 		 * Since `pp' is locked it can not change identity now.
7927c478bd9Sstevel@tonic-gate 		 * Reconfirm we locked the correct page.
7937c478bd9Sstevel@tonic-gate 		 *
7947c478bd9Sstevel@tonic-gate 		 * Both the p_vnode and p_offset *must* be cast volatile
795e7c874afSJosef 'Jeff' Sipek 		 * to force a reload of their values: The page_hash_search
796e7c874afSJosef 'Jeff' Sipek 		 * function will have stuffed p_vnode and p_offset into
7977c478bd9Sstevel@tonic-gate 		 * registers before calling page_trylock(); another thread,
7987c478bd9Sstevel@tonic-gate 		 * actually holding the hash lock, could have changed the
7997c478bd9Sstevel@tonic-gate 		 * page's identity in memory, but our registers would not
8007c478bd9Sstevel@tonic-gate 		 * be changed, fooling the reconfirmation.  If the hash
8017c478bd9Sstevel@tonic-gate 		 * lock was held during the search, the casting would
8027c478bd9Sstevel@tonic-gate 		 * not be needed.
8037c478bd9Sstevel@tonic-gate 		 */
8047c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[6]);
8057c478bd9Sstevel@tonic-gate 		if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
8067c478bd9Sstevel@tonic-gate 		    ((volatile u_offset_t)(pp->p_offset) != off)) {
8077c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_cnt[7]);
8087c478bd9Sstevel@tonic-gate 			if (hash_locked) {
8097c478bd9Sstevel@tonic-gate 				panic("page_lookup_create: lost page %p",
8107c478bd9Sstevel@tonic-gate 				    (void *)pp);
8117c478bd9Sstevel@tonic-gate 				/*NOTREACHED*/
8127c478bd9Sstevel@tonic-gate 			}
8137c478bd9Sstevel@tonic-gate 			page_unlock(pp);
8147c478bd9Sstevel@tonic-gate 			phm = PAGE_HASH_MUTEX(index);
8157c478bd9Sstevel@tonic-gate 			mutex_enter(phm);
8167c478bd9Sstevel@tonic-gate 			hash_locked = 1;
8177c478bd9Sstevel@tonic-gate 			goto top;
8187c478bd9Sstevel@tonic-gate 		}
8197c478bd9Sstevel@tonic-gate 
8207c478bd9Sstevel@tonic-gate 		/*
8217c478bd9Sstevel@tonic-gate 		 * If page_trylock() was called, then pp may still be on
8227c478bd9Sstevel@tonic-gate 		 * the cachelist (can't be on the free list, it would not
8237c478bd9Sstevel@tonic-gate 		 * have been found in the search).  If it is on the
8247c478bd9Sstevel@tonic-gate 		 * cachelist it must be pulled now. To pull the page from
8257c478bd9Sstevel@tonic-gate 		 * the cachelist, it must be exclusively locked.
8267c478bd9Sstevel@tonic-gate 		 *
8277c478bd9Sstevel@tonic-gate 		 * The other big difference between page_trylock() and
8287c478bd9Sstevel@tonic-gate 		 * page_lock(), is that page_lock() will pull the
8297c478bd9Sstevel@tonic-gate 		 * page from whatever free list (the cache list in this
8307c478bd9Sstevel@tonic-gate 		 * case) the page is on.  If page_trylock() was used
8317c478bd9Sstevel@tonic-gate 		 * above, then we have to do the reclaim ourselves.
8327c478bd9Sstevel@tonic-gate 		 */
8337c478bd9Sstevel@tonic-gate 		if ((!hash_locked) && (PP_ISFREE(pp))) {
8347c478bd9Sstevel@tonic-gate 			ASSERT(PP_ISAGED(pp) == 0);
8357c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_cnt[8]);
8367c478bd9Sstevel@tonic-gate 
8377c478bd9Sstevel@tonic-gate 			/*
8387c478bd9Sstevel@tonic-gate 			 * page_relcaim will insure that we
8397c478bd9Sstevel@tonic-gate 			 * have this page exclusively
8407c478bd9Sstevel@tonic-gate 			 */
8417c478bd9Sstevel@tonic-gate 
8427c478bd9Sstevel@tonic-gate 			if (!page_reclaim(pp, NULL)) {
8437c478bd9Sstevel@tonic-gate 				/*
8447c478bd9Sstevel@tonic-gate 				 * Page_reclaim dropped whatever lock
8457c478bd9Sstevel@tonic-gate 				 * we held.
8467c478bd9Sstevel@tonic-gate 				 */
8477c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[9]);
8487c478bd9Sstevel@tonic-gate 				phm = PAGE_HASH_MUTEX(index);
8497c478bd9Sstevel@tonic-gate 				mutex_enter(phm);
8507c478bd9Sstevel@tonic-gate 				hash_locked = 1;
8517c478bd9Sstevel@tonic-gate 				goto top;
8527c478bd9Sstevel@tonic-gate 			} else if (se == SE_SHARED && newpp == NULL) {
8537c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[10]);
8547c478bd9Sstevel@tonic-gate 				page_downgrade(pp);
8557c478bd9Sstevel@tonic-gate 			}
8567c478bd9Sstevel@tonic-gate 		}
8577c478bd9Sstevel@tonic-gate 
8587c478bd9Sstevel@tonic-gate 		if (hash_locked) {
8597c478bd9Sstevel@tonic-gate 			mutex_exit(phm);
8607c478bd9Sstevel@tonic-gate 		}
8617c478bd9Sstevel@tonic-gate 
8627c478bd9Sstevel@tonic-gate 		if (newpp != NULL && pp->p_szc < newpp->p_szc &&
8637c478bd9Sstevel@tonic-gate 		    PAGE_EXCL(pp) && nrelocp != NULL) {
8647c478bd9Sstevel@tonic-gate 			ASSERT(nrelocp != NULL);
8657c478bd9Sstevel@tonic-gate 			(void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
8667c478bd9Sstevel@tonic-gate 			    NULL);
8677c478bd9Sstevel@tonic-gate 			if (*nrelocp > 0) {
8687c478bd9Sstevel@tonic-gate 				VM_STAT_COND_ADD(*nrelocp == 1,
8697c478bd9Sstevel@tonic-gate 				    page_lookup_cnt[11]);
8707c478bd9Sstevel@tonic-gate 				VM_STAT_COND_ADD(*nrelocp > 1,
8717c478bd9Sstevel@tonic-gate 				    page_lookup_cnt[12]);
8727c478bd9Sstevel@tonic-gate 				pp = newpp;
8737c478bd9Sstevel@tonic-gate 				se = SE_EXCL;
8747c478bd9Sstevel@tonic-gate 			} else {
8757c478bd9Sstevel@tonic-gate 				if (se == SE_SHARED) {
8767c478bd9Sstevel@tonic-gate 					page_downgrade(pp);
8777c478bd9Sstevel@tonic-gate 				}
8787c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[13]);
8797c478bd9Sstevel@tonic-gate 			}
8807c478bd9Sstevel@tonic-gate 		} else if (newpp != NULL && nrelocp != NULL) {
8817c478bd9Sstevel@tonic-gate 			if (PAGE_EXCL(pp) && se == SE_SHARED) {
8827c478bd9Sstevel@tonic-gate 				page_downgrade(pp);
8837c478bd9Sstevel@tonic-gate 			}
8847c478bd9Sstevel@tonic-gate 			VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
8857c478bd9Sstevel@tonic-gate 			    page_lookup_cnt[14]);
8867c478bd9Sstevel@tonic-gate 			VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
8877c478bd9Sstevel@tonic-gate 			    page_lookup_cnt[15]);
8887c478bd9Sstevel@tonic-gate 			VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
8897c478bd9Sstevel@tonic-gate 			    page_lookup_cnt[16]);
8907c478bd9Sstevel@tonic-gate 		} else if (newpp != NULL && PAGE_EXCL(pp)) {
8917c478bd9Sstevel@tonic-gate 			se = SE_EXCL;
8927c478bd9Sstevel@tonic-gate 		}
8937c478bd9Sstevel@tonic-gate 	} else if (!hash_locked) {
8947c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[17]);
8957c478bd9Sstevel@tonic-gate 		phm = PAGE_HASH_MUTEX(index);
8967c478bd9Sstevel@tonic-gate 		mutex_enter(phm);
8977c478bd9Sstevel@tonic-gate 		hash_locked = 1;
8987c478bd9Sstevel@tonic-gate 		goto top;
8997c478bd9Sstevel@tonic-gate 	} else if (newpp != NULL) {
9007c478bd9Sstevel@tonic-gate 		/*
9017c478bd9Sstevel@tonic-gate 		 * If we have a preallocated page then
9027c478bd9Sstevel@tonic-gate 		 * insert it now and basically behave like
9037c478bd9Sstevel@tonic-gate 		 * page_create.
9047c478bd9Sstevel@tonic-gate 		 */
9057c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[18]);
9067c478bd9Sstevel@tonic-gate 		/*
9077c478bd9Sstevel@tonic-gate 		 * Since we hold the page hash mutex and
9087c478bd9Sstevel@tonic-gate 		 * just searched for this page, page_hashin
9097c478bd9Sstevel@tonic-gate 		 * had better not fail.  If it does, that
9107c478bd9Sstevel@tonic-gate 		 * means some thread did not follow the
9117c478bd9Sstevel@tonic-gate 		 * page hash mutex rules.  Panic now and
9127c478bd9Sstevel@tonic-gate 		 * get it over with.  As usual, go down
9137c478bd9Sstevel@tonic-gate 		 * holding all the locks.
9147c478bd9Sstevel@tonic-gate 		 */
9157c478bd9Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(phm));
9167c478bd9Sstevel@tonic-gate 		if (!page_hashin(newpp, vp, off, phm)) {
9177c478bd9Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(phm));
9187c478bd9Sstevel@tonic-gate 			panic("page_lookup_create: hashin failed %p %p %llx %p",
9197c478bd9Sstevel@tonic-gate 			    (void *)newpp, (void *)vp, off, (void *)phm);
9207c478bd9Sstevel@tonic-gate 			/*NOTREACHED*/
9217c478bd9Sstevel@tonic-gate 		}
9227c478bd9Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(phm));
9237c478bd9Sstevel@tonic-gate 		mutex_exit(phm);
9247c478bd9Sstevel@tonic-gate 		phm = NULL;
9257c478bd9Sstevel@tonic-gate 		page_set_props(newpp, P_REF);
9267c478bd9Sstevel@tonic-gate 		page_io_lock(newpp);
9277c478bd9Sstevel@tonic-gate 		pp = newpp;
9287c478bd9Sstevel@tonic-gate 		se = SE_EXCL;
9297c478bd9Sstevel@tonic-gate 	} else {
9307c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[19]);
9317c478bd9Sstevel@tonic-gate 		mutex_exit(phm);
9327c478bd9Sstevel@tonic-gate 	}
9337c478bd9Sstevel@tonic-gate 
9347c478bd9Sstevel@tonic-gate 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
9357c478bd9Sstevel@tonic-gate 
9367c478bd9Sstevel@tonic-gate 	ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
9377c478bd9Sstevel@tonic-gate 
9387c478bd9Sstevel@tonic-gate 	return (pp);
9397c478bd9Sstevel@tonic-gate }
9407c478bd9Sstevel@tonic-gate 
9417c478bd9Sstevel@tonic-gate /*
9427c478bd9Sstevel@tonic-gate  * Search the hash list for the page representing the
9437c478bd9Sstevel@tonic-gate  * specified [vp, offset] and return it locked.  Skip
9447c478bd9Sstevel@tonic-gate  * free pages and pages that cannot be locked as requested.
9457c478bd9Sstevel@tonic-gate  * Used while attempting to kluster pages.
9467c478bd9Sstevel@tonic-gate  */
9477c478bd9Sstevel@tonic-gate page_t *
page_lookup_nowait(vnode_t * vp,u_offset_t off,se_t se)9487c478bd9Sstevel@tonic-gate page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
9497c478bd9Sstevel@tonic-gate {
9507c478bd9Sstevel@tonic-gate 	page_t		*pp;
9517c478bd9Sstevel@tonic-gate 	kmutex_t	*phm;
9527c478bd9Sstevel@tonic-gate 	ulong_t		index;
9537c478bd9Sstevel@tonic-gate 	uint_t		locked;
9547c478bd9Sstevel@tonic-gate 
9557c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
9567c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(page_lookup_nowait_cnt[0]);
9577c478bd9Sstevel@tonic-gate 
9587c478bd9Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
959e7c874afSJosef 'Jeff' Sipek 	pp = page_hash_search(index, vp, off);
9607c478bd9Sstevel@tonic-gate 	locked = 0;
9617c478bd9Sstevel@tonic-gate 	if (pp == NULL) {
9627c478bd9Sstevel@tonic-gate top:
9637c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_nowait_cnt[1]);
9647c478bd9Sstevel@tonic-gate 		locked = 1;
9657c478bd9Sstevel@tonic-gate 		phm = PAGE_HASH_MUTEX(index);
9667c478bd9Sstevel@tonic-gate 		mutex_enter(phm);
967e7c874afSJosef 'Jeff' Sipek 		pp = page_hash_search(index, vp, off);
9687c478bd9Sstevel@tonic-gate 	}
9697c478bd9Sstevel@tonic-gate 
9707c478bd9Sstevel@tonic-gate 	if (pp == NULL || PP_ISFREE(pp)) {
9717c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_nowait_cnt[2]);
9727c478bd9Sstevel@tonic-gate 		pp = NULL;
9737c478bd9Sstevel@tonic-gate 	} else {
9747c478bd9Sstevel@tonic-gate 		if (!page_trylock(pp, se)) {
9757c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_nowait_cnt[3]);
9767c478bd9Sstevel@tonic-gate 			pp = NULL;
9777c478bd9Sstevel@tonic-gate 		} else {
9787c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_nowait_cnt[4]);
9797c478bd9Sstevel@tonic-gate 			/*
9807c478bd9Sstevel@tonic-gate 			 * See the comment in page_lookup()
9817c478bd9Sstevel@tonic-gate 			 */
9827c478bd9Sstevel@tonic-gate 			if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
9837c478bd9Sstevel@tonic-gate 			    ((u_offset_t)(pp->p_offset) != off)) {
9847c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_nowait_cnt[5]);
9857c478bd9Sstevel@tonic-gate 				if (locked) {
9867c478bd9Sstevel@tonic-gate 					panic("page_lookup_nowait %p",
9877c478bd9Sstevel@tonic-gate 					    (void *)pp);
9887c478bd9Sstevel@tonic-gate 					/*NOTREACHED*/
9897c478bd9Sstevel@tonic-gate 				}
9907c478bd9Sstevel@tonic-gate 				page_unlock(pp);
9917c478bd9Sstevel@tonic-gate 				goto top;
9927c478bd9Sstevel@tonic-gate 			}
9937c478bd9Sstevel@tonic-gate 			if (PP_ISFREE(pp)) {
9947c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_nowait_cnt[6]);
9957c478bd9Sstevel@tonic-gate 				page_unlock(pp);
9967c478bd9Sstevel@tonic-gate 				pp = NULL;
9977c478bd9Sstevel@tonic-gate 			}
9987c478bd9Sstevel@tonic-gate 		}
9997c478bd9Sstevel@tonic-gate 	}
10007c478bd9Sstevel@tonic-gate 	if (locked) {
10017c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_nowait_cnt[7]);
10027c478bd9Sstevel@tonic-gate 		mutex_exit(phm);
10037c478bd9Sstevel@tonic-gate 	}
10047c478bd9Sstevel@tonic-gate 
10057c478bd9Sstevel@tonic-gate 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
10067c478bd9Sstevel@tonic-gate 
10077c478bd9Sstevel@tonic-gate 	return (pp);
10087c478bd9Sstevel@tonic-gate }
10097c478bd9Sstevel@tonic-gate 
10107c478bd9Sstevel@tonic-gate /*
10117c478bd9Sstevel@tonic-gate  * Search the hash list for a page with the specified [vp, off]
10127c478bd9Sstevel@tonic-gate  * that is known to exist and is already locked.  This routine
10137c478bd9Sstevel@tonic-gate  * is typically used by segment SOFTUNLOCK routines.
10147c478bd9Sstevel@tonic-gate  */
10157c478bd9Sstevel@tonic-gate page_t *
page_find(vnode_t * vp,u_offset_t off)10167c478bd9Sstevel@tonic-gate page_find(vnode_t *vp, u_offset_t off)
10177c478bd9Sstevel@tonic-gate {
10187c478bd9Sstevel@tonic-gate 	page_t		*pp;
10197c478bd9Sstevel@tonic-gate 	kmutex_t	*phm;
10207c478bd9Sstevel@tonic-gate 	ulong_t		index;
10217c478bd9Sstevel@tonic-gate 
10227c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
10237c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(page_find_cnt);
10247c478bd9Sstevel@tonic-gate 
10257c478bd9Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
10267c478bd9Sstevel@tonic-gate 	phm = PAGE_HASH_MUTEX(index);
10277c478bd9Sstevel@tonic-gate 
10287c478bd9Sstevel@tonic-gate 	mutex_enter(phm);
1029e7c874afSJosef 'Jeff' Sipek 	pp = page_hash_search(index, vp, off);
10307c478bd9Sstevel@tonic-gate 	mutex_exit(phm);
10317c478bd9Sstevel@tonic-gate 
10324fc2445aSelowe 	ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
10337c478bd9Sstevel@tonic-gate 	return (pp);
10347c478bd9Sstevel@tonic-gate }
10357c478bd9Sstevel@tonic-gate 
10367c478bd9Sstevel@tonic-gate /*
10377c478bd9Sstevel@tonic-gate  * Determine whether a page with the specified [vp, off]
10387c478bd9Sstevel@tonic-gate  * currently exists in the system.  Obviously this should
10397c478bd9Sstevel@tonic-gate  * only be considered as a hint since nothing prevents the
10407c478bd9Sstevel@tonic-gate  * page from disappearing or appearing immediately after
10417c478bd9Sstevel@tonic-gate  * the return from this routine. Subsequently, we don't
10427c478bd9Sstevel@tonic-gate  * even bother to lock the list.
10437c478bd9Sstevel@tonic-gate  */
10447c478bd9Sstevel@tonic-gate page_t *
page_exists(vnode_t * vp,u_offset_t off)10457c478bd9Sstevel@tonic-gate page_exists(vnode_t *vp, u_offset_t off)
10467c478bd9Sstevel@tonic-gate {
10477c478bd9Sstevel@tonic-gate 	ulong_t		index;
10487c478bd9Sstevel@tonic-gate 
10497c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
10507c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(page_exists_cnt);
10517c478bd9Sstevel@tonic-gate 
10527c478bd9Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
10537c478bd9Sstevel@tonic-gate 
1054e7c874afSJosef 'Jeff' Sipek 	return (page_hash_search(index, vp, off));
10557c478bd9Sstevel@tonic-gate }
10567c478bd9Sstevel@tonic-gate 
10577c478bd9Sstevel@tonic-gate /*
10587c478bd9Sstevel@tonic-gate  * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
10597c478bd9Sstevel@tonic-gate  * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
10607c478bd9Sstevel@tonic-gate  * with these pages locked SHARED. If necessary reclaim pages from
10617c478bd9Sstevel@tonic-gate  * freelist. Return 1 if contiguous pages exist and 0 otherwise.
10627c478bd9Sstevel@tonic-gate  *
10637c478bd9Sstevel@tonic-gate  * If we fail to lock pages still return 1 if pages exist and contiguous.
10647c478bd9Sstevel@tonic-gate  * But in this case return value is just a hint. ppa array won't be filled.
10657c478bd9Sstevel@tonic-gate  * Caller should initialize ppa[0] as NULL to distinguish return value.
10667c478bd9Sstevel@tonic-gate  *
10677c478bd9Sstevel@tonic-gate  * Returns 0 if pages don't exist or not physically contiguous.
10687c478bd9Sstevel@tonic-gate  *
10697c478bd9Sstevel@tonic-gate  * This routine doesn't work for anonymous(swapfs) pages.
10707c478bd9Sstevel@tonic-gate  */
10717c478bd9Sstevel@tonic-gate int
page_exists_physcontig(vnode_t * vp,u_offset_t off,uint_t szc,page_t * ppa[])10727c478bd9Sstevel@tonic-gate page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
10737c478bd9Sstevel@tonic-gate {
10747c478bd9Sstevel@tonic-gate 	pgcnt_t pages;
10757c478bd9Sstevel@tonic-gate 	pfn_t pfn;
10767c478bd9Sstevel@tonic-gate 	page_t *rootpp;
10777c478bd9Sstevel@tonic-gate 	pgcnt_t i;
10787c478bd9Sstevel@tonic-gate 	pgcnt_t j;
10797c478bd9Sstevel@tonic-gate 	u_offset_t save_off = off;
10807c478bd9Sstevel@tonic-gate 	ulong_t index;
10817c478bd9Sstevel@tonic-gate 	kmutex_t *phm;
10827c478bd9Sstevel@tonic-gate 	page_t *pp;
10837c478bd9Sstevel@tonic-gate 	uint_t pszc;
10847c478bd9Sstevel@tonic-gate 	int loopcnt = 0;
10857c478bd9Sstevel@tonic-gate 
10867c478bd9Sstevel@tonic-gate 	ASSERT(szc != 0);
10877c478bd9Sstevel@tonic-gate 	ASSERT(vp != NULL);
10887c478bd9Sstevel@tonic-gate 	ASSERT(!IS_SWAPFSVP(vp));
1089ad23a2dbSjohansen 	ASSERT(!VN_ISKAS(vp));
10907c478bd9Sstevel@tonic-gate 
10917c478bd9Sstevel@tonic-gate again:
10927c478bd9Sstevel@tonic-gate 	if (++loopcnt > 3) {
10937c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[0]);
10947c478bd9Sstevel@tonic-gate 		return (0);
10957c478bd9Sstevel@tonic-gate 	}
10967c478bd9Sstevel@tonic-gate 
10977c478bd9Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
10987c478bd9Sstevel@tonic-gate 	phm = PAGE_HASH_MUTEX(index);
10997c478bd9Sstevel@tonic-gate 
11007c478bd9Sstevel@tonic-gate 	mutex_enter(phm);
1101e7c874afSJosef 'Jeff' Sipek 	pp = page_hash_search(index, vp, off);
11027c478bd9Sstevel@tonic-gate 	mutex_exit(phm);
11037c478bd9Sstevel@tonic-gate 
11047c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(page_exphcontg[1]);
11057c478bd9Sstevel@tonic-gate 
11067c478bd9Sstevel@tonic-gate 	if (pp == NULL) {
11077c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[2]);
11087c478bd9Sstevel@tonic-gate 		return (0);
11097c478bd9Sstevel@tonic-gate 	}
11107c478bd9Sstevel@tonic-gate 
11117c478bd9Sstevel@tonic-gate 	pages = page_get_pagecnt(szc);
11127c478bd9Sstevel@tonic-gate 	rootpp = pp;
11137c478bd9Sstevel@tonic-gate 	pfn = rootpp->p_pagenum;
11147c478bd9Sstevel@tonic-gate 
11157c478bd9Sstevel@tonic-gate 	if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
11167c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[3]);
11177c478bd9Sstevel@tonic-gate 		if (!page_trylock(pp, SE_SHARED)) {
11187c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_exphcontg[4]);
11197c478bd9Sstevel@tonic-gate 			return (1);
11207c478bd9Sstevel@tonic-gate 		}
11219853d9e8SJason Beloro 		/*
11229853d9e8SJason Beloro 		 * Also check whether p_pagenum was modified by DR.
11239853d9e8SJason Beloro 		 */
11247c478bd9Sstevel@tonic-gate 		if (pp->p_szc != pszc || pp->p_vnode != vp ||
11259853d9e8SJason Beloro 		    pp->p_offset != off || pp->p_pagenum != pfn) {
11267c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_exphcontg[5]);
11277c478bd9Sstevel@tonic-gate 			page_unlock(pp);
11287c478bd9Sstevel@tonic-gate 			off = save_off;
11297c478bd9Sstevel@tonic-gate 			goto again;
11307c478bd9Sstevel@tonic-gate 		}
11317c478bd9Sstevel@tonic-gate 		/*
11327c478bd9Sstevel@tonic-gate 		 * szc was non zero and vnode and offset matched after we
11337c478bd9Sstevel@tonic-gate 		 * locked the page it means it can't become free on us.
11347c478bd9Sstevel@tonic-gate 		 */
11357c478bd9Sstevel@tonic-gate 		ASSERT(!PP_ISFREE(pp));
11367c478bd9Sstevel@tonic-gate 		if (!IS_P2ALIGNED(pfn, pages)) {
11377c478bd9Sstevel@tonic-gate 			page_unlock(pp);
11387c478bd9Sstevel@tonic-gate 			return (0);
11397c478bd9Sstevel@tonic-gate 		}
11407c478bd9Sstevel@tonic-gate 		ppa[0] = pp;
11417c478bd9Sstevel@tonic-gate 		pp++;
11427c478bd9Sstevel@tonic-gate 		off += PAGESIZE;
1143