xref: /illumos-gate/usr/src/uts/common/vm/vm_seg.c (revision 15c07adc)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
50209230bSgjelinek  * Common Development and Distribution License (the "License").
60209230bSgjelinek  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22d3d50737SRafael Vanoni  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
24*15c07adcSJohn Levon  * Copyright (c) 2018, Joyent, Inc.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
277c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
287c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
297c478bd9Sstevel@tonic-gate 
307c478bd9Sstevel@tonic-gate /*
317c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
327c478bd9Sstevel@tonic-gate  * The Regents of the University of California
337c478bd9Sstevel@tonic-gate  * All Rights Reserved
347c478bd9Sstevel@tonic-gate  *
357c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
367c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
377c478bd9Sstevel@tonic-gate  * contributors.
387c478bd9Sstevel@tonic-gate  */
397c478bd9Sstevel@tonic-gate 
407c478bd9Sstevel@tonic-gate /*
417c478bd9Sstevel@tonic-gate  * VM - segment management.
427c478bd9Sstevel@tonic-gate  */
437c478bd9Sstevel@tonic-gate 
447c478bd9Sstevel@tonic-gate #include <sys/types.h>
457c478bd9Sstevel@tonic-gate #include <sys/inttypes.h>
467c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
477c478bd9Sstevel@tonic-gate #include <sys/param.h>
487c478bd9Sstevel@tonic-gate #include <sys/systm.h>
497c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
50a98e9dbfSaguzovsk #include <sys/sysmacros.h>
517c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h>
52a98e9dbfSaguzovsk #include <sys/tuneable.h>
537c478bd9Sstevel@tonic-gate #include <sys/debug.h>
54a98e9dbfSaguzovsk #include <sys/fs/swapnode.h>
557c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
567c478bd9Sstevel@tonic-gate #include <sys/callb.h>
577c478bd9Sstevel@tonic-gate #include <sys/mem_config.h>
580209230bSgjelinek #include <sys/mman.h>
597c478bd9Sstevel@tonic-gate 
607c478bd9Sstevel@tonic-gate #include <vm/hat.h>
617c478bd9Sstevel@tonic-gate #include <vm/as.h>
627c478bd9Sstevel@tonic-gate #include <vm/seg.h>
637c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
640209230bSgjelinek #include <vm/seg_spt.h>
650209230bSgjelinek #include <vm/seg_vn.h>
66a98e9dbfSaguzovsk #include <vm/anon.h>
67a98e9dbfSaguzovsk 
687c478bd9Sstevel@tonic-gate /*
697c478bd9Sstevel@tonic-gate  * kstats for segment advise
707c478bd9Sstevel@tonic-gate  */
717c478bd9Sstevel@tonic-gate segadvstat_t segadvstat = {
727c478bd9Sstevel@tonic-gate 	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
737c478bd9Sstevel@tonic-gate 	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
747c478bd9Sstevel@tonic-gate };
757c478bd9Sstevel@tonic-gate 
767c478bd9Sstevel@tonic-gate kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
777c478bd9Sstevel@tonic-gate uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
787c478bd9Sstevel@tonic-gate 
797c478bd9Sstevel@tonic-gate /*
807c478bd9Sstevel@tonic-gate  * entry in the segment page cache
817c478bd9Sstevel@tonic-gate  */
827c478bd9Sstevel@tonic-gate struct seg_pcache {
83a98e9dbfSaguzovsk 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
84a98e9dbfSaguzovsk 	struct seg_pcache	*p_hprev;
85a98e9dbfSaguzovsk 	pcache_link_t		p_plink;	/* per segment/amp list */
86a98e9dbfSaguzovsk 	void 			*p_htag0;	/* segment/amp pointer */
87a98e9dbfSaguzovsk 	caddr_t			p_addr;		/* base address/anon_idx */
88a98e9dbfSaguzovsk 	size_t			p_len;		/* total bytes */
89a98e9dbfSaguzovsk 	size_t			p_wlen;		/* writtable bytes at p_addr */
90a98e9dbfSaguzovsk 	struct page		**p_pp;		/* pp shadow list */
91a98e9dbfSaguzovsk 	seg_preclaim_cbfunc_t	p_callback;	/* reclaim callback function */
92a98e9dbfSaguzovsk 	clock_t			p_lbolt;	/* lbolt from last use */
93a98e9dbfSaguzovsk 	struct seg_phash	*p_hashp;	/* our pcache hash bucket */
94a98e9dbfSaguzovsk 	uint_t			p_active;	/* active count */
95a98e9dbfSaguzovsk 	uchar_t			p_write;	/* true if S_WRITE */
96a98e9dbfSaguzovsk 	uchar_t			p_ref;		/* reference byte */
97a98e9dbfSaguzovsk 	ushort_t		p_flags;	/* bit flags */
987c478bd9Sstevel@tonic-gate };
997c478bd9Sstevel@tonic-gate 
1007c478bd9Sstevel@tonic-gate struct seg_phash {
101a98e9dbfSaguzovsk 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
102a98e9dbfSaguzovsk 	struct seg_pcache	*p_hprev;
103a98e9dbfSaguzovsk 	kmutex_t		p_hmutex;	/* protects hash bucket */
104a98e9dbfSaguzovsk 	pcache_link_t		p_halink[2];	/* active bucket linkages */
105a98e9dbfSaguzovsk };
106a98e9dbfSaguzovsk 
107a98e9dbfSaguzovsk struct seg_phash_wired {
108a98e9dbfSaguzovsk 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
109a98e9dbfSaguzovsk 	struct seg_pcache	*p_hprev;
110a98e9dbfSaguzovsk 	kmutex_t		p_hmutex;	/* protects hash bucket */
1117c478bd9Sstevel@tonic-gate };
1127c478bd9Sstevel@tonic-gate 
113a98e9dbfSaguzovsk /*
114a98e9dbfSaguzovsk  * A parameter to control a maximum number of bytes that can be
115a98e9dbfSaguzovsk  * purged from pcache at a time.
116a98e9dbfSaguzovsk  */
117a98e9dbfSaguzovsk #define	P_MAX_APURGE_BYTES	(1024 * 1024 * 1024)
118a98e9dbfSaguzovsk 
119a98e9dbfSaguzovsk /*
120a98e9dbfSaguzovsk  * log2(fraction of pcache to reclaim at a time).
121a98e9dbfSaguzovsk  */
122a98e9dbfSaguzovsk #define	P_SHRINK_SHFT		(5)
123a98e9dbfSaguzovsk 
124a98e9dbfSaguzovsk /*
125a98e9dbfSaguzovsk  * The following variables can be tuned via /etc/system.
126a98e9dbfSaguzovsk  */
127a98e9dbfSaguzovsk 
128a98e9dbfSaguzovsk int	segpcache_enabled = 1;		/* if 1, shadow lists are cached */
129a98e9dbfSaguzovsk pgcnt_t	segpcache_maxwindow = 0;	/* max # of pages that can be cached */
130a98e9dbfSaguzovsk ulong_t	segpcache_hashsize_win = 0;	/* # of non wired buckets */
131a98e9dbfSaguzovsk ulong_t	segpcache_hashsize_wired = 0;	/* # of wired buckets */
132a98e9dbfSaguzovsk int	segpcache_reap_sec = 1;		/* reap check rate in secs */
133a98e9dbfSaguzovsk clock_t	segpcache_reap_ticks = 0;	/* reap interval in ticks */
134a98e9dbfSaguzovsk int	segpcache_pcp_maxage_sec = 1;	/* pcp max age in secs */
135a98e9dbfSaguzovsk clock_t	segpcache_pcp_maxage_ticks = 0;	/* pcp max age in ticks */
136a98e9dbfSaguzovsk int	segpcache_shrink_shift = P_SHRINK_SHFT;	/* log2 reap fraction */
137a98e9dbfSaguzovsk pgcnt_t	segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES;	/* max purge bytes */
138a98e9dbfSaguzovsk 
139a98e9dbfSaguzovsk static kmutex_t seg_pcache_mtx;	/* protects seg_pdisabled counter */
140a98e9dbfSaguzovsk static kmutex_t seg_pasync_mtx;	/* protects async thread scheduling */
141a98e9dbfSaguzovsk static kcondvar_t seg_pasync_cv;
142a98e9dbfSaguzovsk 
143a98e9dbfSaguzovsk #pragma align 64(pctrl1)
144a98e9dbfSaguzovsk #pragma align 64(pctrl2)
145a98e9dbfSaguzovsk #pragma align 64(pctrl3)
146a98e9dbfSaguzovsk 
147a98e9dbfSaguzovsk /*
148a98e9dbfSaguzovsk  * Keep frequently used variables together in one cache line.
149a98e9dbfSaguzovsk  */
150a98e9dbfSaguzovsk static struct p_ctrl1 {
151a98e9dbfSaguzovsk 	uint_t p_disabled;		/* if not 0, caching temporarily off */
152a98e9dbfSaguzovsk 	pgcnt_t p_maxwin;		/* max # of pages that can be cached */
153a98e9dbfSaguzovsk 	size_t p_hashwin_sz;		/* # of non wired buckets */
154a98e9dbfSaguzovsk 	struct seg_phash *p_htabwin;	/* hash table for non wired entries */
155a98e9dbfSaguzovsk 	size_t p_hashwired_sz;		/* # of wired buckets */
156a98e9dbfSaguzovsk 	struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
157a98e9dbfSaguzovsk 	kmem_cache_t *p_kmcache;	/* kmem cache for seg_pcache structs */
158a98e9dbfSaguzovsk #ifdef _LP64
159a98e9dbfSaguzovsk 	ulong_t pad[1];
160a98e9dbfSaguzovsk #endif /* _LP64 */
161a98e9dbfSaguzovsk } pctrl1;
162a98e9dbfSaguzovsk 
163a98e9dbfSaguzovsk static struct p_ctrl2 {
164a98e9dbfSaguzovsk 	kmutex_t p_mem_mtx;	/* protects window counter and p_halinks */
165a98e9dbfSaguzovsk 	pgcnt_t  p_locked_win;	/* # pages from window */
166a98e9dbfSaguzovsk 	pgcnt_t  p_locked;	/* # of pages cached by pagelock */
167a98e9dbfSaguzovsk 	uchar_t	 p_ahcur;	/* current active links for insert/delete */
168a98e9dbfSaguzovsk 	uchar_t  p_athr_on;	/* async reclaim thread is running. */
169a98e9dbfSaguzovsk 	pcache_link_t p_ahhead[2]; /* active buckets linkages */
170a98e9dbfSaguzovsk } pctrl2;
171a98e9dbfSaguzovsk 
172a98e9dbfSaguzovsk static struct p_ctrl3 {
173a98e9dbfSaguzovsk 	clock_t	p_pcp_maxage;		/* max pcp age in ticks */
174a98e9dbfSaguzovsk 	ulong_t	p_athr_empty_ahb;	/* athread walk stats */
175a98e9dbfSaguzovsk 	ulong_t p_athr_full_ahb;	/* athread walk stats */
176a98e9dbfSaguzovsk 	pgcnt_t	p_maxapurge_npages;	/* max pages to purge at a time */
177a98e9dbfSaguzovsk 	int	p_shrink_shft;		/* reap shift factor */
178a98e9dbfSaguzovsk #ifdef _LP64
179a98e9dbfSaguzovsk 	ulong_t pad[3];
180a98e9dbfSaguzovsk #endif /* _LP64 */
181a98e9dbfSaguzovsk } pctrl3;
182a98e9dbfSaguzovsk 
183a98e9dbfSaguzovsk #define	seg_pdisabled			pctrl1.p_disabled
184a98e9dbfSaguzovsk #define	seg_pmaxwindow			pctrl1.p_maxwin
185a98e9dbfSaguzovsk #define	seg_phashsize_win		pctrl1.p_hashwin_sz
186a98e9dbfSaguzovsk #define	seg_phashtab_win		pctrl1.p_htabwin
187a98e9dbfSaguzovsk #define	seg_phashsize_wired		pctrl1.p_hashwired_sz
188a98e9dbfSaguzovsk #define	seg_phashtab_wired		pctrl1.p_htabwired
189a98e9dbfSaguzovsk #define	seg_pkmcache			pctrl1.p_kmcache
190a98e9dbfSaguzovsk #define	seg_pmem_mtx			pctrl2.p_mem_mtx
191a98e9dbfSaguzovsk #define	seg_plocked_window		pctrl2.p_locked_win
192a98e9dbfSaguzovsk #define	seg_plocked			pctrl2.p_locked
193a98e9dbfSaguzovsk #define	seg_pahcur			pctrl2.p_ahcur
194a98e9dbfSaguzovsk #define	seg_pathr_on			pctrl2.p_athr_on
195a98e9dbfSaguzovsk #define	seg_pahhead			pctrl2.p_ahhead
196a98e9dbfSaguzovsk #define	seg_pmax_pcpage			pctrl3.p_pcp_maxage
197a98e9dbfSaguzovsk #define	seg_pathr_empty_ahb		pctrl3.p_athr_empty_ahb
198a98e9dbfSaguzovsk #define	seg_pathr_full_ahb		pctrl3.p_athr_full_ahb
199a98e9dbfSaguzovsk #define	seg_pshrink_shift		pctrl3.p_shrink_shft
200a98e9dbfSaguzovsk #define	seg_pmaxapurge_npages		pctrl3.p_maxapurge_npages
201a98e9dbfSaguzovsk 
202a98e9dbfSaguzovsk #define	P_HASHWIN_MASK			(seg_phashsize_win - 1)
203a98e9dbfSaguzovsk #define	P_HASHWIRED_MASK		(seg_phashsize_wired - 1)
204a98e9dbfSaguzovsk #define	P_BASESHIFT			(6)
205a98e9dbfSaguzovsk 
206a98e9dbfSaguzovsk kthread_t *seg_pasync_thr;
207a98e9dbfSaguzovsk 
208a98e9dbfSaguzovsk extern struct seg_ops segvn_ops;
209a98e9dbfSaguzovsk extern struct seg_ops segspt_shmops;
210a98e9dbfSaguzovsk 
211a98e9dbfSaguzovsk #define	IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
212a98e9dbfSaguzovsk #define	IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
2137c478bd9Sstevel@tonic-gate 
214d3d50737SRafael Vanoni #define	LBOLT_DELTA(t)	((ulong_t)(ddi_get_lbolt() - (t)))
2157c478bd9Sstevel@tonic-gate 
216a98e9dbfSaguzovsk #define	PCP_AGE(pcp)	LBOLT_DELTA((pcp)->p_lbolt)
2177c478bd9Sstevel@tonic-gate 
218a98e9dbfSaguzovsk /*
219a98e9dbfSaguzovsk  * htag0 argument can be a seg or amp pointer.
220a98e9dbfSaguzovsk  */
221a98e9dbfSaguzovsk #define	P_HASHBP(seg, htag0, addr, flags)				\
222a98e9dbfSaguzovsk 	(IS_PFLAGS_WIRED((flags)) ?					\
223a98e9dbfSaguzovsk 	    ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK &	\
224a98e9dbfSaguzovsk 	    ((uintptr_t)(htag0) >> P_BASESHIFT)]) :			\
225a98e9dbfSaguzovsk 	    (&seg_phashtab_win[P_HASHWIN_MASK &				\
226a98e9dbfSaguzovsk 	    (((uintptr_t)(htag0) >> 3) ^				\
227a98e9dbfSaguzovsk 	    ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?		\
228a98e9dbfSaguzovsk 	    (flags >> 16) : page_get_shift((seg)->s_szc))))]))
2297c478bd9Sstevel@tonic-gate 
230a98e9dbfSaguzovsk /*
231a98e9dbfSaguzovsk  * htag0 argument can be a seg or amp pointer.
232a98e9dbfSaguzovsk  */
233a98e9dbfSaguzovsk #define	P_MATCH(pcp, htag0, addr, len)					\
234a98e9dbfSaguzovsk 	((pcp)->p_htag0 == (htag0) &&					\
235a98e9dbfSaguzovsk 	(pcp)->p_addr == (addr) &&					\
236a98e9dbfSaguzovsk 	(pcp)->p_len >= (len))
2377c478bd9Sstevel@tonic-gate 
238a98e9dbfSaguzovsk #define	P_MATCH_PP(pcp, htag0, addr, len, pp)				\
239a98e9dbfSaguzovsk 	((pcp)->p_pp == (pp) &&						\
240a98e9dbfSaguzovsk 	(pcp)->p_htag0 == (htag0) &&					\
241a98e9dbfSaguzovsk 	(pcp)->p_addr == (addr) &&					\
242a98e9dbfSaguzovsk 	(pcp)->p_len >= (len))
2437c478bd9Sstevel@tonic-gate 
244a98e9dbfSaguzovsk #define	plink2pcache(pl)	((struct seg_pcache *)((uintptr_t)(pl) - \
245a98e9dbfSaguzovsk     offsetof(struct seg_pcache, p_plink)))
2467c478bd9Sstevel@tonic-gate 
247a98e9dbfSaguzovsk #define	hlink2phash(hl, l)	((struct seg_phash *)((uintptr_t)(hl) -	\
248a98e9dbfSaguzovsk     offsetof(struct seg_phash, p_halink[l])))
2497c478bd9Sstevel@tonic-gate 
2507c478bd9Sstevel@tonic-gate /*
251a98e9dbfSaguzovsk  * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
252a98e9dbfSaguzovsk  * active hash bucket lists. We maintain active bucket lists to reduce the
253a98e9dbfSaguzovsk  * overhead of finding active buckets during asynchronous purging since there
254a98e9dbfSaguzovsk  * can be 10s of millions of buckets on a large system but only a small subset
255a98e9dbfSaguzovsk  * of them in actual use.
256a98e9dbfSaguzovsk  *
257a98e9dbfSaguzovsk  * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
258a98e9dbfSaguzovsk  * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
259a98e9dbfSaguzovsk  * buckets. The other list is used by asynchronous purge thread. This allows
260a98e9dbfSaguzovsk  * the purge thread to walk its active list without holding seg_pmem_mtx for a
261a98e9dbfSaguzovsk  * long time. When asynchronous thread is done with its list it switches to
262a98e9dbfSaguzovsk  * current active list and makes the list it just finished processing as
263a98e9dbfSaguzovsk  * current active list.
264a98e9dbfSaguzovsk  *
265a98e9dbfSaguzovsk  * seg_padd_abuck() only adds the bucket to current list if the bucket is not
266a98e9dbfSaguzovsk  * yet on any list.  seg_premove_abuck() may remove the bucket from either
267a98e9dbfSaguzovsk  * list. If the bucket is on current list it will be always removed. Otherwise
268a98e9dbfSaguzovsk  * the bucket is only removed if asynchronous purge thread is not currently
269a98e9dbfSaguzovsk  * running or seg_premove_abuck() is called by asynchronous purge thread
270a98e9dbfSaguzovsk  * itself. A given bucket can only be on one of active lists at a time. These
271a98e9dbfSaguzovsk  * routines should be called with per bucket lock held.  The routines use
272a98e9dbfSaguzovsk  * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
273a98e9dbfSaguzovsk  * the first entry is added to the bucket chain and seg_premove_abuck() must
274a98e9dbfSaguzovsk  * be called after the last pcp entry is deleted from its chain. Per bucket
275a98e9dbfSaguzovsk  * lock should be held by the callers.  This avoids a potential race condition
276a98e9dbfSaguzovsk  * when seg_premove_abuck() removes a bucket after pcp entries are added to
277a98e9dbfSaguzovsk  * its list after the caller checked that the bucket has no entries. (this
278a98e9dbfSaguzovsk  * race would cause a loss of an active bucket from the active lists).
279a98e9dbfSaguzovsk  *
280a98e9dbfSaguzovsk  * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
281a98e9dbfSaguzovsk  * New entries are added to the end of the list since LRU is used as the
282a98e9dbfSaguzovsk  * purging policy.
283a98e9dbfSaguzovsk  */
284a98e9dbfSaguzovsk static void
seg_padd_abuck(struct seg_phash * hp)285a98e9dbfSaguzovsk seg_padd_abuck(struct seg_phash *hp)
286a98e9dbfSaguzovsk {
287a98e9dbfSaguzovsk 	int lix;
288a98e9dbfSaguzovsk 
289a98e9dbfSaguzovsk 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
290a98e9dbfSaguzovsk 	ASSERT((struct seg_phash *)hp->p_hnext != hp);
291a98e9dbfSaguzovsk 	ASSERT((struct seg_phash *)hp->p_hprev != hp);
292a98e9dbfSaguzovsk 	ASSERT(hp->p_hnext == hp->p_hprev);
293a98e9dbfSaguzovsk 	ASSERT(!IS_PCP_WIRED(hp->p_hnext));
294a98e9dbfSaguzovsk 	ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
295a98e9dbfSaguzovsk 	ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
296a98e9dbfSaguzovsk 	ASSERT(hp >= seg_phashtab_win &&
297a98e9dbfSaguzovsk 	    hp < &seg_phashtab_win[seg_phashsize_win]);
298a98e9dbfSaguzovsk 
299a98e9dbfSaguzovsk 	/*
300a98e9dbfSaguzovsk 	 * This bucket can already be on one of active lists
301a98e9dbfSaguzovsk 	 * since seg_premove_abuck() may have failed to remove it
302a98e9dbfSaguzovsk 	 * before.
303a98e9dbfSaguzovsk 	 */
304a98e9dbfSaguzovsk 	mutex_enter(&seg_pmem_mtx);
305a98e9dbfSaguzovsk 	lix = seg_pahcur;
306a98e9dbfSaguzovsk 	ASSERT(lix >= 0 && lix <= 1);
307a98e9dbfSaguzovsk 	if (hp->p_halink[lix].p_lnext != NULL) {
308a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
309a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
310a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
311a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
312a98e9dbfSaguzovsk 		return;
313a98e9dbfSaguzovsk 	}
314a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
315a98e9dbfSaguzovsk 
316a98e9dbfSaguzovsk 	/*
317a98e9dbfSaguzovsk 	 * If this bucket is still on list !lix async thread can't yet remove
318a98e9dbfSaguzovsk 	 * it since we hold here per bucket lock. In this case just return
319a98e9dbfSaguzovsk 	 * since async thread will eventually find and process this bucket.
320a98e9dbfSaguzovsk 	 */
321a98e9dbfSaguzovsk 	if (hp->p_halink[!lix].p_lnext != NULL) {
322a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev != NULL);
323a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
324a98e9dbfSaguzovsk 		return;
325a98e9dbfSaguzovsk 	}
326a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[!lix].p_lprev == NULL);
327a98e9dbfSaguzovsk 	/*
328a98e9dbfSaguzovsk 	 * This bucket is not on any active bucket list yet.
329a98e9dbfSaguzovsk 	 * Add the bucket to the tail of current active list.
330a98e9dbfSaguzovsk 	 */
331a98e9dbfSaguzovsk 	hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
332a98e9dbfSaguzovsk 	hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
333a98e9dbfSaguzovsk 	seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
334a98e9dbfSaguzovsk 	seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
335a98e9dbfSaguzovsk 	mutex_exit(&seg_pmem_mtx);
336a98e9dbfSaguzovsk }
337a98e9dbfSaguzovsk 
338a98e9dbfSaguzovsk static void
seg_premove_abuck(struct seg_phash * hp,int athr)339a98e9dbfSaguzovsk seg_premove_abuck(struct seg_phash *hp, int athr)
340a98e9dbfSaguzovsk {
341a98e9dbfSaguzovsk 	int lix;
342a98e9dbfSaguzovsk 
343a98e9dbfSaguzovsk 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
344a98e9dbfSaguzovsk 	ASSERT((struct seg_phash *)hp->p_hnext == hp);
345a98e9dbfSaguzovsk 	ASSERT((struct seg_phash *)hp->p_hprev == hp);
346a98e9dbfSaguzovsk 	ASSERT(hp >= seg_phashtab_win &&
347a98e9dbfSaguzovsk 	    hp < &seg_phashtab_win[seg_phashsize_win]);
348a98e9dbfSaguzovsk 
349a98e9dbfSaguzovsk 	if (athr) {
350a98e9dbfSaguzovsk 		ASSERT(seg_pathr_on);
351a98e9dbfSaguzovsk 		ASSERT(seg_pahcur <= 1);
352a98e9dbfSaguzovsk 		/*
353a98e9dbfSaguzovsk 		 * We are called by asynchronous thread that found this bucket
354a98e9dbfSaguzovsk 		 * on not currently active (i.e. !seg_pahcur) list. Remove it
355a98e9dbfSaguzovsk 		 * from there.  Per bucket lock we are holding makes sure
356a98e9dbfSaguzovsk 		 * seg_pinsert() can't sneak in and add pcp entries to this
357a98e9dbfSaguzovsk 		 * bucket right before we remove the bucket from its list.
358a98e9dbfSaguzovsk 		 */
359a98e9dbfSaguzovsk 		lix = !seg_pahcur;
360a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[lix].p_lnext != NULL);
361a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
362a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
363a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
364a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
365a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
366a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext = NULL;
367a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev = NULL;
368a98e9dbfSaguzovsk 		return;
369a98e9dbfSaguzovsk 	}
370a98e9dbfSaguzovsk 
371a98e9dbfSaguzovsk 	mutex_enter(&seg_pmem_mtx);
372a98e9dbfSaguzovsk 	lix = seg_pahcur;
373a98e9dbfSaguzovsk 	ASSERT(lix >= 0 && lix <= 1);
374a98e9dbfSaguzovsk 
375a98e9dbfSaguzovsk 	/*
376a98e9dbfSaguzovsk 	 * If the bucket is on currently active list just remove it from
377a98e9dbfSaguzovsk 	 * there.
378a98e9dbfSaguzovsk 	 */
379a98e9dbfSaguzovsk 	if (hp->p_halink[lix].p_lnext != NULL) {
380a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
381a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
382a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
383a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
384a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
385a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext = NULL;
386a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev = NULL;
387a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
388a98e9dbfSaguzovsk 		return;
389a98e9dbfSaguzovsk 	}
390a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
391a98e9dbfSaguzovsk 
392a98e9dbfSaguzovsk 	/*
393a98e9dbfSaguzovsk 	 * If asynchronous thread is not running we can remove the bucket from
394a98e9dbfSaguzovsk 	 * not currently active list. The bucket must be on this list since we
395a98e9dbfSaguzovsk 	 * already checked that it's not on the other list and the bucket from
396a98e9dbfSaguzovsk 	 * which we just deleted the last pcp entry must be still on one of the
397a98e9dbfSaguzovsk 	 * active bucket lists.
398a98e9dbfSaguzovsk 	 */
399a98e9dbfSaguzovsk 	lix = !lix;
400a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[lix].p_lnext != NULL);
401a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[lix].p_lprev != NULL);
402a98e9dbfSaguzovsk 
403a98e9dbfSaguzovsk 	if (!seg_pathr_on) {
404a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
405a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
406a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext = NULL;
407a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev = NULL;
408a98e9dbfSaguzovsk 	}
409a98e9dbfSaguzovsk 	mutex_exit(&seg_pmem_mtx);
410a98e9dbfSaguzovsk }
411a98e9dbfSaguzovsk 
412a98e9dbfSaguzovsk /*
413a98e9dbfSaguzovsk  * Check if bucket pointed by hp already has a pcp entry that matches request
414a98e9dbfSaguzovsk  * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
415a98e9dbfSaguzovsk  * Also delete matching entries that cover smaller address range but start
416a98e9dbfSaguzovsk  * at the same address as addr argument. Return the list of deleted entries if
417a98e9dbfSaguzovsk  * any. This is an internal helper function called from seg_pinsert() only
418a98e9dbfSaguzovsk  * for non wired shadow lists. The caller already holds a per seg/amp list
419a98e9dbfSaguzovsk  * lock.
420a98e9dbfSaguzovsk  */
421a98e9dbfSaguzovsk static struct seg_pcache *
seg_plookup_checkdup(struct seg_phash * hp,void * htag0,caddr_t addr,size_t len,int * found)422a98e9dbfSaguzovsk seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
423a98e9dbfSaguzovsk     caddr_t addr, size_t len, int *found)
424a98e9dbfSaguzovsk {
425a98e9dbfSaguzovsk 	struct seg_pcache *pcp;
426a98e9dbfSaguzovsk 	struct seg_pcache *delcallb_list = NULL;
427a98e9dbfSaguzovsk 
428a98e9dbfSaguzovsk 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
429a98e9dbfSaguzovsk 
430a98e9dbfSaguzovsk 	*found = 0;
431a98e9dbfSaguzovsk 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
432a98e9dbfSaguzovsk 	    pcp = pcp->p_hnext) {
433a98e9dbfSaguzovsk 		ASSERT(pcp->p_hashp == hp);
434a98e9dbfSaguzovsk 		if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
435a98e9dbfSaguzovsk 			ASSERT(!IS_PCP_WIRED(pcp));
436a98e9dbfSaguzovsk 			if (pcp->p_len < len) {
437a98e9dbfSaguzovsk 				pcache_link_t *plinkp;
438a98e9dbfSaguzovsk 				if (pcp->p_active) {
439a98e9dbfSaguzovsk 					continue;
440a98e9dbfSaguzovsk 				}
441a98e9dbfSaguzovsk 				plinkp = &pcp->p_plink;
442a98e9dbfSaguzovsk 				plinkp->p_lprev->p_lnext = plinkp->p_lnext;
443a98e9dbfSaguzovsk 				plinkp->p_lnext->p_lprev = plinkp->p_lprev;
444a98e9dbfSaguzovsk 				pcp->p_hprev->p_hnext = pcp->p_hnext;
445a98e9dbfSaguzovsk 				pcp->p_hnext->p_hprev = pcp->p_hprev;
446a98e9dbfSaguzovsk 				pcp->p_hprev = delcallb_list;
447a98e9dbfSaguzovsk 				delcallb_list = pcp;
448a98e9dbfSaguzovsk 			} else {
449a98e9dbfSaguzovsk 				*found = 1;
450a98e9dbfSaguzovsk 				break;
451a98e9dbfSaguzovsk 			}
452a98e9dbfSaguzovsk 		}
453a98e9dbfSaguzovsk 	}
454a98e9dbfSaguzovsk 	return (delcallb_list);
455a98e9dbfSaguzovsk }
456a98e9dbfSaguzovsk 
457a98e9dbfSaguzovsk /*
458a98e9dbfSaguzovsk  * lookup an address range in pagelock cache. Return shadow list and bump up
459a98e9dbfSaguzovsk  * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
460a98e9dbfSaguzovsk  * as a lookup tag.
4617c478bd9Sstevel@tonic-gate  */
4627c478bd9Sstevel@tonic-gate struct page **
seg_plookup(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,enum seg_rw rw,uint_t flags)463a98e9dbfSaguzovsk seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
464a98e9dbfSaguzovsk     enum seg_rw rw, uint_t flags)
4657c478bd9Sstevel@tonic-gate {
4667c478bd9Sstevel@tonic-gate 	struct seg_pcache *pcp;
4677c478bd9Sstevel@tonic-gate 	struct seg_phash *hp;
468a98e9dbfSaguzovsk 	void *htag0;
469a98e9dbfSaguzovsk 
470a98e9dbfSaguzovsk 	ASSERT(seg != NULL);
471a98e9dbfSaguzovsk 	ASSERT(rw == S_READ || rw == S_WRITE);
4727c478bd9Sstevel@tonic-gate 
4737c478bd9Sstevel@tonic-gate 	/*
4747c478bd9Sstevel@tonic-gate 	 * Skip pagelock cache, while DR is in progress or
4757c478bd9Sstevel@tonic-gate 	 * seg_pcache is off.
4767c478bd9Sstevel@tonic-gate 	 */
477a98e9dbfSaguzovsk 	if (seg_pdisabled) {
4787c478bd9Sstevel@tonic-gate 		return (NULL);
4797c478bd9Sstevel@tonic-gate 	}
480a98e9dbfSaguzovsk 	ASSERT(seg_phashsize_win != 0);
4817c478bd9Sstevel@tonic-gate 
482a98e9dbfSaguzovsk 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
483a98e9dbfSaguzovsk 	hp = P_HASHBP(seg, htag0, addr, flags);
4847c478bd9Sstevel@tonic-gate 	mutex_enter(&hp->p_hmutex);
4857c478bd9Sstevel@tonic-gate 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
4867c478bd9Sstevel@tonic-gate 	    pcp = pcp->p_hnext) {
487a98e9dbfSaguzovsk 		ASSERT(pcp->p_hashp == hp);
488a98e9dbfSaguzovsk 		if (P_MATCH(pcp, htag0, addr, len)) {
489a98e9dbfSaguzovsk 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(