xref: /illumos-gate/usr/src/uts/common/vm/vm_seg.c (revision a98e9dbf)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
50209230bSgjelinek  * Common Development and Distribution License (the "License").
60209230bSgjelinek  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22c6f08383Sjj  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
277c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
287c478bd9Sstevel@tonic-gate 
297c478bd9Sstevel@tonic-gate /*
307c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
317c478bd9Sstevel@tonic-gate  * The Regents of the University of California
327c478bd9Sstevel@tonic-gate  * All Rights Reserved
337c478bd9Sstevel@tonic-gate  *
347c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
357c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
367c478bd9Sstevel@tonic-gate  * contributors.
377c478bd9Sstevel@tonic-gate  */
387c478bd9Sstevel@tonic-gate 
397c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
407c478bd9Sstevel@tonic-gate 
417c478bd9Sstevel@tonic-gate /*
427c478bd9Sstevel@tonic-gate  * VM - segment management.
437c478bd9Sstevel@tonic-gate  */
447c478bd9Sstevel@tonic-gate 
457c478bd9Sstevel@tonic-gate #include <sys/types.h>
467c478bd9Sstevel@tonic-gate #include <sys/inttypes.h>
477c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
487c478bd9Sstevel@tonic-gate #include <sys/param.h>
497c478bd9Sstevel@tonic-gate #include <sys/systm.h>
507c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
51*a98e9dbfSaguzovsk #include <sys/sysmacros.h>
527c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h>
53*a98e9dbfSaguzovsk #include <sys/tuneable.h>
547c478bd9Sstevel@tonic-gate #include <sys/debug.h>
55*a98e9dbfSaguzovsk #include <sys/fs/swapnode.h>
567c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
577c478bd9Sstevel@tonic-gate #include <sys/callb.h>
587c478bd9Sstevel@tonic-gate #include <sys/mem_config.h>
590209230bSgjelinek #include <sys/mman.h>
607c478bd9Sstevel@tonic-gate 
617c478bd9Sstevel@tonic-gate #include <vm/hat.h>
627c478bd9Sstevel@tonic-gate #include <vm/as.h>
637c478bd9Sstevel@tonic-gate #include <vm/seg.h>
647c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
650209230bSgjelinek #include <vm/seg_spt.h>
660209230bSgjelinek #include <vm/seg_vn.h>
67*a98e9dbfSaguzovsk #include <vm/anon.h>
68*a98e9dbfSaguzovsk 
697c478bd9Sstevel@tonic-gate /*
707c478bd9Sstevel@tonic-gate  * kstats for segment advise
717c478bd9Sstevel@tonic-gate  */
727c478bd9Sstevel@tonic-gate segadvstat_t segadvstat = {
737c478bd9Sstevel@tonic-gate 	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
747c478bd9Sstevel@tonic-gate 	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
757c478bd9Sstevel@tonic-gate };
767c478bd9Sstevel@tonic-gate 
777c478bd9Sstevel@tonic-gate kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
787c478bd9Sstevel@tonic-gate uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
797c478bd9Sstevel@tonic-gate 
807c478bd9Sstevel@tonic-gate /*
817c478bd9Sstevel@tonic-gate  * entry in the segment page cache
827c478bd9Sstevel@tonic-gate  */
837c478bd9Sstevel@tonic-gate struct seg_pcache {
84*a98e9dbfSaguzovsk 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
85*a98e9dbfSaguzovsk 	struct seg_pcache	*p_hprev;
86*a98e9dbfSaguzovsk 	pcache_link_t		p_plink;	/* per segment/amp list */
87*a98e9dbfSaguzovsk 	void 			*p_htag0;	/* segment/amp pointer */
88*a98e9dbfSaguzovsk 	caddr_t			p_addr;		/* base address/anon_idx */
89*a98e9dbfSaguzovsk 	size_t			p_len;		/* total bytes */
90*a98e9dbfSaguzovsk 	size_t			p_wlen;		/* writtable bytes at p_addr */
91*a98e9dbfSaguzovsk 	struct page		**p_pp;		/* pp shadow list */
92*a98e9dbfSaguzovsk 	seg_preclaim_cbfunc_t	p_callback;	/* reclaim callback function */
93*a98e9dbfSaguzovsk 	clock_t			p_lbolt;	/* lbolt from last use */
94*a98e9dbfSaguzovsk 	struct seg_phash	*p_hashp;	/* our pcache hash bucket */
95*a98e9dbfSaguzovsk 	uint_t			p_active;	/* active count */
96*a98e9dbfSaguzovsk 	uchar_t			p_write;	/* true if S_WRITE */
97*a98e9dbfSaguzovsk 	uchar_t			p_ref;		/* reference byte */
98*a98e9dbfSaguzovsk 	ushort_t		p_flags;	/* bit flags */
997c478bd9Sstevel@tonic-gate };
1007c478bd9Sstevel@tonic-gate 
1017c478bd9Sstevel@tonic-gate struct seg_phash {
102*a98e9dbfSaguzovsk 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
103*a98e9dbfSaguzovsk 	struct seg_pcache	*p_hprev;
104*a98e9dbfSaguzovsk 	kmutex_t		p_hmutex;	/* protects hash bucket */
105*a98e9dbfSaguzovsk 	pcache_link_t		p_halink[2];	/* active bucket linkages */
106*a98e9dbfSaguzovsk };
107*a98e9dbfSaguzovsk 
108*a98e9dbfSaguzovsk struct seg_phash_wired {
109*a98e9dbfSaguzovsk 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
110*a98e9dbfSaguzovsk 	struct seg_pcache	*p_hprev;
111*a98e9dbfSaguzovsk 	kmutex_t		p_hmutex;	/* protects hash bucket */
1127c478bd9Sstevel@tonic-gate };
1137c478bd9Sstevel@tonic-gate 
114*a98e9dbfSaguzovsk /*
115*a98e9dbfSaguzovsk  * A parameter to control a maximum number of bytes that can be
116*a98e9dbfSaguzovsk  * purged from pcache at a time.
117*a98e9dbfSaguzovsk  */
118*a98e9dbfSaguzovsk #define	P_MAX_APURGE_BYTES	(1024 * 1024 * 1024)
119*a98e9dbfSaguzovsk 
120*a98e9dbfSaguzovsk /*
121*a98e9dbfSaguzovsk  * log2(fraction of pcache to reclaim at a time).
122*a98e9dbfSaguzovsk  */
123*a98e9dbfSaguzovsk #define	P_SHRINK_SHFT		(5)
124*a98e9dbfSaguzovsk 
125*a98e9dbfSaguzovsk /*
126*a98e9dbfSaguzovsk  * The following variables can be tuned via /etc/system.
127*a98e9dbfSaguzovsk  */
128*a98e9dbfSaguzovsk 
129*a98e9dbfSaguzovsk int	segpcache_enabled = 1;		/* if 1, shadow lists are cached */
130*a98e9dbfSaguzovsk pgcnt_t	segpcache_maxwindow = 0;	/* max # of pages that can be cached */
131*a98e9dbfSaguzovsk ulong_t	segpcache_hashsize_win = 0;	/* # of non wired buckets */
132*a98e9dbfSaguzovsk ulong_t	segpcache_hashsize_wired = 0;	/* # of wired buckets */
133*a98e9dbfSaguzovsk int	segpcache_reap_sec = 1;		/* reap check rate in secs */
134*a98e9dbfSaguzovsk clock_t	segpcache_reap_ticks = 0;	/* reap interval in ticks */
135*a98e9dbfSaguzovsk int	segpcache_pcp_maxage_sec = 1;	/* pcp max age in secs */
136*a98e9dbfSaguzovsk clock_t	segpcache_pcp_maxage_ticks = 0;	/* pcp max age in ticks */
137*a98e9dbfSaguzovsk int	segpcache_shrink_shift = P_SHRINK_SHFT;	/* log2 reap fraction */
138*a98e9dbfSaguzovsk pgcnt_t	segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES;	/* max purge bytes */
139*a98e9dbfSaguzovsk 
140*a98e9dbfSaguzovsk static kmutex_t seg_pcache_mtx;	/* protects seg_pdisabled counter */
141*a98e9dbfSaguzovsk static kmutex_t seg_pasync_mtx;	/* protects async thread scheduling */
142*a98e9dbfSaguzovsk static kcondvar_t seg_pasync_cv;
143*a98e9dbfSaguzovsk 
144*a98e9dbfSaguzovsk #pragma align 64(pctrl1)
145*a98e9dbfSaguzovsk #pragma align 64(pctrl2)
146*a98e9dbfSaguzovsk #pragma align 64(pctrl3)
147*a98e9dbfSaguzovsk 
148*a98e9dbfSaguzovsk /*
149*a98e9dbfSaguzovsk  * Keep frequently used variables together in one cache line.
150*a98e9dbfSaguzovsk  */
151*a98e9dbfSaguzovsk static struct p_ctrl1 {
152*a98e9dbfSaguzovsk 	uint_t p_disabled;		/* if not 0, caching temporarily off */
153*a98e9dbfSaguzovsk 	pgcnt_t p_maxwin;		/* max # of pages that can be cached */
154*a98e9dbfSaguzovsk 	size_t p_hashwin_sz;		/* # of non wired buckets */
155*a98e9dbfSaguzovsk 	struct seg_phash *p_htabwin;	/* hash table for non wired entries */
156*a98e9dbfSaguzovsk 	size_t p_hashwired_sz;		/* # of wired buckets */
157*a98e9dbfSaguzovsk 	struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
158*a98e9dbfSaguzovsk 	kmem_cache_t *p_kmcache;	/* kmem cache for seg_pcache structs */
159*a98e9dbfSaguzovsk #ifdef _LP64
160*a98e9dbfSaguzovsk 	ulong_t pad[1];
161*a98e9dbfSaguzovsk #endif /* _LP64 */
162*a98e9dbfSaguzovsk } pctrl1;
163*a98e9dbfSaguzovsk 
164*a98e9dbfSaguzovsk static struct p_ctrl2 {
165*a98e9dbfSaguzovsk 	kmutex_t p_mem_mtx;	/* protects window counter and p_halinks */
166*a98e9dbfSaguzovsk 	pgcnt_t  p_locked_win;	/* # pages from window */
167*a98e9dbfSaguzovsk 	pgcnt_t  p_locked;	/* # of pages cached by pagelock */
168*a98e9dbfSaguzovsk 	uchar_t	 p_ahcur;	/* current active links for insert/delete */
169*a98e9dbfSaguzovsk 	uchar_t  p_athr_on;	/* async reclaim thread is running. */
170*a98e9dbfSaguzovsk 	pcache_link_t p_ahhead[2]; /* active buckets linkages */
171*a98e9dbfSaguzovsk } pctrl2;
172*a98e9dbfSaguzovsk 
173*a98e9dbfSaguzovsk static struct p_ctrl3 {
174*a98e9dbfSaguzovsk 	clock_t	p_pcp_maxage;		/* max pcp age in ticks */
175*a98e9dbfSaguzovsk 	ulong_t	p_athr_empty_ahb;	/* athread walk stats */
176*a98e9dbfSaguzovsk 	ulong_t p_athr_full_ahb;	/* athread walk stats */
177*a98e9dbfSaguzovsk 	pgcnt_t	p_maxapurge_npages;	/* max pages to purge at a time */
178*a98e9dbfSaguzovsk 	int	p_shrink_shft;		/* reap shift factor */
179*a98e9dbfSaguzovsk #ifdef _LP64
180*a98e9dbfSaguzovsk 	ulong_t pad[3];
181*a98e9dbfSaguzovsk #endif /* _LP64 */
182*a98e9dbfSaguzovsk } pctrl3;
183*a98e9dbfSaguzovsk 
184*a98e9dbfSaguzovsk #define	seg_pdisabled			pctrl1.p_disabled
185*a98e9dbfSaguzovsk #define	seg_pmaxwindow			pctrl1.p_maxwin
186*a98e9dbfSaguzovsk #define	seg_phashsize_win		pctrl1.p_hashwin_sz
187*a98e9dbfSaguzovsk #define	seg_phashtab_win		pctrl1.p_htabwin
188*a98e9dbfSaguzovsk #define	seg_phashsize_wired		pctrl1.p_hashwired_sz
189*a98e9dbfSaguzovsk #define	seg_phashtab_wired		pctrl1.p_htabwired
190*a98e9dbfSaguzovsk #define	seg_pkmcache			pctrl1.p_kmcache
191*a98e9dbfSaguzovsk #define	seg_pmem_mtx			pctrl2.p_mem_mtx
192*a98e9dbfSaguzovsk #define	seg_plocked_window		pctrl2.p_locked_win
193*a98e9dbfSaguzovsk #define	seg_plocked			pctrl2.p_locked
194*a98e9dbfSaguzovsk #define	seg_pahcur			pctrl2.p_ahcur
195*a98e9dbfSaguzovsk #define	seg_pathr_on			pctrl2.p_athr_on
196*a98e9dbfSaguzovsk #define	seg_pahhead			pctrl2.p_ahhead
197*a98e9dbfSaguzovsk #define	seg_pmax_pcpage			pctrl3.p_pcp_maxage
198*a98e9dbfSaguzovsk #define	seg_pathr_empty_ahb		pctrl3.p_athr_empty_ahb
199*a98e9dbfSaguzovsk #define	seg_pathr_full_ahb		pctrl3.p_athr_full_ahb
200*a98e9dbfSaguzovsk #define	seg_pshrink_shift		pctrl3.p_shrink_shft
201*a98e9dbfSaguzovsk #define	seg_pmaxapurge_npages		pctrl3.p_maxapurge_npages
202*a98e9dbfSaguzovsk 
203*a98e9dbfSaguzovsk #define	P_HASHWIN_MASK			(seg_phashsize_win - 1)
204*a98e9dbfSaguzovsk #define	P_HASHWIRED_MASK		(seg_phashsize_wired - 1)
205*a98e9dbfSaguzovsk #define	P_BASESHIFT			(6)
206*a98e9dbfSaguzovsk 
207*a98e9dbfSaguzovsk kthread_t *seg_pasync_thr;
208*a98e9dbfSaguzovsk 
209*a98e9dbfSaguzovsk extern struct seg_ops segvn_ops;
210*a98e9dbfSaguzovsk extern struct seg_ops segspt_shmops;
211*a98e9dbfSaguzovsk 
212*a98e9dbfSaguzovsk #define	IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
213*a98e9dbfSaguzovsk #define	IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
2147c478bd9Sstevel@tonic-gate 
215*a98e9dbfSaguzovsk #define	LBOLT_DELTA(t)	((ulong_t)(lbolt - (t)))
2167c478bd9Sstevel@tonic-gate 
217*a98e9dbfSaguzovsk #define	PCP_AGE(pcp)	LBOLT_DELTA((pcp)->p_lbolt)
2187c478bd9Sstevel@tonic-gate 
219*a98e9dbfSaguzovsk /*
220*a98e9dbfSaguzovsk  * htag0 argument can be a seg or amp pointer.
221*a98e9dbfSaguzovsk  */
222*a98e9dbfSaguzovsk #define	P_HASHBP(seg, htag0, addr, flags)				\
223*a98e9dbfSaguzovsk 	(IS_PFLAGS_WIRED((flags)) ?					\
224*a98e9dbfSaguzovsk 	    ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK &	\
225*a98e9dbfSaguzovsk 	    ((uintptr_t)(htag0) >> P_BASESHIFT)]) :			\
226*a98e9dbfSaguzovsk 	    (&seg_phashtab_win[P_HASHWIN_MASK &				\
227*a98e9dbfSaguzovsk 	    (((uintptr_t)(htag0) >> 3) ^				\
228*a98e9dbfSaguzovsk 	    ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?		\
229*a98e9dbfSaguzovsk 	    (flags >> 16) : page_get_shift((seg)->s_szc))))]))
2307c478bd9Sstevel@tonic-gate 
231*a98e9dbfSaguzovsk /*
232*a98e9dbfSaguzovsk  * htag0 argument can be a seg or amp pointer.
233*a98e9dbfSaguzovsk  */
234*a98e9dbfSaguzovsk #define	P_MATCH(pcp, htag0, addr, len)					\
235*a98e9dbfSaguzovsk 	((pcp)->p_htag0 == (htag0) &&					\
236*a98e9dbfSaguzovsk 	(pcp)->p_addr == (addr) &&					\
237*a98e9dbfSaguzovsk 	(pcp)->p_len >= (len))
2387c478bd9Sstevel@tonic-gate 
239*a98e9dbfSaguzovsk #define	P_MATCH_PP(pcp, htag0, addr, len, pp)				\
240*a98e9dbfSaguzovsk 	((pcp)->p_pp == (pp) &&						\
241*a98e9dbfSaguzovsk 	(pcp)->p_htag0 == (htag0) &&					\
242*a98e9dbfSaguzovsk 	(pcp)->p_addr == (addr) &&					\
243*a98e9dbfSaguzovsk 	(pcp)->p_len >= (len))
2447c478bd9Sstevel@tonic-gate 
245*a98e9dbfSaguzovsk #define	plink2pcache(pl)	((struct seg_pcache *)((uintptr_t)(pl) - \
246*a98e9dbfSaguzovsk     offsetof(struct seg_pcache, p_plink)))
2477c478bd9Sstevel@tonic-gate 
248*a98e9dbfSaguzovsk #define	hlink2phash(hl, l)	((struct seg_phash *)((uintptr_t)(hl) -	\
249*a98e9dbfSaguzovsk     offsetof(struct seg_phash, p_halink[l])))
2507c478bd9Sstevel@tonic-gate 
2517c478bd9Sstevel@tonic-gate /*
252*a98e9dbfSaguzovsk  * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
253*a98e9dbfSaguzovsk  * active hash bucket lists. We maintain active bucket lists to reduce the
254*a98e9dbfSaguzovsk  * overhead of finding active buckets during asynchronous purging since there
255*a98e9dbfSaguzovsk  * can be 10s of millions of buckets on a large system but only a small subset
256*a98e9dbfSaguzovsk  * of them in actual use.
257*a98e9dbfSaguzovsk  *
258*a98e9dbfSaguzovsk  * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
259*a98e9dbfSaguzovsk  * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
260*a98e9dbfSaguzovsk  * buckets. The other list is used by asynchronous purge thread. This allows
261*a98e9dbfSaguzovsk  * the purge thread to walk its active list without holding seg_pmem_mtx for a
262*a98e9dbfSaguzovsk  * long time. When asynchronous thread is done with its list it switches to
263*a98e9dbfSaguzovsk  * current active list and makes the list it just finished processing as
264*a98e9dbfSaguzovsk  * current active list.
265*a98e9dbfSaguzovsk  *
266*a98e9dbfSaguzovsk  * seg_padd_abuck() only adds the bucket to current list if the bucket is not
267*a98e9dbfSaguzovsk  * yet on any list.  seg_premove_abuck() may remove the bucket from either
268*a98e9dbfSaguzovsk  * list. If the bucket is on current list it will be always removed. Otherwise
269*a98e9dbfSaguzovsk  * the bucket is only removed if asynchronous purge thread is not currently
270*a98e9dbfSaguzovsk  * running or seg_premove_abuck() is called by asynchronous purge thread
271*a98e9dbfSaguzovsk  * itself. A given bucket can only be on one of active lists at a time. These
272*a98e9dbfSaguzovsk  * routines should be called with per bucket lock held.  The routines use
273*a98e9dbfSaguzovsk  * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
274*a98e9dbfSaguzovsk  * the first entry is added to the bucket chain and seg_premove_abuck() must
275*a98e9dbfSaguzovsk  * be called after the last pcp entry is deleted from its chain. Per bucket
276*a98e9dbfSaguzovsk  * lock should be held by the callers.  This avoids a potential race condition
277*a98e9dbfSaguzovsk  * when seg_premove_abuck() removes a bucket after pcp entries are added to
278*a98e9dbfSaguzovsk  * its list after the caller checked that the bucket has no entries. (this
279*a98e9dbfSaguzovsk  * race would cause a loss of an active bucket from the active lists).
280*a98e9dbfSaguzovsk  *
281*a98e9dbfSaguzovsk  * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
282*a98e9dbfSaguzovsk  * New entries are added to the end of the list since LRU is used as the
283*a98e9dbfSaguzovsk  * purging policy.
284*a98e9dbfSaguzovsk  */
285*a98e9dbfSaguzovsk static void
286*a98e9dbfSaguzovsk seg_padd_abuck(struct seg_phash *hp)
287*a98e9dbfSaguzovsk {
288*a98e9dbfSaguzovsk 	int lix;
289*a98e9dbfSaguzovsk 
290*a98e9dbfSaguzovsk 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
291*a98e9dbfSaguzovsk 	ASSERT((struct seg_phash *)hp->p_hnext != hp);
292*a98e9dbfSaguzovsk 	ASSERT((struct seg_phash *)hp->p_hprev != hp);
293*a98e9dbfSaguzovsk 	ASSERT(hp->p_hnext == hp->p_hprev);
294*a98e9dbfSaguzovsk 	ASSERT(!IS_PCP_WIRED(hp->p_hnext));
295*a98e9dbfSaguzovsk 	ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
296*a98e9dbfSaguzovsk 	ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
297*a98e9dbfSaguzovsk 	ASSERT(hp >= seg_phashtab_win &&
298*a98e9dbfSaguzovsk 	    hp < &seg_phashtab_win[seg_phashsize_win]);
299*a98e9dbfSaguzovsk 
300*a98e9dbfSaguzovsk 	/*
301*a98e9dbfSaguzovsk 	 * This bucket can already be on one of active lists
302*a98e9dbfSaguzovsk 	 * since seg_premove_abuck() may have failed to remove it
303*a98e9dbfSaguzovsk 	 * before.
304*a98e9dbfSaguzovsk 	 */
305*a98e9dbfSaguzovsk 	mutex_enter(&seg_pmem_mtx);
306*a98e9dbfSaguzovsk 	lix = seg_pahcur;
307*a98e9dbfSaguzovsk 	ASSERT(lix >= 0 && lix <= 1);
308*a98e9dbfSaguzovsk 	if (hp->p_halink[lix].p_lnext != NULL) {
309*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
310*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
311*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
312*a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
313*a98e9dbfSaguzovsk 		return;
314*a98e9dbfSaguzovsk 	}
315*a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
316*a98e9dbfSaguzovsk 
317*a98e9dbfSaguzovsk 	/*
318*a98e9dbfSaguzovsk 	 * If this bucket is still on list !lix async thread can't yet remove
319*a98e9dbfSaguzovsk 	 * it since we hold here per bucket lock. In this case just return
320*a98e9dbfSaguzovsk 	 * since async thread will eventually find and process this bucket.
321*a98e9dbfSaguzovsk 	 */
322*a98e9dbfSaguzovsk 	if (hp->p_halink[!lix].p_lnext != NULL) {
323*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev != NULL);
324*a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
325*a98e9dbfSaguzovsk 		return;
326*a98e9dbfSaguzovsk 	}
327*a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[!lix].p_lprev == NULL);
328*a98e9dbfSaguzovsk 	/*
329*a98e9dbfSaguzovsk 	 * This bucket is not on any active bucket list yet.
330*a98e9dbfSaguzovsk 	 * Add the bucket to the tail of current active list.
331*a98e9dbfSaguzovsk 	 */
332*a98e9dbfSaguzovsk 	hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
333*a98e9dbfSaguzovsk 	hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
334*a98e9dbfSaguzovsk 	seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
335*a98e9dbfSaguzovsk 	seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
336*a98e9dbfSaguzovsk 	mutex_exit(&seg_pmem_mtx);
337*a98e9dbfSaguzovsk }
338*a98e9dbfSaguzovsk 
339*a98e9dbfSaguzovsk static void
340*a98e9dbfSaguzovsk seg_premove_abuck(struct seg_phash *hp, int athr)
341*a98e9dbfSaguzovsk {
342*a98e9dbfSaguzovsk 	int lix;
343*a98e9dbfSaguzovsk 
344*a98e9dbfSaguzovsk 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
345*a98e9dbfSaguzovsk 	ASSERT((struct seg_phash *)hp->p_hnext == hp);
346*a98e9dbfSaguzovsk 	ASSERT((struct seg_phash *)hp->p_hprev == hp);
347*a98e9dbfSaguzovsk 	ASSERT(hp >= seg_phashtab_win &&
348*a98e9dbfSaguzovsk 	    hp < &seg_phashtab_win[seg_phashsize_win]);
349*a98e9dbfSaguzovsk 
350*a98e9dbfSaguzovsk 	if (athr) {
351*a98e9dbfSaguzovsk 		ASSERT(seg_pathr_on);
352*a98e9dbfSaguzovsk 		ASSERT(seg_pahcur <= 1);
353*a98e9dbfSaguzovsk 		/*
354*a98e9dbfSaguzovsk 		 * We are called by asynchronous thread that found this bucket
355*a98e9dbfSaguzovsk 		 * on not currently active (i.e. !seg_pahcur) list. Remove it
356*a98e9dbfSaguzovsk 		 * from there.  Per bucket lock we are holding makes sure
357*a98e9dbfSaguzovsk 		 * seg_pinsert() can't sneak in and add pcp entries to this
358*a98e9dbfSaguzovsk 		 * bucket right before we remove the bucket from its list.
359*a98e9dbfSaguzovsk 		 */
360*a98e9dbfSaguzovsk 		lix = !seg_pahcur;
361*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[lix].p_lnext != NULL);
362*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
363*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
364*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
365*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
366*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
367*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext = NULL;
368*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev = NULL;
369*a98e9dbfSaguzovsk 		return;
370*a98e9dbfSaguzovsk 	}
371*a98e9dbfSaguzovsk 
372*a98e9dbfSaguzovsk 	mutex_enter(&seg_pmem_mtx);
373*a98e9dbfSaguzovsk 	lix = seg_pahcur;
374*a98e9dbfSaguzovsk 	ASSERT(lix >= 0 && lix <= 1);
375*a98e9dbfSaguzovsk 
376*a98e9dbfSaguzovsk 	/*
377*a98e9dbfSaguzovsk 	 * If the bucket is on currently active list just remove it from
378*a98e9dbfSaguzovsk 	 * there.
379*a98e9dbfSaguzovsk 	 */
380*a98e9dbfSaguzovsk 	if (hp->p_halink[lix].p_lnext != NULL) {
381*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
382*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
383*a98e9dbfSaguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
384*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
385*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
386*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext = NULL;
387*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev = NULL;
388*a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
389*a98e9dbfSaguzovsk 		return;
390*a98e9dbfSaguzovsk 	}
391*a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
392*a98e9dbfSaguzovsk 
393*a98e9dbfSaguzovsk 	/*
394*a98e9dbfSaguzovsk 	 * If asynchronous thread is not running we can remove the bucket from
395*a98e9dbfSaguzovsk 	 * not currently active list. The bucket must be on this list since we
396*a98e9dbfSaguzovsk 	 * already checked that it's not on the other list and the bucket from
397*a98e9dbfSaguzovsk 	 * which we just deleted the last pcp entry must be still on one of the
398*a98e9dbfSaguzovsk 	 * active bucket lists.
399*a98e9dbfSaguzovsk 	 */
400*a98e9dbfSaguzovsk 	lix = !lix;
401*a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[lix].p_lnext != NULL);
402*a98e9dbfSaguzovsk 	ASSERT(hp->p_halink[lix].p_lprev != NULL);
403*a98e9dbfSaguzovsk 
404*a98e9dbfSaguzovsk 	if (!seg_pathr_on) {
405*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
406*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
407*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lnext = NULL;
408*a98e9dbfSaguzovsk 		hp->p_halink[lix].p_lprev = NULL;
409*a98e9dbfSaguzovsk 	}
410*a98e9dbfSaguzovsk 	mutex_exit(&seg_pmem_mtx);
411*a98e9dbfSaguzovsk }
412*a98e9dbfSaguzovsk 
413*a98e9dbfSaguzovsk /*
414*a98e9dbfSaguzovsk  * Check if bucket pointed by hp already has a pcp entry that matches request
415*a98e9dbfSaguzovsk  * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
416*a98e9dbfSaguzovsk  * Also delete matching entries that cover smaller address range but start
417*a98e9dbfSaguzovsk  * at the same address as addr argument. Return the list of deleted entries if
418*a98e9dbfSaguzovsk  * any. This is an internal helper function called from seg_pinsert() only
419*a98e9dbfSaguzovsk  * for non wired shadow lists. The caller already holds a per seg/amp list
420*a98e9dbfSaguzovsk  * lock.
421*a98e9dbfSaguzovsk  */
422*a98e9dbfSaguzovsk static struct seg_pcache *
423*a98e9dbfSaguzovsk seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
424*a98e9dbfSaguzovsk     caddr_t addr, size_t len, int *found)
425*a98e9dbfSaguzovsk {
426*a98e9dbfSaguzovsk 	struct seg_pcache *pcp;
427*a98e9dbfSaguzovsk 	struct seg_pcache *delcallb_list = NULL;
428*a98e9dbfSaguzovsk 
429*a98e9dbfSaguzovsk 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
430*a98e9dbfSaguzovsk 
431*a98e9dbfSaguzovsk 	*found = 0;
432*a98e9dbfSaguzovsk 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
433*a98e9dbfSaguzovsk 	    pcp = pcp->p_hnext) {
434*a98e9dbfSaguzovsk 		ASSERT(pcp->p_hashp == hp);
435*a98e9dbfSaguzovsk 		if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
436*a98e9dbfSaguzovsk 			ASSERT(!IS_PCP_WIRED(pcp));
437*a98e9dbfSaguzovsk 			if (pcp->p_len < len) {
438*a98e9dbfSaguzovsk 				pcache_link_t *plinkp;
439*a98e9dbfSaguzovsk 				if (pcp->p_active) {
440*a98e9dbfSaguzovsk 					continue;
441*a98e9dbfSaguzovsk 				}
442*a98e9dbfSaguzovsk 				plinkp = &pcp->p_plink;
443*a98e9dbfSaguzovsk 				plinkp->p_lprev->p_lnext = plinkp->p_lnext;
444*a98e9dbfSaguzovsk 				plinkp->p_lnext->p_lprev = plinkp->p_lprev;
445*a98e9dbfSaguzovsk 				pcp->p_hprev->p_hnext = pcp->p_hnext;
446*a98e9dbfSaguzovsk 				pcp->p_hnext->p_hprev = pcp->p_hprev;
447*a98e9dbfSaguzovsk 				pcp->p_hprev = delcallb_list;
448*a98e9dbfSaguzovsk 				delcallb_list = pcp;
449*a98e9dbfSaguzovsk 			} else {
450*a98e9dbfSaguzovsk 				*found = 1;
451*a98e9dbfSaguzovsk 				break;
452*a98e9dbfSaguzovsk 			}
453*a98e9dbfSaguzovsk 		}
454*a98e9dbfSaguzovsk 	}
455*a98e9dbfSaguzovsk 	return (delcallb_list);
456*a98e9dbfSaguzovsk }
457*a98e9dbfSaguzovsk 
458*a98e9dbfSaguzovsk /*
459*a98e9dbfSaguzovsk  * lookup an address range in pagelock cache. Return shadow list and bump up
460*a98e9dbfSaguzovsk  * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
461*a98e9dbfSaguzovsk  * as a lookup tag.
4627c478bd9Sstevel@tonic-gate  */
4637c478bd9Sstevel@tonic-gate struct page **
464*a98e9dbfSaguzovsk seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
465*a98e9dbfSaguzovsk     enum seg_rw rw, uint_t flags)
4667c478bd9Sstevel@tonic-gate {
4677c478bd9Sstevel@tonic-gate 	struct seg_pcache *pcp;
4687c478bd9Sstevel@tonic-gate 	struct seg_phash *hp;
469*a98e9dbfSaguzovsk 	void *htag0;
470*a98e9dbfSaguzovsk 
471*a98e9dbfSaguzovsk 	ASSERT(seg != NULL);
472*a98e9dbfSaguzovsk 	ASSERT(rw == S_READ || rw == S_WRITE);
4737c478bd9Sstevel@tonic-gate 
4747c478bd9Sstevel@tonic-gate 	/*
4757c478bd9Sstevel@tonic-gate 	 * Skip pagelock cache, while DR is in progress or
4767c478bd9Sstevel@tonic-gate 	 * seg_pcache is off.
4777c478bd9Sstevel@tonic-gate 	 */
478*a98e9dbfSaguzovsk 	if (seg_pdisabled) {
4797c478bd9Sstevel@tonic-gate 		return (NULL);
4807c478bd9Sstevel@tonic-gate 	}
481*a98e9dbfSaguzovsk 	ASSERT(seg_phashsize_win != 0);
4827c478bd9Sstevel@tonic-gate 
483*a98e9dbfSaguzovsk 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
484*a98e9dbfSaguzovsk 	hp = P_HASHBP(seg, htag0, addr, flags);
4857c478bd9Sstevel@tonic-gate 	mutex_enter(&hp->p_hmutex);
4867c478bd9Sstevel@tonic-gate 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
4877c478bd9Sstevel@tonic-gate 	    pcp = pcp->p_hnext) {
488*a98e9dbfSaguzovsk 		ASSERT(pcp->p_hashp == hp);
489*a98e9dbfSaguzovsk 		if (P_MATCH(pcp, htag0, addr, len)) {
490*a98e9dbfSaguzovsk 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
491*a98e9dbfSaguzovsk 			/*
492*a98e9dbfSaguzovsk 			 * If this request wants to write pages
493*a98e9dbfSaguzovsk 			 * but write permissions starting from
494*a98e9dbfSaguzovsk 			 * addr don't cover the entire length len
495*a98e9dbfSaguzovsk 			 * return lookup failure back to the caller.
496*a98e9dbfSaguzovsk 			 * It will check protections and fail this
497*a98e9dbfSaguzovsk 			 * pagelock operation with EACCESS error.
498*a98e9dbfSaguzovsk 			 */
499*a98e9dbfSaguzovsk 			if (rw == S_WRITE && pcp->p_wlen < len) {
500*a98e9dbfSaguzovsk 				break;
501*a98e9dbfSaguzovsk 			}
502*a98e9dbfSaguzovsk 			if (pcp->p_active == UINT_MAX) {
503*a98e9dbfSaguzovsk 				break;
504*a98e9dbfSaguzovsk 			}
5057c478bd9Sstevel@tonic-gate 			pcp->p_active++;
506*a98e9dbfSaguzovsk 			if (rw == S_WRITE && !pcp->p_write) {
507*a98e9dbfSaguzovsk 				pcp->p_write = 1;
508*a98e9dbfSaguzovsk 			}
5097c478bd9Sstevel@tonic-gate 			mutex_exit(&hp->p_hmutex);
5107c478bd9Sstevel@tonic-gate 			return (pcp->p_pp);
5117c478bd9Sstevel@tonic-gate 		}
5127c478bd9Sstevel@tonic-gate 	}
5137c478bd9Sstevel@tonic-gate 	mutex_exit(&hp->p_hmutex);
5147c478bd9Sstevel@tonic-gate 	return (NULL);
5157c478bd9Sstevel@tonic-gate }
5167c478bd9Sstevel@tonic-gate 
5177c478bd9Sstevel@tonic-gate /*
518*a98e9dbfSaguzovsk  * mark address range inactive. If the cache is off or the address range is
519*a98e9dbfSaguzovsk  * not in the cache or another shadow list that covers bigger range is found
520*a98e9dbfSaguzovsk  * we call the segment driver to reclaim the pages. Otherwise just decrement
521*a98e9dbfSaguzovsk  * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
522*a98e9dbfSaguzovsk  * otherwise use seg as a lookup tag.
5237c478bd9Sstevel@tonic-gate  */
5247c478bd9Sstevel@tonic-gate void
525*a98e9dbfSaguzovsk seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
526*a98e9dbfSaguzovsk     size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
527*a98e9dbfSaguzovsk     seg_preclaim_cbfunc_t callback)
5287c478bd9Sstevel@tonic-gate {
5297c478bd9Sstevel@tonic-gate 	struct seg_pcache *pcp;
5307c478bd9Sstevel@tonic-gate 	struct seg_phash *hp;
531*a98e9dbfSaguzovsk 	kmutex_t *pmtx = NULL;
532*a98e9dbfSaguzovsk 	pcache_link_t *pheadp;
533*a98e9dbfSaguzovsk 	void *htag0;
534*a98e9dbfSaguzovsk 	pgcnt_t npages = 0;
535*a98e9dbfSaguzovsk 	int keep = 0;
5367c478bd9Sstevel@tonic-gate 
537*a98e9dbfSaguzovsk 	ASSERT(seg != NULL);
538*a98e9dbfSaguzovsk 	ASSERT(rw == S_READ || rw == S_WRITE);
539*a98e9dbfSaguzovsk 
540*a98e9dbfSaguzovsk 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
541*a98e9dbfSaguzovsk 
542*a98e9dbfSaguzovsk 	/*
543*a98e9dbfSaguzovsk 	 * Skip lookup if pcache is not configured.
544*a98e9dbfSaguzovsk 	 */
545*a98e9dbfSaguzovsk 	if (seg_phashsize_win == 0) {
546*a98e9dbfSaguzovsk 		goto out;
547*a98e9dbfSaguzovsk 	}
548*a98e9dbfSaguzovsk 
549*a98e9dbfSaguzovsk 	/*
550*a98e9dbfSaguzovsk 	 * Grab per seg/amp lock before hash lock if we are going to remove
551*a98e9dbfSaguzovsk 	 * inactive entry from pcache.
552*a98e9dbfSaguzovsk 	 */
553*a98e9dbfSaguzovsk 	if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
554*a98e9dbfSaguzovsk 		if (amp == NULL) {
555*a98e9dbfSaguzovsk 			pheadp = &seg->s_phead;
556*a98e9dbfSaguzovsk 			pmtx = &seg->s_pmtx;
557*a98e9dbfSaguzovsk 		} else {
558*a98e9dbfSaguzovsk 			pheadp = &amp->a_phead;
559*a98e9dbfSaguzovsk 			pmtx = &amp->a_pmtx;
560*a98e9dbfSaguzovsk 		}
561*a98e9dbfSaguzovsk 		mutex_enter(pmtx);
5627c478bd9Sstevel@tonic-gate 	}
563*a98e9dbfSaguzovsk 
564*a98e9dbfSaguzovsk 	hp = P_HASHBP(seg, htag0, addr, flags);
5657c478bd9Sstevel@tonic-gate 	mutex_enter(&hp->p_hmutex);
566*a98e9dbfSaguzovsk again:
5677c478bd9Sstevel@tonic-gate 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
5687c478bd9Sstevel@tonic-gate 	    pcp = pcp->p_hnext) {
569*a98e9dbfSaguzovsk 		ASSERT(pcp->p_hashp == hp);
570*a98e9dbfSaguzovsk 		if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
571*a98e9dbfSaguzovsk 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
572*a98e9dbfSaguzovsk 			ASSERT(pcp->p_active);
573*a98e9dbfSaguzovsk 			if (keep) {
574*a98e9dbfSaguzovsk 				/*
575*a98e9dbfSaguzovsk 				 * Don't remove this pcp entry
576*a98e9dbfSaguzovsk 				 * if we didn't find duplicate
577*a98e9dbfSaguzovsk 				 * shadow lists on second search.
578*a98e9dbfSaguzovsk 				 * Somebody removed those duplicates
579*a98e9dbfSaguzovsk 				 * since we dropped hash lock after first
580*a98e9dbfSaguzovsk 				 * search.
581*a98e9dbfSaguzovsk 				 */
582*a98e9dbfSaguzovsk 				ASSERT(pmtx != NULL);
583*a98e9dbfSaguzovsk 				ASSERT(!IS_PFLAGS_WIRED(flags));
584*a98e9dbfSaguzovsk 				mutex_exit(pmtx);
585*a98e9dbfSaguzovsk 				pmtx = NULL;
586*a98e9dbfSaguzovsk 			}
5877c478bd9Sstevel@tonic-gate 			pcp->p_active--;
588*a98e9dbfSaguzovsk 			if (pcp->p_active == 0 && (pmtx != NULL ||
589*a98e9dbfSaguzovsk 			    (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
590*a98e9dbfSaguzovsk 
591*a98e9dbfSaguzovsk 				/*
592*a98e9dbfSaguzovsk 				 * This entry is no longer active.  Remove it
593*a98e9dbfSaguzovsk 				 * now either because pcaching is temporarily
594*a98e9dbfSaguzovsk 				 * disabled or there're other pcp entries that
595*a98e9dbfSaguzovsk 				 * can match this pagelock request (i.e. this
596*a98e9dbfSaguzovsk 				 * entry is a duplicate).
597*a98e9dbfSaguzovsk 				 */
5987c478bd9Sstevel@tonic-gate 
5997c478bd9Sstevel@tonic-gate 				ASSERT(callback == pcp->p_callback);
600*a98e9dbfSaguzovsk 				if (pmtx != NULL) {
601*a98e9dbfSaguzovsk 					pcache_link_t *plinkp = &pcp->p_plink;
602*a98e9dbfSaguzovsk 					ASSERT(!IS_PCP_WIRED(pcp));
603*a98e9dbfSaguzovsk 					ASSERT(pheadp->p_lnext != pheadp);
604*a98e9dbfSaguzovsk 					ASSERT(pheadp->p_lprev != pheadp);
605*a98e9dbfSaguzovsk 					plinkp->p_lprev->p_lnext =
606*a98e9dbfSaguzovsk 					    plinkp->p_lnext;
607*a98e9dbfSaguzovsk 					plinkp->p_lnext->p_lprev =
608*a98e9dbfSaguzovsk 					    plinkp->p_lprev;
609*a98e9dbfSaguzovsk 				}
6107c478bd9Sstevel@tonic-gate 				pcp->p_hprev->p_hnext = pcp->p_hnext;
6117c478bd9Sstevel@tonic-gate 				pcp->p_hnext->p_hprev = pcp->p_hprev;
612*a98e9dbfSaguzovsk 				if (!IS_PCP_WIRED(pcp) &&
613*a98e9dbfSaguzovsk 				    hp->p_hnext == (struct seg_pcache *)hp) {
614*a98e9dbfSaguzovsk 					/*
615*a98e9dbfSaguzovsk 					 * We removed the last entry from this
616*a98e9dbfSaguzovsk 					 * bucket.  Now remove the bucket from
617*a98e9dbfSaguzovsk 					 * its active list.
618*a98e9dbfSaguzovsk 					 */
619*a98e9dbfSaguzovsk 					seg_premove_abuck(hp, 0);
620*a98e9dbfSaguzovsk 				}
6217c478bd9Sstevel@tonic-gate 				mutex_exit(&hp->p_hmutex);
622*a98e9dbfSaguzovsk 				if (pmtx != NULL) {
623*a98e9dbfSaguzovsk 					mutex_exit(pmtx);
6247c478bd9Sstevel@tonic-gate 				}
625*a98e9dbfSaguzovsk 				len = pcp->p_len;
626*a98e9dbfSaguzovsk 				npages = btop(len);
627*a98e9dbfSaguzovsk 				if (rw != S_WRITE && pcp->p_write) {
628*a98e9dbfSaguzovsk 					rw = S_WRITE;
629*a98e9dbfSaguzovsk 				}
630*a98e9dbfSaguzovsk 				kmem_cache_free(seg_pkmcache, pcp);
6317c478bd9Sstevel@tonic-gate 				goto out;
632*a98e9dbfSaguzovsk 			} else {
633*a98e9dbfSaguzovsk 				/*
634*a98e9dbfSaguzovsk 				 * We found a matching pcp entry but will not
635*a98e9dbfSaguzovsk 				 * free it right away even if it's no longer
636*a98e9dbfSaguzovsk 				 * active.
637*a98e9dbfSaguzovsk 				 */
638*a98e9dbfSaguzovsk 				if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
639*a98e9dbfSaguzovsk 					/*
640*a98e9dbfSaguzovsk 					 * Set the reference bit and mark the
641*a98e9dbfSaguzovsk 					 * time of last access to this pcp
642*a98e9dbfSaguzovsk 					 * so that asynchronous thread doesn't
643*a98e9dbfSaguzovsk 					 * free it immediately since
644*a98e9dbfSaguzovsk 					 * it may be reactivated very soon.
645*a98e9dbfSaguzovsk 					 */
646*a98e9dbfSaguzovsk 					pcp->p_lbolt = lbolt;
647*a98e9dbfSaguzovsk 					pcp->p_ref = 1;
648*a98e9dbfSaguzovsk 				}
649*a98e9dbfSaguzovsk 				mutex_exit(&hp->p_hmutex);
650*a98e9dbfSaguzovsk 				if (pmtx != NULL) {
651*a98e9dbfSaguzovsk 					mutex_exit(pmtx);
652*a98e9dbfSaguzovsk 				}
653*a98e9dbfSaguzovsk 				return;
654*a98e9dbfSaguzovsk 			}
655*a98e9dbfSaguzovsk 		} else if (!IS_PFLAGS_WIRED(flags) &&
656*a98e9dbfSaguzovsk 		    P_MATCH(pcp, htag0, addr, len)) {
657*a98e9dbfSaguzovsk 			/*
658*a98e9dbfSaguzovsk 			 * This is a duplicate pcp entry.  This situation may
659*a98e9dbfSaguzovsk 			 * happen if a bigger shadow list that covers our
660*a98e9dbfSaguzovsk 			 * range was added while our entry was still active.
661*a98e9dbfSaguzovsk 			 * Now we can free our pcp entry if it becomes
662*a98e9dbfSaguzovsk 			 * inactive.
663*a98e9dbfSaguzovsk 			 */
664*a98e9dbfSaguzovsk 			if (!pcp->p_active) {
665*a98e9dbfSaguzovsk 				/*
666*a98e9dbfSaguzovsk 				 * Mark this entry as referenced just in case
667*a98e9dbfSaguzovsk 				 * we'll free our own pcp entry soon.
668*a98e9dbfSaguzovsk 				 */
669*a98e9dbfSaguzovsk 				pcp->p_lbolt = lbolt;
670*a98e9dbfSaguzovsk 				pcp->p_ref = 1;
671*a98e9dbfSaguzovsk 			}
672*a98e9dbfSaguzovsk 			if (pmtx != NULL) {
673*a98e9dbfSaguzovsk 				/*
674*a98e9dbfSaguzovsk 				 * we are already holding pmtx and found a
675*a98e9dbfSaguzovsk 				 * duplicate.  Don't keep our own pcp entry.
676*a98e9dbfSaguzovsk 				 */
677*a98e9dbfSaguzovsk 				keep = 0;
678*a98e9dbfSaguzovsk 				continue;
679*a98e9dbfSaguzovsk 			}
680*a98e9dbfSaguzovsk 			/*
681*a98e9dbfSaguzovsk 			 * We have to use mutex_tryenter to attempt to lock
682*a98e9dbfSaguzovsk 			 * seg/amp list lock since we already hold hash lock
683*a98e9dbfSaguzovsk 			 * and seg/amp list lock is above hash lock in lock
684*a98e9dbfSaguzovsk 			 * order.  If mutex_tryenter fails drop hash lock and
685*a98e9dbfSaguzovsk 			 * retake both locks in correct order and research
686*a98e9dbfSaguzovsk 			 * this hash chain.
687*a98e9dbfSaguzovsk 			 */
688*a98e9dbfSaguzovsk 			ASSERT(keep == 0);
689*a98e9dbfSaguzovsk 			if (amp == NULL) {
690*a98e9dbfSaguzovsk 				pheadp = &seg->s_phead;
691*a98e9dbfSaguzovsk 				pmtx = &seg->s_pmtx;
692*a98e9dbfSaguzovsk 			} else {
693*a98e9dbfSaguzovsk 				pheadp = &amp->a_phead;
694*a98e9dbfSaguzovsk 				pmtx = &amp->a_pmtx;
695*a98e9dbfSaguzovsk 			}
696*a98e9dbfSaguzovsk 			if (!mutex_tryenter(pmtx)) {
697*a98e9dbfSaguzovsk 				mutex_exit(&hp->p_hmutex);
698*a98e9dbfSaguzovsk 				mutex_enter(pmtx);
699*a98e9dbfSaguzovsk 				mutex_enter(&hp->p_hmutex);
700*a98e9dbfSaguzovsk 				/*
701*a98e9dbfSaguzovsk 				 * If we don't find bigger shadow list on
702*a98e9dbfSaguzovsk 				 * second search (it may happen since we
703*a98e9dbfSaguzovsk 				 * dropped bucket lock) keep the entry that
704*a98e9dbfSaguzovsk 				 * matches our own shadow list.
705*a98e9dbfSaguzovsk 				 */
706*a98e9dbfSaguzovsk 				keep = 1;
707*a98e9dbfSaguzovsk 				goto again;
7087c478bd9Sstevel@tonic-gate 			}
7097c478bd9Sstevel@tonic-gate 		}
7107c478bd9Sstevel@tonic-gate 	}
7117c478bd9Sstevel@tonic-gate 	mutex_exit(&hp->p_hmutex);
712*a98e9dbfSaguzovsk 	if (pmtx != NULL) {
713*a98e9dbfSaguzovsk 		mutex_exit(pmtx);
714*a98e9dbfSaguzovsk 	}
7157c478bd9Sstevel@tonic-gate out:
716*a98e9dbfSaguzovsk 	(*callback)(htag0, addr, len, pp, rw, 0);
717*a98e9dbfSaguzovsk 	if (npages) {
718*a98e9dbfSaguzovsk 		mutex_enter(&seg_pmem_mtx);
719*a98e9dbfSaguzovsk 		ASSERT(seg_plocked >= npages);
720*a98e9dbfSaguzovsk 		seg_plocked -= npages;
721*a98e9dbfSaguzovsk 		if (!IS_PFLAGS_WIRED(flags)) {
722*a98e9dbfSaguzovsk 			ASSERT(seg_plocked_window >= npages);
723*a98e9dbfSaguzovsk 			seg_plocked_window -= npages;
724*a98e9dbfSaguzovsk 		}
725*a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
726*a98e9dbfSaguzovsk 	}
727*a98e9dbfSaguzovsk 
7287c478bd9Sstevel@tonic-gate }
7297c478bd9Sstevel@tonic-gate 
730*a98e9dbfSaguzovsk #ifdef DEBUG
731*a98e9dbfSaguzovsk static uint32_t p_insert_chk_mtbf = 0;
732*a98e9dbfSaguzovsk #endif
733*a98e9dbfSaguzovsk 
7347c478bd9Sstevel@tonic-gate /*
7357c478bd9Sstevel@tonic-gate  * The seg_pinsert_check() is used by segment drivers to predict whether
7367c478bd9Sstevel@tonic-gate  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
7377c478bd9Sstevel@tonic-gate  */
738*a98e9dbfSaguzovsk /*ARGSUSED*/
7397c478bd9Sstevel@tonic-gate int
740*a98e9dbfSaguzovsk seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
741*a98e9dbfSaguzovsk     size_t len, uint_t flags)
7427c478bd9Sstevel@tonic-gate {
743*a98e9dbfSaguzovsk 	ASSERT(seg != NULL);
7447c478bd9Sstevel@tonic-gate 
745*a98e9dbfSaguzovsk #ifdef DEBUG
746*a98e9dbfSaguzovsk 	if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
7477c478bd9Sstevel@tonic-gate 		return (SEGP_FAIL);
7487c478bd9Sstevel@tonic-gate 	}
749*a98e9dbfSaguzovsk #endif
750*a98e9dbfSaguzovsk 
751*a98e9dbfSaguzovsk 	if (seg_pdisabled) {
7527c478bd9Sstevel@tonic-gate 		return (SEGP_FAIL);
7537c478bd9Sstevel@tonic-gate 	}
754*a98e9dbfSaguzovsk 	ASSERT(seg_phashsize_win != 0);
755*a98e9dbfSaguzovsk 
756*a98e9dbfSaguzovsk 	if (IS_PFLAGS_WIRED(flags)) {
757*a98e9dbfSaguzovsk 		return (SEGP_SUCCESS);
758*a98e9dbfSaguzovsk 	}
759*a98e9dbfSaguzovsk 
760*a98e9dbfSaguzovsk 	if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
7617c478bd9Sstevel@tonic-gate 		return (SEGP_FAIL);
7627c478bd9Sstevel@tonic-gate 	}
7637c478bd9Sstevel@tonic-gate 
764*a98e9dbfSaguzovsk 	if (freemem < desfree) {
765*a98e9dbfSaguzovsk 		return (SEGP_FAIL);
7667c478bd9Sstevel@tonic-gate 	}
767*a98e9dbfSaguzovsk 
7687c478bd9Sstevel@tonic-gate 	return (SEGP_SUCCESS);
7697c478bd9Sstevel@tonic-gate }
7707c478bd9Sstevel@tonic-gate 
771*a98e9dbfSaguzovsk #ifdef DEBUG
772*a98e9dbfSaguzovsk static uint32_t p_insert_mtbf = 0;
773*a98e9dbfSaguzovsk #endif
7747c478bd9Sstevel@tonic-gate 
7757c478bd9Sstevel@tonic-gate /*
776*a98e9dbfSaguzovsk  * Insert address range with shadow list into pagelock cache if there's no
777*a98e9dbfSaguzovsk  * shadow list already cached for this address range. If the cache is off or
778*a98e9dbfSaguzovsk  * caching is temporarily disabled or the allowed 'window' is exceeded return
779*a98e9dbfSaguzovsk  * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
780*a98e9dbfSaguzovsk  *
781*a98e9dbfSaguzovsk  * For non wired shadow lists (segvn case) include address in the hashing
782*a98e9dbfSaguzovsk  * function to avoid linking all the entries from the same segment or amp on
783*a98e9dbfSaguzovsk  * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
784*a98e9dbfSaguzovsk  * pcache entries are also linked on a per segment/amp list so that all
785*a98e9dbfSaguzovsk  * entries can be found quickly during seg/amp purge without walking the
786*a98e9dbfSaguzovsk  * entire pcache hash table.  For wired shadow lists (segspt case) we
787*a98e9dbfSaguzovsk  * don't use address hashing and per segment linking because the caller
788*a98e9dbfSaguzovsk  * currently inserts only one entry per segment that covers the entire
789*a98e9dbfSaguzovsk  * segment. If we used per segment linking even for segspt it would complicate
790*a98e9dbfSaguzovsk  * seg_ppurge_wiredpp() locking.
791*a98e9dbfSaguzovsk  *
792*a98e9dbfSaguzovsk  * Both hash bucket and per seg/amp locks need to be held before adding a non
793*a98e9dbfSaguzovsk  * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
794*a98e9dbfSaguzovsk  * first.
795*a98e9dbfSaguzovsk  *
796*a98e9dbfSaguzovsk  * This function will also remove from pcache old inactive shadow lists that
797*a98e9dbfSaguzovsk  * overlap with this request but cover smaller range for the same start
798*a98e9dbfSaguzovsk  * address.
7997c478bd9Sstevel@tonic-gate  */
8007c478bd9Sstevel@tonic-gate int
801*a98e9dbfSaguzovsk seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
802*a98e9dbfSaguzovsk     size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
803*a98e9dbfSaguzovsk     seg_preclaim_cbfunc_t callback)
8047c478bd9Sstevel@tonic-gate {
8057c478bd9Sstevel@tonic-gate 	struct seg_pcache *pcp;
8067c478bd9Sstevel@tonic-gate 	struct seg_phash *hp;
8077c478bd9Sstevel@tonic-gate 	pgcnt_t npages;
808*a98e9dbfSaguzovsk 	pcache_link_t *pheadp;
809*a98e9dbfSaguzovsk 	kmutex_t *pmtx;
810*a98e9dbfSaguzovsk 	struct seg_pcache *delcallb_list = NULL;
8117c478bd9Sstevel@tonic-gate 
812*a98e9dbfSaguzovsk 	ASSERT(seg != NULL);
813*a98e9dbfSaguzovsk 	ASSERT(rw == S_READ || rw == S_WRITE);
814*a98e9dbfSaguzovsk 	ASSERT(rw == S_READ || wlen == len);
815*a98e9dbfSaguzovsk 	ASSERT(rw == S_WRITE || wlen <= len);
816*a98e9dbfSaguzovsk 	ASSERT(amp == NULL || wlen == len);
817*a98e9dbfSaguzovsk 
818*a98e9dbfSaguzovsk #ifdef DEBUG
819*a98e9dbfSaguzovsk 	if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
8207c478bd9Sstevel@tonic-gate 		return (SEGP_FAIL);
8217c478bd9Sstevel@tonic-gate 	}
822*a98e9dbfSaguzovsk #endif
823*a98e9dbfSaguzovsk 
824*a98e9dbfSaguzovsk 	if (seg_pdisabled) {
8257c478bd9Sstevel@tonic-gate 		return (SEGP_FAIL);
8267c478bd9Sstevel@tonic-gate 	}
827*a98e9dbfSaguzovsk 	ASSERT(seg_phashsize_win != 0);
828*a98e9dbfSaguzovsk 
8297c478bd9Sstevel@tonic-gate 	ASSERT((len & PAGEOFFSET) == 0);
830*a98e9dbfSaguzovsk 	npages = btop(len);
831*a98e9dbfSaguzovsk 	mutex_enter(&seg_pmem_mtx);
832*a98e9dbfSaguzovsk 	if (!IS_PFLAGS_WIRED(flags)) {
833*a98e9dbfSaguzovsk 		if (seg_plocked_window + npages > seg_pmaxwindow) {
834*a98e9dbfSaguzovsk 			mutex_exit(&seg_pmem_mtx);
8357c478bd9Sstevel@tonic-gate 			return (SEGP_FAIL);
8367c478bd9Sstevel@tonic-gate 		}
837*a98e9dbfSaguzovsk 		seg_plocked_window += npages;
8387c478bd9Sstevel@tonic-gate 	}
8397c478bd9Sstevel@tonic-gate 	seg_plocked += npages;
840*a98e9dbfSaguzovsk 	mutex_exit(&seg_pmem_mtx);
8417c478bd9Sstevel@tonic-gate 
842*a98e9dbfSaguzovsk 	pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
843*a98e9dbfSaguzovsk 	/*
844*a98e9dbfSaguzovsk 	 * If amp is not NULL set htag0 to amp otherwise set it to seg.
845*a98e9dbfSaguzovsk 	 */
846*a98e9dbfSaguzovsk 	if (amp == NULL) {
847*a98e9dbfSaguzovsk 		pcp->p_htag0 = (void *)seg;
848*a98e9dbfSaguzovsk 		pcp->p_flags = flags & 0xffff;
849*a98e9dbfSaguzovsk 	} else {
850*a98e9dbfSaguzovsk 		pcp->p_htag0 = (void *)amp;
851*a98e9dbfSaguzovsk 		pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
852*a98e9dbfSaguzovsk 	}
8537c478bd9Sstevel@tonic-gate 	pcp->p_addr = addr;
8547c478bd9Sstevel@tonic-gate 	pcp->p_len = len;
855*a98e9dbfSaguzovsk 	pcp->p_wlen = wlen;
8567c478bd9Sstevel@tonic-gate 	pcp->p_pp = pp;
857*a98e9dbfSaguzovsk 	pcp->p_write = (rw == S_WRITE);
8587c478bd9Sstevel@tonic-gate 	pcp->p_callback = callback;
8597c478bd9Sstevel@tonic-gate 	pcp->p_active = 1;
8607c478bd9Sstevel@tonic-gate 
861*a98e9dbfSaguzovsk 	hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
862*a98e9dbfSaguzovsk 	if (!IS_PFLAGS_WIRED(flags)) {
863*a98e9dbfSaguzovsk 		int found;
864*a98e9dbfSaguzovsk 		void *htag0;
865*a98e9dbfSaguzovsk 		if (amp == NULL) {
866*a98e9dbfSaguzovsk 			pheadp = &seg->s_phead;
867*a98e9dbfSaguzovsk 			pmtx = &seg->s_pmtx;
868*a98e9dbfSaguzovsk 			htag0 = (void *)seg;
869*a98e9dbfSaguzovsk 		} else {
870*a98e9dbfSaguzovsk 			pheadp = &amp->a_phead;
871*a98e9dbfSaguzovsk 			pmtx = &amp->a_pmtx;
872*a98e9dbfSaguzovsk 			htag0 = (void *)amp;
873*a98e9dbfSaguzovsk 		}
874*a98e9dbfSaguzovsk 		mutex_enter(pmtx);
875*a98e9dbfSaguzovsk 		mutex_enter(&hp->p_hmutex);
876*a98e9dbfSaguzovsk 		delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
877*a98e9dbfSaguzovsk 		    len, &found);
878*a98e9dbfSaguzovsk 		if (found) {
879*a98e9dbfSaguzovsk 			mutex_exit(&hp->p_hmutex);
880*a98e9dbfSaguzovsk 			mutex_exit(pmtx);
881*a98e9dbfSaguzovsk 			mutex_enter(&seg_pmem_mtx);
882*a98e9dbfSaguzovsk 			seg_plocked -= npages;
883*a98e9dbfSaguzovsk 			seg_plocked_window -= npages;
884*a98e9dbfSaguzovsk 			mutex_exit(&seg_pmem_mtx);
885*a98e9dbfSaguzovsk 			kmem_cache_free(seg_pkmcache, pcp);
886*a98e9dbfSaguzovsk 			goto out;
887*a98e9dbfSaguzovsk 		}
888*a98e9dbfSaguzovsk 		pcp->p_plink.p_lnext = pheadp->p_lnext;
889*a98e9dbfSaguzovsk 		pcp->p_plink.p_lprev = pheadp;
890*a98e9dbfSaguzovsk 		pheadp->p_lnext->p_lprev = &pcp->p_plink;
891*a98e9dbfSaguzovsk 		pheadp->p_lnext = &pcp->p_plink;
892*a98e9dbfSaguzovsk 	} else {
893*a98e9dbfSaguzovsk 		mutex_enter(&hp->p_hmutex);
894*a98e9dbfSaguzovsk 	}
895*a98e9dbfSaguzovsk 	pcp->p_hashp = hp;
8967c478bd9Sstevel@tonic-gate 	pcp->p_hnext = hp->p_hnext;
8977c478bd9Sstevel@tonic-gate 	pcp->p_hprev = (struct seg_pcache *)hp;
8987c478bd9Sstevel@tonic-gate 	hp->p_hnext->p_hprev = pcp;
8997c478bd9Sstevel@tonic-gate 	hp->p_hnext = pcp;
900*a98e9dbfSaguzovsk 	if (!IS_PFLAGS_WIRED(flags) &&
901*a98e9dbfSaguzovsk 	    hp->p_hprev == pcp) {
902*a98e9dbfSaguzovsk 		seg_padd_abuck(hp);
903*a98e9dbfSaguzovsk 	}
9047c478bd9Sstevel@tonic-gate 	mutex_exit(&hp->p_hmutex);
905*a98e9dbfSaguzovsk 	if (!IS_PFLAGS_WIRED(flags)) {
906*a98e9dbfSaguzovsk 		mutex_exit(pmtx);
907*a98e9dbfSaguzovsk 	}
908*a98e9dbfSaguzovsk 
909*a98e9dbfSaguzovsk out:
910*a98e9dbfSaguzovsk 	npages = 0;
911*a98e9dbfSaguzovsk 	while (delcallb_list != NULL) {
912*a98e9dbfSaguzovsk 		pcp = delcallb_list;
913*a98e9dbfSaguzovsk 		delcallb_list = pcp->p_hprev;
914*a98e9dbfSaguzovsk 		ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
915*a98e9dbfSaguzovsk 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
916*a98e9dbfSaguzovsk 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
917*a98e9dbfSaguzovsk 		npages += btop(pcp->p_len);
918*a98e9dbfSaguzovsk 		kmem_cache_free(seg_pkmcache, pcp);
919*a98e9dbfSaguzovsk 	}
920*a98e9dbfSaguzovsk 	if (npages) {
921*a98e9dbfSaguzovsk 		ASSERT(!IS_PFLAGS_WIRED(flags));
922*a98e9dbfSaguzovsk 		mutex_enter(&seg_pmem_mtx);
923*a98e9dbfSaguzovsk 		ASSERT(seg_plocked >= npages);
924*a98e9dbfSaguzovsk 		ASSERT(seg_plocked_window >= npages);
925*a98e9dbfSaguzovsk 		seg_plocked -= npages;
926*a98e9dbfSaguzovsk 		seg_plocked_window -= npages;
927*a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
928*a98e9dbfSaguzovsk 	}
929*a98e9dbfSaguzovsk 
9307c478bd9Sstevel@tonic-gate 	return (SEGP_SUCCESS);
9317c478bd9Sstevel@tonic-gate }
9327c478bd9Sstevel@tonic-gate 
9337c478bd9Sstevel@tonic-gate /*
934*a98e9dbfSaguzovsk  * purge entries from the pagelock cache if not active
935*a98e9dbfSaguzovsk  * and not recently used.
9367c478bd9Sstevel@tonic-gate  */
9377c478bd9Sstevel@tonic-gate static void
938*a98e9dbfSaguzovsk seg_ppurge_async(int force)
9397c478bd9Sstevel@tonic-gate {
9407c478bd9Sstevel@tonic-gate 	struct seg_pcache *delcallb_list = NULL;
9417c478bd9Sstevel@tonic-gate 	struct seg_pcache *pcp;
9427c478bd9Sstevel@tonic-gate 	struct seg_phash *hp;
9437c478bd9Sstevel@tonic-gate 	pgcnt_t npages = 0;
9447c478bd9Sstevel@tonic-gate 	pgcnt_t npages_window = 0;
945*a98e9dbfSaguzovsk 	pgcnt_t	npgs_to_purge;
946*a98e9dbfSaguzovsk 	pgcnt_t npgs_purged = 0;
947*a98e9dbfSaguzovsk 	int hlinks = 0;
948*a98e9dbfSaguzovsk 	int hlix;
949*a98e9dbfSaguzovsk 	pcache_link_t *hlinkp;
950*a98e9dbfSaguzovsk 	pcache_link_t *hlnextp = NULL;
951*a98e9dbfSaguzovsk 	int lowmem;
952*a98e9dbfSaguzovsk 	int trim;
953*a98e9dbfSaguzovsk 
954*a98e9dbfSaguzovsk 	ASSERT(seg_phashsize_win != 0);
9557c478bd9Sstevel@tonic-gate 
9567c478bd9Sstevel@tonic-gate 	/*
957*a98e9dbfSaguzovsk 	 * if the cache is off or empty, return
9587c478bd9Sstevel@tonic-gate 	 */
959*a98e9dbfSaguzovsk 	if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
9607c478bd9Sstevel@tonic-gate 		return;
9617c478bd9Sstevel@tonic-gate 	}
9627c478bd9Sstevel@tonic-gate 
963*a98e9dbfSaguzovsk 	if (!force) {
964*a98e9dbfSaguzovsk 		lowmem = 0;
965*a98e9dbfSaguzovsk 		trim = 0;
966*a98e9dbfSaguzovsk 		if (freemem < lotsfree + needfree) {
967*a98e9dbfSaguzovsk 			spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
968*a98e9dbfSaguzovsk 			if (fmem <= 5 * (desfree >> 2)) {
969*a98e9dbfSaguzovsk 				lowmem = 1;
970*a98e9dbfSaguzovsk 			} else if (fmem <= 7 * (lotsfree >> 3)) {
971*a98e9dbfSaguzovsk 				if (seg_plocked_window >=
972*a98e9dbfSaguzovsk 				    (availrmem_initial >> 1)) {
973*a98e9dbfSaguzovsk 					lowmem = 1;
974*a98e9dbfSaguzovsk 				}
975*a98e9dbfSaguzovsk 			} else if (fmem < lotsfree) {
976*a98e9dbfSaguzovsk 				if (seg_plocked_window >=
977*a98e9dbfSaguzovsk 				    3 * (availrmem_initial >> 2)) {
978*a98e9dbfSaguzovsk 					lowmem = 1;
979*a98e9dbfSaguzovsk 				}
980*a98e9dbfSaguzovsk 			}
981*a98e9dbfSaguzovsk 		}
982*a98e9dbfSaguzovsk 		if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
983*a98e9dbfSaguzovsk 			trim = 1;
984*a98e9dbfSaguzovsk 		}
985*a98e9dbfSaguzovsk 		if (!lowmem && !trim) {
986*a98e9dbfSaguzovsk 			return;
987*a98e9dbfSaguzovsk 		}
988*a98e9dbfSaguzovsk 		npgs_to_purge = seg_plocked_window >>
989*a98e9dbfSaguzovsk 		    seg_pshrink_shift;
990*a98e9dbfSaguzovsk 		if (lowmem) {
991*a98e9dbfSaguzovsk 			npgs_to_purge = MIN(npgs_to_purge,
992*a98e9dbfSaguzovsk 			    MAX(seg_pmaxapurge_npages, desfree));
993*a98e9dbfSaguzovsk 		} else {
994*a98e9dbfSaguzovsk 			npgs_to_purge = MIN(npgs_to_purge,
995*a98e9dbfSaguzovsk 			    seg_pmaxapurge_npages);
996*a98e9dbfSaguzovsk 		}
997*a98e9dbfSaguzovsk 		if (npgs_to_purge == 0) {
998*a98e9dbfSaguzovsk 			return;
999*a98e9dbfSaguzovsk 		}
1000*a98e9dbfSaguzovsk 	} else {
1001*a98e9dbfSaguzovsk 		struct seg_phash_wired *hpw;
10027c478bd9Sstevel@tonic-gate 
1003*a98e9dbfSaguzovsk 		ASSERT(seg_phashsize_wired != 0);
10047c478bd9Sstevel@tonic-gate 
1005*a98e9dbfSaguzovsk 		for (hpw = seg_phashtab_wired;
1006*a98e9dbfSaguzovsk 		    hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1007*a98e9dbfSaguzovsk 
1008*a98e9dbfSaguzovsk 			if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1009*a98e9dbfSaguzovsk 				continue;
1010*a98e9dbfSaguzovsk 			}
1011*a98e9dbfSaguzovsk 
1012*a98e9dbfSaguzovsk 			mutex_enter(&hpw->p_hmutex);
1013*a98e9dbfSaguzovsk 
1014*a98e9dbfSaguzovsk 			for (pcp = hpw->p_hnext;
1015*a98e9dbfSaguzovsk 			    pcp != (struct seg_pcache *)hpw;
1016*a98e9dbfSaguzovsk 			    pcp = pcp->p_hnext) {
1017*a98e9dbfSaguzovsk 
1018*a98e9dbfSaguzovsk 				ASSERT(IS_PCP_WIRED(pcp));
1019*a98e9dbfSaguzovsk 				ASSERT(pcp->p_hashp ==
1020*a98e9dbfSaguzovsk 				    (struct seg_phash *)hpw);
1021*a98e9dbfSaguzovsk 
1022*a98e9dbfSaguzovsk 				if (pcp->p_active) {
1023*a98e9dbfSaguzovsk 					continue;
10247c478bd9Sstevel@tonic-gate 				}
1025*a98e9dbfSaguzovsk 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1026*a98e9dbfSaguzovsk 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1027*a98e9dbfSaguzovsk 				pcp->p_hprev = delcallb_list;
1028*a98e9dbfSaguzovsk 				delcallb_list = pcp;
1029*a98e9dbfSaguzovsk 			}
1030*a98e9dbfSaguzovsk 			mutex_exit(&hpw->p_hmutex);
1031*a98e9dbfSaguzovsk 		}
1032*a98e9dbfSaguzovsk 	}
1033*a98e9dbfSaguzovsk 
1034*a98e9dbfSaguzovsk 	mutex_enter(&seg_pmem_mtx);
1035*a98e9dbfSaguzovsk 	if (seg_pathr_on) {
1036*a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
1037*a98e9dbfSaguzovsk 		goto runcb;
1038*a98e9dbfSaguzovsk 	}
1039*a98e9dbfSaguzovsk 	seg_pathr_on = 1;
1040*a98e9dbfSaguzovsk 	mutex_exit(&seg_pmem_mtx);
1041*a98e9dbfSaguzovsk 	ASSERT(seg_pahcur <= 1);
1042*a98e9dbfSaguzovsk 	hlix = !seg_pahcur;
1043*a98e9dbfSaguzovsk 
1044*a98e9dbfSaguzovsk again:
1045*a98e9dbfSaguzovsk 	for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1046*a98e9dbfSaguzovsk 	    hlinkp = hlnextp) {
1047*a98e9dbfSaguzovsk 
1048*a98e9dbfSaguzovsk 		hlnextp = hlinkp->p_lnext;
1049*a98e9dbfSaguzovsk 		ASSERT(hlnextp != NULL);
1050*a98e9dbfSaguzovsk 
1051*a98e9dbfSaguzovsk 		hp = hlink2phash(hlinkp, hlix);
1052*a98e9dbfSaguzovsk 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1053*a98e9dbfSaguzovsk 			seg_pathr_empty_ahb++;
1054*a98e9dbfSaguzovsk 			continue;
1055*a98e9dbfSaguzovsk 		}
1056*a98e9dbfSaguzovsk 		seg_pathr_full_ahb++;
1057*a98e9dbfSaguzovsk 		mutex_enter(&hp->p_hmutex);
1058*a98e9dbfSaguzovsk 
1059*a98e9dbfSaguzovsk 		for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1060*a98e9dbfSaguzovsk 		    pcp = pcp->p_hnext) {
1061*a98e9dbfSaguzovsk 			pcache_link_t *pheadp;
1062*a98e9dbfSaguzovsk 			pcache_link_t *plinkp;
1063*a98e9dbfSaguzovsk 			void *htag0;
1064*a98e9dbfSaguzovsk 			kmutex_t *pmtx;
1065*a98e9dbfSaguzovsk 
1066*a98e9dbfSaguzovsk 			ASSERT(!IS_PCP_WIRED(pcp));
1067*a98e9dbfSaguzovsk 			ASSERT(pcp->p_hashp == hp);
1068*a98e9dbfSaguzovsk 
1069*a98e9dbfSaguzovsk 			if (pcp->p_active) {
1070*a98e9dbfSaguzovsk 				continue;
1071*a98e9dbfSaguzovsk 			}
1072*a98e9dbfSaguzovsk 			if (!force && pcp->p_ref &&
1073*a98e9dbfSaguzovsk 			    PCP_AGE(pcp) < seg_pmax_pcpage) {
10747c478bd9Sstevel@tonic-gate 				pcp->p_ref = 0;
1075*a98e9dbfSaguzovsk 				continue;
10767c478bd9Sstevel@tonic-gate 			}
1077*a98e9dbfSaguzovsk 			plinkp = &pcp->p_plink;
1078*a98e9dbfSaguzovsk 			htag0 = pcp->p_htag0;
1079*a98e9dbfSaguzovsk 			if (pcp->p_flags & SEGP_AMP) {
1080*a98e9dbfSaguzovsk 				pheadp = &((amp_t *)htag0)->a_phead;
1081*a98e9dbfSaguzovsk 				pmtx = &((amp_t *)htag0)->a_pmtx;
1082*a98e9dbfSaguzovsk 			} else {
1083*a98e9dbfSaguzovsk 				pheadp = &((seg_t *)htag0)->s_phead;
1084*a98e9dbfSaguzovsk 				pmtx = &((seg_t *)htag0)->s_pmtx;
1085*a98e9dbfSaguzovsk 			}
1086*a98e9dbfSaguzovsk 			if (!mutex_tryenter(pmtx)) {
1087*a98e9dbfSaguzovsk 				continue;
1088*a98e9dbfSaguzovsk 			}
1089*a98e9dbfSaguzovsk 			ASSERT(pheadp->p_lnext != pheadp);
1090*a98e9dbfSaguzovsk 			ASSERT(pheadp->p_lprev != pheadp);
1091*a98e9dbfSaguzovsk 			plinkp->p_lprev->p_lnext =
1092*a98e9dbfSaguzovsk 			    plinkp->p_lnext;
1093*a98e9dbfSaguzovsk 			plinkp->p_lnext->p_lprev =
1094*a98e9dbfSaguzovsk 			    plinkp->p_lprev;
1095*a98e9dbfSaguzovsk 			pcp->p_hprev->p_hnext = pcp->p_hnext;
1096*a98e9dbfSaguzovsk 			pcp->p_hnext->p_hprev = pcp->p_hprev;
1097*a98e9dbfSaguzovsk 			mutex_exit(pmtx);
1098*a98e9dbfSaguzovsk 			pcp->p_hprev = delcallb_list;
1099*a98e9dbfSaguzovsk 			delcallb_list = pcp;
1100*a98e9dbfSaguzovsk 			npgs_purged += btop(pcp->p_len);
1101*a98e9dbfSaguzovsk 		}
1102*a98e9dbfSaguzovsk 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1103*a98e9dbfSaguzovsk 			seg_premove_abuck(hp, 1);
11047c478bd9Sstevel@tonic-gate 		}
11057c478bd9Sstevel@tonic-gate 		mutex_exit(&hp->p_hmutex);
1106*a98e9dbfSaguzovsk 		if (npgs_purged >= seg_plocked_window) {
11077c478bd9Sstevel@tonic-gate 			break;
1108*a98e9dbfSaguzovsk 		}
1109*a98e9dbfSaguzovsk 		if (!force) {
1110*a98e9dbfSaguzovsk 			if (npgs_purged >= npgs_to_purge) {
1111*a98e9dbfSaguzovsk 				break;
1112*a98e9dbfSaguzovsk 			}
1113*a98e9dbfSaguzovsk 			if (!trim && !(seg_pathr_full_ahb & 15)) {
1114*a98e9dbfSaguzovsk 				ASSERT(lowmem);
1115*a98e9dbfSaguzovsk 				if (freemem >= lotsfree + needfree) {
1116*a98e9dbfSaguzovsk 					break;
1117*a98e9dbfSaguzovsk 				}
1118*a98e9dbfSaguzovsk 			}
1119*a98e9dbfSaguzovsk 		}
11207c478bd9Sstevel@tonic-gate 	}
11217c478bd9Sstevel@tonic-gate 
1122*a98e9dbfSaguzovsk 	if (hlinkp == &seg_pahhead[hlix]) {
1123*a98e9dbfSaguzovsk 		/*
1124*a98e9dbfSaguzovsk 		 * We processed the entire hlix active bucket list
1125*a98e9dbfSaguzovsk 		 * but didn't find enough pages to reclaim.
1126*a98e9dbfSaguzovsk 		 * Switch the lists and walk the other list
1127*a98e9dbfSaguzovsk 		 * if we haven't done it yet.
1128*a98e9dbfSaguzovsk 		 */
1129*a98e9dbfSaguzovsk 		mutex_enter(&seg_pmem_mtx);
1130*a98e9dbfSaguzovsk 		ASSERT(seg_pathr_on);
1131*a98e9dbfSaguzovsk 		ASSERT(seg_pahcur == !hlix);
1132*a98e9dbfSaguzovsk 		seg_pahcur = hlix;
1133*a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
1134*a98e9dbfSaguzovsk 		if (++hlinks < 2) {
1135*a98e9dbfSaguzovsk 			hlix = !hlix;
1136*a98e9dbfSaguzovsk 			goto again;
1137*a98e9dbfSaguzovsk 		}
1138*a98e9dbfSaguzovsk 	} else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1139*a98e9dbfSaguzovsk 	    seg_pahhead[hlix].p_lnext != hlinkp) {
1140*a98e9dbfSaguzovsk 		ASSERT(hlinkp != NULL);
1141*a98e9dbfSaguzovsk 		ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1142*a98e9dbfSaguzovsk 		ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1143*a98e9dbfSaguzovsk 		ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1144*a98e9dbfSaguzovsk 
1145*a98e9dbfSaguzovsk 		/*
1146*a98e9dbfSaguzovsk 		 * Reinsert the header to point to hlinkp
1147*a98e9dbfSaguzovsk 		 * so that we start from hlinkp bucket next time around.
1148*a98e9dbfSaguzovsk 		 */
1149*a98e9dbfSaguzovsk 		seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1150*a98e9dbfSaguzovsk 		seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1151*a98e9dbfSaguzovsk 		seg_pahhead[hlix].p_lnext = hlinkp;
1152*a98e9dbfSaguzovsk 		seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1153*a98e9dbfSaguzovsk 		hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1154*a98e9dbfSaguzovsk 		hlinkp->p_lprev = &seg_pahhead[hlix];
1155*a98e9dbfSaguzovsk 	}
1156*a98e9dbfSaguzovsk 
1157*a98e9dbfSaguzovsk 	mutex_enter(&seg_pmem_mtx);
1158*a98e9dbfSaguzovsk 	ASSERT(seg_pathr_on);
1159*a98e9dbfSaguzovsk 	seg_pathr_on = 0;
1160*a98e9dbfSaguzovsk 	mutex_exit(&seg_pmem_mtx);
1161*a98e9dbfSaguzovsk 
1162*a98e9dbfSaguzovsk runcb:
11637c478bd9Sstevel@tonic-gate 	/*
1164*a98e9dbfSaguzovsk 	 * Run the delayed callback list. segments/amps can't go away until
1165*a98e9dbfSaguzovsk 	 * callback is executed since they must have non 0 softlockcnt. That's
1166*a98e9dbfSaguzovsk 	 * why we don't need to hold as/seg/amp locks to execute the callback.
11677c478bd9Sstevel@tonic-gate 	 */
11687c478bd9Sstevel@tonic-gate 	while (delcallb_list != NULL) {
11697c478bd9Sstevel@tonic-gate 		pcp = delcallb_list;
11707c478bd9Sstevel@tonic-gate 		delcallb_list = pcp->p_hprev;
1171*a98e9dbfSaguzovsk 		ASSERT(!pcp->p_active);
1172*a98e9dbfSaguzovsk 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1173*a98e9dbfSaguzovsk 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1174*a98e9dbfSaguzovsk 		npages += btop(pcp->p_len);
1175*a98e9dbfSaguzovsk 		if (!IS_PCP_WIRED(pcp)) {
1176*a98e9dbfSaguzovsk 			npages_window += btop(pcp->p_len);
11777c478bd9Sstevel@tonic-gate 		}
1178*a98e9dbfSaguzovsk 		kmem_cache_free(seg_pkmcache, pcp);
1179*a98e9dbfSaguzovsk 	}
1180*a98e9dbfSaguzovsk 	if (npages) {
1181*a98e9dbfSaguzovsk 		mutex_enter(&seg_pmem_mtx);
1182*a98e9dbfSaguzovsk 		ASSERT(seg_plocked >= npages);
1183*a98e9dbfSaguzovsk 		ASSERT(seg_plocked_window >= npages_window);
1184*a98e9dbfSaguzovsk 		seg_plocked -= npages;
1185*a98e9dbfSaguzovsk 		seg_plocked_window -= npages_window;
1186*a98e9dbfSaguzovsk 		mutex_exit(&seg_pmem_mtx);
11877c478bd9Sstevel@tonic-gate 	}
11887c478bd9Sstevel@tonic-gate }
11897c478bd9Sstevel@tonic-gate 
11907c478bd9Sstevel@tonic-gate /*
1191*a98e9dbfSaguzovsk  * Remove cached pages for segment(s) entries from hashtable.  The segments
1192*a98e9dbfSaguzovsk  * are identified by pp array. This is useful for multiple seg's cached on
1193*a98e9dbfSaguzovsk  * behalf of dummy segment (ISM/DISM) with common pp array.
11947c478bd9Sstevel@tonic-gate  */
11957c478bd9Sstevel@tonic-gate void
1196*a98e9dbfSaguzovsk seg_ppurge_wiredpp(struct page **pp)
11977c478bd9Sstevel@tonic-gate {
1198*a98e9dbfSaguzovsk 	struct seg_pcache *pcp;
1199*a98e9dbfSaguzovsk 	struct seg_phash_wired *hp;
12007c478bd9Sstevel@tonic-gate 	pgcnt_t npages = 0;
1201*a98e9dbfSaguzovsk 	struct	seg_pcache *delcallb_list = NULL;
12027c478bd9Sstevel@tonic-gate 
12037c478bd9Sstevel@tonic-gate 	/*
1204*a98e9dbfSaguzovsk 	 * if the cache is empty, return
12057c478bd9Sstevel@tonic-gate 	 */
1206*a98e9dbfSaguzovsk 	if (seg_plocked == 0) {
12077c478bd9Sstevel@tonic-gate 		return;
12087c478bd9Sstevel@tonic-gate 	}
1209*a98e9dbfSaguzovsk 	ASSERT(seg_phashsize_wired != 0);
12107c478bd9Sstevel@tonic-gate 
1211*a98e9dbfSaguzovsk 	for (hp = seg_phashtab_wired;
1212*a98e9dbfSaguzovsk 	    hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1213*a98e9dbfSaguzovsk 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1214*a98e9dbfSaguzovsk 			continue;
1215*a98e9dbfSaguzovsk 		}
12167c478bd9Sstevel@tonic-gate 		mutex_enter(&hp->p_hmutex);
12177c478bd9Sstevel@tonic-gate 		pcp = hp->p_hnext;
12187c478bd9Sstevel@tonic-gate 		while (pcp != (struct seg_pcache *)hp) {
1219*a98e9dbfSaguzovsk 			ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1220*a98e9dbfSaguzovsk 			ASSERT(IS_PCP_WIRED(pcp));
12217c478bd9Sstevel@tonic-gate 			/*
12227c478bd9Sstevel@tonic-gate 			 * purge entries which are not active
12237c478bd9Sstevel@tonic-gate 			 */
1224*a98e9dbfSaguzovsk 			if (!pcp->p_active && pcp->p_pp == pp) {
1225*a98e9dbfSaguzovsk 				ASSERT(pcp->p_htag0 != NULL);
12267c478bd9Sstevel@tonic-gate 				pcp->p_hprev->p_hnext = pcp->p_hnext;
12277c478bd9Sstevel@tonic-gate 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1228*a98e9dbfSaguzovsk 				pcp->p_hprev = delcallb_list;
1229*a98e9dbfSaguzovsk 				delcallb_list = pcp;
12307c478bd9Sstevel@tonic-gate 			}
1231*a98e9dbfSaguzovsk 			pcp = pcp->p_hnext;
12327c478bd9Sstevel@tonic-gate 		}
12337c478bd9Sstevel@tonic-gate 		mutex_exit(&hp->p_hmutex);
1234*a98e9dbfSaguzovsk 		/*
1235*a98e9dbfSaguzovsk 		 * segments can't go away until callback is executed since
1236*a98e9dbfSaguzovsk 		 * they must have non 0 softlockcnt. That's why we don't
1237*a98e9dbfSaguzovsk 		 * need to hold as/seg locks to execute the callback.
1238*a98e9dbfSaguzovsk 		 */
1239*a98e9dbfSaguzovsk 		while (delcallb_list != NULL) {
1240*a98e9dbfSaguzovsk 			int done;
1241*a98e9dbfSaguzovsk 			pcp = delcallb_list;
1242*a98e9dbfSaguzovsk 			delcallb_list = pcp->p_hprev;
1243*a98e9dbfSaguzovsk 			ASSERT(!pcp->p_active);
1244*a98e9dbfSaguzovsk 			done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1245*a98e9dbfSaguzovsk 			    pcp->p_len, pcp->p_pp,
1246*a98e9dbfSaguzovsk 			    pcp->p_write ? S_WRITE : S_READ, 1);
1247*a98e9dbfSaguzovsk 			npages += btop(pcp->p_len);
1248*a98e9dbfSaguzovsk 			ASSERT(IS_PCP_WIRED(pcp));
1249*a98e9dbfSaguzovsk 			kmem_cache_free(seg_pkmcache, pcp);
1250*a98e9dbfSaguzovsk 			if (done) {
1251*a98e9dbfSaguzovsk 				ASSERT(delcallb_list == NULL);
1252*a98e9dbfSaguzovsk 				goto out;
1253*a98e9dbfSaguzovsk 			}
1254*a98e9dbfSaguzovsk 		}
12557c478bd9Sstevel@tonic-gate 	}
12567c478bd9Sstevel@tonic-gate 
1257*a98e9dbfSaguzovsk out:
1258*a98e9dbfSaguzovsk 	mutex_enter(&seg_pmem_mtx);
1259*a98e9dbfSaguzovsk 	ASSERT(seg_plocked >= npages);
12607c478bd9Sstevel@tonic-gate 	seg_plocked -= npages;
1261*a98e9dbfSaguzovsk 	mutex_exit(&seg_pmem_mtx);
12627c478bd9Sstevel@tonic-gate }
12637c478bd9Sstevel@tonic-gate 
12647c478bd9Sstevel@tonic-gate /*
12657c478bd9Sstevel@tonic-gate  * purge all entries for a given segment. Since we
12667c478bd9Sstevel@tonic-gate  * callback into the segment driver directly for page
12677c478bd9Sstevel@tonic-gate  * reclaim the caller needs to hold the right locks.
12687c478bd9Sstevel@tonic-gate  */
12697c478bd9Sstevel@tonic-gate void
1270*a98e9dbfSaguzovsk seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
12717c478bd9Sstevel@tonic-gate {
12727c478bd9Sstevel@tonic-gate 	struct seg_pcache *delcallb_list = NULL;
12737c478bd9Sstevel@tonic-gate 	struct seg_pcache *pcp;
12747c478bd9Sstevel@tonic-gate 	struct seg_phash *hp;
12757c478bd9Sstevel@tonic-gate 	pgcnt_t npages = 0;
1276*a98e9dbfSaguzovsk 	void *htag0;
12777c478bd9Sstevel@tonic-gate 
1278*a98e9dbfSaguzovsk 	if (seg_plocked == 0) {
12797c478bd9Sstevel@tonic-gate 		return;
12807c478bd9Sstevel@tonic-gate 	}
1281*a98e9dbfSaguzovsk 	ASSERT(seg_phashsize_win != 0);
1282*a98e9dbfSaguzovsk 
1283*a98e9dbfSaguzovsk 	/*
1284*a98e9dbfSaguzovsk 	 * If amp is not NULL use amp as a lookup tag otherwise use seg
1285*a98e9dbfSaguzovsk 	 * as a lookup tag.
1286*a98e9dbfSaguzovsk 	 */
1287*a98e9dbfSaguzovsk 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1288*a98e9dbfSaguzovsk 	ASSERT(htag0 != NULL);
1289*a98e9dbfSaguzovsk 	if (IS_PFLAGS_WIRED(flags)) {
1290*a98e9dbfSaguzovsk 		hp = P_HASHBP(seg, htag0, 0, flags);
1291*a98e9dbfSaguzovsk 		mutex_enter(&hp->p_hmutex);
1292*a98e9dbfSaguzovsk 		pcp = hp->p_hnext;
1293*a98e9dbfSaguzovsk 		while (pcp != (struct seg_pcache *)hp) {
1294*a98e9dbfSaguzovsk 			ASSERT(pcp->p_hashp == hp);
1295*a98e9dbfSaguzovsk 			ASSERT(IS_PCP_WIRED(pcp));
1296*a98e9dbfSaguzovsk 			if (pcp->p_htag0 == htag0) {
1297*a98e9dbfSaguzovsk 				if (pcp->p_active) {
1298*a98e9dbfSaguzovsk 					break;
1299*a98e9dbfSaguzovsk 				}
1300*a98e9dbfSaguzovsk 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1301*a98e9dbfSaguzovsk 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1302*a98e9dbfSaguzovsk 				pcp->p_hprev = delcallb_list;
1303*a98e9dbfSaguzovsk 				delcallb_list = pcp;
1304*a98e9dbfSaguzovsk 			}
1305*a98e9dbfSaguzovsk 			pcp = pcp->p_hnext;
1306*a98e9dbfSaguzovsk 		}
1307*a98e9dbfSaguzovsk 		mutex_exit(&hp->p_hmutex);
1308*a98e9dbfSaguzovsk 	} else {
1309*a98e9dbfSaguzovsk 		pcache_link_t *plinkp;
1310*a98e9dbfSaguzovsk 		pcache_link_t *pheadp;
1311*a98e9dbfSaguzovsk 		kmutex_t *pmtx;
1312*a98e9dbfSaguzovsk 
1313*a98e9dbfSaguzovsk 		if (amp == NULL) {
1314*a98e9dbfSaguzovsk 			ASSERT(seg != NULL);
1315*a98e9dbfSaguzovsk 			pheadp = &seg->s_phead;
1316*a98e9dbfSaguzovsk 			pmtx = &seg->s_pmtx;
1317*a98e9dbfSaguzovsk 		} else {
1318*a98e9dbfSaguzovsk 			pheadp = &amp->a_phead;
1319*a98e9dbfSaguzovsk 			pmtx = &amp->a_pmtx;
1320*a98e9dbfSaguzovsk 		}
1321*a98e9dbfSaguzovsk 		mutex_enter(pmtx);
1322*a98e9dbfSaguzovsk 		while ((plinkp = pheadp->p_lnext) != pheadp) {
1323*a98e9dbfSaguzovsk 			pcp = plink2pcache(plinkp);
1324*a98e9dbfSaguzovsk 			ASSERT(!IS_PCP_WIRED(pcp));
1325*a98e9dbfSaguzovsk 			ASSERT(pcp->p_htag0 == htag0);
1326*a98e9dbfSaguzovsk 			hp = pcp->p_hashp;
1327*a98e9dbfSaguzovsk 			mutex_enter(&hp->p_hmutex);
13287c478bd9Sstevel@tonic-gate 			if (pcp->p_active) {
1329*a98e9dbfSaguzovsk 				mutex_exit(&hp->p_hmutex);
13307c478bd9Sstevel@tonic-gate 				break;
13317c478bd9Sstevel@tonic-gate 			}
1332*a98e9dbfSaguzovsk 			ASSERT(plinkp->p_lprev == pheadp);
1333*a98e9dbfSaguzovsk 			pheadp->p_lnext = plinkp->p_lnext;
1334*a98e9dbfSaguzovsk 			plinkp->p_lnext->p_lprev = pheadp;
13357c478bd9Sstevel@tonic-gate 			pcp->p_hprev->p_hnext = pcp->p_hnext;
13367c478bd9Sstevel@tonic-gate 			pcp->p_hnext->p_hprev = pcp->p_hprev;
13377c478bd9Sstevel@tonic-gate 			pcp->p_hprev = delcallb_list;
13387c478bd9Sstevel@tonic-gate 			delcallb_list = pcp;
1339*a98e9dbfSaguzovsk 			if (hp->p_hnext == (struct seg_pcache *)hp) {
1340*a98e9dbfSaguzovsk 				seg_premove_abuck(hp, 0);
1341*a98e9dbfSaguzovsk 			}
1342*a98e9dbfSaguzovsk 			mutex_exit(&hp->p_hmutex);
13437c478bd9Sstevel@tonic-gate 		}
1344*a98e9dbfSaguzovsk 		mutex_exit(pmtx);
13457c478bd9Sstevel@tonic-gate 	}
13467c478bd9Sstevel@tonic-gate 	while (delcallb_list != NULL) {
13477c478bd9Sstevel@tonic-gate 		pcp = delcallb_list;
13487c478bd9Sstevel@tonic-gate 		delcallb_list = pcp->p_hprev;
1349*a98e9dbfSaguzovsk 		ASSERT(!pcp->p_active);
1350*a98e9dbfSaguzovsk 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1351*a98e9dbfSaguzovsk 		    pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1352*a98e9dbfSaguzovsk 		npages += btop(pcp->p_len);
1353*a98e9dbfSaguzovsk 		kmem_cache_free(seg_pkmcache, pcp);
13547c478bd9Sstevel@tonic-gate 	}
1355*a98e9dbfSaguzovsk 	mutex_enter(&seg_pmem_mtx);
1356*a98e9dbfSaguzovsk 	ASSERT(seg_plocked >= npages);
13577c478bd9Sstevel@tonic-gate 	seg_plocked -= npages;
1358*a98e9dbfSaguzovsk 	if (!IS_PFLAGS_WIRED(flags)) {
1359*a98e9dbfSaguzovsk 		ASSERT(seg_plocked_window >= npages);
1360*a98e9dbfSaguzovsk 		seg_plocked_window -= npages;
1361*a98e9dbfSaguzovsk 	}
1362*a98e9dbfSaguzovsk 	mutex_exit(&seg_pmem_mtx);
13637c478bd9Sstevel@tonic-gate }
13647c478bd9Sstevel@tonic-gate 
13657c478bd9Sstevel@tonic-gate static void seg_pinit_mem_config(void);
13667c478bd9Sstevel@tonic-gate 
13677c478bd9Sstevel@tonic-gate /*
13687c478bd9Sstevel@tonic-gate  * setup the pagelock cache
13697c478bd9Sstevel@tonic-gate  */
13707c478bd9Sstevel@tonic-gate static void
13717c478bd9Sstevel@tonic-gate seg_pinit(void)
13727c478bd9Sstevel@tonic-gate {
13737c478bd9Sstevel@tonic-gate 	struct seg_phash *hp;
1374*a98e9dbfSaguzovsk 	ulong_t i;
1375*a98e9dbfSaguzovsk 	pgcnt_t physmegs;
13767c478bd9Sstevel@tonic-gate 
1377*a98e9dbfSaguzovsk 	seg_plocked = 0;
1378*a98e9dbfSaguzovsk 	seg_plocked_window = 0;
13797c478bd9Sstevel@tonic-gate 
1380*a98e9dbfSaguzovsk 	if (segpcache_enabled == 0) {
1381*a98e9dbfSaguzovsk 		seg_phashsize_win = 0;
1382*a98e9dbfSaguzovsk 		seg_phashsize_wired = 0;
1383*a98e9dbfSaguzovsk 		seg_pdisabled = 1;
1384*a98e9dbfSaguzovsk 		return;
1385*a98e9dbfSaguzovsk 	}
13867c478bd9Sstevel@tonic-gate 
1387*a98e9dbfSaguzovsk 	seg_pdisabled = 0;
1388*a98e9dbfSaguzovsk 	seg_pkmcache = kmem_cache_create("seg_pcache",
1389*a98e9dbfSaguzovsk 	    sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1390*a98e9dbfSaguzovsk 	if (segpcache_pcp_maxage_ticks <= 0) {
1391*a98e9dbfSaguzovsk 		segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1392*a98e9dbfSaguzovsk 	}
1393*a98e9dbfSaguzovsk 	seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1394*a98e9dbfSaguzovsk 	seg_pathr_empty_ahb = 0;
1395*a98e9dbfSaguzovsk 	seg_pathr_full_ahb = 0;
1396*a98e9dbfSaguzovsk 	seg_pshrink_shift = segpcache_shrink_shift;
1397*a98e9dbfSaguzovsk 	seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
13987c478bd9Sstevel@tonic-gate 
1399*a98e9dbfSaguzovsk 	mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1400*a98e9dbfSaguzovsk 	mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1401*a98e9dbfSaguzovsk 	mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1402*a98e9dbfSaguzovsk 	cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1403*a98e9dbfSaguzovsk 
1404*a98e9dbfSaguzovsk 	physmegs = physmem >> (20 - PAGESHIFT);
1405*a98e9dbfSaguzovsk 
1406*a98e9dbfSaguzovsk 	/*
1407*a98e9dbfSaguzovsk 	 * If segpcache_hashsize_win was not set in /etc/system or it has
1408*a98e9dbfSaguzovsk 	 * absurd value set it to a default.
1409*a98e9dbfSaguzovsk 	 */
1410*a98e9dbfSaguzovsk 	if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1411*a98e9dbfSaguzovsk 		/*
1412*a98e9dbfSaguzovsk 		 * Create one bucket per 32K (or at least per 8 pages) of
1413*a98e9dbfSaguzovsk 		 * available memory.
1414*a98e9dbfSaguzovsk 		 */
1415*a98e9dbfSaguzovsk 		pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1416*a98e9dbfSaguzovsk 		segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1417*a98e9dbfSaguzovsk 	}
1418*a98e9dbfSaguzovsk 	if (!ISP2(segpcache_hashsize_win)) {
1419*a98e9dbfSaguzovsk 		ulong_t rndfac = ~(1UL <<
1420*a98e9dbfSaguzovsk 		    (highbit(segpcache_hashsize_win) - 1));
1421*a98e9dbfSaguzovsk 		rndfac &= segpcache_hashsize_win;
1422*a98e9dbfSaguzovsk 		segpcache_hashsize_win += rndfac;
1423*a98e9dbfSaguzovsk 		segpcache_hashsize_win = 1 <<
1424*a98e9dbfSaguzovsk 		    (highbit(segpcache_hashsize_win) - 1);
1425*a98e9dbfSaguzovsk 	}
1426*a98e9dbfSaguzovsk 	seg_phashsize_win = segpcache_hashsize_win;
1427*a98e9dbfSaguzovsk 	seg_phashtab_win = kmem_zalloc(
1428*a98e9dbfSaguzovsk 	    seg_phashsize_win * sizeof (struct seg_phash),
1429*a98e9dbfSaguzovsk 	    KM_SLEEP);
1430*a98e9dbfSaguzovsk 	for (i = 0; i < seg_phashsize_win; i++) {
1431*a98e9dbfSaguzovsk 		hp = &seg_phashtab_win[i];
1432*a98e9dbfSaguzovsk 		hp->p_hnext = (struct seg_pcache *)hp;
1433*a98e9dbfSaguzovsk 		hp->p_hprev = (struct seg_pcache *)hp;
1434*a98e9dbfSaguzovsk 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1435*a98e9dbfSaguzovsk 	}
1436*a98e9dbfSaguzovsk 
1437*a98e9dbfSaguzovsk 	seg_pahcur = 0;
1438*a98e9dbfSaguzovsk 	seg_pathr_on = 0;
1439*a98e9dbfSaguzovsk 	seg_pahhead[0].p_lnext = &seg_pahhead[0];
1440*a98e9dbfSaguzovsk 	seg_pahhead[0].p_lprev = &seg_pahhead[0];
1441*a98e9dbfSaguzovsk 	seg_pahhead[1].p_lnext = &seg_pahhead[1];
1442*a98e9dbfSaguzovsk 	seg_pahhead[1].p_lprev = &seg_pahhead[1];
1443*a98e9dbfSaguzovsk 
1444*a98e9dbfSaguzovsk 	/*
1445*a98e9dbfSaguzovsk 	 * If segpcache_hashsize_wired was not set in /etc/system or it has
1446*a98e9dbfSaguzovsk 	 * absurd value set it to a default.
1447*a98e9dbfSaguzovsk 	 */
1448*a98e9dbfSaguzovsk 	if (segpcache_hashsize_wired == 0 ||
1449*a98e9dbfSaguzovsk 	    segpcache_hashsize_wired > physmem / 4) {
1450*a98e9dbfSaguzovsk 		/*
1451*a98e9dbfSaguzovsk 		 * Choose segpcache_hashsize_wired based on physmem.
1452*a98e9dbfSaguzovsk 		 * Create a bucket per 128K bytes upto 256K buckets.
1453*a98e9dbfSaguzovsk 		 */
1454*a98e9dbfSaguzovsk 		if (physmegs < 20 * 1024) {
1455*a98e9dbfSaguzovsk 			segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1456*a98e9dbfSaguzovsk 		} else {
1457*a98e9dbfSaguzovsk 			segpcache_hashsize_wired = 256 * 1024;
14587c478bd9Sstevel@tonic-gate 		}
14597c478bd9Sstevel@tonic-gate 	}
1460*a98e9dbfSaguzovsk 	if (!ISP2(segpcache_hashsize_wired)) {
1461*a98e9dbfSaguzovsk 		segpcache_hashsize_wired = 1 <<
1462*a98e9dbfSaguzovsk 		    highbit(segpcache_hashsize_wired);
1463*a98e9dbfSaguzovsk 	}
1464*a98e9dbfSaguzovsk 	seg_phashsize_wired = segpcache_hashsize_wired;
1465*a98e9dbfSaguzovsk 	seg_phashtab_wired = kmem_zalloc(
1466*a98e9dbfSaguzovsk 	    seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1467*a98e9dbfSaguzovsk 	for (i = 0; i < seg_phashsize_wired; i++) {
1468*a98e9dbfSaguzovsk 		hp = (struct seg_phash *)&seg_phashtab_wired[i];
1469*a98e9dbfSaguzovsk 		hp->p_hnext = (struct seg_pcache *)hp;
1470*a98e9dbfSaguzovsk 		hp->p_hprev = (struct seg_pcache *)hp;
1471*a98e9dbfSaguzovsk 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1472*a98e9dbfSaguzovsk 	}
14737c478bd9Sstevel@tonic-gate 
1474*a98e9dbfSaguzovsk 	if (segpcache_maxwindow == 0) {
1475*a98e9dbfSaguzovsk 		if (physmegs < 64) {
1476*a98e9dbfSaguzovsk 			/* 3% of memory */
1477*a98e9dbfSaguzovsk 			segpcache_maxwindow = availrmem >> 5;
1478*a98e9dbfSaguzovsk 		} else if (physmegs < 512) {
1479*a98e9dbfSaguzovsk 			/* 12% of memory */
1480*a98e9dbfSaguzovsk 			segpcache_maxwindow = availrmem >> 3;
1481*a98e9dbfSaguzovsk 		} else if (physmegs < 1024) {
1482*a98e9dbfSaguzovsk 			/* 25% of memory */
1483*a98e9dbfSaguzovsk 			segpcache_maxwindow = availrmem >> 2;
1484*a98e9dbfSaguzovsk 		} else if (physmegs < 2048) {
1485*a98e9dbfSaguzovsk 			/* 50% of memory */
1486*a98e9dbfSaguzovsk 			segpcache_maxwindow = availrmem >> 1;
1487*a98e9dbfSaguzovsk 		} else {
1488*a98e9dbfSaguzovsk 			/* no limit */
1489*a98e9dbfSaguzovsk 			segpcache_maxwindow = (pgcnt_t)-1;
1490*a98e9dbfSaguzovsk 		}
1491*a98e9dbfSaguzovsk 	}
1492*a98e9dbfSaguzovsk 	seg_pmaxwindow = segpcache_maxwindow;
14937c478bd9Sstevel@tonic-gate 	seg_pinit_mem_config();
14947c478bd9Sstevel@tonic-gate }
14957c478bd9Sstevel@tonic-gate 
14967c478bd9Sstevel@tonic-gate /*
14977c478bd9Sstevel@tonic-gate  * called by pageout if memory is low
14987c478bd9Sstevel@tonic-gate  */
14997c478bd9Sstevel@tonic-gate void
15007c478bd9Sstevel@tonic-gate seg_preap(void)
15017c478bd9Sstevel@tonic-gate {
15027c478bd9Sstevel@tonic-gate 	/*
1503*a98e9dbfSaguzovsk 	 * if the cache is off or empty, return
15047c478bd9Sstevel@tonic-gate 	 */
1505*a98e9dbfSaguzovsk 	if (seg_plocked_window == 0) {
15067c478bd9Sstevel@tonic-gate 		return;
15077c478bd9Sstevel@tonic-gate 	}
1508*a98e9dbfSaguzovsk 	ASSERT(seg_phashsize_win != 0);
15097c478bd9Sstevel@tonic-gate 
1510*a98e9dbfSaguzovsk 	/*
1511*a98e9dbfSaguzovsk 	 * If somebody is already purging pcache
1512*a98e9dbfSaguzovsk 	 * just return.
1513*a98e9dbfSaguzovsk 	 */
1514*a98e9dbfSaguzovsk 	if (seg_pdisabled) {
1515*a98e9dbfSaguzovsk 		return;
1516*a98e9dbfSaguzovsk 	}
1517*a98e9dbfSaguzovsk 
1518*a98e9dbfSaguzovsk 	cv_signal(&seg_pasync_cv);
1519*a98e9dbfSaguzovsk }
15207c478bd9Sstevel@tonic-gate 
15217c478bd9Sstevel@tonic-gate /*
15227c478bd9Sstevel@tonic-gate  * run as a backgroud thread and reclaim pagelock
15237c478bd9Sstevel@tonic-gate  * pages which have not been used recently
15247c478bd9Sstevel@tonic-gate  */
15257c478bd9Sstevel@tonic-gate void
15267c478bd9Sstevel@tonic-gate seg_pasync_thread(void)
15277c478bd9Sstevel@tonic-gate {
15287c478bd9Sstevel@tonic-gate 	callb_cpr_t cpr_info;
15297c478bd9Sstevel@tonic-gate 
1530*a98e9dbfSaguzovsk 	if (seg_phashsize_win == 0) {
1531*a98e9dbfSaguzovsk 		thread_exit();
1532*a98e9dbfSaguzovsk 		/*NOTREACHED*/
1533*a98e9dbfSaguzovsk 	}
1534*a98e9dbfSaguzovsk 
1535*a98e9dbfSaguzovsk 	seg_pasync_thr = curthread;
15367c478bd9Sstevel@tonic-gate 
1537*a98e9dbfSaguzovsk 	CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1538*a98e9dbfSaguzovsk 	    callb_generic_cpr, "seg_pasync");
15397c478bd9Sstevel@tonic-gate 
1540*a98e9dbfSaguzovsk 	if (segpcache_reap_ticks <= 0) {
1541*a98e9dbfSaguzovsk 		segpcache_reap_ticks = segpcache_reap_sec * hz;
15427c478bd9Sstevel@tonic-gate 	}
15437c478bd9Sstevel@tonic-gate 
1544*a98e9dbfSaguzovsk 	mutex_enter(&seg_pasync_mtx);
15457c478bd9Sstevel@tonic-gate 	for (;;) {
15467c478bd9Sstevel@tonic-gate 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
1547*a98e9dbfSaguzovsk 		(void) cv_timedwait(&seg_pasync_cv, &seg_pasync_mtx,
1548*a98e9dbfSaguzovsk 		    lbolt + segpcache_reap_ticks);
1549*a98e9dbfSaguzovsk 		CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1550*a98e9dbfSaguzovsk 		if (seg_pdisabled == 0) {
1551*a98e9dbfSaguzovsk 			seg_ppurge_async(0);
1552*a98e9dbfSaguzovsk 		}
15537c478bd9Sstevel@tonic-gate 	}
15547c478bd9Sstevel@tonic-gate }
15557c478bd9Sstevel@tonic-gate 
15567c478bd9Sstevel@tonic-gate static struct kmem_cache *seg_cache;
15577c478bd9Sstevel@tonic-gate 
15587c478bd9Sstevel@tonic-gate /*
15597c478bd9Sstevel@tonic-gate  * Initialize segment management data structures.
15607c478bd9Sstevel@tonic-gate  */
15617c478bd9Sstevel@tonic-gate void
15627c478bd9Sstevel@tonic-gate seg_init(void)
15637c478bd9Sstevel@tonic-gate {
15647c478bd9Sstevel@tonic-gate 	kstat_t *ksp;
15657c478bd9Sstevel@tonic-gate 
1566*a98e9dbfSaguzovsk 	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1567*a98e9dbfSaguzovsk 	    0, NULL, NULL, NULL, NULL, NULL, 0);
15687c478bd9Sstevel@tonic-gate 
15697c478bd9Sstevel@tonic-gate 	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1570c6f08383Sjj 	    segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
15717c478bd9Sstevel@tonic-gate 	if (ksp) {
15727c478bd9Sstevel@tonic-gate 		ksp->ks_data = (void *)segadvstat_ptr;
15737c478bd9Sstevel@tonic-gate 		kstat_install(ksp);
15747c478bd9Sstevel@tonic-gate 	}
15757c478bd9Sstevel@tonic-gate 
15767c478bd9Sstevel@tonic-gate 	seg_pinit();
15777c478bd9Sstevel@tonic-gate }
15787c478bd9Sstevel@tonic-gate 
15797c478bd9Sstevel@tonic-gate /*
15807c478bd9Sstevel@tonic-gate  * Allocate a segment to cover [base, base+size]
15817c478bd9Sstevel@tonic-gate  * and attach it to the specified address space.
15827c478bd9Sstevel@tonic-gate  */
15837c478bd9Sstevel@tonic-gate struct seg *
15847c478bd9Sstevel@tonic-gate seg_alloc(struct as *as, caddr_t base, size_t size)
15857c478bd9Sstevel@tonic-gate {
15867c478bd9Sstevel@tonic-gate 	struct seg *new;
15877c478bd9Sstevel@tonic-gate 	caddr_t segbase;
15887c478bd9Sstevel@tonic-gate 	size_t segsize;
15897c478bd9Sstevel@tonic-gate 
15907c478bd9Sstevel@tonic-gate 	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
15917c478bd9Sstevel@tonic-gate 	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
15927c478bd9Sstevel@tonic-gate 	    (uintptr_t)segbase;
15937c478bd9Sstevel@tonic-gate 
15947c478bd9Sstevel@tonic-gate 	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
15957c478bd9Sstevel@tonic-gate 		return ((struct seg *)NULL);	/* bad virtual addr range */
15967c478bd9Sstevel@tonic-gate 
15977c478bd9Sstevel@tonic-gate 	if (as != &kas &&
15987c478bd9Sstevel@tonic-gate 	    valid_usr_range(segbase, segsize, 0, as,
15997c478bd9Sstevel@tonic-gate 	    as->a_userlimit) != RANGE_OKAY)
16007c478bd9Sstevel@tonic-gate 		return ((struct seg *)NULL);	/* bad virtual addr range */
16017c478bd9Sstevel@tonic-gate 
16027c478bd9Sstevel@tonic-gate 	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
16037c478bd9Sstevel@tonic-gate 	new->s_ops = NULL;
16047c478bd9Sstevel@tonic-gate 	new->s_data = NULL;
16057c478bd9Sstevel@tonic-gate 	new->s_szc = 0;
16067c478bd9Sstevel@tonic-gate 	new->s_flags = 0;
1607*a98e9dbfSaguzovsk 	mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1608*a98e9dbfSaguzovsk 	new->s_phead.p_lnext = &new->s_phead;
1609*a98e9dbfSaguzovsk 	new->s_phead.p_lprev = &new->s_phead;
16107c478bd9Sstevel@tonic-gate 	if (seg_attach(as, segbase, segsize, new) < 0) {
16117c478bd9Sstevel@tonic-gate 		kmem_cache_free(seg_cache, new);
16127c478bd9Sstevel@tonic-gate 		return ((struct seg *)NULL);
16137c478bd9Sstevel@tonic-gate 	}
16147c478bd9Sstevel@tonic-gate 	/* caller must fill in ops, data */
16157c478bd9Sstevel@tonic-gate 	return (new);
16167c478bd9Sstevel@tonic-gate }
16177c478bd9Sstevel@tonic-gate 
16187c478bd9Sstevel@tonic-gate /*
16197c478bd9Sstevel@tonic-gate  * Attach a segment to the address space.  Used by seg_alloc()
16207c478bd9Sstevel@tonic-gate  * and for kernel startup to attach to static segments.
16217c478bd9Sstevel@tonic-gate  */
16227c478bd9Sstevel@tonic-gate int
16237c478bd9Sstevel@tonic-gate seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
16247c478bd9Sstevel@tonic-gate {
16257c478bd9Sstevel@tonic-gate 	seg->s_as = as;
16267c478bd9Sstevel@tonic-gate 	seg->s_base = base;
16277c478bd9Sstevel@tonic-gate 	seg->s_size = size;
16287c478bd9Sstevel@tonic-gate 
16297c478bd9Sstevel@tonic-gate 	/*
16307c478bd9Sstevel@tonic-gate 	 * as_addseg() will add the segment at the appropraite point
16317c478bd9Sstevel@tonic-gate 	 * in the list. It will return -1 if there is overlap with
16327c478bd9Sstevel@tonic-gate 	 * an already existing segment.
16337c478bd9Sstevel@tonic-gate 	 */
16347c478bd9Sstevel@tonic-gate 	return (as_addseg(as, seg));
16357c478bd9Sstevel@tonic-gate }
16367c478bd9Sstevel@tonic-gate 
16377c478bd9Sstevel@tonic-gate /*
16387c478bd9Sstevel@tonic-gate  * Unmap a segment and free it from its associated address space.
16397c478bd9Sstevel@tonic-gate  * This should be called by anybody who's finished with a whole segment's
16407c478bd9Sstevel@tonic-gate  * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
16417c478bd9Sstevel@tonic-gate  * responsibility of the segment driver to unlink the the segment
16427c478bd9Sstevel@tonic-gate  * from the address space, and to free public and private data structures
16437c478bd9Sstevel@tonic-gate  * associated with the segment.  (This is typically done by a call to
16447c478bd9Sstevel@tonic-gate  * seg_free()).
16457c478bd9Sstevel@tonic-gate  */
16467c478bd9Sstevel@tonic-gate void
16477c478bd9Sstevel@tonic-gate seg_unmap(struct seg *seg)
16487c478bd9Sstevel@tonic-gate {
16497c478bd9Sstevel@tonic-gate #ifdef DEBUG
16507c478bd9Sstevel@tonic-gate 	int ret;
16517c478bd9Sstevel@tonic-gate #endif /* DEBUG */
16527c478bd9Sstevel@tonic-gate 
16537c478bd9Sstevel@tonic-gate 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
16547c478bd9Sstevel@tonic-gate 
16557c478bd9Sstevel@tonic-gate 	/* Shouldn't have called seg_unmap if mapping isn't yet established */
16567c478bd9Sstevel@tonic-gate 	ASSERT(seg->s_data != NULL);
16577c478bd9Sstevel@tonic-gate 
16587c478bd9Sstevel@tonic-gate 	/* Unmap the whole mapping */
16597c478bd9Sstevel@tonic-gate #ifdef DEBUG
16607c478bd9Sstevel@tonic-gate 	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
16617c478bd9Sstevel@tonic-gate 	ASSERT(ret == 0);
16627c478bd9Sstevel@tonic-gate #else
16637c478bd9Sstevel@tonic-gate 	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
16647c478bd9Sstevel@tonic-gate #endif /* DEBUG */
16657c478bd9Sstevel@tonic-gate }
16667c478bd9Sstevel@tonic-gate 
16677c478bd9Sstevel@tonic-gate /*
16687c478bd9Sstevel@tonic-gate  * Free the segment from its associated as. This should only be called
16697c478bd9Sstevel@tonic-gate  * if a mapping to the segment has not yet been established (e.g., if
16707c478bd9Sstevel@tonic-gate  * an error occurs in the middle of doing an as_map when the segment
16717c478bd9Sstevel@tonic-gate  * has already been partially set up) or if it has already been deleted
16727c478bd9Sstevel@tonic-gate  * (e.g., from a segment driver unmap routine if the unmap applies to the
16737c478bd9Sstevel@tonic-gate  * entire segment). If the mapping is currently set up then seg_unmap() should
16747c478bd9Sstevel@tonic-gate  * be called instead.
16757c478bd9Sstevel@tonic-gate  */
16767c478bd9Sstevel@tonic-gate void
16777c478bd9Sstevel@tonic-gate seg_free(struct seg *seg)
16787c478bd9Sstevel@tonic-gate {
16797c478bd9Sstevel@tonic-gate 	register struct as *as = seg->s_as;
16807c478bd9Sstevel@tonic-gate 	struct seg *tseg = as_removeseg(as, seg);
16817c478bd9Sstevel@tonic-gate 
16827c478bd9Sstevel@tonic-gate 	ASSERT(tseg == seg);
16837c478bd9Sstevel@tonic-gate 
16847c478bd9Sstevel@tonic-gate 	/*
16857c478bd9Sstevel@tonic-gate 	 * If the segment private data field is NULL,
16867c478bd9Sstevel@tonic-gate 	 * then segment driver is not attached yet.
16877c478bd9Sstevel@tonic-gate 	 */
16887c478bd9Sstevel@tonic-gate 	if (seg->s_data != NULL)
16897c478bd9Sstevel@tonic-gate 		SEGOP_FREE(seg);
16907c478bd9Sstevel@tonic-gate 
1691*a98e9dbfSaguzovsk 	mutex_destroy(&seg->s_pmtx);
1692*a98e9dbfSaguzovsk 	ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1693*a98e9dbfSaguzovsk 	ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
16947c478bd9Sstevel@tonic-gate 	kmem_cache_free(seg_cache, seg);
16957c478bd9Sstevel@tonic-gate }
16967c478bd9Sstevel@tonic-gate 
16977c478bd9Sstevel@tonic-gate /*ARGSUSED*/
16987c478bd9Sstevel@tonic-gate static void
16997c478bd9Sstevel@tonic-gate seg_p_mem_config_post_add(
17007c478bd9Sstevel@tonic-gate 	void *arg,
17017c478bd9Sstevel@tonic-gate 	pgcnt_t delta_pages)
17027c478bd9Sstevel@tonic-gate {
17037c478bd9Sstevel@tonic-gate 	/* Nothing to do. */
17047c478bd9Sstevel@tonic-gate }
17057c478bd9Sstevel@tonic-gate 
1706cee1d74bSjfrank void
1707cee1d74bSjfrank seg_p_enable(void)
1708cee1d74bSjfrank {
1709*a98e9dbfSaguzovsk 	mutex_enter(&seg_pcache_mtx);
1710*a98e9dbfSaguzovsk 	ASSERT(seg_pdisabled != 0);
1711*a98e9dbfSaguzovsk 	seg_pdisabled--;
1712*a98e9dbfSaguzovsk 	mutex_exit(&seg_pcache_mtx);
1713cee1d74bSjfrank }
1714cee1d74bSjfrank 
17157c478bd9Sstevel@tonic-gate /*
1716cee1d74bSjfrank  * seg_p_disable - disables seg_pcache, and then attempts to empty the
1717cee1d74bSjfrank  * cache.
1718cee1d74bSjfrank  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1719cee1d74bSjfrank  * SEGP_FAIL if the cache could not be emptied.
17207c478bd9Sstevel@tonic-gate  */
1721cee1d74bSjfrank int
1722cee1d74bSjfrank seg_p_disable(void)
17237c478bd9Sstevel@tonic-gate {
17247c478bd9Sstevel@tonic-gate 	pgcnt_t	old_plocked;
17257c478bd9Sstevel@tonic-gate 	int stall_count = 0;
17267c478bd9Sstevel@tonic-gate 
1727*a98e9dbfSaguzovsk 	mutex_enter(&seg_pcache_mtx);
1728*a98e9dbfSaguzovsk 	seg_pdisabled++;
1729*a98e9dbfSaguzovsk 	ASSERT(seg_pdisabled != 0);
1730*a98e9dbfSaguzovsk 	mutex_exit(&seg_pcache_mtx);
17317c478bd9Sstevel@tonic-gate 
17327c478bd9Sstevel@tonic-gate 	/*
17337c478bd9Sstevel@tonic-gate 	 * Attempt to empty the cache. Terminate if seg_plocked does not
17347c478bd9Sstevel@tonic-gate 	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
17357c478bd9Sstevel@tonic-gate 	 */
17367c478bd9Sstevel@tonic-gate 	while (seg_plocked != 0) {
1737*a98e9dbfSaguzovsk 		ASSERT(seg_phashsize_win != 0);
17387c478bd9Sstevel@tonic-gate 		old_plocked = seg_plocked;
1739*a98e9dbfSaguzovsk 		seg_ppurge_async(1);
17407c478bd9Sstevel@tonic-gate 		if (seg_plocked == old_plocked) {
17417c478bd9Sstevel@tonic-gate 			if (stall_count++ > SEGP_STALL_THRESHOLD) {
1742cee1d74bSjfrank 				return (SEGP_FAIL);
17437c478bd9Sstevel@tonic-gate 			}
17447c478bd9Sstevel@tonic-gate 		} else
17457c478bd9Sstevel@tonic-gate 			stall_count = 0;
17467c478bd9Sstevel@tonic-gate 		if (seg_plocked != 0)
17477c478bd9Sstevel@tonic-gate 			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
17487c478bd9Sstevel@tonic-gate 	}
1749cee1d74bSjfrank 	return (SEGP_SUCCESS);
1750cee1d74bSjfrank }
1751cee1d74bSjfrank 
1752cee1d74bSjfrank /*
1753cee1d74bSjfrank  * Attempt to purge seg_pcache.  May need to return before this has
1754cee1d74bSjfrank  * completed to allow other pre_del callbacks to unlock pages. This is
1755cee1d74bSjfrank  * ok because:
1756*a98e9dbfSaguzovsk  *	1) The seg_pdisabled flag has been set so at least we won't
1757cee1d74bSjfrank  *	cache anymore locks and the locks we couldn't purge
1758cee1d74bSjfrank  *	will not be held if they do get released by a subsequent
1759cee1d74bSjfrank  *	pre-delete callback.
1760cee1d74bSjfrank  *
1761cee1d74bSjfrank  *	2) The rest of the memory delete thread processing does not
1762cee1d74bSjfrank  *	depend on the changes made in this pre-delete callback. No
1763cee1d74bSjfrank  *	panics will result, the worst that will happen is that the
1764cee1d74bSjfrank  *	DR code will timeout and cancel the delete.
1765cee1d74bSjfrank  */
1766cee1d74bSjfrank /*ARGSUSED*/
1767cee1d74bSjfrank static int
1768cee1d74bSjfrank seg_p_mem_config_pre_del(
1769cee1d74bSjfrank 	void *arg,
1770cee1d74bSjfrank 	pgcnt_t delta_pages)
1771cee1d74bSjfrank {
1772*a98e9dbfSaguzovsk 	if (seg_phashsize_win == 0) {
1773*a98e9dbfSaguzovsk 		return (0);
1774*a98e9dbfSaguzovsk 	}
1775cee1d74bSjfrank 	if (seg_p_disable() != SEGP_SUCCESS)
1776cee1d74bSjfrank 		cmn_err(CE_NOTE,
1777cee1d74bSjfrank 		    "!Pre-delete couldn't purge"" pagelock cache - continuing");
17787c478bd9Sstevel@tonic-gate 	return (0);
17797c478bd9Sstevel@tonic-gate }
17807c478bd9Sstevel@tonic-gate 
17817c478bd9Sstevel@tonic-gate /*ARGSUSED*/
17827c478bd9Sstevel@tonic-gate static void
17837c478bd9Sstevel@tonic-gate seg_p_mem_config_post_del(
17847c478bd9Sstevel@tonic-gate 	void *arg,
17857c478bd9Sstevel@tonic-gate 	pgcnt_t delta_pages,
17867c478bd9Sstevel@tonic-gate 	int cancelled)
17877c478bd9Sstevel@tonic-gate {
1788*a98e9dbfSaguzovsk 	if (seg_phashsize_win == 0) {
1789*a98e9dbfSaguzovsk 		return;
1790*a98e9dbfSaguzovsk 	}
1791cee1d74bSjfrank 	seg_p_enable();
17927c478bd9Sstevel@tonic-gate }
17937c478bd9Sstevel@tonic-gate 
17947c478bd9Sstevel@tonic-gate static kphysm_setup_vector_t seg_p_mem_config_vec = {
17957c478bd9Sstevel@tonic-gate 	KPHYSM_SETUP_VECTOR_VERSION,
17967c478bd9Sstevel@tonic-gate 	seg_p_mem_config_post_add,
17977c478bd9Sstevel@tonic-gate 	seg_p_mem_config_pre_del,
17987c478bd9Sstevel@tonic-gate 	seg_p_mem_config_post_del,
17997c478bd9Sstevel@tonic-gate };
18007c478bd9Sstevel@tonic-gate 
18017c478bd9Sstevel@tonic-gate static void
18027c478bd9Sstevel@tonic-gate seg_pinit_mem_config(void)
18037c478bd9Sstevel@tonic-gate {
18047c478bd9Sstevel@tonic-gate 	int ret;
18057c478bd9Sstevel@tonic-gate 
18067c478bd9Sstevel@tonic-gate 	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
18077c478bd9Sstevel@tonic-gate 	/*
18087c478bd9Sstevel@tonic-gate 	 * Want to catch this in the debug kernel. At run time, if the
18097c478bd9Sstevel@tonic-gate 	 * callbacks don't get run all will be OK as the disable just makes
18107c478bd9Sstevel@tonic-gate 	 * it more likely that the pages can be collected.
18117c478bd9Sstevel@tonic-gate 	 */
18127c478bd9Sstevel@tonic-gate 	ASSERT(ret == 0);
18137c478bd9Sstevel@tonic-gate }
18140209230bSgjelinek 
18150209230bSgjelinek /*
18160209230bSgjelinek  * Verify that segment is not a shared anonymous segment which reserves
18170209230bSgjelinek  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
18180209230bSgjelinek  * from one zone to another if any segments are shared.  This is because the
18190209230bSgjelinek  * last process to exit will credit the swap reservation.  This could lead
18200209230bSgjelinek  * to the swap being reserved by one zone, and credited to another.
18210209230bSgjelinek  */
18220209230bSgjelinek boolean_t
18230209230bSgjelinek seg_can_change_zones(struct seg *seg)
18240209230bSgjelinek {
18250209230bSgjelinek 	struct segvn_data *svd;
18260209230bSgjelinek 
18270209230bSgjelinek 	if (seg->s_ops == &segspt_shmops)
18280209230bSgjelinek 		return (B_FALSE);
18290209230bSgjelinek 
18300209230bSgjelinek 	if (seg->s_ops == &segvn_ops) {
18310209230bSgjelinek 		svd = (struct segvn_data *)seg->s_data;
18320209230bSgjelinek 		if (svd->type == MAP_SHARED &&
18330209230bSgjelinek 		    svd->amp != NULL &&
18340209230bSgjelinek 		    svd->amp->swresv > 0)
18350209230bSgjelinek 		return (B_FALSE);
18360209230bSgjelinek 	}
18370209230bSgjelinek 	return (B_TRUE);
18380209230bSgjelinek }
18390209230bSgjelinek 
18400209230bSgjelinek /*
18410209230bSgjelinek  * Return swap reserved by a segment backing a private mapping.
18420209230bSgjelinek  */
18430209230bSgjelinek size_t
18440209230bSgjelinek seg_swresv(struct seg *seg)
18450209230bSgjelinek {
18460209230bSgjelinek 	struct segvn_data *svd;
18470209230bSgjelinek 	size_t swap = 0;
18480209230bSgjelinek 
18490209230bSgjelinek 	if (seg->s_ops == &segvn_ops) {
18500209230bSgjelinek 		svd = (struct segvn_data *)seg->s_data;
18510209230bSgjelinek 		if (svd->type == MAP_PRIVATE && svd->swresv > 0)
18520209230bSgjelinek 			swap = svd->swresv;
18530209230bSgjelinek 	}
18540209230bSgjelinek 	return (swap);
18550209230bSgjelinek }
1856