1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 * Copyright (c) 2018, Joyent, Inc.
25 */
26
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved  	*/
29
30/*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40/*
41 * VM - segment management.
42 */
43
44#include <sys/types.h>
45#include <sys/inttypes.h>
46#include <sys/t_lock.h>
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/kmem.h>
50#include <sys/sysmacros.h>
51#include <sys/vmsystm.h>
52#include <sys/tuneable.h>
53#include <sys/debug.h>
54#include <sys/fs/swapnode.h>
55#include <sys/cmn_err.h>
56#include <sys/callb.h>
57#include <sys/mem_config.h>
58#include <sys/mman.h>
59
60#include <vm/hat.h>
61#include <vm/as.h>
62#include <vm/seg.h>
63#include <vm/seg_kmem.h>
64#include <vm/seg_spt.h>
65#include <vm/seg_vn.h>
66#include <vm/anon.h>
67
68/*
69 * kstats for segment advise
70 */
71segadvstat_t segadvstat = {
72	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
73	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
74};
75
76kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
77uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
78
79/*
80 * entry in the segment page cache
81 */
82struct seg_pcache {
83	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
84	struct seg_pcache	*p_hprev;
85	pcache_link_t		p_plink;	/* per segment/amp list */
86	void 			*p_htag0;	/* segment/amp pointer */
87	caddr_t			p_addr;		/* base address/anon_idx */
88	size_t			p_len;		/* total bytes */
89	size_t			p_wlen;		/* writtable bytes at p_addr */
90	struct page		**p_pp;		/* pp shadow list */
91	seg_preclaim_cbfunc_t	p_callback;	/* reclaim callback function */
92	clock_t			p_lbolt;	/* lbolt from last use */
93	struct seg_phash	*p_hashp;	/* our pcache hash bucket */
94	uint_t			p_active;	/* active count */
95	uchar_t			p_write;	/* true if S_WRITE */
96	uchar_t			p_ref;		/* reference byte */
97	ushort_t		p_flags;	/* bit flags */
98};
99
100struct seg_phash {
101	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
102	struct seg_pcache	*p_hprev;
103	kmutex_t		p_hmutex;	/* protects hash bucket */
104	pcache_link_t		p_halink[2];	/* active bucket linkages */
105};
106
107struct seg_phash_wired {
108	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
109	struct seg_pcache	*p_hprev;
110	kmutex_t		p_hmutex;	/* protects hash bucket */
111};
112
113/*
114 * A parameter to control a maximum number of bytes that can be
115 * purged from pcache at a time.
116 */
117#define	P_MAX_APURGE_BYTES	(1024 * 1024 * 1024)
118
119/*
120 * log2(fraction of pcache to reclaim at a time).
121 */
122#define	P_SHRINK_SHFT		(5)
123
124/*
125 * The following variables can be tuned via /etc/system.
126 */
127
128int	segpcache_enabled = 1;		/* if 1, shadow lists are cached */
129pgcnt_t	segpcache_maxwindow = 0;	/* max # of pages that can be cached */
130ulong_t	segpcache_hashsize_win = 0;	/* # of non wired buckets */
131ulong_t	segpcache_hashsize_wired = 0;	/* # of wired buckets */
132int	segpcache_reap_sec = 1;		/* reap check rate in secs */
133clock_t	segpcache_reap_ticks = 0;	/* reap interval in ticks */
134int	segpcache_pcp_maxage_sec = 1;	/* pcp max age in secs */
135clock_t	segpcache_pcp_maxage_ticks = 0;	/* pcp max age in ticks */
136int	segpcache_shrink_shift = P_SHRINK_SHFT;	/* log2 reap fraction */
137pgcnt_t	segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES;	/* max purge bytes */
138
139static kmutex_t seg_pcache_mtx;	/* protects seg_pdisabled counter */
140static kmutex_t seg_pasync_mtx;	/* protects async thread scheduling */
141static kcondvar_t seg_pasync_cv;
142
143#pragma align 64(pctrl1)
144#pragma align 64(pctrl2)
145#pragma align 64(pctrl3)
146
147/*
148 * Keep frequently used variables together in one cache line.
149 */
150static struct p_ctrl1 {
151	uint_t p_disabled;		/* if not 0, caching temporarily off */
152	pgcnt_t p_maxwin;		/* max # of pages that can be cached */
153	size_t p_hashwin_sz;		/* # of non wired buckets */
154	struct seg_phash *p_htabwin;	/* hash table for non wired entries */
155	size_t p_hashwired_sz;		/* # of wired buckets */
156	struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
157	kmem_cache_t *p_kmcache;	/* kmem cache for seg_pcache structs */
158#ifdef _LP64
159	ulong_t pad[1];
160#endif /* _LP64 */
161} pctrl1;
162
163static struct p_ctrl2 {
164	kmutex_t p_mem_mtx;	/* protects window counter and p_halinks */
165	pgcnt_t  p_locked_win;	/* # pages from window */
166	pgcnt_t  p_locked;	/* # of pages cached by pagelock */
167	uchar_t	 p_ahcur;	/* current active links for insert/delete */
168	uchar_t  p_athr_on;	/* async reclaim thread is running. */
169	pcache_link_t p_ahhead[2]; /* active buckets linkages */
170} pctrl2;
171
172static struct p_ctrl3 {
173	clock_t	p_pcp_maxage;		/* max pcp age in ticks */
174	ulong_t	p_athr_empty_ahb;	/* athread walk stats */
175	ulong_t p_athr_full_ahb;	/* athread walk stats */
176	pgcnt_t	p_maxapurge_npages;	/* max pages to purge at a time */
177	int	p_shrink_shft;		/* reap shift factor */
178#ifdef _LP64
179	ulong_t pad[3];
180#endif /* _LP64 */
181} pctrl3;
182
183#define	seg_pdisabled			pctrl1.p_disabled
184#define	seg_pmaxwindow			pctrl1.p_maxwin
185#define	seg_phashsize_win		pctrl1.p_hashwin_sz
186#define	seg_phashtab_win		pctrl1.p_htabwin
187#define	seg_phashsize_wired		pctrl1.p_hashwired_sz
188#define	seg_phashtab_wired		pctrl1.p_htabwired
189#define	seg_pkmcache			pctrl1.p_kmcache
190#define	seg_pmem_mtx			pctrl2.p_mem_mtx
191#define	seg_plocked_window		pctrl2.p_locked_win
192#define	seg_plocked			pctrl2.p_locked
193#define	seg_pahcur			pctrl2.p_ahcur
194#define	seg_pathr_on			pctrl2.p_athr_on
195#define	seg_pahhead			pctrl2.p_ahhead
196#define	seg_pmax_pcpage			pctrl3.p_pcp_maxage
197#define	seg_pathr_empty_ahb		pctrl3.p_athr_empty_ahb
198#define	seg_pathr_full_ahb		pctrl3.p_athr_full_ahb
199#define	seg_pshrink_shift		pctrl3.p_shrink_shft
200#define	seg_pmaxapurge_npages		pctrl3.p_maxapurge_npages
201
202#define	P_HASHWIN_MASK			(seg_phashsize_win - 1)
203#define	P_HASHWIRED_MASK		(seg_phashsize_wired - 1)
204#define	P_BASESHIFT			(6)
205
206kthread_t *seg_pasync_thr;
207
208extern struct seg_ops segvn_ops;
209extern struct seg_ops segspt_shmops;
210
211#define	IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
212#define	IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
213
214#define	LBOLT_DELTA(t)	((ulong_t)(ddi_get_lbolt() - (t)))
215
216#define	PCP_AGE(pcp)	LBOLT_DELTA((pcp)->p_lbolt)
217
218/*
219 * htag0 argument can be a seg or amp pointer.
220 */
221#define	P_HASHBP(seg, htag0, addr, flags)				\
222	(IS_PFLAGS_WIRED((flags)) ?					\
223	    ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK &	\
224	    ((uintptr_t)(htag0) >> P_BASESHIFT)]) :			\
225	    (&seg_phashtab_win[P_HASHWIN_MASK &				\
226	    (((uintptr_t)(htag0) >> 3) ^				\
227	    ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?		\
228	    (flags >> 16) : page_get_shift((seg)->s_szc))))]))
229
230/*
231 * htag0 argument can be a seg or amp pointer.
232 */
233#define	P_MATCH(pcp, htag0, addr, len)					\
234	((pcp)->p_htag0 == (htag0) &&					\
235	(pcp)->p_addr == (addr) &&					\
236	(pcp)->p_len >= (len))
237
238#define	P_MATCH_PP(pcp, htag0, addr, len, pp)				\
239	((pcp)->p_pp == (pp) &&						\
240	(pcp)->p_htag0 == (htag0) &&					\
241	(pcp)->p_addr == (addr) &&					\
242	(pcp)->p_len >= (len))
243
244#define	plink2pcache(pl)	((struct seg_pcache *)((uintptr_t)(pl) - \
245    offsetof(struct seg_pcache, p_plink)))
246
247#define	hlink2phash(hl, l)	((struct seg_phash *)((uintptr_t)(hl) -	\
248    offsetof(struct seg_phash, p_halink[l])))
249
250/*
251 * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
252 * active hash bucket lists. We maintain active bucket lists to reduce the
253 * overhead of finding active buckets during asynchronous purging since there
254 * can be 10s of millions of buckets on a large system but only a small subset
255 * of them in actual use.
256 *
257 * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
258 * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
259 * buckets. The other list is used by asynchronous purge thread. This allows
260 * the purge thread to walk its active list without holding seg_pmem_mtx for a
261 * long time. When asynchronous thread is done with its list it switches to
262 * current active list and makes the list it just finished processing as
263 * current active list.
264 *
265 * seg_padd_abuck() only adds the bucket to current list if the bucket is not
266 * yet on any list.  seg_premove_abuck() may remove the bucket from either
267 * list. If the bucket is on current list it will be always removed. Otherwise
268 * the bucket is only removed if asynchronous purge thread is not currently
269 * running or seg_premove_abuck() is called by asynchronous purge thread
270 * itself. A given bucket can only be on one of active lists at a time. These
271 * routines should be called with per bucket lock held.  The routines use
272 * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
273 * the first entry is added to the bucket chain and seg_premove_abuck() must
274 * be called after the last pcp entry is deleted from its chain. Per bucket
275 * lock should be held by the callers.  This avoids a potential race condition
276 * when seg_premove_abuck() removes a bucket after pcp entries are added to
277 * its list after the caller checked that the bucket has no entries. (this
278 * race would cause a loss of an active bucket from the active lists).
279 *
280 * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
281 * New entries are added to the end of the list since LRU is used as the
282 * purging policy.
283 */
284static void
285seg_padd_abuck(struct seg_phash *hp)
286{
287	int lix;
288
289	ASSERT(MUTEX_HELD(&hp->p_hmutex));
290	ASSERT((struct seg_phash *)hp->p_hnext != hp);
291	ASSERT((struct seg_phash *)hp->p_hprev != hp);
292	ASSERT(hp->p_hnext == hp->p_hprev);
293	ASSERT(!IS_PCP_WIRED(hp->p_hnext));
294	ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
295	ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
296	ASSERT(hp >= seg_phashtab_win &&
297	    hp < &seg_phashtab_win[seg_phashsize_win]);
298
299	/*
300	 * This bucket can already be on one of active lists
301	 * since seg_premove_abuck() may have failed to remove it
302	 * before.
303	 */
304	mutex_enter(&seg_pmem_mtx);
305	lix = seg_pahcur;
306	ASSERT(lix >= 0 && lix <= 1);
307	if (hp->p_halink[lix].p_lnext != NULL) {
308		ASSERT(hp->p_halink[lix].p_lprev != NULL);
309		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
310		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
311		mutex_exit(&seg_pmem_mtx);
312		return;
313	}
314	ASSERT(hp->p_halink[lix].p_lprev == NULL);
315
316	/*
317	 * If this bucket is still on list !lix async thread can't yet remove
318	 * it since we hold here per bucket lock. In this case just return
319	 * since async thread will eventually find and process this bucket.
320	 */
321	if (hp->p_halink[!lix].p_lnext != NULL) {
322		ASSERT(hp->p_halink[!lix].p_lprev != NULL);
323		mutex_exit(&seg_pmem_mtx);
324		return;
325	}
326	ASSERT(hp->p_halink[!lix].p_lprev == NULL);
327	/*
328	 * This bucket is not on any active bucket list yet.
329	 * Add the bucket to the tail of current active list.
330	 */
331	hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
332	hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
333	seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
334	seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
335	mutex_exit(&seg_pmem_mtx);
336}
337
338static void
339seg_premove_abuck(struct seg_phash *hp, int athr)
340{
341	int lix;
342
343	ASSERT(MUTEX_HELD(&hp->p_hmutex));
344	ASSERT((struct seg_phash *)hp->p_hnext == hp);
345	ASSERT((struct seg_phash *)hp->p_hprev == hp);
346	ASSERT(hp >= seg_phashtab_win &&
347	    hp < &seg_phashtab_win[seg_phashsize_win]);
348
349	if (athr) {
350		ASSERT(seg_pathr_on);
351		ASSERT(seg_pahcur <= 1);
352		/*
353		 * We are called by asynchronous thread that found this bucket
354		 * on not currently active (i.e. !seg_pahcur) list. Remove it
355		 * from there.  Per bucket lock we are holding makes sure
356		 * seg_pinsert() can't sneak in and add pcp entries to this
357		 * bucket right before we remove the bucket from its list.
358		 */
359		lix = !seg_pahcur;
360		ASSERT(hp->p_halink[lix].p_lnext != NULL);
361		ASSERT(hp->p_halink[lix].p_lprev != NULL);
362		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
363		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
364		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
365		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
366		hp->p_halink[lix].p_lnext = NULL;
367		hp->p_halink[lix].p_lprev = NULL;
368		return;
369	}
370
371	mutex_enter(&seg_pmem_mtx);
372	lix = seg_pahcur;
373	ASSERT(lix >= 0 && lix <= 1);
374
375	/*
376	 * If the bucket is on currently active list just remove it from
377	 * there.
378	 */
379	if (hp->p_halink[lix].p_lnext != NULL) {
380		ASSERT(hp->p_halink[lix].p_lprev != NULL);
381		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
382		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
383		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
384		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
385		hp->p_halink[lix].p_lnext = NULL;
386		hp->p_halink[lix].p_lprev = NULL;
387		mutex_exit(&seg_pmem_mtx);
388		return;
389	}
390	ASSERT(hp->p_halink[lix].p_lprev == NULL);
391
392	/*
393	 * If asynchronous thread is not running we can remove the bucket from
394	 * not currently active list. The bucket must be on this list since we
395	 * already checked that it's not on the other list and the bucket from
396	 * which we just deleted the last pcp entry must be still on one of the
397	 * active bucket lists.
398	 */
399	lix = !lix;
400	ASSERT(hp->p_halink[lix].p_lnext != NULL);
401	ASSERT(hp->p_halink[lix].p_lprev != NULL);
402
403	if (!seg_pathr_on) {
404		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
405		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
406		hp->p_halink[lix].p_lnext = NULL;
407		hp->p_halink[lix].p_lprev = NULL;
408	}
409	mutex_exit(&seg_pmem_mtx);
410}
411
412/*
413 * Check if bucket pointed by hp already has a pcp entry that matches request
414 * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
415 * Also delete matching entries that cover smaller address range but start
416 * at the same address as addr argument. Return the list of deleted entries if
417 * any. This is an internal helper function called from seg_pinsert() only
418 * for non wired shadow lists. The caller already holds a per seg/amp list
419 * lock.
420 */
421static struct seg_pcache *
422seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
423    caddr_t addr, size_t len, int *found)
424{
425	struct seg_pcache *pcp;
426	struct seg_pcache *delcallb_list = NULL;
427
428	ASSERT(MUTEX_HELD(&hp->p_hmutex));
429
430	*found = 0;
431	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
432	    pcp = pcp->p_hnext) {
433		ASSERT(pcp->p_hashp == hp);
434		if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
435			ASSERT(!IS_PCP_WIRED(pcp));
436			if (pcp->p_len < len) {
437				pcache_link_t *plinkp;
438				if (pcp->p_active) {
439					continue;
440				}
441				plinkp = &pcp->p_plink;
442				plinkp->p_lprev->p_lnext = plinkp->p_lnext;
443				plinkp->p_lnext->p_lprev = plinkp->p_lprev;
444				pcp->p_hprev->p_hnext = pcp->p_hnext;
445				pcp->p_hnext->p_hprev = pcp->p_hprev;
446				pcp->p_hprev = delcallb_list;
447				delcallb_list = pcp;
448			} else {
449				*found = 1;
450				break;
451			}
452		}
453	}
454	return (delcallb_list);
455}
456
457/*
458 * lookup an address range in pagelock cache. Return shadow list and bump up
459 * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
460 * as a lookup tag.
461 */
462struct page **
463seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
464    enum seg_rw rw, uint_t flags)
465{
466	struct seg_pcache *pcp;
467	struct seg_phash *hp;
468	void *htag0;
469
470	ASSERT(seg != NULL);
471	ASSERT(rw == S_READ || rw == S_WRITE);
472
473	/*
474	 * Skip pagelock cache, while DR is in progress or
475	 * seg_pcache is off.
476	 */
477	if (seg_pdisabled) {
478		return (NULL);
479	}
480	ASSERT(seg_phashsize_win != 0);
481
482	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
483	hp = P_HASHBP(seg, htag0, addr, flags);
484	mutex_enter(&hp->p_hmutex);
485	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
486	    pcp = pcp->p_hnext) {
487		ASSERT(pcp->p_hashp == hp);
488		if (P_MATCH(pcp, htag0, addr, len)) {
489			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
490			/*
491			 * If this request wants to write pages
492			 * but write permissions starting from
493			 * addr don't cover the entire length len
494			 * return lookup failure back to the caller.
495			 * It will check protections and fail this
496			 * pagelock operation with EACCESS error.
497			 */
498			if (rw == S_WRITE && pcp->p_wlen < len) {
499				break;
500			}
501			if (pcp->p_active == UINT_MAX) {
502				break;
503			}
504			pcp->p_active++;
505			if (rw == S_WRITE && !pcp->p_write) {
506				pcp->p_write = 1;
507			}
508			mutex_exit(&hp->p_hmutex);
509			return (pcp->p_pp);
510		}
511	}
512	mutex_exit(&hp->p_hmutex);
513	return (NULL);
514}
515
516/*
517 * mark address range inactive. If the cache is off or the address range is
518 * not in the cache or another shadow list that covers bigger range is found
519 * we call the segment driver to reclaim the pages. Otherwise just decrement
520 * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
521 * otherwise use seg as a lookup tag.
522 */
523void
524seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
525    size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
526    seg_preclaim_cbfunc_t callback)
527{
528	struct seg_pcache *pcp;
529	struct seg_phash *hp;
530	kmutex_t *pmtx = NULL;
531	pcache_link_t *pheadp;
532	void *htag0;
533	pgcnt_t npages = 0;
534	int keep = 0;
535
536	ASSERT(seg != NULL);
537	ASSERT(rw == S_READ || rw == S_WRITE);
538
539	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
540
541	/*
542	 * Skip lookup if pcache is not configured.
543	 */
544	if (seg_phashsize_win == 0) {
545		goto out;
546	}
547
548	/*
549	 * Grab per seg/amp lock before hash lock if we are going to remove
550	 * inactive entry from pcache.
551	 */
552	if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
553		if (amp == NULL) {
554			pheadp = &seg->s_phead;
555			pmtx = &seg->s_pmtx;
556		} else {
557			pheadp = &amp->a_phead;
558			pmtx = &amp->a_pmtx;
559		}
560		mutex_enter(pmtx);
561	}
562
563	hp = P_HASHBP(seg, htag0, addr, flags);
564	mutex_enter(&hp->p_hmutex);
565again:
566	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
567	    pcp = pcp->p_hnext) {
568		ASSERT(pcp->p_hashp == hp);
569		if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
570			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
571			ASSERT(pcp->p_active);
572			if (keep) {
573				/*
574				 * Don't remove this pcp entry
575				 * if we didn't find duplicate
576				 * shadow lists on second search.
577				 * Somebody removed those duplicates
578				 * since we dropped hash lock after first
579				 * search.
580				 */
581				ASSERT(pmtx != NULL);
582				ASSERT(!IS_PFLAGS_WIRED(flags));
583				mutex_exit(pmtx);
584				pmtx = NULL;
585			}
586			pcp->p_active--;
587			if (pcp->p_active == 0 && (pmtx != NULL ||
588			    (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
589
590				/*
591				 * This entry is no longer active.  Remove it
592				 * now either because pcaching is temporarily
593				 * disabled or there're other pcp entries that
594				 * can match this pagelock request (i.e. this
595				 * entry is a duplicate).
596				 */
597
598				ASSERT(callback == pcp->p_callback);
599				if (pmtx != NULL) {
600					pcache_link_t *plinkp = &pcp->p_plink;
601					ASSERT(!IS_PCP_WIRED(pcp));
602					ASSERT(pheadp->p_lnext != pheadp);
603					ASSERT(pheadp->p_lprev != pheadp);
604					plinkp->p_lprev->p_lnext =
605					    plinkp->p_lnext;
606					plinkp->p_lnext->p_lprev =
607					    plinkp->p_lprev;
608				}
609				pcp->p_hprev->p_hnext = pcp->p_hnext;
610				pcp->p_hnext->p_hprev = pcp->p_hprev;
611				if (!IS_PCP_WIRED(pcp) &&
612				    hp->p_hnext == (struct seg_pcache *)hp) {
613					/*
614					 * We removed the last entry from this
615					 * bucket.  Now remove the bucket from
616					 * its active list.
617					 */
618					seg_premove_abuck(hp, 0);
619				}
620				mutex_exit(&hp->p_hmutex);
621				if (pmtx != NULL) {
622					mutex_exit(pmtx);
623				}
624				len = pcp->p_len;
625				npages = btop(len);
626				if (rw != S_WRITE && pcp->p_write) {
627					rw = S_WRITE;
628				}
629				kmem_cache_free(seg_pkmcache, pcp);
630				goto out;
631			} else {
632				/*
633				 * We found a matching pcp entry but will not
634				 * free it right away even if it's no longer
635				 * active.
636				 */
637				if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
638					/*
639					 * Set the reference bit and mark the
640					 * time of last access to this pcp
641					 * so that asynchronous thread doesn't
642					 * free it immediately since
643					 * it may be reactivated very soon.
644					 */
645					pcp->p_lbolt = ddi_get_lbolt();
646					pcp->p_ref = 1;
647				}
648				mutex_exit(&hp->p_hmutex);
649				if (pmtx != NULL) {
650					mutex_exit(pmtx);
651				}
652				return;
653			}
654		} else if (!IS_PFLAGS_WIRED(flags) &&
655		    P_MATCH(pcp, htag0, addr, len)) {
656			/*
657			 * This is a duplicate pcp entry.  This situation may
658			 * happen if a bigger shadow list that covers our
659			 * range was added while our entry was still active.
660			 * Now we can free our pcp entry if it becomes
661			 * inactive.
662			 */
663			if (!pcp->p_active) {
664				/*
665				 * Mark this entry as referenced just in case
666				 * we'll free our own pcp entry soon.
667				 */
668				pcp->p_lbolt = ddi_get_lbolt();
669				pcp->p_ref = 1;
670			}
671			if (pmtx != NULL) {
672				/*
673				 * we are already holding pmtx and found a
674				 * duplicate.  Don't keep our own pcp entry.
675				 */
676				keep = 0;
677				continue;
678			}
679			/*
680			 * We have to use mutex_tryenter to attempt to lock
681			 * seg/amp list lock since we already hold hash lock
682			 * and seg/amp list lock is above hash lock in lock
683			 * order.  If mutex_tryenter fails drop hash lock and
684			 * retake both locks in correct order and research
685			 * this hash chain.
686			 */
687			ASSERT(keep == 0);
688			if (amp == NULL) {
689				pheadp = &seg->s_phead;
690				pmtx = &seg->s_pmtx;
691			} else {
692				pheadp = &amp->a_phead;
693				pmtx = &amp->a_pmtx;
694			}
695			if (!mutex_tryenter(pmtx)) {
696				mutex_exit(&hp->p_hmutex);
697				mutex_enter(pmtx);
698				mutex_enter(&hp->p_hmutex);
699				/*
700				 * If we don't find bigger shadow list on
701				 * second search (it may happen since we
702				 * dropped bucket lock) keep the entry that
703				 * matches our own shadow list.
704				 */
705				keep = 1;
706				goto again;
707			}
708		}
709	}
710	mutex_exit(&hp->p_hmutex);
711	if (pmtx != NULL) {
712		mutex_exit(pmtx);
713	}
714out:
715	(*callback)(htag0, addr, len, pp, rw, 0);
716	if (npages) {
717		mutex_enter(&seg_pmem_mtx);
718		ASSERT(seg_plocked >= npages);
719		seg_plocked -= npages;
720		if (!IS_PFLAGS_WIRED(flags)) {
721			ASSERT(seg_plocked_window >= npages);
722			seg_plocked_window -= npages;
723		}
724		mutex_exit(&seg_pmem_mtx);
725	}
726
727}
728
729#ifdef DEBUG
730static uint32_t p_insert_chk_mtbf = 0;
731#endif
732
733/*
734 * The seg_pinsert_check() is used by segment drivers to predict whether
735 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
736 */
737/*ARGSUSED*/
738int
739seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
740    size_t len, uint_t flags)
741{
742	ASSERT(seg != NULL);
743
744#ifdef DEBUG
745	if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
746		return (SEGP_FAIL);
747	}
748#endif
749
750	if (seg_pdisabled) {
751		return (SEGP_FAIL);
752	}
753	ASSERT(seg_phashsize_win != 0);
754
755	if (IS_PFLAGS_WIRED(flags)) {
756		return (SEGP_SUCCESS);
757	}
758
759	if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
760		return (SEGP_FAIL);
761	}
762
763	if (freemem < desfree) {
764		return (SEGP_FAIL);
765	}
766
767	return (SEGP_SUCCESS);
768}
769
770#ifdef DEBUG
771static uint32_t p_insert_mtbf = 0;
772#endif
773
774/*
775 * Insert address range with shadow list into pagelock cache if there's no
776 * shadow list already cached for this address range. If the cache is off or
777 * caching is temporarily disabled or the allowed 'window' is exceeded return
778 * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
779 *
780 * For non wired shadow lists (segvn case) include address in the hashing
781 * function to avoid linking all the entries from the same segment or amp on
782 * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
783 * pcache entries are also linked on a per segment/amp list so that all
784 * entries can be found quickly during seg/amp purge without walking the
785 * entire pcache hash table.  For wired shadow lists (segspt case) we
786 * don't use address hashing and per segment linking because the caller
787 * currently inserts only one entry per segment that covers the entire
788 * segment. If we used per segment linking even for segspt it would complicate
789 * seg_ppurge_wiredpp() locking.
790 *
791 * Both hash bucket and per seg/amp locks need to be held before adding a non
792 * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
793 * first.
794 *
795 * This function will also remove from pcache old inactive shadow lists that
796 * overlap with this request but cover smaller range for the same start
797 * address.
798 */
799int
800seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
801    size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
802    seg_preclaim_cbfunc_t callback)
803{
804	struct seg_pcache *pcp;
805	struct seg_phash *hp;
806	pgcnt_t npages;
807	pcache_link_t *pheadp;
808	kmutex_t *pmtx;
809	struct seg_pcache *delcallb_list = NULL;
810
811	ASSERT(seg != NULL);
812	ASSERT(rw == S_READ || rw == S_WRITE);
813	ASSERT(rw == S_READ || wlen == len);
814	ASSERT(rw == S_WRITE || wlen <= len);
815	ASSERT(amp == NULL || wlen == len);
816
817#ifdef DEBUG
818	if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
819		return (SEGP_FAIL);
820	}
821#endif
822
823	if (seg_pdisabled) {
824		return (SEGP_FAIL);
825	}
826	ASSERT(seg_phashsize_win != 0);
827
828	ASSERT((len & PAGEOFFSET) == 0);
829	npages = btop(len);
830	mutex_enter(&seg_pmem_mtx);
831	if (!IS_PFLAGS_WIRED(flags)) {
832		if (seg_plocked_window + npages > seg_pmaxwindow) {
833			mutex_exit(&seg_pmem_mtx);
834			return (SEGP_FAIL);
835		}
836		seg_plocked_window += npages;
837	}
838	seg_plocked += npages;
839	mutex_exit(&seg_pmem_mtx);
840
841	pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
842	/*
843	 * If amp is not NULL set htag0 to amp otherwise set it to seg.
844	 */
845	if (amp == NULL) {
846		pcp->p_htag0 = (void *)seg;
847		pcp->p_flags = flags & 0xffff;
848	} else {
849		pcp->p_htag0 = (void *)amp;
850		pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
851	}
852	pcp->p_addr = addr;
853	pcp->p_len = len;
854	pcp->p_wlen = wlen;
855	pcp->p_pp = pp;
856	pcp->p_write = (rw == S_WRITE);
857	pcp->p_callback = callback;
858	pcp->p_active = 1;
859
860	hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
861	if (!IS_PFLAGS_WIRED(flags)) {
862		int found;
863		void *htag0;
864		if (amp == NULL) {
865			pheadp = &seg->s_phead;
866			pmtx = &seg->s_pmtx;
867			htag0 = (void *)seg;
868		} else {
869			pheadp = &amp->a_phead;
870			pmtx = &amp->a_pmtx;
871			htag0 = (void *)amp;
872		}
873		mutex_enter(pmtx);
874		mutex_enter(&hp->p_hmutex);
875		delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
876		    len, &found);
877		if (found) {
878			mutex_exit(&hp->p_hmutex);
879			mutex_exit(pmtx);
880			mutex_enter(&seg_pmem_mtx);
881			seg_plocked -= npages;
882			seg_plocked_window -= npages;
883			mutex_exit(&seg_pmem_mtx);
884			kmem_cache_free(seg_pkmcache, pcp);
885			goto out;
886		}
887		pcp->p_plink.p_lnext = pheadp->p_lnext;
888		pcp->p_plink.p_lprev = pheadp;
889		pheadp->p_lnext->p_lprev = &pcp->p_plink;
890		pheadp->p_lnext = &pcp->p_plink;
891	} else {
892		mutex_enter(&hp->p_hmutex);
893	}
894	pcp->p_hashp = hp;
895	pcp->p_hnext = hp->p_hnext;
896	pcp->p_hprev = (struct seg_pcache *)hp;
897	hp->p_hnext->p_hprev = pcp;
898	hp->p_hnext = pcp;
899	if (!IS_PFLAGS_WIRED(flags) &&
900	    hp->p_hprev == pcp) {
901		seg_padd_abuck(hp);
902	}
903	mutex_exit(&hp->p_hmutex);
904	if (!IS_PFLAGS_WIRED(flags)) {
905		mutex_exit(pmtx);
906	}
907
908out:
909	npages = 0;
910	while (delcallb_list != NULL) {
911		pcp = delcallb_list;
912		delcallb_list = pcp->p_hprev;
913		ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
914		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
915		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
916		npages += btop(pcp->p_len);
917		kmem_cache_free(seg_pkmcache, pcp);
918	}
919	if (npages) {
920		ASSERT(!IS_PFLAGS_WIRED(flags));
921		mutex_enter(&seg_pmem_mtx);
922		ASSERT(seg_plocked >= npages);
923		ASSERT(seg_plocked_window >= npages);
924		seg_plocked -= npages;
925		seg_plocked_window -= npages;
926		mutex_exit(&seg_pmem_mtx);
927	}
928
929	return (SEGP_SUCCESS);
930}
931
932/*
933 * purge entries from the pagelock cache if not active
934 * and not recently used.
935 */
936static void
937seg_ppurge_async(int force)
938{
939	struct seg_pcache *delcallb_list = NULL;
940	struct seg_pcache *pcp;
941	struct seg_phash *hp;
942	pgcnt_t npages = 0;
943	pgcnt_t npages_window = 0;
944	pgcnt_t	npgs_to_purge;
945	pgcnt_t npgs_purged = 0;
946	int hlinks = 0;
947	int hlix;
948	pcache_link_t *hlinkp;
949	pcache_link_t *hlnextp = NULL;
950	int lowmem;
951	int trim;
952
953	ASSERT(seg_phashsize_win != 0);
954
955	/*
956	 * if the cache is off or empty, return
957	 */
958	if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
959		return;
960	}
961
962	if (!force) {
963		lowmem = 0;
964		trim = 0;
965		if (freemem < lotsfree + needfree) {
966			spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
967			if (fmem <= 5 * (desfree >> 2)) {
968				lowmem = 1;
969			} else if (fmem <= 7 * (lotsfree >> 3)) {
970				if (seg_plocked_window >=
971				    (availrmem_initial >> 1)) {
972					lowmem = 1;
973				}
974			} else if (fmem < lotsfree) {
975				if (seg_plocked_window >=
976				    3 * (availrmem_initial >> 2)) {
977					lowmem = 1;
978				}
979			}
980		}
981		if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
982			trim = 1;
983		}
984		if (!lowmem && !trim) {
985			return;
986		}
987		npgs_to_purge = seg_plocked_window >>
988		    seg_pshrink_shift;
989		if (lowmem) {
990			npgs_to_purge = MIN(npgs_to_purge,
991			    MAX(seg_pmaxapurge_npages, desfree));
992		} else {
993			npgs_to_purge = MIN(npgs_to_purge,
994			    seg_pmaxapurge_npages);
995		}
996		if (npgs_to_purge == 0) {
997			return;
998		}
999	} else {
1000		struct seg_phash_wired *hpw;
1001
1002		ASSERT(seg_phashsize_wired != 0);
1003
1004		for (hpw = seg_phashtab_wired;
1005		    hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1006
1007			if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1008				continue;
1009			}
1010
1011			mutex_enter(&hpw->p_hmutex);
1012
1013			for (pcp = hpw->p_hnext;
1014			    pcp != (struct seg_pcache *)hpw;
1015			    pcp = pcp->p_hnext) {
1016
1017				ASSERT(IS_PCP_WIRED(pcp));
1018				ASSERT(pcp->p_hashp ==
1019				    (struct seg_phash *)hpw);
1020
1021				if (pcp->p_active) {
1022					continue;
1023				}
1024				pcp->p_hprev->p_hnext = pcp->p_hnext;
1025				pcp->p_hnext->p_hprev = pcp->p_hprev;
1026				pcp->p_hprev = delcallb_list;
1027				delcallb_list = pcp;
1028			}
1029			mutex_exit(&hpw->p_hmutex);
1030		}
1031	}
1032
1033	mutex_enter(&seg_pmem_mtx);
1034	if (seg_pathr_on) {
1035		mutex_exit(&seg_pmem_mtx);
1036		goto runcb;
1037	}
1038	seg_pathr_on = 1;
1039	mutex_exit(&seg_pmem_mtx);
1040	ASSERT(seg_pahcur <= 1);
1041	hlix = !seg_pahcur;
1042
1043again:
1044	for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1045	    hlinkp = hlnextp) {
1046
1047		hlnextp = hlinkp->p_lnext;
1048		ASSERT(hlnextp != NULL);
1049
1050		hp = hlink2phash(hlinkp, hlix);
1051		if (hp->p_hnext == (struct seg_pcache *)hp) {
1052			seg_pathr_empty_ahb++;
1053			continue;
1054		}
1055		seg_pathr_full_ahb++;
1056		mutex_enter(&hp->p_hmutex);
1057
1058		for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1059		    pcp = pcp->p_hnext) {
1060			pcache_link_t *pheadp;
1061			pcache_link_t *plinkp;
1062			void *htag0;
1063			kmutex_t *pmtx;
1064
1065			ASSERT(!IS_PCP_WIRED(pcp));
1066			ASSERT(pcp->p_hashp == hp);
1067
1068			if (pcp->p_active) {
1069				continue;
1070			}
1071			if (!force && pcp->p_ref &&
1072			    PCP_AGE(pcp) < seg_pmax_pcpage) {
1073				pcp->p_ref = 0;
1074				continue;
1075			}
1076			plinkp = &pcp->p_plink;
1077			htag0 = pcp->p_htag0;
1078			if (pcp->p_flags & SEGP_AMP) {
1079				pheadp = &((amp_t *)htag0)->a_phead;
1080				pmtx = &((amp_t *)htag0)->a_pmtx;
1081			} else {
1082				pheadp = &((seg_t *)htag0)->s_phead;
1083				pmtx = &((seg_t *)htag0)->s_pmtx;
1084			}
1085			if (!mutex_tryenter(pmtx)) {
1086				continue;
1087			}
1088			ASSERT(pheadp->p_lnext != pheadp);
1089			ASSERT(pheadp->p_lprev != pheadp);
1090			plinkp->p_lprev->p_lnext =
1091			    plinkp->p_lnext;
1092			plinkp->p_lnext->p_lprev =
1093			    plinkp->p_lprev;
1094			pcp->p_hprev->p_hnext = pcp->p_hnext;
1095			pcp->p_hnext->p_hprev = pcp->p_hprev;
1096			mutex_exit(pmtx);
1097			pcp->p_hprev = delcallb_list;
1098			delcallb_list = pcp;
1099			npgs_purged += btop(pcp->p_len);
1100		}
1101		if (hp->p_hnext == (struct seg_pcache *)hp) {
1102			seg_premove_abuck(hp, 1);
1103		}
1104		mutex_exit(&hp->p_hmutex);
1105		if (npgs_purged >= seg_plocked_window) {
1106			break;
1107		}
1108		if (!force) {
1109			if (npgs_purged >= npgs_to_purge) {
1110				break;
1111			}
1112			if (!trim && !(seg_pathr_full_ahb & 15)) {
1113				ASSERT(lowmem);
1114				if (freemem >= lotsfree + needfree) {
1115					break;
1116				}
1117			}
1118		}
1119	}
1120
1121	if (hlinkp == &seg_pahhead[hlix]) {
1122		/*
1123		 * We processed the entire hlix active bucket list
1124		 * but didn't find enough pages to reclaim.
1125		 * Switch the lists and walk the other list
1126		 * if we haven't done it yet.
1127		 */
1128		mutex_enter(&seg_pmem_mtx);
1129		ASSERT(seg_pathr_on);
1130		ASSERT(seg_pahcur == !hlix);
1131		seg_pahcur = hlix;
1132		mutex_exit(&seg_pmem_mtx);
1133		if (++hlinks < 2) {
1134			hlix = !hlix;
1135			goto again;
1136		}
1137	} else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1138	    seg_pahhead[hlix].p_lnext != hlinkp) {
1139		ASSERT(hlinkp != NULL);
1140		ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1141		ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1142		ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1143
1144		/*
1145		 * Reinsert the header to point to hlinkp
1146		 * so that we start from hlinkp bucket next time around.
1147		 */
1148		seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1149		seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1150		seg_pahhead[hlix].p_lnext = hlinkp;
1151		seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1152		hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1153		hlinkp->p_lprev = &seg_pahhead[hlix];
1154	}
1155
1156	mutex_enter(&seg_pmem_mtx);
1157	ASSERT(seg_pathr_on);
1158	seg_pathr_on = 0;
1159	mutex_exit(&seg_pmem_mtx);
1160
1161runcb:
1162	/*
1163	 * Run the delayed callback list. segments/amps can't go away until
1164	 * callback is executed since they must have non 0 softlockcnt. That's
1165	 * why we don't need to hold as/seg/amp locks to execute the callback.
1166	 */
1167	while (delcallb_list != NULL) {
1168		pcp = delcallb_list;
1169		delcallb_list = pcp->p_hprev;
1170		ASSERT(!pcp->p_active);
1171		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1172		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1173		npages += btop(pcp->p_len);
1174		if (!IS_PCP_WIRED(pcp)) {
1175			npages_window += btop(pcp->p_len);
1176		}
1177		kmem_cache_free(seg_pkmcache, pcp);
1178	}
1179	if (npages) {
1180		mutex_enter(&seg_pmem_mtx);
1181		ASSERT(seg_plocked >= npages);
1182		ASSERT(seg_plocked_window >= npages_window);
1183		seg_plocked -= npages;
1184		seg_plocked_window -= npages_window;
1185		mutex_exit(&seg_pmem_mtx);
1186	}
1187}
1188
1189/*
1190 * Remove cached pages for segment(s) entries from hashtable.  The segments
1191 * are identified by pp array. This is useful for multiple seg's cached on
1192 * behalf of dummy segment (ISM/DISM) with common pp array.
1193 */
1194void
1195seg_ppurge_wiredpp(struct page **pp)
1196{
1197	struct seg_pcache *pcp;
1198	struct seg_phash_wired *hp;
1199	pgcnt_t npages = 0;
1200	struct	seg_pcache *delcallb_list = NULL;
1201
1202	/*
1203	 * if the cache is empty, return
1204	 */
1205	if (seg_plocked == 0) {
1206		return;
1207	}
1208	ASSERT(seg_phashsize_wired != 0);
1209
1210	for (hp = seg_phashtab_wired;
1211	    hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1212		if (hp->p_hnext == (struct seg_pcache *)hp) {
1213			continue;
1214		}
1215		mutex_enter(&hp->p_hmutex);
1216		pcp = hp->p_hnext;
1217		while (pcp != (struct seg_pcache *)hp) {
1218			ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1219			ASSERT(IS_PCP_WIRED(pcp));
1220			/*
1221			 * purge entries which are not active
1222			 */
1223			if (!pcp->p_active && pcp->p_pp == pp) {
1224				ASSERT(pcp->p_htag0 != NULL);
1225				pcp->p_hprev->p_hnext = pcp->p_hnext;
1226				pcp->p_hnext->p_hprev = pcp->p_hprev;
1227				pcp->p_hprev = delcallb_list;
1228				delcallb_list = pcp;
1229			}
1230			pcp = pcp->p_hnext;
1231		}
1232		mutex_exit(&hp->p_hmutex);
1233		/*
1234		 * segments can't go away until callback is executed since
1235		 * they must have non 0 softlockcnt. That's why we don't
1236		 * need to hold as/seg locks to execute the callback.
1237		 */
1238		while (delcallb_list != NULL) {
1239			int done;
1240			pcp = delcallb_list;
1241			delcallb_list = pcp->p_hprev;
1242			ASSERT(!pcp->p_active);
1243			done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1244			    pcp->p_len, pcp->p_pp,
1245			    pcp->p_write ? S_WRITE : S_READ, 1);
1246			npages += btop(pcp->p_len);
1247			ASSERT(IS_PCP_WIRED(pcp));
1248			kmem_cache_free(seg_pkmcache, pcp);
1249			if (done) {
1250				ASSERT(delcallb_list == NULL);
1251				goto out;
1252			}
1253		}
1254	}
1255
1256out:
1257	mutex_enter(&seg_pmem_mtx);
1258	ASSERT(seg_plocked >= npages);
1259	seg_plocked -= npages;
1260	mutex_exit(&seg_pmem_mtx);
1261}
1262
1263/*
1264 * purge all entries for a given segment. Since we
1265 * callback into the segment driver directly for page
1266 * reclaim the caller needs to hold the right locks.
1267 */
1268void
1269seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1270{
1271	struct seg_pcache *delcallb_list = NULL;
1272	struct seg_pcache *pcp;
1273	struct seg_phash *hp;
1274	pgcnt_t npages = 0;
1275	void *htag0;
1276
1277	if (seg_plocked == 0) {
1278		return;
1279	}
1280	ASSERT(seg_phashsize_win != 0);
1281
1282	/*
1283	 * If amp is not NULL use amp as a lookup tag otherwise use seg
1284	 * as a lookup tag.
1285	 */
1286	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1287	ASSERT(htag0 != NULL);
1288	if (IS_PFLAGS_WIRED(flags)) {
1289		hp = P_HASHBP(seg, htag0, 0, flags);
1290		mutex_enter(&hp->p_hmutex);
1291		pcp = hp->p_hnext;
1292		while (pcp != (struct seg_pcache *)hp) {
1293			ASSERT(pcp->p_hashp == hp);
1294			ASSERT(IS_PCP_WIRED(pcp));
1295			if (pcp->p_htag0 == htag0) {
1296				if (pcp->p_active) {
1297					break;
1298				}
1299				pcp->p_hprev->p_hnext = pcp->p_hnext;
1300				pcp->p_hnext->p_hprev = pcp->p_hprev;
1301				pcp->p_hprev = delcallb_list;
1302				delcallb_list = pcp;
1303			}
1304			pcp = pcp->p_hnext;
1305		}
1306		mutex_exit(&hp->p_hmutex);
1307	} else {
1308		pcache_link_t *plinkp;
1309		pcache_link_t *pheadp;
1310		kmutex_t *pmtx;
1311
1312		if (amp == NULL) {
1313			ASSERT(seg != NULL);
1314			pheadp = &seg->s_phead;
1315			pmtx = &seg->s_pmtx;
1316		} else {
1317			pheadp = &amp->a_phead;
1318			pmtx = &amp->a_pmtx;
1319		}
1320		mutex_enter(pmtx);
1321		while ((plinkp = pheadp->p_lnext) != pheadp) {
1322			pcp = plink2pcache(plinkp);
1323			ASSERT(!IS_PCP_WIRED(pcp));
1324			ASSERT(pcp->p_htag0 == htag0);
1325			hp = pcp->p_hashp;
1326			mutex_enter(&hp->p_hmutex);
1327			if (pcp->p_active) {
1328				mutex_exit(&hp->p_hmutex);
1329				break;
1330			}
1331			ASSERT(plinkp->p_lprev == pheadp);
1332			pheadp->p_lnext = plinkp->p_lnext;
1333			plinkp->p_lnext->p_lprev = pheadp;
1334			pcp->p_hprev->p_hnext = pcp->p_hnext;
1335			pcp->p_hnext->p_hprev = pcp->p_hprev;
1336			pcp->p_hprev = delcallb_list;
1337			delcallb_list = pcp;
1338			if (hp->p_hnext == (struct seg_pcache *)hp) {
1339				seg_premove_abuck(hp, 0);
1340			}
1341			mutex_exit(&hp->p_hmutex);
1342		}
1343		mutex_exit(pmtx);
1344	}
1345	while (delcallb_list != NULL) {
1346		pcp = delcallb_list;
1347		delcallb_list = pcp->p_hprev;
1348		ASSERT(!pcp->p_active);
1349		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1350		    pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1351		npages += btop(pcp->p_len);
1352		kmem_cache_free(seg_pkmcache, pcp);
1353	}
1354	mutex_enter(&seg_pmem_mtx);
1355	ASSERT(seg_plocked >= npages);
1356	seg_plocked -= npages;
1357	if (!IS_PFLAGS_WIRED(flags)) {
1358		ASSERT(seg_plocked_window >= npages);
1359		seg_plocked_window -= npages;
1360	}
1361	mutex_exit(&seg_pmem_mtx);
1362}
1363
1364static void seg_pinit_mem_config(void);
1365
1366/*
1367 * setup the pagelock cache
1368 */
1369static void
1370seg_pinit(void)
1371{
1372	struct seg_phash *hp;
1373	ulong_t i;
1374	pgcnt_t physmegs;
1375
1376	seg_plocked = 0;
1377	seg_plocked_window = 0;
1378
1379	if (segpcache_enabled == 0) {
1380		seg_phashsize_win = 0;
1381		seg_phashsize_wired = 0;
1382		seg_pdisabled = 1;
1383		return;
1384	}
1385
1386	seg_pdisabled = 0;
1387	seg_pkmcache = kmem_cache_create("seg_pcache",
1388	    sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1389	if (segpcache_pcp_maxage_ticks <= 0) {
1390		segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1391	}
1392	seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1393	seg_pathr_empty_ahb = 0;
1394	seg_pathr_full_ahb = 0;
1395	seg_pshrink_shift = segpcache_shrink_shift;
1396	seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1397
1398	mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1399	mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1400	mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1401	cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1402
1403	physmegs = physmem >> (20 - PAGESHIFT);
1404
1405	/*
1406	 * If segpcache_hashsize_win was not set in /etc/system or it has
1407	 * absurd value set it to a default.
1408	 */
1409	if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1410		/*
1411		 * Create one bucket per 32K (or at least per 8 pages) of
1412		 * available memory.
1413		 */
1414		pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1415		segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1416	}
1417	if (!ISP2(segpcache_hashsize_win)) {
1418		ulong_t rndfac = ~(1UL <<
1419		    (highbit(segpcache_hashsize_win) - 1));
1420		rndfac &= segpcache_hashsize_win;
1421		segpcache_hashsize_win += rndfac;
1422		segpcache_hashsize_win = 1 <<
1423		    (highbit(segpcache_hashsize_win) - 1);
1424	}
1425	seg_phashsize_win = segpcache_hashsize_win;
1426	seg_phashtab_win = kmem_zalloc(
1427	    seg_phashsize_win * sizeof (struct seg_phash),
1428	    KM_SLEEP);
1429	for (i = 0; i < seg_phashsize_win; i++) {
1430		hp = &seg_phashtab_win[i];
1431		hp->p_hnext = (struct seg_pcache *)hp;
1432		hp->p_hprev = (struct seg_pcache *)hp;
1433		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1434	}
1435
1436	seg_pahcur = 0;
1437	seg_pathr_on = 0;
1438	seg_pahhead[0].p_lnext = &seg_pahhead[0];
1439	seg_pahhead[0].p_lprev = &seg_pahhead[0];
1440	seg_pahhead[1].p_lnext = &seg_pahhead[1];
1441	seg_pahhead[1].p_lprev = &seg_pahhead[1];
1442
1443	/*
1444	 * If segpcache_hashsize_wired was not set in /etc/system or it has
1445	 * absurd value set it to a default.
1446	 */
1447	if (segpcache_hashsize_wired == 0 ||
1448	    segpcache_hashsize_wired > physmem / 4) {
1449		/*
1450		 * Choose segpcache_hashsize_wired based on physmem.
1451		 * Create a bucket per 128K bytes upto 256K buckets.
1452		 */
1453		if (physmegs < 20 * 1024) {
1454			segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1455		} else {
1456			segpcache_hashsize_wired = 256 * 1024;
1457		}
1458	}
1459	if (!ISP2(segpcache_hashsize_wired)) {
1460		segpcache_hashsize_wired = 1 <<
1461		    highbit(segpcache_hashsize_wired);
1462	}
1463	seg_phashsize_wired = segpcache_hashsize_wired;
1464	seg_phashtab_wired = kmem_zalloc(
1465	    seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1466	for (i = 0; i < seg_phashsize_wired; i++) {
1467		hp = (struct seg_phash *)&seg_phashtab_wired[i];
1468		hp->p_hnext = (struct seg_pcache *)hp;
1469		hp->p_hprev = (struct seg_pcache *)hp;
1470		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1471	}
1472
1473	if (segpcache_maxwindow == 0) {
1474		if (physmegs < 64) {
1475			/* 3% of memory */
1476			segpcache_maxwindow = availrmem >> 5;
1477		} else if (physmegs < 512) {
1478			/* 12% of memory */
1479			segpcache_maxwindow = availrmem >> 3;
1480		} else if (physmegs < 1024) {
1481			/* 25% of memory */
1482			segpcache_maxwindow = availrmem >> 2;
1483		} else if (physmegs < 2048) {
1484			/* 50% of memory */
1485			segpcache_maxwindow = availrmem >> 1;
1486		} else {
1487			/* no limit */
1488			segpcache_maxwindow = (pgcnt_t)-1;
1489		}
1490	}
1491	seg_pmaxwindow = segpcache_maxwindow;
1492	seg_pinit_mem_config();
1493}
1494
1495/*
1496 * called by pageout if memory is low
1497 */
1498void
1499seg_preap(void)
1500{
1501	/*
1502	 * if the cache is off or empty, return
1503	 */
1504	if (seg_plocked_window == 0) {
1505		return;
1506	}
1507	ASSERT(seg_phashsize_win != 0);
1508
1509	/*
1510	 * If somebody is already purging pcache
1511	 * just return.
1512	 */
1513	if (seg_pdisabled) {
1514		return;
1515	}
1516
1517	cv_signal(&seg_pasync_cv);
1518}
1519
1520/*
1521 * run as a backgroud thread and reclaim pagelock
1522 * pages which have not been used recently
1523 */
1524void
1525seg_pasync_thread(void)
1526{
1527	callb_cpr_t cpr_info;
1528
1529	if (seg_phashsize_win == 0) {
1530		thread_exit();
1531		/*NOTREACHED*/
1532	}
1533
1534	seg_pasync_thr = curthread;
1535
1536	CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1537	    callb_generic_cpr, "seg_pasync");
1538
1539	if (segpcache_reap_ticks <= 0) {
1540		segpcache_reap_ticks = segpcache_reap_sec * hz;
1541	}
1542
1543	mutex_enter(&seg_pasync_mtx);
1544	for (;;) {
1545		CALLB_CPR_SAFE_BEGIN(&cpr_info);
1546		(void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1547		    segpcache_reap_ticks, TR_CLOCK_TICK);
1548		CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1549		if (seg_pdisabled == 0) {
1550			seg_ppurge_async(0);
1551		}
1552	}
1553}
1554
1555static struct kmem_cache *seg_cache;
1556
1557/*
1558 * Initialize segment management data structures.
1559 */
1560void
1561seg_init(void)
1562{
1563	kstat_t *ksp;
1564
1565	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1566	    0, NULL, NULL, NULL, NULL, NULL, 0);
1567
1568	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1569	    segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1570	if (ksp) {
1571		ksp->ks_data = (void *)segadvstat_ptr;
1572		kstat_install(ksp);
1573	}
1574
1575	seg_pinit();
1576}
1577
1578/*
1579 * Allocate a segment to cover [base, base+size]
1580 * and attach it to the specified address space.
1581 */
1582struct seg *
1583seg_alloc(struct as *as, caddr_t base, size_t size)
1584{
1585	struct seg *new;
1586	caddr_t segbase;
1587	size_t segsize;
1588
1589	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1590	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1591	    (uintptr_t)segbase;
1592
1593	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1594		return ((struct seg *)NULL);	/* bad virtual addr range */
1595
1596	if (as != &kas &&
1597	    valid_usr_range(segbase, segsize, 0, as,
1598	    as->a_userlimit) != RANGE_OKAY)
1599		return ((struct seg *)NULL);	/* bad virtual addr range */
1600
1601	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1602	new->s_ops = NULL;
1603	new->s_data = NULL;
1604	new->s_szc = 0;
1605	new->s_flags = 0;
1606	mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1607	new->s_phead.p_lnext = &new->s_phead;
1608	new->s_phead.p_lprev = &new->s_phead;
1609	if (seg_attach(as, segbase, segsize, new) < 0) {
1610		kmem_cache_free(seg_cache, new);
1611		return ((struct seg *)NULL);
1612	}
1613	/* caller must fill in ops, data */
1614	return (new);
1615}
1616
1617/*
1618 * Attach a segment to the address space.  Used by seg_alloc()
1619 * and for kernel startup to attach to static segments.
1620 */
1621int
1622seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1623{
1624	seg->s_as = as;
1625	seg->s_base = base;
1626	seg->s_size = size;
1627
1628	/*
1629	 * as_addseg() will add the segment at the appropraite point
1630	 * in the list. It will return -1 if there is overlap with
1631	 * an already existing segment.
1632	 */
1633	return (as_addseg(as, seg));
1634}
1635
1636/*
1637 * Unmap a segment and free it from its associated address space.
1638 * This should be called by anybody who's finished with a whole segment's
1639 * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
1640 * responsibility of the segment driver to unlink the the segment
1641 * from the address space, and to free public and private data structures
1642 * associated with the segment.  (This is typically done by a call to
1643 * seg_free()).
1644 */
1645void
1646seg_unmap(struct seg *seg)
1647{
1648#ifdef DEBUG
1649	int ret;
1650#endif /* DEBUG */
1651
1652	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1653
1654	/* Shouldn't have called seg_unmap if mapping isn't yet established */
1655	ASSERT(seg->s_data != NULL);
1656
1657	/* Unmap the whole mapping */
1658#ifdef DEBUG
1659	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1660	ASSERT(ret == 0);
1661#else
1662	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1663#endif /* DEBUG */
1664}
1665
1666/*
1667 * Free the segment from its associated as. This should only be called
1668 * if a mapping to the segment has not yet been established (e.g., if
1669 * an error occurs in the middle of doing an as_map when the segment
1670 * has already been partially set up) or if it has already been deleted
1671 * (e.g., from a segment driver unmap routine if the unmap applies to the
1672 * entire segment). If the mapping is currently set up then seg_unmap() should
1673 * be called instead.
1674 */
1675void
1676seg_free(struct seg *seg)
1677{
1678	register struct as *as = seg->s_as;
1679	struct seg *tseg = as_removeseg(as, seg);
1680
1681	ASSERT(tseg == seg);
1682
1683	/*
1684	 * If the segment private data field is NULL,
1685	 * then segment driver is not attached yet.
1686	 */
1687	if (seg->s_data != NULL)
1688		SEGOP_FREE(seg);
1689
1690	mutex_destroy(&seg->s_pmtx);
1691	ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1692	ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1693	kmem_cache_free(seg_cache, seg);
1694}
1695
1696/*ARGSUSED*/
1697static void
1698seg_p_mem_config_post_add(
1699	void *arg,
1700	pgcnt_t delta_pages)
1701{
1702	/* Nothing to do. */
1703}
1704
1705void
1706seg_p_enable(void)
1707{
1708	mutex_enter(&seg_pcache_mtx);
1709	ASSERT(seg_pdisabled != 0);
1710	seg_pdisabled--;
1711	mutex_exit(&seg_pcache_mtx);
1712}
1713
1714/*
1715 * seg_p_disable - disables seg_pcache, and then attempts to empty the
1716 * cache.
1717 * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1718 * SEGP_FAIL if the cache could not be emptied.
1719 */
1720int
1721seg_p_disable(void)
1722{
1723	pgcnt_t	old_plocked;
1724	int stall_count = 0;
1725
1726	mutex_enter(&seg_pcache_mtx);
1727	seg_pdisabled++;
1728	ASSERT(seg_pdisabled != 0);
1729	mutex_exit(&seg_pcache_mtx);
1730
1731	/*
1732	 * Attempt to empty the cache. Terminate if seg_plocked does not
1733	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1734	 */
1735	while (seg_plocked != 0) {
1736		ASSERT(seg_phashsize_win != 0);
1737		old_plocked = seg_plocked;
1738		seg_ppurge_async(1);
1739		if (seg_plocked == old_plocked) {
1740			if (stall_count++ > SEGP_STALL_THRESHOLD) {
1741				return (SEGP_FAIL);
1742			}
1743		} else
1744			stall_count = 0;
1745		if (seg_plocked != 0)
1746			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1747	}
1748	return (SEGP_SUCCESS);
1749}
1750
1751/*
1752 * Attempt to purge seg_pcache.  May need to return before this has
1753 * completed to allow other pre_del callbacks to unlock pages. This is
1754 * ok because:
1755 *	1) The seg_pdisabled flag has been set so at least we won't
1756 *	cache anymore locks and the locks we couldn't purge
1757 *	will not be held if they do get released by a subsequent
1758 *	pre-delete callback.
1759 *
1760 *	2) The rest of the memory delete thread processing does not
1761 *	depend on the changes made in this pre-delete callback. No
1762 *	panics will result, the worst that will happen is that the
1763 *	DR code will timeout and cancel the delete.
1764 */
1765/*ARGSUSED*/
1766static int
1767seg_p_mem_config_pre_del(
1768	void *arg,
1769	pgcnt_t delta_pages)
1770{
1771	if (seg_phashsize_win == 0) {
1772		return (0);
1773	}
1774	if (seg_p_disable() != SEGP_SUCCESS)
1775		cmn_err(CE_NOTE,
1776		    "!Pre-delete couldn't purge"" pagelock cache - continuing");
1777	return (0);
1778}
1779
1780/*ARGSUSED*/
1781static void
1782seg_p_mem_config_post_del(
1783	void *arg,
1784	pgcnt_t delta_pages,
1785	int cancelled)
1786{
1787	if (seg_phashsize_win == 0) {
1788		return;
1789	}
1790	seg_p_enable();
1791}
1792
1793static kphysm_setup_vector_t seg_p_mem_config_vec = {
1794	KPHYSM_SETUP_VECTOR_VERSION,
1795	seg_p_mem_config_post_add,
1796	seg_p_mem_config_pre_del,
1797	seg_p_mem_config_post_del,
1798};
1799
1800static void
1801seg_pinit_mem_config(void)
1802{
1803	int ret;
1804
1805	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1806	/*
1807	 * Want to catch this in the debug kernel. At run time, if the
1808	 * callbacks don't get run all will be OK as the disable just makes
1809	 * it more likely that the pages can be collected.
1810	 */
1811	ASSERT(ret == 0);
1812}
1813
1814/*
1815 * Verify that segment is not a shared anonymous segment which reserves
1816 * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1817 * from one zone to another if any segments are shared.  This is because the
1818 * last process to exit will credit the swap reservation.  This could lead
1819 * to the swap being reserved by one zone, and credited to another.
1820 */
1821boolean_t
1822seg_can_change_zones(struct seg *seg)
1823{
1824	struct segvn_data *svd;
1825
1826	if (seg->s_ops == &segspt_shmops)
1827		return (B_FALSE);
1828
1829	if (seg->s_ops == &segvn_ops) {
1830		svd = (struct segvn_data *)seg->s_data;
1831		if (svd->type == MAP_SHARED &&
1832		    svd->amp != NULL &&
1833		    svd->amp->swresv > 0)
1834			return (B_FALSE);
1835	}
1836	return (B_TRUE);
1837}
1838
1839/*
1840 * Return swap reserved by a segment backing a private mapping.
1841 */
1842size_t
1843seg_swresv(struct seg *seg)
1844{
1845	struct segvn_data *svd;
1846	size_t swap = 0;
1847
1848	if (seg->s_ops == &segvn_ops) {
1849		svd = (struct segvn_data *)seg->s_data;
1850		if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1851			swap = svd->swresv;
1852	}
1853	return (swap);
1854}
1855
1856/*
1857 * General not supported function for SEGOP_INHERIT
1858 */
1859/* ARGSUSED */
1860int
1861seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
1862{
1863	return (ENOTSUP);
1864}
1865