xref: /illumos-gate/usr/src/uts/common/vm/vm_seg.c (revision 15c07adc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright (c) 2018, Joyent, Inc.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 /*
41  * VM - segment management.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/inttypes.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/kmem.h>
50 #include <sys/sysmacros.h>
51 #include <sys/vmsystm.h>
52 #include <sys/tuneable.h>
53 #include <sys/debug.h>
54 #include <sys/fs/swapnode.h>
55 #include <sys/cmn_err.h>
56 #include <sys/callb.h>
57 #include <sys/mem_config.h>
58 #include <sys/mman.h>
59 
60 #include <vm/hat.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_kmem.h>
64 #include <vm/seg_spt.h>
65 #include <vm/seg_vn.h>
66 #include <vm/anon.h>
67 
68 /*
69  * kstats for segment advise
70  */
71 segadvstat_t segadvstat = {
72 	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
73 	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
74 };
75 
76 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
77 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
78 
79 /*
80  * entry in the segment page cache
81  */
82 struct seg_pcache {
83 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
84 	struct seg_pcache	*p_hprev;
85 	pcache_link_t		p_plink;	/* per segment/amp list */
86 	void 			*p_htag0;	/* segment/amp pointer */
87 	caddr_t			p_addr;		/* base address/anon_idx */
88 	size_t			p_len;		/* total bytes */
89 	size_t			p_wlen;		/* writtable bytes at p_addr */
90 	struct page		**p_pp;		/* pp shadow list */
91 	seg_preclaim_cbfunc_t	p_callback;	/* reclaim callback function */
92 	clock_t			p_lbolt;	/* lbolt from last use */
93 	struct seg_phash	*p_hashp;	/* our pcache hash bucket */
94 	uint_t			p_active;	/* active count */
95 	uchar_t			p_write;	/* true if S_WRITE */
96 	uchar_t			p_ref;		/* reference byte */
97 	ushort_t		p_flags;	/* bit flags */
98 };
99 
100 struct seg_phash {
101 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
102 	struct seg_pcache	*p_hprev;
103 	kmutex_t		p_hmutex;	/* protects hash bucket */
104 	pcache_link_t		p_halink[2];	/* active bucket linkages */
105 };
106 
107 struct seg_phash_wired {
108 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
109 	struct seg_pcache	*p_hprev;
110 	kmutex_t		p_hmutex;	/* protects hash bucket */
111 };
112 
113 /*
114  * A parameter to control a maximum number of bytes that can be
115  * purged from pcache at a time.
116  */
117 #define	P_MAX_APURGE_BYTES	(1024 * 1024 * 1024)
118 
119 /*
120  * log2(fraction of pcache to reclaim at a time).
121  */
122 #define	P_SHRINK_SHFT		(5)
123 
124 /*
125  * The following variables can be tuned via /etc/system.
126  */
127 
128 int	segpcache_enabled = 1;		/* if 1, shadow lists are cached */
129 pgcnt_t	segpcache_maxwindow = 0;	/* max # of pages that can be cached */
130 ulong_t	segpcache_hashsize_win = 0;	/* # of non wired buckets */
131 ulong_t	segpcache_hashsize_wired = 0;	/* # of wired buckets */
132 int	segpcache_reap_sec = 1;		/* reap check rate in secs */
133 clock_t	segpcache_reap_ticks = 0;	/* reap interval in ticks */
134 int	segpcache_pcp_maxage_sec = 1;	/* pcp max age in secs */
135 clock_t	segpcache_pcp_maxage_ticks = 0;	/* pcp max age in ticks */
136 int	segpcache_shrink_shift = P_SHRINK_SHFT;	/* log2 reap fraction */
137 pgcnt_t	segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES;	/* max purge bytes */
138 
139 static kmutex_t seg_pcache_mtx;	/* protects seg_pdisabled counter */
140 static kmutex_t seg_pasync_mtx;	/* protects async thread scheduling */
141 static kcondvar_t seg_pasync_cv;
142 
143 #pragma align 64(pctrl1)
144 #pragma align 64(pctrl2)
145 #pragma align 64(pctrl3)
146 
147 /*
148  * Keep frequently used variables together in one cache line.
149  */
150 static struct p_ctrl1 {
151 	uint_t p_disabled;		/* if not 0, caching temporarily off */
152 	pgcnt_t p_maxwin;		/* max # of pages that can be cached */
153 	size_t p_hashwin_sz;		/* # of non wired buckets */
154 	struct seg_phash *p_htabwin;	/* hash table for non wired entries */
155 	size_t p_hashwired_sz;		/* # of wired buckets */
156 	struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
157 	kmem_cache_t *p_kmcache;	/* kmem cache for seg_pcache structs */
158 #ifdef _LP64
159 	ulong_t pad[1];
160 #endif /* _LP64 */
161 } pctrl1;
162 
163 static struct p_ctrl2 {
164 	kmutex_t p_mem_mtx;	/* protects window counter and p_halinks */
165 	pgcnt_t  p_locked_win;	/* # pages from window */
166 	pgcnt_t  p_locked;	/* # of pages cached by pagelock */
167 	uchar_t	 p_ahcur;	/* current active links for insert/delete */
168 	uchar_t  p_athr_on;	/* async reclaim thread is running. */
169 	pcache_link_t p_ahhead[2]; /* active buckets linkages */
170 } pctrl2;
171 
172 static struct p_ctrl3 {
173 	clock_t	p_pcp_maxage;		/* max pcp age in ticks */
174 	ulong_t	p_athr_empty_ahb;	/* athread walk stats */
175 	ulong_t p_athr_full_ahb;	/* athread walk stats */
176 	pgcnt_t	p_maxapurge_npages;	/* max pages to purge at a time */
177 	int	p_shrink_shft;		/* reap shift factor */
178 #ifdef _LP64
179 	ulong_t pad[3];
180 #endif /* _LP64 */
181 } pctrl3;
182 
183 #define	seg_pdisabled			pctrl1.p_disabled
184 #define	seg_pmaxwindow			pctrl1.p_maxwin
185 #define	seg_phashsize_win		pctrl1.p_hashwin_sz
186 #define	seg_phashtab_win		pctrl1.p_htabwin
187 #define	seg_phashsize_wired		pctrl1.p_hashwired_sz
188 #define	seg_phashtab_wired		pctrl1.p_htabwired
189 #define	seg_pkmcache			pctrl1.p_kmcache
190 #define	seg_pmem_mtx			pctrl2.p_mem_mtx
191 #define	seg_plocked_window		pctrl2.p_locked_win
192 #define	seg_plocked			pctrl2.p_locked
193 #define	seg_pahcur			pctrl2.p_ahcur
194 #define	seg_pathr_on			pctrl2.p_athr_on
195 #define	seg_pahhead			pctrl2.p_ahhead
196 #define	seg_pmax_pcpage			pctrl3.p_pcp_maxage
197 #define	seg_pathr_empty_ahb		pctrl3.p_athr_empty_ahb
198 #define	seg_pathr_full_ahb		pctrl3.p_athr_full_ahb
199 #define	seg_pshrink_shift		pctrl3.p_shrink_shft
200 #define	seg_pmaxapurge_npages		pctrl3.p_maxapurge_npages
201 
202 #define	P_HASHWIN_MASK			(seg_phashsize_win - 1)
203 #define	P_HASHWIRED_MASK		(seg_phashsize_wired - 1)
204 #define	P_BASESHIFT			(6)
205 
206 kthread_t *seg_pasync_thr;
207 
208 extern struct seg_ops segvn_ops;
209 extern struct seg_ops segspt_shmops;
210 
211 #define	IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
212 #define	IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
213 
214 #define	LBOLT_DELTA(t)	((ulong_t)(ddi_get_lbolt() - (t)))
215 
216 #define	PCP_AGE(pcp)	LBOLT_DELTA((pcp)->p_lbolt)
217 
218 /*
219  * htag0 argument can be a seg or amp pointer.
220  */
221 #define	P_HASHBP(seg, htag0, addr, flags)				\
222 	(IS_PFLAGS_WIRED((flags)) ?					\
223 	    ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK &	\
224 	    ((uintptr_t)(htag0) >> P_BASESHIFT)]) :			\
225 	    (&seg_phashtab_win[P_HASHWIN_MASK &				\
226 	    (((uintptr_t)(htag0) >> 3) ^				\
227 	    ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?		\
228 	    (flags >> 16) : page_get_shift((seg)->s_szc))))]))
229 
230 /*
231  * htag0 argument can be a seg or amp pointer.
232  */
233 #define	P_MATCH(pcp, htag0, addr, len)					\
234 	((pcp)->p_htag0 == (htag0) &&					\
235 	(pcp)->p_addr == (addr) &&					\
236 	(pcp)->p_len >= (len))
237 
238 #define	P_MATCH_PP(pcp, htag0, addr, len, pp)				\
239 	((pcp)->p_pp == (pp) &&						\
240 	(pcp)->p_htag0 == (htag0) &&					\
241 	(pcp)->p_addr == (addr) &&					\
242 	(pcp)->p_len >= (len))
243 
244 #define	plink2pcache(pl)	((struct seg_pcache *)((uintptr_t)(pl) - \
245     offsetof(struct seg_pcache, p_plink)))
246 
247 #define	hlink2phash(hl, l)	((struct seg_phash *)((uintptr_t)(hl) -	\
248     offsetof(struct seg_phash, p_halink[l])))
249 
250 /*
251  * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
252  * active hash bucket lists. We maintain active bucket lists to reduce the
253  * overhead of finding active buckets during asynchronous purging since there
254  * can be 10s of millions of buckets on a large system but only a small subset
255  * of them in actual use.
256  *
257  * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
258  * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
259  * buckets. The other list is used by asynchronous purge thread. This allows
260  * the purge thread to walk its active list without holding seg_pmem_mtx for a
261  * long time. When asynchronous thread is done with its list it switches to
262  * current active list and makes the list it just finished processing as
263  * current active list.
264  *
265  * seg_padd_abuck() only adds the bucket to current list if the bucket is not
266  * yet on any list.  seg_premove_abuck() may remove the bucket from either
267  * list. If the bucket is on current list it will be always removed. Otherwise
268  * the bucket is only removed if asynchronous purge thread is not currently
269  * running or seg_premove_abuck() is called by asynchronous purge thread
270  * itself. A given bucket can only be on one of active lists at a time. These
271  * routines should be called with per bucket lock held.  The routines use
272  * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
273  * the first entry is added to the bucket chain and seg_premove_abuck() must
274  * be called after the last pcp entry is deleted from its chain. Per bucket
275  * lock should be held by the callers.  This avoids a potential race condition
276  * when seg_premove_abuck() removes a bucket after pcp entries are added to
277  * its list after the caller checked that the bucket has no entries. (this
278  * race would cause a loss of an active bucket from the active lists).
279  *
280  * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
281  * New entries are added to the end of the list since LRU is used as the
282  * purging policy.
283  */
284 static void
seg_padd_abuck(struct seg_phash * hp)285 seg_padd_abuck(struct seg_phash *hp)
286 {
287 	int lix;
288 
289 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
290 	ASSERT((struct seg_phash *)hp->p_hnext != hp);
291 	ASSERT((struct seg_phash *)hp->p_hprev != hp);
292 	ASSERT(hp->p_hnext == hp->p_hprev);
293 	ASSERT(!IS_PCP_WIRED(hp->p_hnext));
294 	ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
295 	ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
296 	ASSERT(hp >= seg_phashtab_win &&
297 	    hp < &seg_phashtab_win[seg_phashsize_win]);
298 
299 	/*
300 	 * This bucket can already be on one of active lists
301 	 * since seg_premove_abuck() may have failed to remove it
302 	 * before.
303 	 */
304 	mutex_enter(&seg_pmem_mtx);
305 	lix = seg_pahcur;
306 	ASSERT(lix >= 0 && lix <= 1);
307 	if (hp->p_halink[lix].p_lnext != NULL) {
308 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
309 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
310 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
311 		mutex_exit(&seg_pmem_mtx);
312 		return;
313 	}
314 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
315 
316 	/*
317 	 * If this bucket is still on list !lix async thread can't yet remove
318 	 * it since we hold here per bucket lock. In this case just return
319 	 * since async thread will eventually find and process this bucket.
320 	 */
321 	if (hp->p_halink[!lix].p_lnext != NULL) {
322 		ASSERT(hp->p_halink[!lix].p_lprev != NULL);
323 		mutex_exit(&seg_pmem_mtx);
324 		return;
325 	}
326 	ASSERT(hp->p_halink[!lix].p_lprev == NULL);
327 	/*
328 	 * This bucket is not on any active bucket list yet.
329 	 * Add the bucket to the tail of current active list.
330 	 */
331 	hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
332 	hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
333 	seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
334 	seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
335 	mutex_exit(&seg_pmem_mtx);
336 }
337 
338 static void
seg_premove_abuck(struct seg_phash * hp,int athr)339 seg_premove_abuck(struct seg_phash *hp, int athr)
340 {
341 	int lix;
342 
343 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
344 	ASSERT((struct seg_phash *)hp->p_hnext == hp);
345 	ASSERT((struct seg_phash *)hp->p_hprev == hp);
346 	ASSERT(hp >= seg_phashtab_win &&
347 	    hp < &seg_phashtab_win[seg_phashsize_win]);
348 
349 	if (athr) {
350 		ASSERT(seg_pathr_on);
351 		ASSERT(seg_pahcur <= 1);
352 		/*
353 		 * We are called by asynchronous thread that found this bucket
354 		 * on not currently active (i.e. !seg_pahcur) list. Remove it
355 		 * from there.  Per bucket lock we are holding makes sure
356 		 * seg_pinsert() can't sneak in and add pcp entries to this
357 		 * bucket right before we remove the bucket from its list.
358 		 */
359 		lix = !seg_pahcur;
360 		ASSERT(hp->p_halink[lix].p_lnext != NULL);
361 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
362 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
363 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
364 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
365 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
366 		hp->p_halink[lix].p_lnext = NULL;
367 		hp->p_halink[lix].p_lprev = NULL;
368 		return;
369 	}
370 
371 	mutex_enter(&seg_pmem_mtx);
372 	lix = seg_pahcur;
373 	ASSERT(lix >= 0 && lix <= 1);
374 
375 	/*
376 	 * If the bucket is on currently active list just remove it from
377 	 * there.
378 	 */
379 	if (hp->p_halink[lix].p_lnext != NULL) {
380 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
381 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
382 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
383 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
384 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
385 		hp->p_halink[lix].p_lnext = NULL;
386 		hp->p_halink[lix].p_lprev = NULL;
387 		mutex_exit(&seg_pmem_mtx);
388 		return;
389 	}
390 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
391 
392 	/*
393 	 * If asynchronous thread is not running we can remove the bucket from
394 	 * not currently active list. The bucket must be on this list since we
395 	 * already checked that it's not on the other list and the bucket from
396 	 * which we just deleted the last pcp entry must be still on one of the
397 	 * active bucket lists.
398 	 */
399 	lix = !lix;
400 	ASSERT(hp->p_halink[lix].p_lnext != NULL);
401 	ASSERT(hp->p_halink[lix].p_lprev != NULL);
402 
403 	if (!seg_pathr_on) {
404 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
405 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
406 		hp->p_halink[lix].p_lnext = NULL;
407 		hp->p_halink[lix].p_lprev = NULL;
408 	}
409 	mutex_exit(&seg_pmem_mtx);
410 }
411 
412 /*
413  * Check if bucket pointed by hp already has a pcp entry that matches request
414  * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
415  * Also delete matching entries that cover smaller address range but start
416  * at the same address as addr argument. Return the list of deleted entries if
417  * any. This is an internal helper function called from seg_pinsert() only
418  * for non wired shadow lists. The caller already holds a per seg/amp list
419  * lock.
420  */
421 static struct seg_pcache *
seg_plookup_checkdup(struct seg_phash * hp,void * htag0,caddr_t addr,size_t len,int * found)422 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
423     caddr_t addr, size_t len, int *found)
424 {
425 	struct seg_pcache *pcp;
426 	struct seg_pcache *delcallb_list = NULL;
427 
428 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
429 
430 	*found = 0;
431 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
432 	    pcp = pcp->p_hnext) {
433 		ASSERT(pcp->p_hashp == hp);
434 		if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
435 			ASSERT(!IS_PCP_WIRED(pcp));
436 			if (pcp->p_len < len) {
437 				pcache_link_t *plinkp;
438 				if (pcp->p_active) {
439 					continue;
440 				}
441 				plinkp = &pcp->p_plink;
442 				plinkp->p_lprev->p_lnext = plinkp->p_lnext;
443 				plinkp->p_lnext->p_lprev = plinkp->p_lprev;
444 				pcp->p_hprev->p_hnext = pcp->p_hnext;
445 				pcp->p_hnext->p_hprev = pcp->p_hprev;
446 				pcp->p_hprev = delcallb_list;
447 				delcallb_list = pcp;
448 			} else {
449 				*found = 1;
450 				break;
451 			}
452 		}
453 	}
454 	return (delcallb_list);
455 }
456 
457 /*
458  * lookup an address range in pagelock cache. Return shadow list and bump up
459  * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
460  * as a lookup tag.
461  */
462 struct page **
seg_plookup(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,enum seg_rw rw,uint_t flags)463 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
464     enum seg_rw rw, uint_t flags)
465 {
466 	struct seg_pcache *pcp;
467 	struct seg_phash *hp;
468 	void *htag0;
469 
470 	ASSERT(seg != NULL);
471 	ASSERT(rw == S_READ || rw == S_WRITE);
472 
473 	/*
474 	 * Skip pagelock cache, while DR is in progress or
475 	 * seg_pcache is off.
476 	 */
477 	if (seg_pdisabled) {
478 		return (NULL);
479 	}
480 	ASSERT(seg_phashsize_win != 0);
481 
482 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
483 	hp = P_HASHBP(seg, htag0, addr, flags);
484 	mutex_enter(&hp->p_hmutex);
485 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
486 	    pcp = pcp->p_hnext) {
487 		ASSERT(pcp->p_hashp == hp);
488 		if (P_MATCH(pcp, htag0, addr, len)) {
489 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
490 			/*
491 			 * If this request wants to write pages
492 			 * but write permissions starting from
493 			 * addr don't cover the entire length len
494 			 * return lookup failure back to the caller.
495 			 * It will check protections and fail this
496 			 * pagelock operation with EACCESS error.
497 			 */
498 			if (rw == S_WRITE && pcp->p_wlen < len) {
499 				break;
500 			}
501 			if (pcp->p_active == UINT_MAX) {
502 				break;
503 			}
504 			pcp->p_active++;
505 			if (rw == S_WRITE && !pcp->p_write) {
506 				pcp->p_write = 1;
507 			}
508 			mutex_exit(&hp->p_hmutex);
509 			return (pcp->p_pp);
510 		}
511 	}
512 	mutex_exit(&hp->p_hmutex);
513 	return (NULL);
514 }
515 
516 /*
517  * mark address range inactive. If the cache is off or the address range is
518  * not in the cache or another shadow list that covers bigger range is found
519  * we call the segment driver to reclaim the pages. Otherwise just decrement
520  * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
521  * otherwise use seg as a lookup tag.
522  */
523 void
seg_pinactive(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,struct page ** pp,enum seg_rw rw,uint_t flags,seg_preclaim_cbfunc_t callback)524 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
525     size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
526     seg_preclaim_cbfunc_t callback)
527 {
528 	struct seg_pcache *pcp;
529 	struct seg_phash *hp;
530 	kmutex_t *pmtx = NULL;
531 	pcache_link_t *pheadp;
532 	void *htag0;
533 	pgcnt_t npages = 0;
534 	int keep = 0;
535 
536 	ASSERT(seg != NULL);
537 	ASSERT(rw == S_READ || rw == S_WRITE);
538 
539 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
540 
541 	/*
542 	 * Skip lookup if pcache is not configured.
543 	 */
544 	if (seg_phashsize_win == 0) {
545 		goto out;
546 	}
547 
548 	/*
549 	 * Grab per seg/amp lock before hash lock if we are going to remove
550 	 * inactive entry from pcache.
551 	 */
552 	if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
553 		if (amp == NULL) {
554 			pheadp = &seg->s_phead;
555 			pmtx = &seg->s_pmtx;
556 		} else {
557 			pheadp = &amp->a_phead;
558 			pmtx = &amp->a_pmtx;
559 		}
560 		mutex_enter(pmtx);
561 	}
562 
563 	hp = P_HASHBP(seg, htag0, addr, flags);
564 	mutex_enter(&hp->p_hmutex);
565 again:
566 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
567 	    pcp = pcp->p_hnext) {
568 		ASSERT(pcp->p_hashp == hp);
569 		if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
570 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
571 			ASSERT(pcp->p_active);
572 			if (keep) {
573 				/*
574 				 * Don't remove this pcp entry
575 				 * if we didn't find duplicate
576 				 * shadow lists on second search.
577 				 * Somebody removed those duplicates
578 				 * since we dropped hash lock after first
579 				 * search.
580 				 */
581 				ASSERT(pmtx != NULL);
582 				ASSERT(!IS_PFLAGS_WIRED(flags));
583 				mutex_exit(pmtx);
584 				pmtx = NULL;
585 			}
586 			pcp->p_active--;
587 			if (pcp->p_active == 0 && (pmtx != NULL ||
588 			    (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
589 
590 				/*
591 				 * This entry is no longer active.  Remove it
592 				 * now either because pcaching is temporarily
593 				 * disabled or there're other pcp entries that
594 				 * can match this pagelock request (i.e. this
595 				 * entry is a duplicate).
596 				 */
597 
598 				ASSERT(callback == pcp->p_callback);
599 				if (pmtx != NULL) {
600 					pcache_link_t *plinkp = &pcp->p_plink;
601 					ASSERT(!IS_PCP_WIRED(pcp));
602 					ASSERT(pheadp->p_lnext != pheadp);
603 					ASSERT(pheadp->p_lprev != pheadp);
604 					plinkp->p_lprev->p_lnext =
605 					    plinkp->p_lnext;
606 					plinkp->p_lnext->p_lprev =
607 					    plinkp->p_lprev;
608 				}
609 				pcp->p_hprev->p_hnext = pcp->p_hnext;
610 				pcp->p_hnext->p_hprev = pcp->p_hprev;
611 				if (!IS_PCP_WIRED(pcp) &&
612 				    hp->p_hnext == (struct seg_pcache *)hp) {
613 					/*
614 					 * We removed the last entry from this
615 					 * bucket.  Now remove the bucket from
616 					 * its active list.
617 					 */
618 					seg_premove_abuck(hp, 0);
619 				}
620 				mutex_exit(&hp->p_hmutex);
621 				if (pmtx != NULL) {
622 					mutex_exit(pmtx);
623 				}
624 				len = pcp->p_len;
625 				npages = btop(len);
626 				if (rw != S_WRITE && pcp->p_write) {
627 					rw = S_WRITE;
628 				}
629 				kmem_cache_free(seg_pkmcache, pcp);
630 				goto out;
631 			} else {
632 				/*
633 				 * We found a matching pcp entry but will not
634 				 * free it right away even if it's no longer
635 				 * active.
636 				 */
637 				if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
638 					/*
639 					 * Set the reference bit and mark the
640 					 * time of last access to this pcp
641 					 * so that asynchronous thread doesn't
642 					 * free it immediately since
643 					 * it may be reactivated very soon.
644 					 */
645 					pcp->p_lbolt = ddi_get_lbolt();
646 					pcp->p_ref = 1;
647 				}
648 				mutex_exit(&hp->p_hmutex);
649 				if (pmtx != NULL) {
650 					mutex_exit(pmtx);
651 				}
652 				return;
653 			}
654 		} else if (!IS_PFLAGS_WIRED(flags) &&
655 		    P_MATCH(pcp, htag0, addr, len)) {
656 			/*
657 			 * This is a duplicate pcp entry.  This situation may
658 			 * happen if a bigger shadow list that covers our
659 			 * range was added while our entry was still active.
660 			 * Now we can free our pcp entry if it becomes
661 			 * inactive.
662 			 */
663 			if (!pcp->p_active) {
664 				/*
665 				 * Mark this entry as referenced just in case
666 				 * we'll free our own pcp entry soon.
667 				 */
668 				pcp->p_lbolt = ddi_get_lbolt();
669 				pcp->p_ref = 1;
670 			}
671 			if (pmtx != NULL) {
672 				/*
673 				 * we are already holding pmtx and found a
674 				 * duplicate.  Don't keep our own pcp entry.
675 				 */
676 				keep = 0;
677 				continue;
678 			}
679 			/*
680 			 * We have to use mutex_tryenter to attempt to lock
681 			 * seg/amp list lock since we already hold hash lock
682 			 * and seg/amp list lock is above hash lock in lock
683 			 * order.  If mutex_tryenter fails drop hash lock and
684 			 * retake both locks in correct order and research
685 			 * this hash chain.
686 			 */
687 			ASSERT(keep == 0);
688 			if (amp == NULL) {
689 				pheadp = &seg->s_phead;
690 				pmtx = &seg->s_pmtx;
691 			} else {
692 				pheadp = &amp->a_phead;
693 				pmtx = &amp->a_pmtx;
694 			}
695 			if (!mutex_tryenter(pmtx)) {
696 				mutex_exit(&hp->p_hmutex);
697 				mutex_enter(pmtx);
698 				mutex_enter(&hp->p_hmutex);
699 				/*
700 				 * If we don't find bigger shadow list on
701 				 * second search (it may happen since we
702 				 * dropped bucket lock) keep the entry that
703 				 * matches our own shadow list.
704 				 */
705 				keep = 1;
706 				goto again;
707 			}
708 		}
709 	}
710 	mutex_exit(&hp->p_hmutex);
711 	if (pmtx != NULL) {
712 		mutex_exit(pmtx);
713 	}
714 out:
715 	(*callback)(htag0, addr, len, pp, rw, 0);
716 	if (npages) {
717 		mutex_enter(&seg_pmem_mtx);
718 		ASSERT(seg_plocked >= npages);
719 		seg_plocked -= npages;
720 		if (!IS_PFLAGS_WIRED(flags)) {
721 			ASSERT(seg_plocked_window >= npages);
722 			seg_plocked_window -= npages;
723 		}
724 		mutex_exit(&seg_pmem_mtx);
725 	}
726 
727 }
728 
729 #ifdef DEBUG
730 static uint32_t p_insert_chk_mtbf = 0;
731 #endif
732 
733 /*
734  * The seg_pinsert_check() is used by segment drivers to predict whether
735  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
736  */
737 /*ARGSUSED*/
738 int
seg_pinsert_check(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,uint_t flags)739 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
740     size_t len, uint_t flags)
741 {
742 	ASSERT(seg != NULL);
743 
744 #ifdef DEBUG
745 	if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
746 		return (SEGP_FAIL);
747 	}
748 #endif
749 
750 	if (seg_pdisabled) {
751 		return (SEGP_FAIL);
752 	}
753 	ASSERT(seg_phashsize_win != 0);
754 
755 	if (IS_PFLAGS_WIRED(flags)) {
756 		return (SEGP_SUCCESS);
757 	}
758 
759 	if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
760 		return (SEGP_FAIL);
761 	}
762 
763 	if (freemem < desfree) {
764 		return (SEGP_FAIL);
765 	}
766 
767 	return (SEGP_SUCCESS);
768 }
769 
770 #ifdef DEBUG
771 static uint32_t p_insert_mtbf = 0;
772 #endif
773 
774 /*
775  * Insert address range with shadow list into pagelock cache if there's no
776  * shadow list already cached for this address range. If the cache is off or
777  * caching is temporarily disabled or the allowed 'window' is exceeded return
778  * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
779  *
780  * For non wired shadow lists (segvn case) include address in the hashing
781  * function to avoid linking all the entries from the same segment or amp on
782  * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
783  * pcache entries are also linked on a per segment/amp list so that all
784  * entries can be found quickly during seg/amp purge without walking the
785  * entire pcache hash table.  For wired shadow lists (segspt case) we
786  * don't use address hashing and per segment linking because the caller
787  * currently inserts only one entry per segment that covers the entire
788  * segment. If we used per segment linking even for segspt it would complicate
789  * seg_ppurge_wiredpp() locking.
790  *
791  * Both hash bucket and per seg/amp locks need to be held before adding a non
792  * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
793  * first.
794  *
795  * This function will also remove from pcache old inactive shadow lists that
796  * overlap with this request but cover smaller range for the same start
797  * address.
798  */
799 int
seg_pinsert(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,size_t wlen,struct page ** pp,enum seg_rw rw,uint_t flags,seg_preclaim_cbfunc_t callback)800 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
801     size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
802     seg_preclaim_cbfunc_t callback)
803 {
804 	struct seg_pcache *pcp;
805 	struct seg_phash *hp;
806 	pgcnt_t npages;
807 	pcache_link_t *pheadp;
808 	kmutex_t *pmtx;
809 	struct seg_pcache *delcallb_list = NULL;
810 
811 	ASSERT(seg != NULL);
812 	ASSERT(rw == S_READ || rw == S_WRITE);
813 	ASSERT(rw == S_READ || wlen == len);
814 	ASSERT(rw == S_WRITE || wlen <= len);
815 	ASSERT(amp == NULL || wlen == len);
816 
817 #ifdef DEBUG
818 	if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
819 		return (SEGP_FAIL);
820 	}
821 #endif
822 
823 	if (seg_pdisabled) {
824 		return (SEGP_FAIL);
825 	}
826 	ASSERT(seg_phashsize_win != 0);
827 
828 	ASSERT((len & PAGEOFFSET) == 0);
829 	npages = btop(len);
830 	mutex_enter(&seg_pmem_mtx);
831 	if (!IS_PFLAGS_WIRED(flags)) {
832 		if (seg_plocked_window + npages > seg_pmaxwindow) {
833 			mutex_exit(&seg_pmem_mtx);
834 			return (SEGP_FAIL);
835 		}
836 		seg_plocked_window += npages;
837 	}
838 	seg_plocked += npages;
839 	mutex_exit(&seg_pmem_mtx);
840 
841 	pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
842 	/*
843 	 * If amp is not NULL set htag0 to amp otherwise set it to seg.
844 	 */
845 	if (amp == NULL) {
846 		pcp->p_htag0 = (void *)seg;
847 		pcp->p_flags = flags & 0xffff;
848 	} else {
849 		pcp->p_htag0 = (void *)amp;
850 		pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
851 	}
852 	pcp->p_addr = addr;
853 	pcp->p_len = len;
854 	pcp->p_wlen = wlen;
855 	pcp->p_pp = pp;
856 	pcp->p_write = (rw == S_WRITE);
857 	pcp->p_callback = callback;
858 	pcp->p_active = 1;
859 
860 	hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
861 	if (!IS_PFLAGS_WIRED(flags)) {
862 		int found;
863 		void *htag0;
864 		if (amp == NULL) {
865 			pheadp = &seg->s_phead;
866 			pmtx = &seg->s_pmtx;
867 			htag0 = (void *)seg;
868 		} else {
869 			pheadp = &amp->a_phead;
870 			pmtx = &amp->a_pmtx;
871 			htag0 = (void *)amp;
872 		}
873 		mutex_enter(pmtx);
874 		mutex_enter(&hp->p_hmutex);
875 		delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
876 		    len, &found);
877 		if (found) {
878 			mutex_exit(&hp->p_hmutex);
879 			mutex_exit(pmtx);
880 			mutex_enter(&seg_pmem_mtx);
881 			seg_plocked -= npages;
882 			seg_plocked_window -= npages;
883 			mutex_exit(&seg_pmem_mtx);
884 			kmem_cache_free(seg_pkmcache, pcp);
885 			goto out;
886 		}
887 		pcp->p_plink.p_lnext = pheadp->p_lnext;
888 		pcp->p_plink.p_lprev = pheadp;
889 		pheadp->p_lnext->p_lprev = &pcp->p_plink;
890 		pheadp->p_lnext = &pcp->p_plink;
891 	} else {
892 		mutex_enter(&hp->p_hmutex);
893 	}
894 	pcp->p_hashp = hp;
895 	pcp->p_hnext = hp->p_hnext;
896 	pcp->p_hprev = (struct seg_pcache *)hp;
897 	hp->p_hnext->p_hprev = pcp;
898 	hp->p_hnext = pcp;
899 	if (!IS_PFLAGS_WIRED(flags) &&
900 	    hp->p_hprev == pcp) {
901 		seg_padd_abuck(hp);
902 	}
903 	mutex_exit(&hp->p_hmutex);
904 	if (!IS_PFLAGS_WIRED(flags)) {
905 		mutex_exit(pmtx);
906 	}
907 
908 out:
909 	npages = 0;
910 	while (delcallb_list != NULL) {
911 		pcp = delcallb_list;
912 		delcallb_list = pcp->p_hprev;
913 		ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
914 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
915 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
916 		npages += btop(pcp->p_len);
917 		kmem_cache_free(seg_pkmcache, pcp);
918 	}
919 	if (npages) {
920 		ASSERT(!IS_PFLAGS_WIRED(flags));
921 		mutex_enter(&seg_pmem_mtx);
922 		ASSERT(seg_plocked >= npages);
923 		ASSERT(seg_plocked_window >= npages);
924 		seg_plocked -= npages;
925 		seg_plocked_window -= npages;
926 		mutex_exit(&seg_pmem_mtx);
927 	}
928 
929 	return (SEGP_SUCCESS);
930 }
931 
932 /*
933  * purge entries from the pagelock cache if not active
934  * and not recently used.
935  */
936 static void
seg_ppurge_async(int force)937 seg_ppurge_async(int force)
938 {
939 	struct seg_pcache *delcallb_list = NULL;
940 	struct seg_pcache *pcp;
941 	struct seg_phash *hp;
942 	pgcnt_t npages = 0;
943 	pgcnt_t npages_window = 0;
944 	pgcnt_t	npgs_to_purge;
945 	pgcnt_t npgs_purged = 0;
946 	int hlinks = 0;
947 	int hlix;
948 	pcache_link_t *hlinkp;
949 	pcache_link_t *hlnextp = NULL;
950 	int lowmem;
951 	int trim;
952 
953 	ASSERT(seg_phashsize_win != 0);
954 
955 	/*
956 	 * if the cache is off or empty, return
957 	 */
958 	if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
959 		return;
960 	}
961 
962 	if (!force) {
963 		lowmem = 0;
964 		trim = 0;
965 		if (freemem < lotsfree + needfree) {
966 			spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
967 			if (fmem <= 5 * (desfree >> 2)) {
968 				lowmem = 1;
969 			} else if (fmem <= 7 * (lotsfree >> 3)) {
970 				if (seg_plocked_window >=
971 				    (availrmem_initial >> 1)) {
972 					lowmem = 1;
973 				}
974 			} else if (fmem < lotsfree) {
975 				if (seg_plocked_window >=
976 				    3 * (availrmem_initial >> 2)) {
977 					lowmem = 1;
978 				}
979 			}
980 		}
981 		if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
982 			trim = 1;
983 		}
984 		if (!lowmem && !trim) {
985 			return;
986 		}
987 		npgs_to_purge = seg_plocked_window >>
988 		    seg_pshrink_shift;
989 		if (lowmem) {
990 			npgs_to_purge = MIN(npgs_to_purge,
991 			    MAX(seg_pmaxapurge_npages, desfree));
992 		} else {
993 			npgs_to_purge = MIN(npgs_to_purge,
994 			    seg_pmaxapurge_npages);
995 		}
996 		if (npgs_to_purge == 0) {
997 			return;
998 		}
999 	} else {
1000 		struct seg_phash_wired *hpw;
1001 
1002 		ASSERT(seg_phashsize_wired != 0);
1003 
1004 		for (hpw = seg_phashtab_wired;
1005 		    hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1006 
1007 			if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1008 				continue;
1009 			}
1010 
1011 			mutex_enter(&hpw->p_hmutex);
1012 
1013 			for (pcp = hpw->p_hnext;
1014 			    pcp != (struct seg_pcache *)hpw;
1015 			    pcp = pcp->p_hnext) {
1016 
1017 				ASSERT(IS_PCP_WIRED(pcp));
1018 				ASSERT(pcp->p_hashp ==
1019 				    (struct seg_phash *)hpw);
1020 
1021 				if (pcp->p_active) {
1022 					continue;
1023 				}
1024 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1025 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1026 				pcp->p_hprev = delcallb_list;
1027 				delcallb_list = pcp;
1028 			}
1029 			mutex_exit(&hpw->p_hmutex);
1030 		}
1031 	}
1032 
1033 	mutex_enter(&seg_pmem_mtx);
1034 	if (seg_pathr_on) {
1035 		mutex_exit(&seg_pmem_mtx);
1036 		goto runcb;
1037 	}
1038 	seg_pathr_on = 1;
1039 	mutex_exit(&seg_pmem_mtx);
1040 	ASSERT(seg_pahcur <= 1);
1041 	hlix = !seg_pahcur;
1042 
1043 again:
1044 	for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1045 	    hlinkp = hlnextp) {
1046 
1047 		hlnextp = hlinkp->p_lnext;
1048 		ASSERT(hlnextp != NULL);
1049 
1050 		hp = hlink2phash(hlinkp, hlix);
1051 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1052 			seg_pathr_empty_ahb++;
1053 			continue;
1054 		}
1055 		seg_pathr_full_ahb++;
1056 		mutex_enter(&hp->p_hmutex);
1057 
1058 		for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1059 		    pcp = pcp->p_hnext) {
1060 			pcache_link_t *pheadp;
1061 			pcache_link_t *plinkp;
1062 			void *htag0;
1063 			kmutex_t *pmtx;
1064 
1065 			ASSERT(!IS_PCP_WIRED(pcp));
1066 			ASSERT(pcp->p_hashp == hp);
1067 
1068 			if (pcp->p_active) {
1069 				continue;
1070 			}
1071 			if (!force && pcp->p_ref &&
1072 			    PCP_AGE(pcp) < seg_pmax_pcpage) {
1073 				pcp->p_ref = 0;
1074 				continue;
1075 			}
1076 			plinkp = &pcp->p_plink;
1077 			htag0 = pcp->p_htag0;
1078 			if (pcp->p_flags & SEGP_AMP) {
1079 				pheadp = &((amp_t *)htag0)->a_phead;
1080 				pmtx = &((amp_t *)htag0)->a_pmtx;
1081 			} else {
1082 				pheadp = &((seg_t *)htag0)->s_phead;
1083 				pmtx = &((seg_t *)htag0)->s_pmtx;
1084 			}
1085 			if (!mutex_tryenter(pmtx)) {
1086 				continue;
1087 			}
1088 			ASSERT(pheadp->p_lnext != pheadp);
1089 			ASSERT(pheadp->p_lprev != pheadp);
1090 			plinkp->p_lprev->p_lnext =
1091 			    plinkp->p_lnext;
1092 			plinkp->p_lnext->p_lprev =
1093 			    plinkp->p_lprev;
1094 			pcp->p_hprev->p_hnext = pcp->p_hnext;
1095 			pcp->p_hnext->p_hprev = pcp->p_hprev;
1096 			mutex_exit(pmtx);
1097 			pcp->p_hprev = delcallb_list;
1098 			delcallb_list = pcp;
1099 			npgs_purged += btop(pcp->p_len);
1100 		}
1101 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1102 			seg_premove_abuck(hp, 1);
1103 		}
1104 		mutex_exit(&hp->p_hmutex);
1105 		if (npgs_purged >= seg_plocked_window) {
1106 			break;
1107 		}
1108 		if (!force) {
1109 			if (npgs_purged >= npgs_to_purge) {
1110 				break;
1111 			}
1112 			if (!trim && !(seg_pathr_full_ahb & 15)) {
1113 				ASSERT(lowmem);
1114 				if (freemem >= lotsfree + needfree) {
1115 					break;
1116 				}
1117 			}
1118 		}
1119 	}
1120 
1121 	if (hlinkp == &seg_pahhead[hlix]) {
1122 		/*
1123 		 * We processed the entire hlix active bucket list
1124 		 * but didn't find enough pages to reclaim.
1125 		 * Switch the lists and walk the other list
1126 		 * if we haven't done it yet.
1127 		 */
1128 		mutex_enter(&seg_pmem_mtx);
1129 		ASSERT(seg_pathr_on);
1130 		ASSERT(seg_pahcur == !hlix);
1131 		seg_pahcur = hlix;
1132 		mutex_exit(&seg_pmem_mtx);
1133 		if (++hlinks < 2) {
1134 			hlix = !hlix;
1135 			goto again;
1136 		}
1137 	} else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1138 	    seg_pahhead[hlix].p_lnext != hlinkp) {
1139 		ASSERT(hlinkp != NULL);
1140 		ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1141 		ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1142 		ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1143 
1144 		/*
1145 		 * Reinsert the header to point to hlinkp
1146 		 * so that we start from hlinkp bucket next time around.
1147 		 */
1148 		seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1149 		seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1150 		seg_pahhead[hlix].p_lnext = hlinkp;
1151 		seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1152 		hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1153 		hlinkp->p_lprev = &seg_pahhead[hlix];
1154 	}
1155 
1156 	mutex_enter(&seg_pmem_mtx);
1157 	ASSERT(seg_pathr_on);
1158 	seg_pathr_on = 0;
1159 	mutex_exit(&seg_pmem_mtx);
1160 
1161 runcb:
1162 	/*
1163 	 * Run the delayed callback list. segments/amps can't go away until
1164 	 * callback is executed since they must have non 0 softlockcnt. That's
1165 	 * why we don't need to hold as/seg/amp locks to execute the callback.
1166 	 */
1167 	while (delcallb_list != NULL) {
1168 		pcp = delcallb_list;
1169 		delcallb_list = pcp->p_hprev;
1170 		ASSERT(!pcp->p_active);
1171 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1172 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1173 		npages += btop(pcp->p_len);
1174 		if (!IS_PCP_WIRED(pcp)) {
1175 			npages_window += btop(pcp->p_len);
1176 		}
1177 		kmem_cache_free(seg_pkmcache, pcp);
1178 	}
1179 	if (npages) {
1180 		mutex_enter(&seg_pmem_mtx);
1181 		ASSERT(seg_plocked >= npages);
1182 		ASSERT(seg_plocked_window >= npages_window);
1183 		seg_plocked -= npages;
1184 		seg_plocked_window -= npages_window;
1185 		mutex_exit(&seg_pmem_mtx);
1186 	}
1187 }
1188 
1189 /*
1190  * Remove cached pages for segment(s) entries from hashtable.  The segments
1191  * are identified by pp array. This is useful for multiple seg's cached on
1192  * behalf of dummy segment (ISM/DISM) with common pp array.
1193  */
1194 void
seg_ppurge_wiredpp(struct page ** pp)1195 seg_ppurge_wiredpp(struct page **pp)
1196 {
1197 	struct seg_pcache *pcp;
1198 	struct seg_phash_wired *hp;
1199 	pgcnt_t npages = 0;
1200 	struct	seg_pcache *delcallb_list = NULL;
1201 
1202 	/*
1203 	 * if the cache is empty, return
1204 	 */
1205 	if (seg_plocked == 0) {
1206 		return;
1207 	}
1208 	ASSERT(seg_phashsize_wired != 0);
1209 
1210 	for (hp = seg_phashtab_wired;
1211 	    hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1212 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1213 			continue;
1214 		}
1215 		mutex_enter(&hp->p_hmutex);
1216 		pcp = hp->p_hnext;
1217 		while (pcp != (struct seg_pcache *)hp) {
1218 			ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1219 			ASSERT(IS_PCP_WIRED(pcp));
1220 			/*
1221 			 * purge entries which are not active
1222 			 */
1223 			if (!pcp->p_active && pcp->p_pp == pp) {
1224 				ASSERT(pcp->p_htag0 != NULL);
1225 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1226 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1227 				pcp->p_hprev = delcallb_list;
1228 				delcallb_list = pcp;
1229 			}
1230 			pcp = pcp->p_hnext;
1231 		}
1232 		mutex_exit(&hp->p_hmutex);
1233 		/*
1234 		 * segments can't go away until callback is executed since
1235 		 * they must have non 0 softlockcnt. That's why we don't
1236 		 * need to hold as/seg locks to execute the callback.
1237 		 */
1238 		while (delcallb_list != NULL) {
1239 			int done;
1240 			pcp = delcallb_list;
1241 			delcallb_list = pcp->p_hprev;
1242 			ASSERT(!pcp->p_active);
1243 			done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1244 			    pcp->p_len, pcp->p_pp,
1245 			    pcp->p_write ? S_WRITE : S_READ, 1);
1246 			npages += btop(pcp->p_len);
1247 			ASSERT(IS_PCP_WIRED(pcp));
1248 			kmem_cache_free(seg_pkmcache, pcp);
1249 			if (done) {
1250 				ASSERT(delcallb_list == NULL);
1251 				goto out;
1252 			}
1253 		}
1254 	}
1255 
1256 out:
1257 	mutex_enter(&seg_pmem_mtx);
1258 	ASSERT(seg_plocked >= npages);
1259 	seg_plocked -= npages;
1260 	mutex_exit(&seg_pmem_mtx);
1261 }
1262 
1263 /*
1264  * purge all entries for a given segment. Since we
1265  * callback into the segment driver directly for page
1266  * reclaim the caller needs to hold the right locks.
1267  */
1268 void
seg_ppurge(struct seg * seg,struct anon_map * amp,uint_t flags)1269 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1270 {
1271 	struct seg_pcache *delcallb_list = NULL;
1272 	struct seg_pcache *pcp;
1273 	struct seg_phash *hp;
1274 	pgcnt_t npages = 0;
1275 	void *htag0;
1276 
1277 	if (seg_plocked == 0) {
1278 		return;
1279 	}
1280 	ASSERT(seg_phashsize_win != 0);
1281 
1282 	/*
1283 	 * If amp is not NULL use amp as a lookup tag otherwise use seg
1284 	 * as a lookup tag.
1285 	 */
1286 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1287 	ASSERT(htag0 != NULL);
1288 	if (IS_PFLAGS_WIRED(flags)) {
1289 		hp = P_HASHBP(seg, htag0, 0, flags);
1290 		mutex_enter(&hp->p_hmutex);
1291 		pcp = hp->p_hnext;
1292 		while (pcp != (struct seg_pcache *)hp) {
1293 			ASSERT(pcp->p_hashp == hp);
1294 			ASSERT(IS_PCP_WIRED(pcp));
1295 			if (pcp->p_htag0 == htag0) {
1296 				if (pcp->p_active) {
1297 					break;
1298 				}
1299 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1300 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1301 				pcp->p_hprev = delcallb_list;
1302 				delcallb_list = pcp;
1303 			}
1304 			pcp = pcp->p_hnext;
1305 		}
1306 		mutex_exit(&hp->p_hmutex);
1307 	} else {
1308 		pcache_link_t *plinkp;
1309 		pcache_link_t *pheadp;
1310 		kmutex_t *pmtx;
1311 
1312 		if (amp == NULL) {
1313 			ASSERT(seg != NULL);
1314 			pheadp = &seg->s_phead;
1315 			pmtx = &seg->s_pmtx;
1316 		} else {
1317 			pheadp = &amp->a_phead;
1318 			pmtx = &amp->a_pmtx;
1319 		}
1320 		mutex_enter(pmtx);
1321 		while ((plinkp = pheadp->p_lnext) != pheadp) {
1322 			pcp = plink2pcache(plinkp);
1323 			ASSERT(!IS_PCP_WIRED(pcp));
1324 			ASSERT(pcp->p_htag0 == htag0);
1325 			hp = pcp->p_hashp;
1326 			mutex_enter(&hp->p_hmutex);
1327 			if (pcp->p_active) {
1328 				mutex_exit(&hp->p_hmutex);
1329 				break;
1330 			}
1331 			ASSERT(plinkp->p_lprev == pheadp);
1332 			pheadp->p_lnext = plinkp->p_lnext;
1333 			plinkp->p_lnext->p_lprev = pheadp;
1334 			pcp->p_hprev->p_hnext = pcp->p_hnext;
1335 			pcp->p_hnext->p_hprev = pcp->p_hprev;
1336 			pcp->p_hprev = delcallb_list;
1337 			delcallb_list = pcp;
1338 			if (hp->p_hnext == (struct seg_pcache *)hp) {
1339 				seg_premove_abuck(hp, 0);
1340 			}
1341 			mutex_exit(&hp->p_hmutex);
1342 		}
1343 		mutex_exit(pmtx);
1344 	}
1345 	while (delcallb_list != NULL) {
1346 		pcp = delcallb_list;
1347 		delcallb_list = pcp->p_hprev;
1348 		ASSERT(!pcp->p_active);
1349 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1350 		    pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1351 		npages += btop(pcp->p_len);
1352 		kmem_cache_free(seg_pkmcache, pcp);
1353 	}
1354 	mutex_enter(&seg_pmem_mtx);
1355 	ASSERT(seg_plocked >= npages);
1356 	seg_plocked -= npages;
1357 	if (!IS_PFLAGS_WIRED(flags)) {
1358 		ASSERT(seg_plocked_window >= npages);
1359 		seg_plocked_window -= npages;
1360 	}
1361 	mutex_exit(&seg_pmem_mtx);
1362 }
1363 
1364 static void seg_pinit_mem_config(void);
1365 
1366 /*
1367  * setup the pagelock cache
1368  */
1369 static void
seg_pinit(void)1370 seg_pinit(void)
1371 {
1372 	struct seg_phash *hp;
1373 	ulong_t i;
1374 	pgcnt_t physmegs;
1375 
1376 	seg_plocked = 0;
1377 	seg_plocked_window = 0;
1378 
1379 	if (segpcache_enabled == 0) {
1380 		seg_phashsize_win = 0;
1381 		seg_phashsize_wired = 0;
1382 		seg_pdisabled = 1;
1383 		return;
1384 	}
1385 
1386 	seg_pdisabled = 0;
1387 	seg_pkmcache = kmem_cache_create("seg_pcache",
1388 	    sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1389 	if (segpcache_pcp_maxage_ticks <= 0) {
1390 		segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1391 	}
1392 	seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1393 	seg_pathr_empty_ahb = 0;
1394 	seg_pathr_full_ahb = 0;
1395 	seg_pshrink_shift = segpcache_shrink_shift;
1396 	seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1397 
1398 	mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1399 	mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1400 	mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1401 	cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1402 
1403 	physmegs = physmem >> (20 - PAGESHIFT);
1404 
1405 	/*
1406 	 * If segpcache_hashsize_win was not set in /etc/system or it has
1407 	 * absurd value set it to a default.
1408 	 */
1409 	if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1410 		/*
1411 		 * Create one bucket per 32K (or at least per 8 pages) of
1412 		 * available memory.
1413 		 */
1414 		pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1415 		segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1416 	}
1417 	if (!ISP2(segpcache_hashsize_win)) {
1418 		ulong_t rndfac = ~(1UL <<
1419 		    (highbit(segpcache_hashsize_win) - 1));
1420 		rndfac &= segpcache_hashsize_win;
1421 		segpcache_hashsize_win += rndfac;
1422 		segpcache_hashsize_win = 1 <<
1423 		    (highbit(segpcache_hashsize_win) - 1);
1424 	}
1425 	seg_phashsize_win = segpcache_hashsize_win;
1426 	seg_phashtab_win = kmem_zalloc(
1427 	    seg_phashsize_win * sizeof (struct seg_phash),
1428 	    KM_SLEEP);
1429 	for (i = 0; i < seg_phashsize_win; i++) {
1430 		hp = &seg_phashtab_win[i];
1431 		hp->p_hnext = (struct seg_pcache *)hp;
1432 		hp->p_hprev = (struct seg_pcache *)hp;
1433 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1434 	}
1435 
1436 	seg_pahcur = 0;
1437 	seg_pathr_on = 0;
1438 	seg_pahhead[0].p_lnext = &seg_pahhead[0];
1439 	seg_pahhead[0].p_lprev = &seg_pahhead[0];
1440 	seg_pahhead[1].p_lnext = &seg_pahhead[1];
1441 	seg_pahhead[1].p_lprev = &seg_pahhead[1];
1442 
1443 	/*
1444 	 * If segpcache_hashsize_wired was not set in /etc/system or it has
1445 	 * absurd value set it to a default.
1446 	 */
1447 	if (segpcache_hashsize_wired == 0 ||
1448 	    segpcache_hashsize_wired > physmem / 4) {
1449 		/*
1450 		 * Choose segpcache_hashsize_wired based on physmem.
1451 		 * Create a bucket per 128K bytes upto 256K buckets.
1452 		 */
1453 		if (physmegs < 20 * 1024) {
1454 			segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1455 		} else {
1456 			segpcache_hashsize_wired = 256 * 1024;
1457 		}
1458 	}
1459 	if (!ISP2(segpcache_hashsize_wired)) {
1460 		segpcache_hashsize_wired = 1 <<
1461 		    highbit(segpcache_hashsize_wired);
1462 	}
1463 	seg_phashsize_wired = segpcache_hashsize_wired;
1464 	seg_phashtab_wired = kmem_zalloc(
1465 	    seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1466 	for (i = 0; i < seg_phashsize_wired; i++) {
1467 		hp = (struct seg_phash *)&seg_phashtab_wired[i];
1468 		hp->p_hnext = (struct seg_pcache *)hp;
1469 		hp->p_hprev = (struct seg_pcache *)hp;
1470 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1471 	}
1472 
1473 	if (segpcache_maxwindow == 0) {
1474 		if (physmegs < 64) {
1475 			/* 3% of memory */
1476 			segpcache_maxwindow = availrmem >> 5;
1477 		} else if (physmegs < 512) {
1478 			/* 12% of memory */
1479 			segpcache_maxwindow = availrmem >> 3;
1480 		} else if (physmegs < 1024) {
1481 			/* 25% of memory */
1482 			segpcache_maxwindow = availrmem >> 2;
1483 		} else if (physmegs < 2048) {
1484 			/* 50% of memory */
1485 			segpcache_maxwindow = availrmem >> 1;
1486 		} else {
1487 			/* no limit */
1488 			segpcache_maxwindow = (pgcnt_t)-1;
1489 		}
1490 	}
1491 	seg_pmaxwindow = segpcache_maxwindow;
1492 	seg_pinit_mem_config();
1493 }
1494 
1495 /*
1496  * called by pageout if memory is low
1497  */
1498 void
seg_preap(void)1499 seg_preap(void)
1500 {
1501 	/*
1502 	 * if the cache is off or empty, return
1503 	 */
1504 	if (seg_plocked_window == 0) {
1505 		return;
1506 	}
1507 	ASSERT(seg_phashsize_win != 0);
1508 
1509 	/*
1510 	 * If somebody is already purging pcache
1511 	 * just return.
1512 	 */
1513 	if (seg_pdisabled) {
1514 		return;
1515 	}
1516 
1517 	cv_signal(&seg_pasync_cv);
1518 }
1519 
1520 /*
1521  * run as a backgroud thread and reclaim pagelock
1522  * pages which have not been used recently
1523  */
1524 void
seg_pasync_thread(void)1525 seg_pasync_thread(void)
1526 {
1527 	callb_cpr_t cpr_info;
1528 
1529 	if (seg_phashsize_win == 0) {
1530 		thread_exit();
1531 		/*NOTREACHED*/
1532 	}
1533 
1534 	seg_pasync_thr = curthread;
1535 
1536 	CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1537 	    callb_generic_cpr, "seg_pasync");
1538 
1539 	if (segpcache_reap_ticks <= 0) {
1540 		segpcache_reap_ticks = segpcache_reap_sec * hz;
1541 	}
1542 
1543 	mutex_enter(&seg_pasync_mtx);
1544 	for (;;) {
1545 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
1546 		(void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1547 		    segpcache_reap_ticks, TR_CLOCK_TICK);
1548 		CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1549 		if (seg_pdisabled == 0) {
1550 			seg_ppurge_async(0);
1551 		}
1552 	}
1553 }
1554 
1555 static struct kmem_cache *seg_cache;
1556 
1557 /*
1558  * Initialize segment management data structures.
1559  */
1560 void
seg_init(void)1561 seg_init(void)
1562 {
1563 	kstat_t *ksp;
1564 
1565 	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1566 	    0, NULL, NULL, NULL, NULL, NULL, 0);
1567 
1568 	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1569 	    segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1570 	if (ksp) {
1571 		ksp->ks_data = (void *)segadvstat_ptr;
1572 		kstat_install(ksp);
1573 	}
1574 
1575 	seg_pinit();
1576 }
1577 
1578 /*
1579  * Allocate a segment to cover [base, base+size]
1580  * and attach it to the specified address space.
1581  */
1582 struct seg *
seg_alloc(struct as * as,caddr_t base,size_t size)1583 seg_alloc(struct as *as, caddr_t base, size_t size)
1584 {
1585 	struct seg *new;
1586 	caddr_t segbase;
1587 	size_t segsize;
1588 
1589 	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1590 	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1591 	    (uintptr_t)segbase;
1592 
1593 	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1594 		return ((struct seg *)NULL);	/* bad virtual addr range */
1595 
1596 	if (as != &kas &&
1597 	    valid_usr_range(segbase, segsize, 0, as,
1598 	    as->a_userlimit) != RANGE_OKAY)
1599 		return ((struct seg *)NULL);	/* bad virtual addr range */
1600 
1601 	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1602 	new->s_ops = NULL;
1603 	new->s_data = NULL;
1604 	new->s_szc = 0;
1605 	new->s_flags = 0;
1606 	mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1607 	new->s_phead.p_lnext = &new->s_phead;
1608 	new->s_phead.p_lprev = &new->s_phead;
1609 	if (seg_attach(as, segbase, segsize, new) < 0) {
1610 		kmem_cache_free(seg_cache, new);
1611 		return ((struct seg *)NULL);
1612 	}
1613 	/* caller must fill in ops, data */
1614 	return (new);
1615 }
1616 
1617 /*
1618  * Attach a segment to the address space.  Used by seg_alloc()
1619  * and for kernel startup to attach to static segments.
1620  */
1621 int
seg_attach(struct as * as,caddr_t base,size_t size,struct seg * seg)1622 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1623 {
1624 	seg->s_as = as;
1625 	seg->s_base = base;
1626 	seg->s_size = size;
1627 
1628 	/*
1629 	 * as_addseg() will add the segment at the appropraite point
1630 	 * in the list. It will return -1 if there is overlap with
1631 	 * an already existing segment.
1632 	 */
1633 	return (as_addseg(as, seg));
1634 }
1635 
1636 /*
1637  * Unmap a segment and free it from its associated address space.
1638  * This should be called by anybody who's finished with a whole segment's
1639  * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
1640  * responsibility of the segment driver to unlink the the segment
1641  * from the address space, and to free public and private data structures
1642  * associated with the segment.  (This is typically done by a call to
1643  * seg_free()).
1644  */
1645 void
seg_unmap(struct seg * seg)1646 seg_unmap(struct seg *seg)
1647 {
1648 #ifdef DEBUG
1649 	int ret;
1650 #endif /* DEBUG */
1651 
1652 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1653 
1654 	/* Shouldn't have called seg_unmap if mapping isn't yet established */
1655 	ASSERT(seg->s_data != NULL);
1656 
1657 	/* Unmap the whole mapping */
1658 #ifdef DEBUG
1659 	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1660 	ASSERT(ret == 0);
1661 #else
1662 	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1663 #endif /* DEBUG */
1664 }
1665 
1666 /*
1667  * Free the segment from its associated as. This should only be called
1668  * if a mapping to the segment has not yet been established (e.g., if
1669  * an error occurs in the middle of doing an as_map when the segment
1670  * has already been partially set up) or if it has already been deleted
1671  * (e.g., from a segment driver unmap routine if the unmap applies to the
1672  * entire segment). If the mapping is currently set up then seg_unmap() should
1673  * be called instead.
1674  */
1675 void
seg_free(struct seg * seg)1676 seg_free(struct seg *seg)
1677 {
1678 	register struct as *as = seg->s_as;
1679 	struct seg *tseg = as_removeseg(as, seg);
1680 
1681 	ASSERT(tseg == seg);
1682 
1683 	/*
1684 	 * If the segment private data field is NULL,
1685 	 * then segment driver is not attached yet.
1686 	 */
1687 	if (seg->s_data != NULL)
1688 		SEGOP_FREE(seg);
1689 
1690 	mutex_destroy(&seg->s_pmtx);
1691 	ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1692 	ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1693 	kmem_cache_free(seg_cache, seg);
1694 }
1695 
1696 /*ARGSUSED*/
1697 static void
seg_p_mem_config_post_add(void * arg,pgcnt_t delta_pages)1698 seg_p_mem_config_post_add(
1699 	void *arg,
1700 	pgcnt_t delta_pages)
1701 {
1702 	/* Nothing to do. */
1703 }
1704 
1705 void
seg_p_enable(void)1706 seg_p_enable(void)
1707 {
1708 	mutex_enter(&seg_pcache_mtx);
1709 	ASSERT(seg_pdisabled != 0);
1710 	seg_pdisabled--;
1711 	mutex_exit(&seg_pcache_mtx);
1712 }
1713 
1714 /*
1715  * seg_p_disable - disables seg_pcache, and then attempts to empty the
1716  * cache.
1717  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1718  * SEGP_FAIL if the cache could not be emptied.
1719  */
1720 int
seg_p_disable(void)1721 seg_p_disable(void)
1722 {
1723 	pgcnt_t	old_plocked;
1724 	int stall_count = 0;
1725 
1726 	mutex_enter(&seg_pcache_mtx);
1727 	seg_pdisabled++;
1728 	ASSERT(seg_pdisabled != 0);
1729 	mutex_exit(&seg_pcache_mtx);
1730 
1731 	/*
1732 	 * Attempt to empty the cache. Terminate if seg_plocked does not
1733 	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1734 	 */
1735 	while (seg_plocked != 0) {
1736 		ASSERT(seg_phashsize_win != 0);
1737 		old_plocked = seg_plocked;
1738 		seg_ppurge_async(1);
1739 		if (seg_plocked == old_plocked) {
1740 			if (stall_count++ > SEGP_STALL_THRESHOLD) {
1741 				return (SEGP_FAIL);
1742 			}
1743 		} else
1744 			stall_count = 0;
1745 		if (seg_plocked != 0)
1746 			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1747 	}
1748 	return (SEGP_SUCCESS);
1749 }
1750 
1751 /*
1752  * Attempt to purge seg_pcache.  May need to return before this has
1753  * completed to allow other pre_del callbacks to unlock pages. This is
1754  * ok because:
1755  *	1) The seg_pdisabled flag has been set so at least we won't
1756  *	cache anymore locks and the locks we couldn't purge
1757  *	will not be held if they do get released by a subsequent
1758  *	pre-delete callback.
1759  *
1760  *	2) The rest of the memory delete thread processing does not
1761  *	depend on the changes made in this pre-delete callback. No
1762  *	panics will result, the worst that will happen is that the
1763  *	DR code will timeout and cancel the delete.
1764  */
1765 /*ARGSUSED*/
1766 static int
seg_p_mem_config_pre_del(void * arg,pgcnt_t delta_pages)1767 seg_p_mem_config_pre_del(
1768 	void *arg,
1769 	pgcnt_t delta_pages)
1770 {
1771 	if (seg_phashsize_win == 0) {
1772 		return (0);
1773 	}
1774 	if (seg_p_disable() != SEGP_SUCCESS)
1775 		cmn_err(CE_NOTE,
1776 		    "!Pre-delete couldn't purge"" pagelock cache - continuing");
1777 	return (0);
1778 }
1779 
1780 /*ARGSUSED*/
1781 static void
seg_p_mem_config_post_del(void * arg,pgcnt_t delta_pages,int cancelled)1782 seg_p_mem_config_post_del(
1783 	void *arg,
1784 	pgcnt_t delta_pages,
1785 	int cancelled)
1786 {
1787 	if (seg_phashsize_win == 0) {
1788 		return;
1789 	}
1790 	seg_p_enable();
1791 }
1792 
1793 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1794 	KPHYSM_SETUP_VECTOR_VERSION,
1795 	seg_p_mem_config_post_add,
1796 	seg_p_mem_config_pre_del,
1797 	seg_p_mem_config_post_del,
1798 };
1799 
1800 static void
seg_pinit_mem_config(void)1801 seg_pinit_mem_config(void)
1802 {
1803 	int ret;
1804 
1805 	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1806 	/*
1807 	 * Want to catch this in the debug kernel. At run time, if the
1808 	 * callbacks don't get run all will be OK as the disable just makes
1809 	 * it more likely that the pages can be collected.
1810 	 */
1811 	ASSERT(ret == 0);
1812 }
1813 
1814 /*
1815  * Verify that segment is not a shared anonymous segment which reserves
1816  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1817  * from one zone to another if any segments are shared.  This is because the
1818  * last process to exit will credit the swap reservation.  This could lead
1819  * to the swap being reserved by one zone, and credited to another.
1820  */
1821 boolean_t
seg_can_change_zones(struct seg * seg)1822 seg_can_change_zones(struct seg *seg)
1823 {
1824 	struct segvn_data *svd;
1825 
1826 	if (seg->s_ops == &segspt_shmops)
1827 		return (B_FALSE);
1828 
1829 	if (seg->s_ops == &segvn_ops) {
1830 		svd = (struct segvn_data *)seg->s_data;
1831 		if (svd->type == MAP_SHARED &&
1832 		    svd->amp != NULL &&
1833 		    svd->amp->swresv > 0)
1834 			return (B_FALSE);
1835 	}
1836 	return (B_TRUE);
1837 }
1838 
1839 /*
1840  * Return swap reserved by a segment backing a private mapping.
1841  */
1842 size_t
seg_swresv(struct seg * seg)1843 seg_swresv(struct seg *seg)
1844 {
1845 	struct segvn_data *svd;
1846 	size_t swap = 0;
1847 
1848 	if (seg->s_ops == &segvn_ops) {
1849 		svd = (struct segvn_data *)seg->s_data;
1850 		if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1851 			swap = svd->swresv;
1852 	}
1853 	return (swap);
1854 }
1855 
1856 /*
1857  * General not supported function for SEGOP_INHERIT
1858  */
1859 /* ARGSUSED */
1860 int
seg_inherit_notsup(struct seg * seg,caddr_t addr,size_t len,uint_t op)1861 seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
1862 {
1863 	return (ENOTSUP);
1864 }
1865