17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate * CDDL HEADER START
37c478bd9Sstevel@tonic-gate *
47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
50209230bSgjelinek * Common Development and Distribution License (the "License").
60209230bSgjelinek * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate *
87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate * and limitations under the License.
127c478bd9Sstevel@tonic-gate *
137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate *
197c478bd9Sstevel@tonic-gate * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate /*
22d3d50737SRafael Vanoni * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
237c478bd9Sstevel@tonic-gate * Use is subject to license terms.
24*15c07adcSJohn Levon * Copyright (c) 2018, Joyent, Inc.
257c478bd9Sstevel@tonic-gate */
267c478bd9Sstevel@tonic-gate
277c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
287c478bd9Sstevel@tonic-gate /* All Rights Reserved */
297c478bd9Sstevel@tonic-gate
307c478bd9Sstevel@tonic-gate /*
317c478bd9Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988
327c478bd9Sstevel@tonic-gate * The Regents of the University of California
337c478bd9Sstevel@tonic-gate * All Rights Reserved
347c478bd9Sstevel@tonic-gate *
357c478bd9Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from
367c478bd9Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its
377c478bd9Sstevel@tonic-gate * contributors.
387c478bd9Sstevel@tonic-gate */
397c478bd9Sstevel@tonic-gate
407c478bd9Sstevel@tonic-gate /*
417c478bd9Sstevel@tonic-gate * VM - segment management.
427c478bd9Sstevel@tonic-gate */
437c478bd9Sstevel@tonic-gate
447c478bd9Sstevel@tonic-gate #include <sys/types.h>
457c478bd9Sstevel@tonic-gate #include <sys/inttypes.h>
467c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
477c478bd9Sstevel@tonic-gate #include <sys/param.h>
487c478bd9Sstevel@tonic-gate #include <sys/systm.h>
497c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
50a98e9dbfSaguzovsk #include <sys/sysmacros.h>
517c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h>
52a98e9dbfSaguzovsk #include <sys/tuneable.h>
537c478bd9Sstevel@tonic-gate #include <sys/debug.h>
54a98e9dbfSaguzovsk #include <sys/fs/swapnode.h>
557c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
567c478bd9Sstevel@tonic-gate #include <sys/callb.h>
577c478bd9Sstevel@tonic-gate #include <sys/mem_config.h>
580209230bSgjelinek #include <sys/mman.h>
597c478bd9Sstevel@tonic-gate
607c478bd9Sstevel@tonic-gate #include <vm/hat.h>
617c478bd9Sstevel@tonic-gate #include <vm/as.h>
627c478bd9Sstevel@tonic-gate #include <vm/seg.h>
637c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
640209230bSgjelinek #include <vm/seg_spt.h>
650209230bSgjelinek #include <vm/seg_vn.h>
66a98e9dbfSaguzovsk #include <vm/anon.h>
67a98e9dbfSaguzovsk
687c478bd9Sstevel@tonic-gate /*
697c478bd9Sstevel@tonic-gate * kstats for segment advise
707c478bd9Sstevel@tonic-gate */
717c478bd9Sstevel@tonic-gate segadvstat_t segadvstat = {
727c478bd9Sstevel@tonic-gate { "MADV_FREE_hit", KSTAT_DATA_ULONG },
737c478bd9Sstevel@tonic-gate { "MADV_FREE_miss", KSTAT_DATA_ULONG },
747c478bd9Sstevel@tonic-gate };
757c478bd9Sstevel@tonic-gate
767c478bd9Sstevel@tonic-gate kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
777c478bd9Sstevel@tonic-gate uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
787c478bd9Sstevel@tonic-gate
797c478bd9Sstevel@tonic-gate /*
807c478bd9Sstevel@tonic-gate * entry in the segment page cache
817c478bd9Sstevel@tonic-gate */
827c478bd9Sstevel@tonic-gate struct seg_pcache {
83a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */
84a98e9dbfSaguzovsk struct seg_pcache *p_hprev;
85a98e9dbfSaguzovsk pcache_link_t p_plink; /* per segment/amp list */
86a98e9dbfSaguzovsk void *p_htag0; /* segment/amp pointer */
87a98e9dbfSaguzovsk caddr_t p_addr; /* base address/anon_idx */
88a98e9dbfSaguzovsk size_t p_len; /* total bytes */
89a98e9dbfSaguzovsk size_t p_wlen; /* writtable bytes at p_addr */
90a98e9dbfSaguzovsk struct page **p_pp; /* pp shadow list */
91a98e9dbfSaguzovsk seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */
92a98e9dbfSaguzovsk clock_t p_lbolt; /* lbolt from last use */
93a98e9dbfSaguzovsk struct seg_phash *p_hashp; /* our pcache hash bucket */
94a98e9dbfSaguzovsk uint_t p_active; /* active count */
95a98e9dbfSaguzovsk uchar_t p_write; /* true if S_WRITE */
96a98e9dbfSaguzovsk uchar_t p_ref; /* reference byte */
97a98e9dbfSaguzovsk ushort_t p_flags; /* bit flags */
987c478bd9Sstevel@tonic-gate };
997c478bd9Sstevel@tonic-gate
1007c478bd9Sstevel@tonic-gate struct seg_phash {
101a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */
102a98e9dbfSaguzovsk struct seg_pcache *p_hprev;
103a98e9dbfSaguzovsk kmutex_t p_hmutex; /* protects hash bucket */
104a98e9dbfSaguzovsk pcache_link_t p_halink[2]; /* active bucket linkages */
105a98e9dbfSaguzovsk };
106a98e9dbfSaguzovsk
107a98e9dbfSaguzovsk struct seg_phash_wired {
108a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */
109a98e9dbfSaguzovsk struct seg_pcache *p_hprev;
110a98e9dbfSaguzovsk kmutex_t p_hmutex; /* protects hash bucket */
1117c478bd9Sstevel@tonic-gate };
1127c478bd9Sstevel@tonic-gate
113a98e9dbfSaguzovsk /*
114a98e9dbfSaguzovsk * A parameter to control a maximum number of bytes that can be
115a98e9dbfSaguzovsk * purged from pcache at a time.
116a98e9dbfSaguzovsk */
117a98e9dbfSaguzovsk #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024)
118a98e9dbfSaguzovsk
119a98e9dbfSaguzovsk /*
120a98e9dbfSaguzovsk * log2(fraction of pcache to reclaim at a time).
121a98e9dbfSaguzovsk */
122a98e9dbfSaguzovsk #define P_SHRINK_SHFT (5)
123a98e9dbfSaguzovsk
124a98e9dbfSaguzovsk /*
125a98e9dbfSaguzovsk * The following variables can be tuned via /etc/system.
126a98e9dbfSaguzovsk */
127a98e9dbfSaguzovsk
128a98e9dbfSaguzovsk int segpcache_enabled = 1; /* if 1, shadow lists are cached */
129a98e9dbfSaguzovsk pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */
130a98e9dbfSaguzovsk ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */
131a98e9dbfSaguzovsk ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */
132a98e9dbfSaguzovsk int segpcache_reap_sec = 1; /* reap check rate in secs */
133a98e9dbfSaguzovsk clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */
134a98e9dbfSaguzovsk int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */
135a98e9dbfSaguzovsk clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
136a98e9dbfSaguzovsk int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
137a98e9dbfSaguzovsk pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
138a98e9dbfSaguzovsk
139a98e9dbfSaguzovsk static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
140a98e9dbfSaguzovsk static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
141a98e9dbfSaguzovsk static kcondvar_t seg_pasync_cv;
142a98e9dbfSaguzovsk
143a98e9dbfSaguzovsk #pragma align 64(pctrl1)
144a98e9dbfSaguzovsk #pragma align 64(pctrl2)
145a98e9dbfSaguzovsk #pragma align 64(pctrl3)
146a98e9dbfSaguzovsk
147a98e9dbfSaguzovsk /*
148a98e9dbfSaguzovsk * Keep frequently used variables together in one cache line.
149a98e9dbfSaguzovsk */
150a98e9dbfSaguzovsk static struct p_ctrl1 {
151a98e9dbfSaguzovsk uint_t p_disabled; /* if not 0, caching temporarily off */
152a98e9dbfSaguzovsk pgcnt_t p_maxwin; /* max # of pages that can be cached */
153a98e9dbfSaguzovsk size_t p_hashwin_sz; /* # of non wired buckets */
154a98e9dbfSaguzovsk struct seg_phash *p_htabwin; /* hash table for non wired entries */
155a98e9dbfSaguzovsk size_t p_hashwired_sz; /* # of wired buckets */
156a98e9dbfSaguzovsk struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
157a98e9dbfSaguzovsk kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */
158a98e9dbfSaguzovsk #ifdef _LP64
159a98e9dbfSaguzovsk ulong_t pad[1];
160a98e9dbfSaguzovsk #endif /* _LP64 */
161a98e9dbfSaguzovsk } pctrl1;
162a98e9dbfSaguzovsk
163a98e9dbfSaguzovsk static struct p_ctrl2 {
164a98e9dbfSaguzovsk kmutex_t p_mem_mtx; /* protects window counter and p_halinks */
165a98e9dbfSaguzovsk pgcnt_t p_locked_win; /* # pages from window */
166a98e9dbfSaguzovsk pgcnt_t p_locked; /* # of pages cached by pagelock */
167a98e9dbfSaguzovsk uchar_t p_ahcur; /* current active links for insert/delete */
168a98e9dbfSaguzovsk uchar_t p_athr_on; /* async reclaim thread is running. */
169a98e9dbfSaguzovsk pcache_link_t p_ahhead[2]; /* active buckets linkages */
170a98e9dbfSaguzovsk } pctrl2;
171a98e9dbfSaguzovsk
172a98e9dbfSaguzovsk static struct p_ctrl3 {
173a98e9dbfSaguzovsk clock_t p_pcp_maxage; /* max pcp age in ticks */
174a98e9dbfSaguzovsk ulong_t p_athr_empty_ahb; /* athread walk stats */
175a98e9dbfSaguzovsk ulong_t p_athr_full_ahb; /* athread walk stats */
176a98e9dbfSaguzovsk pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */
177a98e9dbfSaguzovsk int p_shrink_shft; /* reap shift factor */
178a98e9dbfSaguzovsk #ifdef _LP64
179a98e9dbfSaguzovsk ulong_t pad[3];
180a98e9dbfSaguzovsk #endif /* _LP64 */
181a98e9dbfSaguzovsk } pctrl3;
182a98e9dbfSaguzovsk
183a98e9dbfSaguzovsk #define seg_pdisabled pctrl1.p_disabled
184a98e9dbfSaguzovsk #define seg_pmaxwindow pctrl1.p_maxwin
185a98e9dbfSaguzovsk #define seg_phashsize_win pctrl1.p_hashwin_sz
186a98e9dbfSaguzovsk #define seg_phashtab_win pctrl1.p_htabwin
187a98e9dbfSaguzovsk #define seg_phashsize_wired pctrl1.p_hashwired_sz
188a98e9dbfSaguzovsk #define seg_phashtab_wired pctrl1.p_htabwired
189a98e9dbfSaguzovsk #define seg_pkmcache pctrl1.p_kmcache
190a98e9dbfSaguzovsk #define seg_pmem_mtx pctrl2.p_mem_mtx
191a98e9dbfSaguzovsk #define seg_plocked_window pctrl2.p_locked_win
192a98e9dbfSaguzovsk #define seg_plocked pctrl2.p_locked
193a98e9dbfSaguzovsk #define seg_pahcur pctrl2.p_ahcur
194a98e9dbfSaguzovsk #define seg_pathr_on pctrl2.p_athr_on
195a98e9dbfSaguzovsk #define seg_pahhead pctrl2.p_ahhead
196a98e9dbfSaguzovsk #define seg_pmax_pcpage pctrl3.p_pcp_maxage
197a98e9dbfSaguzovsk #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb
198a98e9dbfSaguzovsk #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb
199a98e9dbfSaguzovsk #define seg_pshrink_shift pctrl3.p_shrink_shft
200a98e9dbfSaguzovsk #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages
201a98e9dbfSaguzovsk
202a98e9dbfSaguzovsk #define P_HASHWIN_MASK (seg_phashsize_win - 1)
203a98e9dbfSaguzovsk #define P_HASHWIRED_MASK (seg_phashsize_wired - 1)
204a98e9dbfSaguzovsk #define P_BASESHIFT (6)
205a98e9dbfSaguzovsk
206a98e9dbfSaguzovsk kthread_t *seg_pasync_thr;
207a98e9dbfSaguzovsk
208a98e9dbfSaguzovsk extern struct seg_ops segvn_ops;
209a98e9dbfSaguzovsk extern struct seg_ops segspt_shmops;
210a98e9dbfSaguzovsk
211a98e9dbfSaguzovsk #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
212a98e9dbfSaguzovsk #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
2137c478bd9Sstevel@tonic-gate
214d3d50737SRafael Vanoni #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t)))
2157c478bd9Sstevel@tonic-gate
216a98e9dbfSaguzovsk #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt)
2177c478bd9Sstevel@tonic-gate
218a98e9dbfSaguzovsk /*
219a98e9dbfSaguzovsk * htag0 argument can be a seg or amp pointer.
220a98e9dbfSaguzovsk */
221a98e9dbfSaguzovsk #define P_HASHBP(seg, htag0, addr, flags) \
222a98e9dbfSaguzovsk (IS_PFLAGS_WIRED((flags)) ? \
223a98e9dbfSaguzovsk ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
224a98e9dbfSaguzovsk ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \
225a98e9dbfSaguzovsk (&seg_phashtab_win[P_HASHWIN_MASK & \
226a98e9dbfSaguzovsk (((uintptr_t)(htag0) >> 3) ^ \
227a98e9dbfSaguzovsk ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \
228a98e9dbfSaguzovsk (flags >> 16) : page_get_shift((seg)->s_szc))))]))
2297c478bd9Sstevel@tonic-gate
230a98e9dbfSaguzovsk /*
231a98e9dbfSaguzovsk * htag0 argument can be a seg or amp pointer.
232a98e9dbfSaguzovsk */
233a98e9dbfSaguzovsk #define P_MATCH(pcp, htag0, addr, len) \
234a98e9dbfSaguzovsk ((pcp)->p_htag0 == (htag0) && \
235a98e9dbfSaguzovsk (pcp)->p_addr == (addr) && \
236a98e9dbfSaguzovsk (pcp)->p_len >= (len))
2377c478bd9Sstevel@tonic-gate
238a98e9dbfSaguzovsk #define P_MATCH_PP(pcp, htag0, addr, len, pp) \
239a98e9dbfSaguzovsk ((pcp)->p_pp == (pp) && \
240a98e9dbfSaguzovsk (pcp)->p_htag0 == (htag0) && \
241a98e9dbfSaguzovsk (pcp)->p_addr == (addr) && \
242a98e9dbfSaguzovsk (pcp)->p_len >= (len))
2437c478bd9Sstevel@tonic-gate
244a98e9dbfSaguzovsk #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \
245a98e9dbfSaguzovsk offsetof(struct seg_pcache, p_plink)))
2467c478bd9Sstevel@tonic-gate
247a98e9dbfSaguzovsk #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \
248a98e9dbfSaguzovsk offsetof(struct seg_phash, p_halink[l])))
2497c478bd9Sstevel@tonic-gate
2507c478bd9Sstevel@tonic-gate /*
251a98e9dbfSaguzovsk * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
252a98e9dbfSaguzovsk * active hash bucket lists. We maintain active bucket lists to reduce the
253a98e9dbfSaguzovsk * overhead of finding active buckets during asynchronous purging since there
254a98e9dbfSaguzovsk * can be 10s of millions of buckets on a large system but only a small subset
255a98e9dbfSaguzovsk * of them in actual use.
256a98e9dbfSaguzovsk *
257a98e9dbfSaguzovsk * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
258a98e9dbfSaguzovsk * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
259a98e9dbfSaguzovsk * buckets. The other list is used by asynchronous purge thread. This allows
260a98e9dbfSaguzovsk * the purge thread to walk its active list without holding seg_pmem_mtx for a
261a98e9dbfSaguzovsk * long time. When asynchronous thread is done with its list it switches to
262a98e9dbfSaguzovsk * current active list and makes the list it just finished processing as
263a98e9dbfSaguzovsk * current active list.
264a98e9dbfSaguzovsk *
265a98e9dbfSaguzovsk * seg_padd_abuck() only adds the bucket to current list if the bucket is not
266a98e9dbfSaguzovsk * yet on any list. seg_premove_abuck() may remove the bucket from either
267a98e9dbfSaguzovsk * list. If the bucket is on current list it will be always removed. Otherwise
268a98e9dbfSaguzovsk * the bucket is only removed if asynchronous purge thread is not currently
269a98e9dbfSaguzovsk * running or seg_premove_abuck() is called by asynchronous purge thread
270a98e9dbfSaguzovsk * itself. A given bucket can only be on one of active lists at a time. These
271a98e9dbfSaguzovsk * routines should be called with per bucket lock held. The routines use
272a98e9dbfSaguzovsk * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
273a98e9dbfSaguzovsk * the first entry is added to the bucket chain and seg_premove_abuck() must
274a98e9dbfSaguzovsk * be called after the last pcp entry is deleted from its chain. Per bucket
275a98e9dbfSaguzovsk * lock should be held by the callers. This avoids a potential race condition
276a98e9dbfSaguzovsk * when seg_premove_abuck() removes a bucket after pcp entries are added to
277a98e9dbfSaguzovsk * its list after the caller checked that the bucket has no entries. (this
278a98e9dbfSaguzovsk * race would cause a loss of an active bucket from the active lists).
279a98e9dbfSaguzovsk *
280a98e9dbfSaguzovsk * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
281a98e9dbfSaguzovsk * New entries are added to the end of the list since LRU is used as the
282a98e9dbfSaguzovsk * purging policy.
283a98e9dbfSaguzovsk */
284a98e9dbfSaguzovsk static void
seg_padd_abuck(struct seg_phash * hp)285a98e9dbfSaguzovsk seg_padd_abuck(struct seg_phash *hp)
286a98e9dbfSaguzovsk {
287a98e9dbfSaguzovsk int lix;
288a98e9dbfSaguzovsk
289a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex));
290a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hnext != hp);
291a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hprev != hp);
292a98e9dbfSaguzovsk ASSERT(hp->p_hnext == hp->p_hprev);
293a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(hp->p_hnext));
294a98e9dbfSaguzovsk ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
295a98e9dbfSaguzovsk ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
296a98e9dbfSaguzovsk ASSERT(hp >= seg_phashtab_win &&
297a98e9dbfSaguzovsk hp < &seg_phashtab_win[seg_phashsize_win]);
298a98e9dbfSaguzovsk
299a98e9dbfSaguzovsk /*
300a98e9dbfSaguzovsk * This bucket can already be on one of active lists
301a98e9dbfSaguzovsk * since seg_premove_abuck() may have failed to remove it
302a98e9dbfSaguzovsk * before.
303a98e9dbfSaguzovsk */
304a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
305a98e9dbfSaguzovsk lix = seg_pahcur;
306a98e9dbfSaguzovsk ASSERT(lix >= 0 && lix <= 1);
307a98e9dbfSaguzovsk if (hp->p_halink[lix].p_lnext != NULL) {
308a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL);
309a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL);
310a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL);
311a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
312a98e9dbfSaguzovsk return;
313a98e9dbfSaguzovsk }
314a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL);
315a98e9dbfSaguzovsk
316a98e9dbfSaguzovsk /*
317a98e9dbfSaguzovsk * If this bucket is still on list !lix async thread can't yet remove
318a98e9dbfSaguzovsk * it since we hold here per bucket lock. In this case just return
319a98e9dbfSaguzovsk * since async thread will eventually find and process this bucket.
320a98e9dbfSaguzovsk */
321a98e9dbfSaguzovsk if (hp->p_halink[!lix].p_lnext != NULL) {
322a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev != NULL);
323a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
324a98e9dbfSaguzovsk return;
325a98e9dbfSaguzovsk }
326a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL);
327a98e9dbfSaguzovsk /*
328a98e9dbfSaguzovsk * This bucket is not on any active bucket list yet.
329a98e9dbfSaguzovsk * Add the bucket to the tail of current active list.
330a98e9dbfSaguzovsk */
331a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
332a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
333a98e9dbfSaguzovsk seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
334a98e9dbfSaguzovsk seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
335a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
336a98e9dbfSaguzovsk }
337a98e9dbfSaguzovsk
338a98e9dbfSaguzovsk static void
seg_premove_abuck(struct seg_phash * hp,int athr)339a98e9dbfSaguzovsk seg_premove_abuck(struct seg_phash *hp, int athr)
340a98e9dbfSaguzovsk {
341a98e9dbfSaguzovsk int lix;
342a98e9dbfSaguzovsk
343a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex));
344a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hnext == hp);
345a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hprev == hp);
346a98e9dbfSaguzovsk ASSERT(hp >= seg_phashtab_win &&
347a98e9dbfSaguzovsk hp < &seg_phashtab_win[seg_phashsize_win]);
348a98e9dbfSaguzovsk
349a98e9dbfSaguzovsk if (athr) {
350a98e9dbfSaguzovsk ASSERT(seg_pathr_on);
351a98e9dbfSaguzovsk ASSERT(seg_pahcur <= 1);
352a98e9dbfSaguzovsk /*
353a98e9dbfSaguzovsk * We are called by asynchronous thread that found this bucket
354a98e9dbfSaguzovsk * on not currently active (i.e. !seg_pahcur) list. Remove it
355a98e9dbfSaguzovsk * from there. Per bucket lock we are holding makes sure
356a98e9dbfSaguzovsk * seg_pinsert() can't sneak in and add pcp entries to this
357a98e9dbfSaguzovsk * bucket right before we remove the bucket from its list.
358a98e9dbfSaguzovsk */
359a98e9dbfSaguzovsk lix = !seg_pahcur;
360a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL);
361a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL);
362a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL);
363a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL);
364a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
365a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
366a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL;
367a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL;
368a98e9dbfSaguzovsk return;
369a98e9dbfSaguzovsk }
370a98e9dbfSaguzovsk
371a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
372a98e9dbfSaguzovsk lix = seg_pahcur;
373a98e9dbfSaguzovsk ASSERT(lix >= 0 && lix <= 1);
374a98e9dbfSaguzovsk
375a98e9dbfSaguzovsk /*
376a98e9dbfSaguzovsk * If the bucket is on currently active list just remove it from
377a98e9dbfSaguzovsk * there.
378a98e9dbfSaguzovsk */
379a98e9dbfSaguzovsk if (hp->p_halink[lix].p_lnext != NULL) {
380a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL);
381a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL);
382a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL);
383a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
384a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
385a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL;
386a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL;
387a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
388a98e9dbfSaguzovsk return;
389a98e9dbfSaguzovsk }
390a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL);
391a98e9dbfSaguzovsk
392a98e9dbfSaguzovsk /*
393a98e9dbfSaguzovsk * If asynchronous thread is not running we can remove the bucket from
394a98e9dbfSaguzovsk * not currently active list. The bucket must be on this list since we
395a98e9dbfSaguzovsk * already checked that it's not on the other list and the bucket from
396a98e9dbfSaguzovsk * which we just deleted the last pcp entry must be still on one of the
397a98e9dbfSaguzovsk * active bucket lists.
398a98e9dbfSaguzovsk */
399a98e9dbfSaguzovsk lix = !lix;
400a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL);
401a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL);
402a98e9dbfSaguzovsk
403a98e9dbfSaguzovsk if (!seg_pathr_on) {
404a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
405a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
406a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL;
407a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL;
408a98e9dbfSaguzovsk }
409a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
410a98e9dbfSaguzovsk }
411a98e9dbfSaguzovsk
412a98e9dbfSaguzovsk /*
413a98e9dbfSaguzovsk * Check if bucket pointed by hp already has a pcp entry that matches request
414a98e9dbfSaguzovsk * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
415a98e9dbfSaguzovsk * Also delete matching entries that cover smaller address range but start
416a98e9dbfSaguzovsk * at the same address as addr argument. Return the list of deleted entries if
417a98e9dbfSaguzovsk * any. This is an internal helper function called from seg_pinsert() only
418a98e9dbfSaguzovsk * for non wired shadow lists. The caller already holds a per seg/amp list
419a98e9dbfSaguzovsk * lock.
420a98e9dbfSaguzovsk */
421a98e9dbfSaguzovsk static struct seg_pcache *
seg_plookup_checkdup(struct seg_phash * hp,void * htag0,caddr_t addr,size_t len,int * found)422a98e9dbfSaguzovsk seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
423a98e9dbfSaguzovsk caddr_t addr, size_t len, int *found)
424a98e9dbfSaguzovsk {
425a98e9dbfSaguzovsk struct seg_pcache *pcp;
426a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL;
427a98e9dbfSaguzovsk
428a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex));
429a98e9dbfSaguzovsk
430a98e9dbfSaguzovsk *found = 0;
431a98e9dbfSaguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
432a98e9dbfSaguzovsk pcp = pcp->p_hnext) {
433a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp);
434a98e9dbfSaguzovsk if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
435a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp));
436a98e9dbfSaguzovsk if (pcp->p_len < len) {
437a98e9dbfSaguzovsk pcache_link_t *plinkp;
438a98e9dbfSaguzovsk if (pcp->p_active) {
439a98e9dbfSaguzovsk continue;
440a98e9dbfSaguzovsk }
441a98e9dbfSaguzovsk plinkp = &pcp->p_plink;
442a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = plinkp->p_lnext;
443a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = plinkp->p_lprev;
444a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext;
445a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev;
446a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list;
447a98e9dbfSaguzovsk delcallb_list = pcp;
448a98e9dbfSaguzovsk } else {
449a98e9dbfSaguzovsk *found = 1;
450a98e9dbfSaguzovsk break;
451a98e9dbfSaguzovsk }
452a98e9dbfSaguzovsk }
453a98e9dbfSaguzovsk }
454a98e9dbfSaguzovsk return (delcallb_list);
455a98e9dbfSaguzovsk }
456a98e9dbfSaguzovsk
457a98e9dbfSaguzovsk /*
458a98e9dbfSaguzovsk * lookup an address range in pagelock cache. Return shadow list and bump up
459a98e9dbfSaguzovsk * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
460a98e9dbfSaguzovsk * as a lookup tag.
4617c478bd9Sstevel@tonic-gate */
4627c478bd9Sstevel@tonic-gate struct page **
seg_plookup(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,enum seg_rw rw,uint_t flags)463a98e9dbfSaguzovsk seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
464a98e9dbfSaguzovsk enum seg_rw rw, uint_t flags)
4657c478bd9Sstevel@tonic-gate {
4667c478bd9Sstevel@tonic-gate struct seg_pcache *pcp;
4677c478bd9Sstevel@tonic-gate struct seg_phash *hp;
468a98e9dbfSaguzovsk void *htag0;
469a98e9dbfSaguzovsk
470a98e9dbfSaguzovsk ASSERT(seg != NULL);
471a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE);
4727c478bd9Sstevel@tonic-gate
4737c478bd9Sstevel@tonic-gate /*
4747c478bd9Sstevel@tonic-gate * Skip pagelock cache, while DR is in progress or
4757c478bd9Sstevel@tonic-gate * seg_pcache is off.
4767c478bd9Sstevel@tonic-gate */
477a98e9dbfSaguzovsk if (seg_pdisabled) {
4787c478bd9Sstevel@tonic-gate return (NULL);
4797c478bd9Sstevel@tonic-gate }
480a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0);
4817c478bd9Sstevel@tonic-gate
482a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp);
483a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, addr, flags);
4847c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex);
4857c478bd9Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
4867c478bd9Sstevel@tonic-gate pcp = pcp->p_hnext) {
487a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp);
488a98e9dbfSaguzovsk if (P_MATCH(pcp, htag0, addr, len)) {
489a98e9dbfSaguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
490a98e9dbfSaguzovsk /*
491a98e9dbfSaguzovsk * If this request wants to write pages
492a98e9dbfSaguzovsk * but write permissions starting from
493a98e9dbfSaguzovsk * addr don't cover the entire length len
494a98e9dbfSaguzovsk * return lookup failure back to the caller.
495a98e9dbfSaguzovsk * It will check protections and fail this
496a98e9dbfSaguzovsk * pagelock operation with EACCESS error.
497a98e9dbfSaguzovsk */
498a98e9dbfSaguzovsk if (rw == S_WRITE && pcp->p_wlen < len) {
499a98e9dbfSaguzovsk break;
500a98e9dbfSaguzovsk }
501a98e9dbfSaguzovsk if (pcp->p_active == UINT_MAX) {
502a98e9dbfSaguzovsk break;
503a98e9dbfSaguzovsk }
5047c478bd9Sstevel@tonic-gate pcp->p_active++;
505a98e9dbfSaguzovsk if (rw == S_WRITE && !pcp->p_write) {
506a98e9dbfSaguzovsk pcp->p_write = 1;
507a98e9dbfSaguzovsk }
5087c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex);
5097c478bd9Sstevel@tonic-gate return (pcp->p_pp);
5107c478bd9Sstevel@tonic-gate }
5117c478bd9Sstevel@tonic-gate }
5127c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex);
5137c478bd9Sstevel@tonic-gate return (NULL);
5147c478bd9Sstevel@tonic-gate }
5157c478bd9Sstevel@tonic-gate
5167c478bd9Sstevel@tonic-gate /*
517a98e9dbfSaguzovsk * mark address range inactive. If the cache is off or the address range is
518a98e9dbfSaguzovsk * not in the cache or another shadow list that covers bigger range is found
519a98e9dbfSaguzovsk * we call the segment driver to reclaim the pages. Otherwise just decrement
520a98e9dbfSaguzovsk * active count and set ref bit. If amp is not NULL use amp as a lookup tag
521a98e9dbfSaguzovsk * otherwise use seg as a lookup tag.
5227c478bd9Sstevel@tonic-gate */
5237c478bd9Sstevel@tonic-gate void
seg_pinactive(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,struct page ** pp,enum seg_rw rw,uint_t flags,seg_preclaim_cbfunc_t callback)524a98e9dbfSaguzovsk seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
525a98e9dbfSaguzovsk size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
526a98e9dbfSaguzovsk seg_preclaim_cbfunc_t callback)
5277c478bd9Sstevel@tonic-gate {
5287c478bd9Sstevel@tonic-gate struct seg_pcache *pcp;
5297c478bd9Sstevel@tonic-gate struct seg_phash *hp;
530a98e9dbfSaguzovsk kmutex_t *pmtx = NULL;
531a98e9dbfSaguzovsk pcache_link_t *pheadp;
532a98e9dbfSaguzovsk void *htag0;
533a98e9dbfSaguzovsk pgcnt_t npages = 0;
534a98e9dbfSaguzovsk int keep = 0;
5357c478bd9Sstevel@tonic-gate
536a98e9dbfSaguzovsk ASSERT(seg != NULL);
537a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE);
538a98e9dbfSaguzovsk
539a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp);
540a98e9dbfSaguzovsk
541a98e9dbfSaguzovsk /*
542a98e9dbfSaguzovsk * Skip lookup if pcache is not configured.
543a98e9dbfSaguzovsk */
544a98e9dbfSaguzovsk if (seg_phashsize_win == 0) {
545a98e9dbfSaguzovsk goto out;
546a98e9dbfSaguzovsk }
547a98e9dbfSaguzovsk
548a98e9dbfSaguzovsk /*
549a98e9dbfSaguzovsk * Grab per seg/amp lock before hash lock if we are going to remove
550a98e9dbfSaguzovsk * inactive entry from pcache.
551a98e9dbfSaguzovsk */
552a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
553a98e9dbfSaguzovsk if (amp == NULL) {
554a98e9dbfSaguzovsk pheadp = &seg->s_phead;
555a98e9dbfSaguzovsk pmtx = &seg->s_pmtx;
556a98e9dbfSaguzovsk } else {
557a98e9dbfSaguzovsk pheadp = &->a_phead;
558a98e9dbfSaguzovsk pmtx = &->a_pmtx;
559a98e9dbfSaguzovsk }
560a98e9dbfSaguzovsk mutex_enter(pmtx);
5617c478bd9Sstevel@tonic-gate }
562a98e9dbfSaguzovsk
563a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, addr, flags);
5647c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex);
565a98e9dbfSaguzovsk again:
5667c478bd9Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
5677c478bd9Sstevel@tonic-gate pcp = pcp->p_hnext) {
568a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp);
569a98e9dbfSaguzovsk if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
570a98e9dbfSaguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
571a98e9dbfSaguzovsk ASSERT(pcp->p_active);
572a98e9dbfSaguzovsk if (keep) {
573a98e9dbfSaguzovsk /*
574a98e9dbfSaguzovsk * Don't remove this pcp entry
575a98e9dbfSaguzovsk * if we didn't find duplicate
576a98e9dbfSaguzovsk * shadow lists on second search.
577a98e9dbfSaguzovsk * Somebody removed those duplicates
578a98e9dbfSaguzovsk * since we dropped hash lock after first
579a98e9dbfSaguzovsk * search.
580a98e9dbfSaguzovsk */
581a98e9dbfSaguzovsk ASSERT(pmtx != NULL);
582a98e9dbfSaguzovsk ASSERT(!IS_PFLAGS_WIRED(flags));
583a98e9dbfSaguzovsk mutex_exit(pmtx);
584a98e9dbfSaguzovsk pmtx = NULL;
585a98e9dbfSaguzovsk }
5867c478bd9Sstevel@tonic-gate pcp->p_active--;
587a98e9dbfSaguzovsk if (pcp->p_active == 0 && (pmtx != NULL ||
588a98e9dbfSaguzovsk (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
589a98e9dbfSaguzovsk
590a98e9dbfSaguzovsk /*
591a98e9dbfSaguzovsk * This entry is no longer active. Remove it
592a98e9dbfSaguzovsk * now either because pcaching is temporarily
593a98e9dbfSaguzovsk * disabled or there're other pcp entries that
594a98e9dbfSaguzovsk * can match this pagelock request (i.e. this
595a98e9dbfSaguzovsk * entry is a duplicate).
596a98e9dbfSaguzovsk */
5977c478bd9Sstevel@tonic-gate
5987c478bd9Sstevel@tonic-gate ASSERT(callback == pcp->p_callback);
599a98e9dbfSaguzovsk if (pmtx != NULL) {
600a98e9dbfSaguzovsk pcache_link_t *plinkp = &pcp->p_plink;
601a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp));
602a98e9dbfSaguzovsk ASSERT(pheadp->p_lnext != pheadp);
603a98e9dbfSaguzovsk ASSERT(pheadp->p_lprev != pheadp);
604a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext =
605a98e9dbfSaguzovsk plinkp->p_lnext;
606a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev =
607a98e9dbfSaguzovsk plinkp->p_lprev;
608a98e9dbfSaguzovsk }
6097c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext;
6107c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev;
611a98e9dbfSaguzovsk if (!IS_PCP_WIRED(pcp) &&
612a98e9dbfSaguzovsk hp->p_hnext == (struct seg_pcache *)hp) {
613a98e9dbfSaguzovsk /*
614a98e9dbfSaguzovsk * We removed the last entry from this
615a98e9dbfSaguzovsk * bucket. Now remove the bucket from
616a98e9dbfSaguzovsk * its active list.
617a98e9dbfSaguzovsk */
618a98e9dbfSaguzovsk seg_premove_abuck(hp, 0);
619a98e9dbfSaguzovsk }
6207c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex);
621a98e9dbfSaguzovsk if (pmtx != NULL) {
622a98e9dbfSaguzovsk mutex_exit(pmtx);
6237c478bd9Sstevel@tonic-gate }
624a98e9dbfSaguzovsk len = pcp->p_len;
625a98e9dbfSaguzovsk npages = btop(len);
626a98e9dbfSaguzovsk if (rw != S_WRITE && pcp->p_write) {
627a98e9dbfSaguzovsk rw = S_WRITE;
628a98e9dbfSaguzovsk }
629a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp);
6307c478bd9Sstevel@tonic-gate goto out;
631a98e9dbfSaguzovsk } else {
632a98e9dbfSaguzovsk /*
633a98e9dbfSaguzovsk * We found a matching pcp entry but will not
634a98e9dbfSaguzovsk * free it right away even if it's no longer
635a98e9dbfSaguzovsk * active.
636a98e9dbfSaguzovsk */
637a98e9dbfSaguzovsk if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
638a98e9dbfSaguzovsk /*
639a98e9dbfSaguzovsk * Set the reference bit and mark the
640a98e9dbfSaguzovsk * time of last access to this pcp
641a98e9dbfSaguzovsk * so that asynchronous thread doesn't
642a98e9dbfSaguzovsk * free it immediately since
643a98e9dbfSaguzovsk * it may be reactivated very soon.
644a98e9dbfSaguzovsk */
645d3d50737SRafael Vanoni pcp->p_lbolt = ddi_get_lbolt();
646a98e9dbfSaguzovsk pcp->p_ref = 1;
647a98e9dbfSaguzovsk }
648a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex);
649a98e9dbfSaguzovsk if (pmtx != NULL) {
650a98e9dbfSaguzovsk mutex_exit(pmtx);
651a98e9dbfSaguzovsk }
652a98e9dbfSaguzovsk return;
653a98e9dbfSaguzovsk }
654a98e9dbfSaguzovsk } else if (!IS_PFLAGS_WIRED(flags) &&
655a98e9dbfSaguzovsk P_MATCH(pcp, htag0, addr, len)) {
656a98e9dbfSaguzovsk /*
657a98e9dbfSaguzovsk * This is a duplicate pcp entry. This situation may
658a98e9dbfSaguzovsk * happen if a bigger shadow list that covers our
659a98e9dbfSaguzovsk * range was added while our entry was still active.
660a98e9dbfSaguzovsk * Now we can free our pcp entry if it becomes
661a98e9dbfSaguzovsk * inactive.
662a98e9dbfSaguzovsk */
663a98e9dbfSaguzovsk if (!pcp->p_active) {
664a98e9dbfSaguzovsk /*
665a98e9dbfSaguzovsk * Mark this entry as referenced just in case
666a98e9dbfSaguzovsk * we'll free our own pcp entry soon.
667a98e9dbfSaguzovsk */
668d3d50737SRafael Vanoni pcp->p_lbolt = ddi_get_lbolt();
669a98e9dbfSaguzovsk pcp->p_ref = 1;
670a98e9dbfSaguzovsk }
671a98e9dbfSaguzovsk if (pmtx != NULL) {
672a98e9dbfSaguzovsk /*
673a98e9dbfSaguzovsk * we are already holding pmtx and found a
674a98e9dbfSaguzovsk * duplicate. Don't keep our own pcp entry.
675a98e9dbfSaguzovsk */
676a98e9dbfSaguzovsk keep = 0;
677a98e9dbfSaguzovsk continue;
678a98e9dbfSaguzovsk }
679a98e9dbfSaguzovsk /*
680a98e9dbfSaguzovsk * We have to use mutex_tryenter to attempt to lock
681a98e9dbfSaguzovsk * seg/amp list lock since we already hold hash lock
682a98e9dbfSaguzovsk * and seg/amp list lock is above hash lock in lock
683a98e9dbfSaguzovsk * order. If mutex_tryenter fails drop hash lock and
684a98e9dbfSaguzovsk * retake both locks in correct order and research
685a98e9dbfSaguzovsk * this hash chain.
686a98e9dbfSaguzovsk */
687a98e9dbfSaguzovsk ASSERT(keep == 0);
688a98e9dbfSaguzovsk if (amp == NULL) {
689a98e9dbfSaguzovsk pheadp = &seg->s_phead;
690a98e9dbfSaguzovsk pmtx = &seg->s_pmtx;
691a98e9dbfSaguzovsk } else {
692a98e9dbfSaguzovsk pheadp = &->a_phead;
693a98e9dbfSaguzovsk pmtx = &->a_pmtx;
694a98e9dbfSaguzovsk }
695a98e9dbfSaguzovsk if (!mutex_tryenter(pmtx)) {
696a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex);
697a98e9dbfSaguzovsk mutex_enter(pmtx);
698a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex);
699a98e9dbfSaguzovsk /*
700a98e9dbfSaguzovsk * If we don't find bigger shadow list on
701a98e9dbfSaguzovsk * second search (it may happen since we
702a98e9dbfSaguzovsk * dropped bucket lock) keep the entry that
703a98e9dbfSaguzovsk * matches our own shadow list.
704a98e9dbfSaguzovsk */
705a98e9dbfSaguzovsk keep = 1;
706a98e9dbfSaguzovsk goto again;
7077c478bd9Sstevel@tonic-gate }
7087c478bd9Sstevel@tonic-gate }
7097c478bd9Sstevel@tonic-gate }
7107c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex);
711a98e9dbfSaguzovsk if (pmtx != NULL) {
712a98e9dbfSaguzovsk mutex_exit(pmtx);
713a98e9dbfSaguzovsk }
7147c478bd9Sstevel@tonic-gate out:
715a98e9dbfSaguzovsk (*callback)(htag0, addr, len, pp, rw, 0);
716a98e9dbfSaguzovsk if (npages) {
717a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
718a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages);
719a98e9dbfSaguzovsk seg_plocked -= npages;
720a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) {
721a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages);
722a98e9dbfSaguzovsk seg_plocked_window -= npages;
723a98e9dbfSaguzovsk }
724a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
725a98e9dbfSaguzovsk }
726a98e9dbfSaguzovsk
7277c478bd9Sstevel@tonic-gate }
7287c478bd9Sstevel@tonic-gate
729a98e9dbfSaguzovsk #ifdef DEBUG
730a98e9dbfSaguzovsk static uint32_t p_insert_chk_mtbf = 0;
731a98e9dbfSaguzovsk #endif
732a98e9dbfSaguzovsk
7337c478bd9Sstevel@tonic-gate /*
7347c478bd9Sstevel@tonic-gate * The seg_pinsert_check() is used by segment drivers to predict whether
7357c478bd9Sstevel@tonic-gate * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
7367c478bd9Sstevel@tonic-gate */
737a98e9dbfSaguzovsk /*ARGSUSED*/
7387c478bd9Sstevel@tonic-gate int
seg_pinsert_check(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,uint_t flags)739a98e9dbfSaguzovsk seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
740a98e9dbfSaguzovsk size_t len, uint_t flags)
7417c478bd9Sstevel@tonic-gate {
742a98e9dbfSaguzovsk ASSERT(seg != NULL);
7437c478bd9Sstevel@tonic-gate
744a98e9dbfSaguzovsk #ifdef DEBUG
745a98e9dbfSaguzovsk if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
7467c478bd9Sstevel@tonic-gate return (SEGP_FAIL);
7477c478bd9Sstevel@tonic-gate }
748a98e9dbfSaguzovsk #endif
749a98e9dbfSaguzovsk
750a98e9dbfSaguzovsk if (seg_pdisabled) {
7517c478bd9Sstevel@tonic-gate return (SEGP_FAIL);
7527c478bd9Sstevel@tonic-gate }
753a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0);
754a98e9dbfSaguzovsk
755a98e9dbfSaguzovsk if (IS_PFLAGS_WIRED(flags)) {
756a98e9dbfSaguzovsk return (SEGP_SUCCESS);
757a98e9dbfSaguzovsk }
758a98e9dbfSaguzovsk
759a98e9dbfSaguzovsk if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
7607c478bd9Sstevel@tonic-gate return (SEGP_FAIL);
7617c478bd9Sstevel@tonic-gate }
7627c478bd9Sstevel@tonic-gate
763a98e9dbfSaguzovsk if (freemem < desfree) {
764a98e9dbfSaguzovsk return (SEGP_FAIL);
7657c478bd9Sstevel@tonic-gate }
766a98e9dbfSaguzovsk
7677c478bd9Sstevel@tonic-gate return (SEGP_SUCCESS);
7687c478bd9Sstevel@tonic-gate }
7697c478bd9Sstevel@tonic-gate
770a98e9dbfSaguzovsk #ifdef DEBUG
771a98e9dbfSaguzovsk static uint32_t p_insert_mtbf = 0;
772a98e9dbfSaguzovsk #endif
7737c478bd9Sstevel@tonic-gate
7747c478bd9Sstevel@tonic-gate /*
775a98e9dbfSaguzovsk * Insert address range with shadow list into pagelock cache if there's no
776a98e9dbfSaguzovsk * shadow list already cached for this address range. If the cache is off or
777a98e9dbfSaguzovsk * caching is temporarily disabled or the allowed 'window' is exceeded return
778a98e9dbfSaguzovsk * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
779a98e9dbfSaguzovsk *
780a98e9dbfSaguzovsk * For non wired shadow lists (segvn case) include address in the hashing
781a98e9dbfSaguzovsk * function to avoid linking all the entries from the same segment or amp on
782a98e9dbfSaguzovsk * the same bucket. amp is used instead of seg if amp is not NULL. Non wired
783a98e9dbfSaguzovsk * pcache entries are also linked on a per segment/amp list so that all
784a98e9dbfSaguzovsk * entries can be found quickly during seg/amp purge without walking the
785a98e9dbfSaguzovsk * entire pcache hash table. For wired shadow lists (segspt case) we
786a98e9dbfSaguzovsk * don't use address hashing and per segment linking because the caller
787a98e9dbfSaguzovsk * currently inserts only one entry per segment that covers the entire
788a98e9dbfSaguzovsk * segment. If we used per segment linking even for segspt it would complicate
789a98e9dbfSaguzovsk * seg_ppurge_wiredpp() locking.
790a98e9dbfSaguzovsk *
791a98e9dbfSaguzovsk * Both hash bucket and per seg/amp locks need to be held before adding a non
792a98e9dbfSaguzovsk * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
793a98e9dbfSaguzovsk * first.
794a98e9dbfSaguzovsk *
795a98e9dbfSaguzovsk * This function will also remove from pcache old inactive shadow lists that
796a98e9dbfSaguzovsk * overlap with this request but cover smaller range for the same start
797a98e9dbfSaguzovsk * address.
7987c478bd9Sstevel@tonic-gate */
7997c478bd9Sstevel@tonic-gate int
seg_pinsert(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,size_t wlen,struct page ** pp,enum seg_rw rw,uint_t flags,seg_preclaim_cbfunc_t callback)800a98e9dbfSaguzovsk seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
801a98e9dbfSaguzovsk size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
802a98e9dbfSaguzovsk seg_preclaim_cbfunc_t callback)
8037c478bd9Sstevel@tonic-gate {
8047c478bd9Sstevel@tonic-gate struct seg_pcache *pcp;
8057c478bd9Sstevel@tonic-gate struct seg_phash *hp;
8067c478bd9Sstevel@tonic-gate pgcnt_t npages;
807a98e9dbfSaguzovsk pcache_link_t *pheadp;
808a98e9dbfSaguzovsk kmutex_t *pmtx;
809a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL;
8107c478bd9Sstevel@tonic-gate
811a98e9dbfSaguzovsk ASSERT(seg != NULL);
812a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE);
813a98e9dbfSaguzovsk ASSERT(rw == S_READ || wlen == len);
814a98e9dbfSaguzovsk ASSERT(rw == S_WRITE || wlen <= len);
815a98e9dbfSaguzovsk ASSERT(amp == NULL || wlen == len);
816a98e9dbfSaguzovsk
817a98e9dbfSaguzovsk #ifdef DEBUG
818a98e9dbfSaguzovsk if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
8197c478bd9Sstevel@tonic-gate return (SEGP_FAIL);
8207c478bd9Sstevel@tonic-gate }
821a98e9dbfSaguzovsk #endif
822a98e9dbfSaguzovsk
823a98e9dbfSaguzovsk if (seg_pdisabled) {
8247c478bd9Sstevel@tonic-gate return (SEGP_FAIL);
8257c478bd9Sstevel@tonic-gate }
826a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0);
827a98e9dbfSaguzovsk
8287c478bd9Sstevel@tonic-gate ASSERT((len & PAGEOFFSET) == 0);
829a98e9dbfSaguzovsk npages = btop(len);
830a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
831a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) {
832a98e9dbfSaguzovsk if (seg_plocked_window + npages > seg_pmaxwindow) {
833a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
8347c478bd9Sstevel@tonic-gate return (SEGP_FAIL);
8357c478bd9Sstevel@tonic-gate }
836a98e9dbfSaguzovsk seg_plocked_window += npages;
8377c478bd9Sstevel@tonic-gate }
8387c478bd9Sstevel@tonic-gate seg_plocked += npages;
839a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
8407c478bd9Sstevel@tonic-gate
841a98e9dbfSaguzovsk pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
842a98e9dbfSaguzovsk /*
843a98e9dbfSaguzovsk * If amp is not NULL set htag0 to amp otherwise set it to seg.
844a98e9dbfSaguzovsk */
845a98e9dbfSaguzovsk if (amp == NULL) {
846a98e9dbfSaguzovsk pcp->p_htag0 = (void *)seg;
847a98e9dbfSaguzovsk pcp->p_flags = flags & 0xffff;
848a98e9dbfSaguzovsk } else {
849a98e9dbfSaguzovsk pcp->p_htag0 = (void *)amp;
850a98e9dbfSaguzovsk pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
851a98e9dbfSaguzovsk }
8527c478bd9Sstevel@tonic-gate pcp->p_addr = addr;
8537c478bd9Sstevel@tonic-gate pcp->p_len = len;
854a98e9dbfSaguzovsk pcp->p_wlen = wlen;
8557c478bd9Sstevel@tonic-gate pcp->p_pp = pp;
856a98e9dbfSaguzovsk pcp->p_write = (rw == S_WRITE);
8577c478bd9Sstevel@tonic-gate pcp->p_callback = callback;
8587c478bd9Sstevel@tonic-gate pcp->p_active = 1;
8597c478bd9Sstevel@tonic-gate
860a98e9dbfSaguzovsk hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
861a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) {
862a98e9dbfSaguzovsk int found;
863a98e9dbfSaguzovsk void *htag0;
864a98e9dbfSaguzovsk if (amp == NULL) {
865a98e9dbfSaguzovsk pheadp = &seg->s_phead;
866a98e9dbfSaguzovsk pmtx = &seg->s_pmtx;
867a98e9dbfSaguzovsk htag0 = (void *)seg;
868a98e9dbfSaguzovsk } else {
869a98e9dbfSaguzovsk pheadp = &->a_phead;
870a98e9dbfSaguzovsk pmtx = &->a_pmtx;
871a98e9dbfSaguzovsk htag0 = (void *)amp;
872a98e9dbfSaguzovsk }
873a98e9dbfSaguzovsk mutex_enter(pmtx);
874a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex);
875a98e9dbfSaguzovsk delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
876a98e9dbfSaguzovsk len, &found);
877a98e9dbfSaguzovsk if (found) {
878a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex);
879a98e9dbfSaguzovsk mutex_exit(pmtx);
880a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
881a98e9dbfSaguzovsk seg_plocked -= npages;
882a98e9dbfSaguzovsk seg_plocked_window -= npages;
883a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
884a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp);
885a98e9dbfSaguzovsk goto out;
886a98e9dbfSaguzovsk }
887a98e9dbfSaguzovsk pcp->p_plink.p_lnext = pheadp->p_lnext;
888a98e9dbfSaguzovsk pcp->p_plink.p_lprev = pheadp;
889a98e9dbfSaguzovsk pheadp->p_lnext->p_lprev = &pcp->p_plink;
890a98e9dbfSaguzovsk pheadp->p_lnext = &pcp->p_plink;
891a98e9dbfSaguzovsk } else {
892a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex);
893a98e9dbfSaguzovsk }
894a98e9dbfSaguzovsk pcp->p_hashp = hp;
8957c478bd9Sstevel@tonic-gate pcp->p_hnext = hp->p_hnext;
8967c478bd9Sstevel@tonic-gate pcp->p_hprev = (struct seg_pcache *)hp;
8977c478bd9Sstevel@tonic-gate hp->p_hnext->p_hprev = pcp;
8987c478bd9Sstevel@tonic-gate hp->p_hnext = pcp;
899a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags) &&
900a98e9dbfSaguzovsk hp->p_hprev == pcp) {
901a98e9dbfSaguzovsk seg_padd_abuck(hp);
902a98e9dbfSaguzovsk }
9037c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex);
904a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) {
905a98e9dbfSaguzovsk mutex_exit(pmtx);
906a98e9dbfSaguzovsk }
907a98e9dbfSaguzovsk
908a98e9dbfSaguzovsk out:
909a98e9dbfSaguzovsk npages = 0;
910a98e9dbfSaguzovsk while (delcallb_list != NULL) {
911a98e9dbfSaguzovsk pcp = delcallb_list;
912a98e9dbfSaguzovsk delcallb_list = pcp->p_hprev;
913a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
914a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
915a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
916a98e9dbfSaguzovsk npages += btop(pcp->p_len);
917a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp);
918a98e9dbfSaguzovsk }
919a98e9dbfSaguzovsk if (npages) {
920a98e9dbfSaguzovsk ASSERT(!IS_PFLAGS_WIRED(flags));
921a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
922a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages);
923a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages);
924a98e9dbfSaguzovsk seg_plocked -= npages;
925a98e9dbfSaguzovsk seg_plocked_window -= npages;
926a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
927a98e9dbfSaguzovsk }
928a98e9dbfSaguzovsk
9297c478bd9Sstevel@tonic-gate return (SEGP_SUCCESS);
9307c478bd9Sstevel@tonic-gate }
9317c478bd9Sstevel@tonic-gate
9327c478bd9Sstevel@tonic-gate /*
933a98e9dbfSaguzovsk * purge entries from the pagelock cache if not active
934a98e9dbfSaguzovsk * and not recently used.
9357c478bd9Sstevel@tonic-gate */
9367c478bd9Sstevel@tonic-gate static void
seg_ppurge_async(int force)937a98e9dbfSaguzovsk seg_ppurge_async(int force)
9387c478bd9Sstevel@tonic-gate {
9397c478bd9Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL;
9407c478bd9Sstevel@tonic-gate struct seg_pcache *pcp;
9417c478bd9Sstevel@tonic-gate struct seg_phash *hp;
9427c478bd9Sstevel@tonic-gate pgcnt_t npages = 0;
9437c478bd9Sstevel@tonic-gate pgcnt_t npages_window = 0;
944a98e9dbfSaguzovsk pgcnt_t npgs_to_purge;
945a98e9dbfSaguzovsk pgcnt_t npgs_purged = 0;
946a98e9dbfSaguzovsk int hlinks = 0;
947a98e9dbfSaguzovsk int hlix;
948a98e9dbfSaguzovsk pcache_link_t *hlinkp;
949a98e9dbfSaguzovsk pcache_link_t *hlnextp = NULL;
950a98e9dbfSaguzovsk int lowmem;
951a98e9dbfSaguzovsk int trim;
952a98e9dbfSaguzovsk
953a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0);
9547c478bd9Sstevel@tonic-gate
9557c478bd9Sstevel@tonic-gate /*
956a98e9dbfSaguzovsk * if the cache is off or empty, return
9577c478bd9Sstevel@tonic-gate */
958a98e9dbfSaguzovsk if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
9597c478bd9Sstevel@tonic-gate return;
9607c478bd9Sstevel@tonic-gate }
9617c478bd9Sstevel@tonic-gate
962a98e9dbfSaguzovsk if (!force) {
963a98e9dbfSaguzovsk lowmem = 0;
964a98e9dbfSaguzovsk trim = 0;
965a98e9dbfSaguzovsk if (freemem < lotsfree + needfree) {
966a98e9dbfSaguzovsk spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
967a98e9dbfSaguzovsk if (fmem <= 5 * (desfree >> 2)) {
968a98e9dbfSaguzovsk lowmem = 1;
969a98e9dbfSaguzovsk } else if (fmem <= 7 * (lotsfree >> 3)) {
970a98e9dbfSaguzovsk if (seg_plocked_window >=
971a98e9dbfSaguzovsk (availrmem_initial >> 1)) {
972a98e9dbfSaguzovsk lowmem = 1;
973a98e9dbfSaguzovsk }
974a98e9dbfSaguzovsk } else if (fmem < lotsfree) {
975a98e9dbfSaguzovsk if (seg_plocked_window >=
976a98e9dbfSaguzovsk 3 * (availrmem_initial >> 2)) {
977a98e9dbfSaguzovsk lowmem = 1;
978a98e9dbfSaguzovsk }
979a98e9dbfSaguzovsk }
980a98e9dbfSaguzovsk }
981a98e9dbfSaguzovsk if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
982a98e9dbfSaguzovsk trim = 1;
983a98e9dbfSaguzovsk }
984a98e9dbfSaguzovsk if (!lowmem && !trim) {
985a98e9dbfSaguzovsk return;
986a98e9dbfSaguzovsk }
987a98e9dbfSaguzovsk npgs_to_purge = seg_plocked_window >>
988a98e9dbfSaguzovsk seg_pshrink_shift;
989a98e9dbfSaguzovsk if (lowmem) {
990a98e9dbfSaguzovsk npgs_to_purge = MIN(npgs_to_purge,
991a98e9dbfSaguzovsk MAX(seg_pmaxapurge_npages, desfree));
992a98e9dbfSaguzovsk } else {
993a98e9dbfSaguzovsk npgs_to_purge = MIN(npgs_to_purge,
994a98e9dbfSaguzovsk seg_pmaxapurge_npages);
995a98e9dbfSaguzovsk }
996a98e9dbfSaguzovsk if (npgs_to_purge == 0) {
997a98e9dbfSaguzovsk return;
998a98e9dbfSaguzovsk }
999a98e9dbfSaguzovsk } else {
1000a98e9dbfSaguzovsk struct seg_phash_wired *hpw;
10017c478bd9Sstevel@tonic-gate
1002a98e9dbfSaguzovsk ASSERT(seg_phashsize_wired != 0);
10037c478bd9Sstevel@tonic-gate
1004a98e9dbfSaguzovsk for (hpw = seg_phashtab_wired;
1005a98e9dbfSaguzovsk hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1006a98e9dbfSaguzovsk
1007a98e9dbfSaguzovsk if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1008a98e9dbfSaguzovsk continue;
1009a98e9dbfSaguzovsk }
1010a98e9dbfSaguzovsk
1011a98e9dbfSaguzovsk mutex_enter(&hpw->p_hmutex);
1012a98e9dbfSaguzovsk
1013a98e9dbfSaguzovsk for (pcp = hpw->p_hnext;
1014a98e9dbfSaguzovsk pcp != (struct seg_pcache *)hpw;
1015a98e9dbfSaguzovsk pcp = pcp->p_hnext) {
1016a98e9dbfSaguzovsk
1017a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp));
1018a98e9dbfSaguzovsk ASSERT(pcp->p_hashp ==
1019a98e9dbfSaguzovsk (struct seg_phash *)hpw);
1020a98e9dbfSaguzovsk
1021a98e9dbfSaguzovsk if (pcp->p_active) {
1022a98e9dbfSaguzovsk continue;
10237c478bd9Sstevel@tonic-gate }
1024a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext;
1025a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev;
1026a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list;
1027a98e9dbfSaguzovsk delcallb_list = pcp;
1028a98e9dbfSaguzovsk }
1029a98e9dbfSaguzovsk mutex_exit(&hpw->p_hmutex);
1030a98e9dbfSaguzovsk }
1031a98e9dbfSaguzovsk }
1032a98e9dbfSaguzovsk
1033a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
1034a98e9dbfSaguzovsk if (seg_pathr_on) {
1035a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
1036a98e9dbfSaguzovsk goto runcb;
1037a98e9dbfSaguzovsk }
1038a98e9dbfSaguzovsk seg_pathr_on = 1;
1039a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
1040a98e9dbfSaguzovsk ASSERT(seg_pahcur <= 1);
1041a98e9dbfSaguzovsk hlix = !seg_pahcur;
1042a98e9dbfSaguzovsk
1043a98e9dbfSaguzovsk again:
1044a98e9dbfSaguzovsk for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1045a98e9dbfSaguzovsk hlinkp = hlnextp) {
1046a98e9dbfSaguzovsk
1047a98e9dbfSaguzovsk hlnextp = hlinkp->p_lnext;
1048a98e9dbfSaguzovsk ASSERT(hlnextp != NULL);
1049a98e9dbfSaguzovsk
1050a98e9dbfSaguzovsk hp = hlink2phash(hlinkp, hlix);
1051a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) {
1052a98e9dbfSaguzovsk seg_pathr_empty_ahb++;
1053a98e9dbfSaguzovsk continue;
1054a98e9dbfSaguzovsk }
1055a98e9dbfSaguzovsk seg_pathr_full_ahb++;
1056a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex);
1057a98e9dbfSaguzovsk
1058a98e9dbfSaguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1059a98e9dbfSaguzovsk pcp = pcp->p_hnext) {
1060a98e9dbfSaguzovsk pcache_link_t *pheadp;
1061a98e9dbfSaguzovsk pcache_link_t *plinkp;
1062a98e9dbfSaguzovsk void *htag0;
1063a98e9dbfSaguzovsk kmutex_t *pmtx;
1064a98e9dbfSaguzovsk
1065a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp));
1066a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp);
1067a98e9dbfSaguzovsk
1068a98e9dbfSaguzovsk if (pcp->p_active) {
1069a98e9dbfSaguzovsk continue;
1070a98e9dbfSaguzovsk }
1071a98e9dbfSaguzovsk if (!force && pcp->p_ref &&
1072a98e9dbfSaguzovsk PCP_AGE(pcp) < seg_pmax_pcpage) {
10737c478bd9Sstevel@tonic-gate pcp->p_ref = 0;
1074a98e9dbfSaguzovsk continue;
10757c478bd9Sstevel@tonic-gate }
1076a98e9dbfSaguzovsk plinkp = &pcp->p_plink;
1077a98e9dbfSaguzovsk htag0 = pcp->p_htag0;
1078a98e9dbfSaguzovsk if (pcp->p_flags & SEGP_AMP) {
1079a98e9dbfSaguzovsk pheadp = &((amp_t *)htag0)->a_phead;
1080a98e9dbfSaguzovsk pmtx = &((amp_t *)htag0)->a_pmtx;
1081a98e9dbfSaguzovsk } else {
1082a98e9dbfSaguzovsk pheadp = &((seg_t *)htag0)->s_phead;
1083a98e9dbfSaguzovsk pmtx = &((seg_t *)htag0)->s_pmtx;
1084a98e9dbfSaguzovsk }
1085a98e9dbfSaguzovsk if (!mutex_tryenter(pmtx)) {
1086a98e9dbfSaguzovsk continue;
1087a98e9dbfSaguzovsk }
1088a98e9dbfSaguzovsk ASSERT(pheadp->p_lnext != pheadp);
1089a98e9dbfSaguzovsk ASSERT(pheadp->p_lprev != pheadp);
1090a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext =
1091a98e9dbfSaguzovsk plinkp->p_lnext;
1092a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev =
1093a98e9dbfSaguzovsk plinkp->p_lprev;
1094a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext;
1095a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev;
1096a98e9dbfSaguzovsk mutex_exit(pmtx);
1097a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list;
1098a98e9dbfSaguzovsk delcallb_list = pcp;
1099a98e9dbfSaguzovsk npgs_purged += btop(pcp->p_len);
1100a98e9dbfSaguzovsk }
1101a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) {
1102a98e9dbfSaguzovsk seg_premove_abuck(hp, 1);
11037c478bd9Sstevel@tonic-gate }
11047c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex);
1105a98e9dbfSaguzovsk if (npgs_purged >= seg_plocked_window) {
11067c478bd9Sstevel@tonic-gate break;
1107a98e9dbfSaguzovsk }
1108a98e9dbfSaguzovsk if (!force) {
1109a98e9dbfSaguzovsk if (npgs_purged >= npgs_to_purge) {
1110a98e9dbfSaguzovsk break;
1111a98e9dbfSaguzovsk }
1112a98e9dbfSaguzovsk if (!trim && !(seg_pathr_full_ahb & 15)) {
1113a98e9dbfSaguzovsk ASSERT(lowmem);
1114a98e9dbfSaguzovsk if (freemem >= lotsfree + needfree) {
1115a98e9dbfSaguzovsk break;
1116a98e9dbfSaguzovsk }
1117a98e9dbfSaguzovsk }
1118a98e9dbfSaguzovsk }
11197c478bd9Sstevel@tonic-gate }
11207c478bd9Sstevel@tonic-gate
1121a98e9dbfSaguzovsk if (hlinkp == &seg_pahhead[hlix]) {
1122a98e9dbfSaguzovsk /*
1123a98e9dbfSaguzovsk * We processed the entire hlix active bucket list
1124a98e9dbfSaguzovsk * but didn't find enough pages to reclaim.
1125a98e9dbfSaguzovsk * Switch the lists and walk the other list
1126a98e9dbfSaguzovsk * if we haven't done it yet.
1127a98e9dbfSaguzovsk */
1128a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
1129a98e9dbfSaguzovsk ASSERT(seg_pathr_on);
1130a98e9dbfSaguzovsk ASSERT(seg_pahcur == !hlix);
1131a98e9dbfSaguzovsk seg_pahcur = hlix;
1132a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
1133a98e9dbfSaguzovsk if (++hlinks < 2) {
1134a98e9dbfSaguzovsk hlix = !hlix;
1135a98e9dbfSaguzovsk goto again;
1136a98e9dbfSaguzovsk }
1137a98e9dbfSaguzovsk } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1138a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext != hlinkp) {
1139a98e9dbfSaguzovsk ASSERT(hlinkp != NULL);
1140a98e9dbfSaguzovsk ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1141a98e9dbfSaguzovsk ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1142a98e9dbfSaguzovsk ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1143a98e9dbfSaguzovsk
1144a98e9dbfSaguzovsk /*
1145a98e9dbfSaguzovsk * Reinsert the header to point to hlinkp
1146a98e9dbfSaguzovsk * so that we start from hlinkp bucket next time around.
1147a98e9dbfSaguzovsk */
1148a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1149a98e9dbfSaguzovsk seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1150a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext = hlinkp;
1151a98e9dbfSaguzovsk seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1152a98e9dbfSaguzovsk hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1153a98e9dbfSaguzovsk hlinkp->p_lprev = &seg_pahhead[hlix];
1154a98e9dbfSaguzovsk }
1155a98e9dbfSaguzovsk
1156a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
1157a98e9dbfSaguzovsk ASSERT(seg_pathr_on);
1158a98e9dbfSaguzovsk seg_pathr_on = 0;
1159a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
1160a98e9dbfSaguzovsk
1161a98e9dbfSaguzovsk runcb:
11627c478bd9Sstevel@tonic-gate /*
1163a98e9dbfSaguzovsk * Run the delayed callback list. segments/amps can't go away until
1164a98e9dbfSaguzovsk * callback is executed since they must have non 0 softlockcnt. That's
1165a98e9dbfSaguzovsk * why we don't need to hold as/seg/amp locks to execute the callback.
11667c478bd9Sstevel@tonic-gate */
11677c478bd9Sstevel@tonic-gate while (delcallb_list != NULL) {
11687c478bd9Sstevel@tonic-gate pcp = delcallb_list;
11697c478bd9Sstevel@tonic-gate delcallb_list = pcp->p_hprev;
1170a98e9dbfSaguzovsk ASSERT(!pcp->p_active);
1171a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1172a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1173a98e9dbfSaguzovsk npages += btop(pcp->p_len);
1174a98e9dbfSaguzovsk if (!IS_PCP_WIRED(pcp)) {
1175a98e9dbfSaguzovsk npages_window += btop(pcp->p_len);
11767c478bd9Sstevel@tonic-gate }
1177a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp);
1178a98e9dbfSaguzovsk }
1179a98e9dbfSaguzovsk if (npages) {
1180a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
1181a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages);
1182a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages_window);
1183a98e9dbfSaguzovsk seg_plocked -= npages;
1184a98e9dbfSaguzovsk seg_plocked_window -= npages_window;
1185a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
11867c478bd9Sstevel@tonic-gate }
11877c478bd9Sstevel@tonic-gate }
11887c478bd9Sstevel@tonic-gate
11897c478bd9Sstevel@tonic-gate /*
1190a98e9dbfSaguzovsk * Remove cached pages for segment(s) entries from hashtable. The segments
1191a98e9dbfSaguzovsk * are identified by pp array. This is useful for multiple seg's cached on
1192a98e9dbfSaguzovsk * behalf of dummy segment (ISM/DISM) with common pp array.
11937c478bd9Sstevel@tonic-gate */
11947c478bd9Sstevel@tonic-gate void
seg_ppurge_wiredpp(struct page ** pp)1195a98e9dbfSaguzovsk seg_ppurge_wiredpp(struct page **pp)
11967c478bd9Sstevel@tonic-gate {
1197a98e9dbfSaguzovsk struct seg_pcache *pcp;
1198a98e9dbfSaguzovsk struct seg_phash_wired *hp;
11997c478bd9Sstevel@tonic-gate pgcnt_t npages = 0;
1200a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL;
12017c478bd9Sstevel@tonic-gate
12027c478bd9Sstevel@tonic-gate /*
1203a98e9dbfSaguzovsk * if the cache is empty, return
12047c478bd9Sstevel@tonic-gate */
1205a98e9dbfSaguzovsk if (seg_plocked == 0) {
12067c478bd9Sstevel@tonic-gate return;
12077c478bd9Sstevel@tonic-gate }
1208a98e9dbfSaguzovsk ASSERT(seg_phashsize_wired != 0);
12097c478bd9Sstevel@tonic-gate
1210a98e9dbfSaguzovsk for (hp = seg_phashtab_wired;
1211a98e9dbfSaguzovsk hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1212a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) {
1213a98e9dbfSaguzovsk continue;
1214a98e9dbfSaguzovsk }
12157c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex);
12167c478bd9Sstevel@tonic-gate pcp = hp->p_hnext;
12177c478bd9Sstevel@tonic-gate while (pcp != (struct seg_pcache *)hp) {
1218a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1219a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp));
12207c478bd9Sstevel@tonic-gate /*
12217c478bd9Sstevel@tonic-gate * purge entries which are not active
12227c478bd9Sstevel@tonic-gate */
1223a98e9dbfSaguzovsk if (!pcp->p_active && pcp->p_pp == pp) {
1224a98e9dbfSaguzovsk ASSERT(pcp->p_htag0 != NULL);
12257c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext;
12267c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev;
1227a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list;
1228a98e9dbfSaguzovsk delcallb_list = pcp;
12297c478bd9Sstevel@tonic-gate }
1230a98e9dbfSaguzovsk pcp = pcp->p_hnext;
12317c478bd9Sstevel@tonic-gate }
12327c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex);
1233a98e9dbfSaguzovsk /*
1234a98e9dbfSaguzovsk * segments can't go away until callback is executed since
1235a98e9dbfSaguzovsk * they must have non 0 softlockcnt. That's why we don't
1236a98e9dbfSaguzovsk * need to hold as/seg locks to execute the callback.
1237a98e9dbfSaguzovsk */
1238a98e9dbfSaguzovsk while (delcallb_list != NULL) {
1239a98e9dbfSaguzovsk int done;
1240a98e9dbfSaguzovsk pcp = delcallb_list;
1241a98e9dbfSaguzovsk delcallb_list = pcp->p_hprev;
1242a98e9dbfSaguzovsk ASSERT(!pcp->p_active);
1243a98e9dbfSaguzovsk done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1244a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp,
1245a98e9dbfSaguzovsk pcp->p_write ? S_WRITE : S_READ, 1);
1246a98e9dbfSaguzovsk npages += btop(pcp->p_len);
1247a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp));
1248a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp);
1249a98e9dbfSaguzovsk if (done) {
1250a98e9dbfSaguzovsk ASSERT(delcallb_list == NULL);
1251a98e9dbfSaguzovsk goto out;
1252a98e9dbfSaguzovsk }
1253a98e9dbfSaguzovsk }
12547c478bd9Sstevel@tonic-gate }
12557c478bd9Sstevel@tonic-gate
1256a98e9dbfSaguzovsk out:
1257a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
1258a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages);
12597c478bd9Sstevel@tonic-gate seg_plocked -= npages;
1260a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
12617c478bd9Sstevel@tonic-gate }
12627c478bd9Sstevel@tonic-gate
12637c478bd9Sstevel@tonic-gate /*
12647c478bd9Sstevel@tonic-gate * purge all entries for a given segment. Since we
12657c478bd9Sstevel@tonic-gate * callback into the segment driver directly for page
12667c478bd9Sstevel@tonic-gate * reclaim the caller needs to hold the right locks.
12677c478bd9Sstevel@tonic-gate */
12687c478bd9Sstevel@tonic-gate void
seg_ppurge(struct seg * seg,struct anon_map * amp,uint_t flags)1269a98e9dbfSaguzovsk seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
12707c478bd9Sstevel@tonic-gate {
12717c478bd9Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL;
12727c478bd9Sstevel@tonic-gate struct seg_pcache *pcp;
12737c478bd9Sstevel@tonic-gate struct seg_phash *hp;
12747c478bd9Sstevel@tonic-gate pgcnt_t npages = 0;
1275a98e9dbfSaguzovsk void *htag0;
12767c478bd9Sstevel@tonic-gate
1277a98e9dbfSaguzovsk if (seg_plocked == 0) {
12787c478bd9Sstevel@tonic-gate return;
12797c478bd9Sstevel@tonic-gate }
1280a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0);
1281a98e9dbfSaguzovsk
1282a98e9dbfSaguzovsk /*
1283a98e9dbfSaguzovsk * If amp is not NULL use amp as a lookup tag otherwise use seg
1284a98e9dbfSaguzovsk * as a lookup tag.
1285a98e9dbfSaguzovsk */
1286a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1287a98e9dbfSaguzovsk ASSERT(htag0 != NULL);
1288a98e9dbfSaguzovsk if (IS_PFLAGS_WIRED(flags)) {
1289a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, 0, flags);
1290a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex);
1291a98e9dbfSaguzovsk pcp = hp->p_hnext;
1292a98e9dbfSaguzovsk while (pcp != (struct seg_pcache *)hp) {
1293a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp);
1294a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp));
1295a98e9dbfSaguzovsk if (pcp->p_htag0 == htag0) {
1296a98e9dbfSaguzovsk if (pcp->p_active) {
1297a98e9dbfSaguzovsk break;
1298a98e9dbfSaguzovsk }
1299a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext;
1300a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev;
1301a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list;
1302a98e9dbfSaguzovsk delcallb_list = pcp;
1303a98e9dbfSaguzovsk }
1304a98e9dbfSaguzovsk pcp = pcp->p_hnext;
1305a98e9dbfSaguzovsk }
1306a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex);
1307a98e9dbfSaguzovsk } else {
1308a98e9dbfSaguzovsk pcache_link_t *plinkp;
1309a98e9dbfSaguzovsk pcache_link_t *pheadp;
1310a98e9dbfSaguzovsk kmutex_t *pmtx;
1311a98e9dbfSaguzovsk
1312a98e9dbfSaguzovsk if (amp == NULL) {
1313a98e9dbfSaguzovsk ASSERT(seg != NULL);
1314a98e9dbfSaguzovsk pheadp = &seg->s_phead;
1315a98e9dbfSaguzovsk pmtx = &seg->s_pmtx;
1316a98e9dbfSaguzovsk } else {
1317a98e9dbfSaguzovsk pheadp = &->a_phead;
1318a98e9dbfSaguzovsk pmtx = &->a_pmtx;
1319a98e9dbfSaguzovsk }
1320a98e9dbfSaguzovsk mutex_enter(pmtx);
1321a98e9dbfSaguzovsk while ((plinkp = pheadp->p_lnext) != pheadp) {
1322a98e9dbfSaguzovsk pcp = plink2pcache(plinkp);
1323a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp));
1324a98e9dbfSaguzovsk ASSERT(pcp->p_htag0 == htag0);
1325a98e9dbfSaguzovsk hp = pcp->p_hashp;
1326a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex);
13277c478bd9Sstevel@tonic-gate if (pcp->p_active) {
1328a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex);
13297c478bd9Sstevel@tonic-gate break;
13307c478bd9Sstevel@tonic-gate }
1331a98e9dbfSaguzovsk ASSERT(plinkp->p_lprev == pheadp);
1332a98e9dbfSaguzovsk pheadp->p_lnext = plinkp->p_lnext;
1333a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = pheadp;
13347c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext;
13357c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev;
13367c478bd9Sstevel@tonic-gate pcp->p_hprev = delcallb_list;
13377c478bd9Sstevel@tonic-gate delcallb_list = pcp;
1338a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) {
1339a98e9dbfSaguzovsk seg_premove_abuck(hp, 0);
1340a98e9dbfSaguzovsk }
1341a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex);
13427c478bd9Sstevel@tonic-gate }
1343a98e9dbfSaguzovsk mutex_exit(pmtx);
13447c478bd9Sstevel@tonic-gate }
13457c478bd9Sstevel@tonic-gate while (delcallb_list != NULL) {
13467c478bd9Sstevel@tonic-gate pcp = delcallb_list;
13477c478bd9Sstevel@tonic-gate delcallb_list = pcp->p_hprev;
1348a98e9dbfSaguzovsk ASSERT(!pcp->p_active);
1349a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1350a98e9dbfSaguzovsk pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1351a98e9dbfSaguzovsk npages += btop(pcp->p_len);
1352a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp);
13537c478bd9Sstevel@tonic-gate }
1354a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx);
1355a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages);
13567c478bd9Sstevel@tonic-gate seg_plocked -= npages;
1357a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) {
1358a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages);
1359a98e9dbfSaguzovsk seg_plocked_window -= npages;
1360a98e9dbfSaguzovsk }
1361a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx);
13627c478bd9Sstevel@tonic-gate }
13637c478bd9Sstevel@tonic-gate
13647c478bd9Sstevel@tonic-gate static void seg_pinit_mem_config(void);
13657c478bd9Sstevel@tonic-gate
13667c478bd9Sstevel@tonic-gate /*
13677c478bd9Sstevel@tonic-gate * setup the pagelock cache
13687c478bd9Sstevel@tonic-gate */
13697c478bd9Sstevel@tonic-gate static void
seg_pinit(void)13707c478bd9Sstevel@tonic-gate seg_pinit(void)
13717c478bd9Sstevel@tonic-gate {
13727c478bd9Sstevel@tonic-gate struct seg_phash *hp;
1373a98e9dbfSaguzovsk ulong_t i;
1374a98e9dbfSaguzovsk pgcnt_t physmegs;
13757c478bd9Sstevel@tonic-gate
1376a98e9dbfSaguzovsk seg_plocked = 0;
1377a98e9dbfSaguzovsk seg_plocked_window = 0;
13787c478bd9Sstevel@tonic-gate
1379a98e9dbfSaguzovsk if (segpcache_enabled == 0) {
1380a98e9dbfSaguzovsk seg_phashsize_win = 0;
1381a98e9dbfSaguzovsk seg_phashsize_wired = 0;
1382a98e9dbfSaguzovsk seg_pdisabled = 1;
1383a98e9dbfSaguzovsk return;
1384a98e9dbfSaguzovsk }
13857c478bd9Sstevel@tonic-gate
1386a98e9dbfSaguzovsk seg_pdisabled = 0;
1387a98e9dbfSaguzovsk seg_pkmcache = kmem_cache_create("seg_pcache",
1388a98e9dbfSaguzovsk sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1389a98e9dbfSaguzovsk if (segpcache_pcp_maxage_ticks <= 0) {
1390a98e9dbfSaguzovsk segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1391a98e9dbfSaguzovsk }
1392a98e9dbfSaguzovsk seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1393a98e9dbfSaguzovsk seg_pathr_empty_ahb = 0;
1394a98e9dbfSaguzovsk seg_pathr_full_ahb = 0;
1395a98e9dbfSaguzovsk seg_pshrink_shift = segpcache_shrink_shift;
1396a98e9dbfSaguzovsk seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
13977c478bd9Sstevel@tonic-gate
1398a98e9dbfSaguzovsk mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1399a98e9dbfSaguzovsk mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1400a98e9dbfSaguzovsk mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1401a98e9dbfSaguzovsk cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1402a98e9dbfSaguzovsk
1403a98e9dbfSaguzovsk physmegs = physmem >> (20 - PAGESHIFT);
1404a98e9dbfSaguzovsk
1405a98e9dbfSaguzovsk /*
1406a98e9dbfSaguzovsk * If segpcache_hashsize_win was not set in /etc/system or it has
1407a98e9dbfSaguzovsk * absurd value set it to a default.
1408a98e9dbfSaguzovsk */
1409a98e9dbfSaguzovsk if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1410a98e9dbfSaguzovsk /*
1411a98e9dbfSaguzovsk * Create one bucket per 32K (or at least per 8 pages) of
1412a98e9dbfSaguzovsk * available memory.
1413a98e9dbfSaguzovsk */
1414a98e9dbfSaguzovsk pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1415a98e9dbfSaguzovsk segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1416a98e9dbfSaguzovsk }
1417a98e9dbfSaguzovsk if (!ISP2(segpcache_hashsize_win)) {
1418a98e9dbfSaguzovsk ulong_t rndfac = ~(1UL <<
1419a98e9dbfSaguzovsk (highbit(segpcache_hashsize_win) - 1));
1420a98e9dbfSaguzovsk rndfac &= segpcache_hashsize_win;
1421a98e9dbfSaguzovsk segpcache_hashsize_win += rndfac;
1422a98e9dbfSaguzovsk segpcache_hashsize_win = 1 <<
1423a98e9dbfSaguzovsk (highbit(segpcache_hashsize_win) - 1);
1424a98e9dbfSaguzovsk }
1425a98e9dbfSaguzovsk seg_phashsize_win = segpcache_hashsize_win;
1426a98e9dbfSaguzovsk seg_phashtab_win = kmem_zalloc(
1427a98e9dbfSaguzovsk seg_phashsize_win * sizeof (struct seg_phash),
1428a98e9dbfSaguzovsk KM_SLEEP);
1429a98e9dbfSaguzovsk for (i = 0; i < seg_phashsize_win; i++) {
1430a98e9dbfSaguzovsk hp = &seg_phashtab_win[i];
1431a98e9dbfSaguzovsk hp->p_hnext = (struct seg_pcache *)hp;
1432a98e9dbfSaguzovsk hp->p_hprev = (struct seg_pcache *)hp;
1433a98e9dbfSaguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1434a98e9dbfSaguzovsk }
1435a98e9dbfSaguzovsk
1436a98e9dbfSaguzovsk seg_pahcur = 0;
1437a98e9dbfSaguzovsk seg_pathr_on = 0;
1438a98e9dbfSaguzovsk seg_pahhead[0].p_lnext = &seg_pahhead[0];
1439a98e9dbfSaguzovsk seg_pahhead[0].p_lprev = &seg_pahhead[0];
1440a98e9dbfSaguzovsk seg_pahhead[1].p_lnext = &seg_pahhead[1];
1441a98e9dbfSaguzovsk seg_pahhead[1].p_lprev = &seg_pahhead[1];
1442a98e9dbfSaguzovsk
1443a98e9dbfSaguzovsk /*
1444a98e9dbfSaguzovsk * If segpcache_hashsize_wired was not set in /etc/system or it has
1445a98e9dbfSaguzovsk * absurd value set it to a default.
1446a98e9dbfSaguzovsk */
1447a98e9dbfSaguzovsk if (segpcache_hashsize_wired == 0 ||
1448a98e9dbfSaguzovsk segpcache_hashsize_wired > physmem / 4) {
1449a98e9dbfSaguzovsk /*
1450a98e9dbfSaguzovsk * Choose segpcache_hashsize_wired based on physmem.
1451a98e9dbfSaguzovsk * Create a bucket per 128K bytes upto 256K buckets.
1452a98e9dbfSaguzovsk */
1453a98e9dbfSaguzovsk if (physmegs < 20 * 1024) {
1454a98e9dbfSaguzovsk segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1455a98e9dbfSaguzovsk } else {
1456a98e9dbfSaguzovsk segpcache_hashsize_wired = 256 * 1024;
14577c478bd9Sstevel@tonic-gate }
14587c478bd9Sstevel@tonic-gate }
1459a98e9dbfSaguzovsk if (!ISP2(segpcache_hashsize_wired)) {
1460a98e9dbfSaguzovsk segpcache_hashsize_wired = 1 <<
1461a98e9dbfSaguzovsk highbit(segpcache_hashsize_wired);
1462a98e9dbfSaguzovsk }
1463a98e9dbfSaguzovsk seg_phashsize_wired = segpcache_hashsize_wired;
1464a98e9dbfSaguzovsk seg_phashtab_wired = kmem_zalloc(
1465a98e9dbfSaguzovsk seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1466a98e9dbfSaguzovsk for (i = 0; i < seg_phashsize_wired; i++) {
1467a98e9dbfSaguzovsk hp = (struct seg_phash *)&seg_phashtab_wired[i];
1468a98e9dbfSaguzovsk hp->p_hnext = (struct seg_pcache *)hp;
1469a98e9dbfSaguzovsk hp->p_hprev = (struct seg_pcache *)hp;
1470a98e9dbfSaguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1471a98e9dbfSaguzovsk }
14727c478bd9Sstevel@tonic-gate
1473a98e9dbfSaguzovsk if (segpcache_maxwindow == 0) {
1474a98e9dbfSaguzovsk if (physmegs < 64) {
1475a98e9dbfSaguzovsk /* 3% of memory */
1476a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 5;
1477a98e9dbfSaguzovsk } else if (physmegs < 512) {
1478a98e9dbfSaguzovsk /* 12% of memory */
1479a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 3;
1480a98e9dbfSaguzovsk } else if (physmegs < 1024) {
1481a98e9dbfSaguzovsk /* 25% of memory */
1482a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 2;
1483a98e9dbfSaguzovsk } else if (physmegs < 2048) {
1484a98e9dbfSaguzovsk /* 50% of memory */
1485a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 1;
1486a98e9dbfSaguzovsk } else {
1487a98e9dbfSaguzovsk /* no limit */
1488a98e9dbfSaguzovsk segpcache_maxwindow = (pgcnt_t)-1;
1489a98e9dbfSaguzovsk }
1490a98e9dbfSaguzovsk }
1491a98e9dbfSaguzovsk seg_pmaxwindow = segpcache_maxwindow;
14927c478bd9Sstevel@tonic-gate seg_pinit_mem_config();
14937c478bd9Sstevel@tonic-gate }
14947c478bd9Sstevel@tonic-gate
14957c478bd9Sstevel@tonic-gate /*
14967c478bd9Sstevel@tonic-gate * called by pageout if memory is low
14977c478bd9Sstevel@tonic-gate */
14987c478bd9Sstevel@tonic-gate void
seg_preap(void)14997c478bd9Sstevel@tonic-gate seg_preap(void)
15007c478bd9Sstevel@tonic-gate {
15017c478bd9Sstevel@tonic-gate /*
1502a98e9dbfSaguzovsk * if the cache is off or empty, return
15037c478bd9Sstevel@tonic-gate */
1504a98e9dbfSaguzovsk if (seg_plocked_window == 0) {
15057c478bd9Sstevel@tonic-gate return;
15067c478bd9Sstevel@tonic-gate }
1507a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0);
15087c478bd9Sstevel@tonic-gate
1509a98e9dbfSaguzovsk /*
1510a98e9dbfSaguzovsk * If somebody is already purging pcache
1511a98e9dbfSaguzovsk * just return.
1512a98e9dbfSaguzovsk */
1513a98e9dbfSaguzovsk if (seg_pdisabled) {
1514a98e9dbfSaguzovsk return;
1515a98e9dbfSaguzovsk }
1516a98e9dbfSaguzovsk
1517a98e9dbfSaguzovsk cv_signal(&seg_pasync_cv);
1518a98e9dbfSaguzovsk }
15197c478bd9Sstevel@tonic-gate
15207c478bd9Sstevel@tonic-gate /*
15217c478bd9Sstevel@tonic-gate * run as a backgroud thread and reclaim pagelock
15227c478bd9Sstevel@tonic-gate * pages which have not been used recently
15237c478bd9Sstevel@tonic-gate */
15247c478bd9Sstevel@tonic-gate void
seg_pasync_thread(void)15257c478bd9Sstevel@tonic-gate seg_pasync_thread(void)
15267c478bd9Sstevel@tonic-gate {
15277c478bd9Sstevel@tonic-gate callb_cpr_t cpr_info;
15287c478bd9Sstevel@tonic-gate
1529a98e9dbfSaguzovsk if (seg_phashsize_win == 0) {
1530a98e9dbfSaguzovsk thread_exit();
1531a98e9dbfSaguzovsk /*NOTREACHED*/
1532a98e9dbfSaguzovsk }
1533a98e9dbfSaguzovsk
1534a98e9dbfSaguzovsk seg_pasync_thr = curthread;
15357c478bd9Sstevel@tonic-gate
1536a98e9dbfSaguzovsk CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1537a98e9dbfSaguzovsk callb_generic_cpr, "seg_pasync");
15387c478bd9Sstevel@tonic-gate
1539a98e9dbfSaguzovsk if (segpcache_reap_ticks <= 0) {
1540a98e9dbfSaguzovsk segpcache_reap_ticks = segpcache_reap_sec * hz;
15417c478bd9Sstevel@tonic-gate }
15427c478bd9Sstevel@tonic-gate
1543a98e9dbfSaguzovsk mutex_enter(&seg_pasync_mtx);
15447c478bd9Sstevel@tonic-gate for (;;) {
15457c478bd9Sstevel@tonic-gate CALLB_CPR_SAFE_BEGIN(&cpr_info);
1546d3d50737SRafael Vanoni (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1547d3d50737SRafael Vanoni segpcache_reap_ticks, TR_CLOCK_TICK);
1548a98e9dbfSaguzovsk CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1549a98e9dbfSaguzovsk if (seg_pdisabled == 0) {
1550a98e9dbfSaguzovsk seg_ppurge_async(0);
1551a98e9dbfSaguzovsk }
15527c478bd9Sstevel@tonic-gate }
15537c478bd9Sstevel@tonic-gate }
15547c478bd9Sstevel@tonic-gate
15557c478bd9Sstevel@tonic-gate static struct kmem_cache *seg_cache;
15567c478bd9Sstevel@tonic-gate
15577c478bd9Sstevel@tonic-gate /*
15587c478bd9Sstevel@tonic-gate * Initialize segment management data structures.
15597c478bd9Sstevel@tonic-gate */
15607c478bd9Sstevel@tonic-gate void
seg_init(void)15617c478bd9Sstevel@tonic-gate seg_init(void)
15627c478bd9Sstevel@tonic-gate {
15637c478bd9Sstevel@tonic-gate kstat_t *ksp;
15647c478bd9Sstevel@tonic-gate
1565a98e9dbfSaguzovsk seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1566a98e9dbfSaguzovsk 0, NULL, NULL, NULL, NULL, NULL, 0);
15677c478bd9Sstevel@tonic-gate
15687c478bd9Sstevel@tonic-gate ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1569c6f08383Sjj segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
15707c478bd9Sstevel@tonic-gate if (ksp) {
15717c478bd9Sstevel@tonic-gate ksp->ks_data = (void *)segadvstat_ptr;
15727c478bd9Sstevel@tonic-gate kstat_install(ksp);
15737c478bd9Sstevel@tonic-gate }
15747c478bd9Sstevel@tonic-gate
15757c478bd9Sstevel@tonic-gate seg_pinit();
15767c478bd9Sstevel@tonic-gate }
15777c478bd9Sstevel@tonic-gate
15787c478bd9Sstevel@tonic-gate /*
15797c478bd9Sstevel@tonic-gate * Allocate a segment to cover [base, base+size]
15807c478bd9Sstevel@tonic-gate * and attach it to the specified address space.
15817c478bd9Sstevel@tonic-gate */
15827c478bd9Sstevel@tonic-gate struct seg *
seg_alloc(struct as * as,caddr_t base,size_t size)15837c478bd9Sstevel@tonic-gate seg_alloc(struct as *as, caddr_t base, size_t size)
15847c478bd9Sstevel@tonic-gate {
15857c478bd9Sstevel@tonic-gate struct seg *new;
15867c478bd9Sstevel@tonic-gate caddr_t segbase;
15877c478bd9Sstevel@tonic-gate size_t segsize;
15887c478bd9Sstevel@tonic-gate
15897c478bd9Sstevel@tonic-gate segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
15907c478bd9Sstevel@tonic-gate segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
15917c478bd9Sstevel@tonic-gate (uintptr_t)segbase;
15927c478bd9Sstevel@tonic-gate
15937c478bd9Sstevel@tonic-gate if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
15947c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */
15957c478bd9Sstevel@tonic-gate
15967c478bd9Sstevel@tonic-gate if (as != &kas &&
15977c478bd9Sstevel@tonic-gate valid_usr_range(segbase, segsize, 0, as,
15987c478bd9Sstevel@tonic-gate as->a_userlimit) != RANGE_OKAY)
15997c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */
16007c478bd9Sstevel@tonic-gate
16017c478bd9Sstevel@tonic-gate new = kmem_cache_alloc(seg_cache, KM_SLEEP);
16027c478bd9Sstevel@tonic-gate new->s_ops = NULL;
16037c478bd9Sstevel@tonic-gate new->s_data = NULL;
16047c478bd9Sstevel@tonic-gate new->s_szc = 0;
16057c478bd9Sstevel@tonic-gate new->s_flags = 0;
1606a98e9dbfSaguzovsk mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1607a98e9dbfSaguzovsk new->s_phead.p_lnext = &new->s_phead;
1608a98e9dbfSaguzovsk new->s_phead.p_lprev = &new->s_phead;
16097c478bd9Sstevel@tonic-gate if (seg_attach(as, segbase, segsize, new) < 0) {
16107c478bd9Sstevel@tonic-gate kmem_cache_free(seg_cache, new);
16117c478bd9Sstevel@tonic-gate return ((struct seg *)NULL);
16127c478bd9Sstevel@tonic-gate }
16137c478bd9Sstevel@tonic-gate /* caller must fill in ops, data */
16147c478bd9Sstevel@tonic-gate return (new);
16157c478bd9Sstevel@tonic-gate }
16167c478bd9Sstevel@tonic-gate
16177c478bd9Sstevel@tonic-gate /*
16187c478bd9Sstevel@tonic-gate * Attach a segment to the address space. Used by seg_alloc()
16197c478bd9Sstevel@tonic-gate * and for kernel startup to attach to static segments.
16207c478bd9Sstevel@tonic-gate */
16217c478bd9Sstevel@tonic-gate int
seg_attach(struct as * as,caddr_t base,size_t size,struct seg * seg)16227c478bd9Sstevel@tonic-gate seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
16237c478bd9Sstevel@tonic-gate {
16247c478bd9Sstevel@tonic-gate seg->s_as = as;
16257c478bd9Sstevel@tonic-gate seg->s_base = base;
16267c478bd9Sstevel@tonic-gate seg->s_size = size;
16277c478bd9Sstevel@tonic-gate
16287c478bd9Sstevel@tonic-gate /*
16297c478bd9Sstevel@tonic-gate * as_addseg() will add the segment at the appropraite point
16307c478bd9Sstevel@tonic-gate * in the list. It will return -1 if there is overlap with
16317c478bd9Sstevel@tonic-gate * an already existing segment.
16327c478bd9Sstevel@tonic-gate */
16337c478bd9Sstevel@tonic-gate return (as_addseg(as, seg));
16347c478bd9Sstevel@tonic-gate }
16357c478bd9Sstevel@tonic-gate
16367c478bd9Sstevel@tonic-gate /*
16377c478bd9Sstevel@tonic-gate * Unmap a segment and free it from its associated address space.
16387c478bd9Sstevel@tonic-gate * This should be called by anybody who's finished with a whole segment's
16397c478bd9Sstevel@tonic-gate * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the
16407c478bd9Sstevel@tonic-gate * responsibility of the segment driver to unlink the the segment
16417c478bd9Sstevel@tonic-gate * from the address space, and to free public and private data structures
16427c478bd9Sstevel@tonic-gate * associated with the segment. (This is typically done by a call to
16437c478bd9Sstevel@tonic-gate * seg_free()).
16447c478bd9Sstevel@tonic-gate */
16457c478bd9Sstevel@tonic-gate void
seg_unmap(struct seg * seg)16467c478bd9Sstevel@tonic-gate seg_unmap(struct seg *seg)
16477c478bd9Sstevel@tonic-gate {
16487c478bd9Sstevel@tonic-gate #ifdef DEBUG
16497c478bd9Sstevel@tonic-gate int ret;
16507c478bd9Sstevel@tonic-gate #endif /* DEBUG */
16517c478bd9Sstevel@tonic-gate
1652dc32d872SJosef 'Jeff' Sipek ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
16537c478bd9Sstevel@tonic-gate
16547c478bd9Sstevel@tonic-gate /* Shouldn't have called seg_unmap if mapping isn't yet established */
16557c478bd9Sstevel@tonic-gate ASSERT(seg->s_data != NULL);
16567c478bd9Sstevel@tonic-gate
16577c478bd9Sstevel@tonic-gate /* Unmap the whole mapping */
16587c478bd9Sstevel@tonic-gate #ifdef DEBUG
16597c478bd9Sstevel@tonic-gate ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
16607c478bd9Sstevel@tonic-gate ASSERT(ret == 0);
16617c478bd9Sstevel@tonic-gate #else
16627c478bd9Sstevel@tonic-gate SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
16637c478bd9Sstevel@tonic-gate #endif /* DEBUG */
16647c478bd9Sstevel@tonic-gate }
16657c478bd9Sstevel@tonic-gate
16667c478bd9Sstevel@tonic-gate /*
16677c478bd9Sstevel@tonic-gate * Free the segment from its associated as. This should only be called
16687c478bd9Sstevel@tonic-gate * if a mapping to the segment has not yet been established (e.g., if
16697c478bd9Sstevel@tonic-gate * an error occurs in the middle of doing an as_map when the segment
16707c478bd9Sstevel@tonic-gate * has already been partially set up) or if it has already been deleted
16717c478bd9Sstevel@tonic-gate * (e.g., from a segment driver unmap routine if the unmap applies to the
16727c478bd9Sstevel@tonic-gate * entire segment). If the mapping is currently set up then seg_unmap() should
16737c478bd9Sstevel@tonic-gate * be called instead.
16747c478bd9Sstevel@tonic-gate */
16757c478bd9Sstevel@tonic-gate void
seg_free(struct seg * seg)16767c478bd9Sstevel@tonic-gate seg_free(struct seg *seg)
16777c478bd9Sstevel@tonic-gate {
16787c478bd9Sstevel@tonic-gate register struct as *as = seg->s_as;
16797c478bd9Sstevel@tonic-gate struct seg *tseg = as_removeseg(as, seg);
16807c478bd9Sstevel@tonic-gate
16817c478bd9Sstevel@tonic-gate ASSERT(tseg == seg);
16827c478bd9Sstevel@tonic-gate
16837c478bd9Sstevel@tonic-gate /*
16847c478bd9Sstevel@tonic-gate * If the segment private data field is NULL,
16857c478bd9Sstevel@tonic-gate * then segment driver is not attached yet.
16867c478bd9Sstevel@tonic-gate */
16877c478bd9Sstevel@tonic-gate if (seg->s_data != NULL)
16887c478bd9Sstevel@tonic-gate SEGOP_FREE(seg);
16897c478bd9Sstevel@tonic-gate
1690a98e9dbfSaguzovsk mutex_destroy(&seg->s_pmtx);
1691a98e9dbfSaguzovsk ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1692a98e9dbfSaguzovsk ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
16937c478bd9Sstevel@tonic-gate kmem_cache_free(seg_cache, seg);
16947c478bd9Sstevel@tonic-gate }
16957c478bd9Sstevel@tonic-gate
16967c478bd9Sstevel@tonic-gate /*ARGSUSED*/
16977c478bd9Sstevel@tonic-gate static void
seg_p_mem_config_post_add(void * arg,pgcnt_t delta_pages)16987c478bd9Sstevel@tonic-gate seg_p_mem_config_post_add(
16997c478bd9Sstevel@tonic-gate void *arg,
17007c478bd9Sstevel@tonic-gate pgcnt_t delta_pages)
17017c478bd9Sstevel@tonic-gate {
17027c478bd9Sstevel@tonic-gate /* Nothing to do. */
17037c478bd9Sstevel@tonic-gate }
17047c478bd9Sstevel@tonic-gate
1705cee1d74bSjfrank void
seg_p_enable(void)1706cee1d74bSjfrank seg_p_enable(void)
1707cee1d74bSjfrank {
1708a98e9dbfSaguzovsk mutex_enter(&seg_pcache_mtx);
1709a98e9dbfSaguzovsk ASSERT(seg_pdisabled != 0);
1710a98e9dbfSaguzovsk seg_pdisabled--;
1711a98e9dbfSaguzovsk mutex_exit(&seg_pcache_mtx);
1712cee1d74bSjfrank }
1713cee1d74bSjfrank
17147c478bd9Sstevel@tonic-gate /*
1715cee1d74bSjfrank * seg_p_disable - disables seg_pcache, and then attempts to empty the
1716cee1d74bSjfrank * cache.
1717cee1d74bSjfrank * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1718cee1d74bSjfrank * SEGP_FAIL if the cache could not be emptied.
17197c478bd9Sstevel@tonic-gate */
1720cee1d74bSjfrank int
seg_p_disable(void)1721cee1d74bSjfrank seg_p_disable(void)
17227c478bd9Sstevel@tonic-gate {
17237c478bd9Sstevel@tonic-gate pgcnt_t old_plocked;
17247c478bd9Sstevel@tonic-gate int stall_count = 0;
17257c478bd9Sstevel@tonic-gate
1726a98e9dbfSaguzovsk mutex_enter(&seg_pcache_mtx);
1727a98e9dbfSaguzovsk seg_pdisabled++;
1728a98e9dbfSaguzovsk ASSERT(seg_pdisabled != 0);
1729a98e9dbfSaguzovsk mutex_exit(&seg_pcache_mtx);
17307c478bd9Sstevel@tonic-gate
17317c478bd9Sstevel@tonic-gate /*
17327c478bd9Sstevel@tonic-gate * Attempt to empty the cache. Terminate if seg_plocked does not
17337c478bd9Sstevel@tonic-gate * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
17347c478bd9Sstevel@tonic-gate */
17357c478bd9Sstevel@tonic-gate while (seg_plocked != 0) {
1736a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0);
17377c478bd9Sstevel@tonic-gate old_plocked = seg_plocked;
1738a98e9dbfSaguzovsk seg_ppurge_async(1);
17397c478bd9Sstevel@tonic-gate if (seg_plocked == old_plocked) {
17407c478bd9Sstevel@tonic-gate if (stall_count++ > SEGP_STALL_THRESHOLD) {
1741cee1d74bSjfrank return (SEGP_FAIL);
17427c478bd9Sstevel@tonic-gate }
17437c478bd9Sstevel@tonic-gate } else
17447c478bd9Sstevel@tonic-gate stall_count = 0;
17457c478bd9Sstevel@tonic-gate if (seg_plocked != 0)
17467c478bd9Sstevel@tonic-gate delay(hz/SEGP_PREDEL_DELAY_FACTOR);
17477c478bd9Sstevel@tonic-gate }
1748cee1d74bSjfrank return (SEGP_SUCCESS);
1749cee1d74bSjfrank }
1750cee1d74bSjfrank
1751cee1d74bSjfrank /*
1752cee1d74bSjfrank * Attempt to purge seg_pcache. May need to return before this has
1753cee1d74bSjfrank * completed to allow other pre_del callbacks to unlock pages. This is
1754cee1d74bSjfrank * ok because:
1755a98e9dbfSaguzovsk * 1) The seg_pdisabled flag has been set so at least we won't
1756cee1d74bSjfrank * cache anymore locks and the locks we couldn't purge
1757cee1d74bSjfrank * will not be held if they do get released by a subsequent
1758cee1d74bSjfrank * pre-delete callback.
1759cee1d74bSjfrank *
1760cee1d74bSjfrank * 2) The rest of the memory delete thread processing does not
1761cee1d74bSjfrank * depend on the changes made in this pre-delete callback. No
1762cee1d74bSjfrank * panics will result, the worst that will happen is that the
1763cee1d74bSjfrank * DR code will timeout and cancel the delete.
1764cee1d74bSjfrank */
1765cee1d74bSjfrank /*ARGSUSED*/
1766cee1d74bSjfrank static int
seg_p_mem_config_pre_del(void * arg,pgcnt_t delta_pages)1767cee1d74bSjfrank seg_p_mem_config_pre_del(
1768cee1d74bSjfrank void *arg,
1769cee1d74bSjfrank pgcnt_t delta_pages)
1770cee1d74bSjfrank {
1771a98e9dbfSaguzovsk if (seg_phashsize_win == 0) {
1772a98e9dbfSaguzovsk return (0);
1773a98e9dbfSaguzovsk }
1774cee1d74bSjfrank if (seg_p_disable() != SEGP_SUCCESS)
1775cee1d74bSjfrank cmn_err(CE_NOTE,
1776cee1d74bSjfrank "!Pre-delete couldn't purge"" pagelock cache - continuing");
17777c478bd9Sstevel@tonic-gate return (0);
17787c478bd9Sstevel@tonic-gate }
17797c478bd9Sstevel@tonic-gate
17807c478bd9Sstevel@tonic-gate /*ARGSUSED*/
17817c478bd9Sstevel@tonic-gate static void
seg_p_mem_config_post_del(void * arg,pgcnt_t delta_pages,int cancelled)17827c478bd9Sstevel@tonic-gate seg_p_mem_config_post_del(
17837c478bd9Sstevel@tonic-gate void *arg,
17847c478bd9Sstevel@tonic-gate pgcnt_t delta_pages,
17857c478bd9Sstevel@tonic-gate int cancelled)
17867c478bd9Sstevel@tonic-gate {
1787a98e9dbfSaguzovsk if (seg_phashsize_win == 0) {
1788a98e9dbfSaguzovsk return;
1789a98e9dbfSaguzovsk }
1790cee1d74bSjfrank seg_p_enable();
17917c478bd9Sstevel@tonic-gate }
17927c478bd9Sstevel@tonic-gate
17937c478bd9Sstevel@tonic-gate static kphysm_setup_vector_t seg_p_mem_config_vec = {
17947c478bd9Sstevel@tonic-gate KPHYSM_SETUP_VECTOR_VERSION,
17957c478bd9Sstevel@tonic-gate seg_p_mem_config_post_add,
17967c478bd9Sstevel@tonic-gate seg_p_mem_config_pre_del,
17977c478bd9Sstevel@tonic-gate seg_p_mem_config_post_del,
17987c478bd9Sstevel@tonic-gate };
17997c478bd9Sstevel@tonic-gate
18007c478bd9Sstevel@tonic-gate static void
seg_pinit_mem_config(void)18017c478bd9Sstevel@tonic-gate seg_pinit_mem_config(void)
18027c478bd9Sstevel@tonic-gate {
18037c478bd9Sstevel@tonic-gate int ret;
18047c478bd9Sstevel@tonic-gate
18057c478bd9Sstevel@tonic-gate ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
18067c478bd9Sstevel@tonic-gate /*
18077c478bd9Sstevel@tonic-gate * Want to catch this in the debug kernel. At run time, if the
18087c478bd9Sstevel@tonic-gate * callbacks don't get run all will be OK as the disable just makes
18097c478bd9Sstevel@tonic-gate * it more likely that the pages can be collected.
18107c478bd9Sstevel@tonic-gate */
18117c478bd9Sstevel@tonic-gate ASSERT(ret == 0);
18127c478bd9Sstevel@tonic-gate }
18130209230bSgjelinek
18140209230bSgjelinek /*
18150209230bSgjelinek * Verify that segment is not a shared anonymous segment which reserves
18160209230bSgjelinek * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
18170209230bSgjelinek * from one zone to another if any segments are shared. This is because the
18180209230bSgjelinek * last process to exit will credit the swap reservation. This could lead
18190209230bSgjelinek * to the swap being reserved by one zone, and credited to another.
18200209230bSgjelinek */
18210209230bSgjelinek boolean_t
seg_can_change_zones(struct seg * seg)18220209230bSgjelinek seg_can_change_zones(struct seg *seg)
18230209230bSgjelinek {
18240209230bSgjelinek struct segvn_data *svd;
18250209230bSgjelinek
18260209230bSgjelinek if (seg->s_ops == &segspt_shmops)
18270209230bSgjelinek return (B_FALSE);
18280209230bSgjelinek
18290209230bSgjelinek if (seg->s_ops == &segvn_ops) {
18300209230bSgjelinek svd = (struct segvn_data *)seg->s_data;
18310209230bSgjelinek if (svd->type == MAP_SHARED &&
18320209230bSgjelinek svd->amp != NULL &&
18330209230bSgjelinek svd->amp->swresv > 0)
1834*15c07adcSJohn Levon return (B_FALSE);
18350209230bSgjelinek }
18360209230bSgjelinek return (B_TRUE);
18370209230bSgjelinek }
18380209230bSgjelinek
18390209230bSgjelinek /*
18400209230bSgjelinek * Return swap reserved by a segment backing a private mapping.
18410209230bSgjelinek */
18420209230bSgjelinek size_t
seg_swresv(struct seg * seg)18430209230bSgjelinek seg_swresv(struct seg *seg)
18440209230bSgjelinek {
18450209230bSgjelinek struct segvn_data *svd;
18460209230bSgjelinek size_t swap = 0;
18470209230bSgjelinek
18480209230bSgjelinek if (seg->s_ops == &segvn_ops) {
18490209230bSgjelinek svd = (struct segvn_data *)seg->s_data;
18500209230bSgjelinek if (svd->type == MAP_PRIVATE && svd->swresv > 0)
18510209230bSgjelinek swap = svd->swresv;
18520209230bSgjelinek }
18530209230bSgjelinek return (swap);
18540209230bSgjelinek }
18559d12795fSRobert Mustacchi
18569d12795fSRobert Mustacchi /*
18579d12795fSRobert Mustacchi * General not supported function for SEGOP_INHERIT
18589d12795fSRobert Mustacchi */
18599d12795fSRobert Mustacchi /* ARGSUSED */
18609d12795fSRobert Mustacchi int
seg_inherit_notsup(struct seg * seg,caddr_t addr,size_t len,uint_t op)18619d12795fSRobert Mustacchi seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
18629d12795fSRobert Mustacchi {
18639d12795fSRobert Mustacchi return (ENOTSUP);
18649d12795fSRobert Mustacchi }
1865