17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 50209230bSgjelinek * Common Development and Distribution License (the "License"). 60209230bSgjelinek * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 22*d3d50737SRafael Vanoni * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 237c478bd9Sstevel@tonic-gate * Use is subject to license terms. 247c478bd9Sstevel@tonic-gate */ 257c478bd9Sstevel@tonic-gate 267c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 277c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate /* 307c478bd9Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 317c478bd9Sstevel@tonic-gate * The Regents of the University of California 327c478bd9Sstevel@tonic-gate * All Rights Reserved 337c478bd9Sstevel@tonic-gate * 347c478bd9Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 357c478bd9Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 367c478bd9Sstevel@tonic-gate * contributors. 377c478bd9Sstevel@tonic-gate */ 387c478bd9Sstevel@tonic-gate 397c478bd9Sstevel@tonic-gate /* 407c478bd9Sstevel@tonic-gate * VM - segment management. 417c478bd9Sstevel@tonic-gate */ 427c478bd9Sstevel@tonic-gate 437c478bd9Sstevel@tonic-gate #include <sys/types.h> 447c478bd9Sstevel@tonic-gate #include <sys/inttypes.h> 457c478bd9Sstevel@tonic-gate #include <sys/t_lock.h> 467c478bd9Sstevel@tonic-gate #include <sys/param.h> 477c478bd9Sstevel@tonic-gate #include <sys/systm.h> 487c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 49a98e9dbfSaguzovsk #include <sys/sysmacros.h> 507c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h> 51a98e9dbfSaguzovsk #include <sys/tuneable.h> 527c478bd9Sstevel@tonic-gate #include <sys/debug.h> 53a98e9dbfSaguzovsk #include <sys/fs/swapnode.h> 547c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 557c478bd9Sstevel@tonic-gate #include <sys/callb.h> 567c478bd9Sstevel@tonic-gate #include <sys/mem_config.h> 570209230bSgjelinek #include <sys/mman.h> 587c478bd9Sstevel@tonic-gate 597c478bd9Sstevel@tonic-gate #include <vm/hat.h> 607c478bd9Sstevel@tonic-gate #include <vm/as.h> 617c478bd9Sstevel@tonic-gate #include <vm/seg.h> 627c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 630209230bSgjelinek #include <vm/seg_spt.h> 640209230bSgjelinek #include <vm/seg_vn.h> 65a98e9dbfSaguzovsk #include <vm/anon.h> 66a98e9dbfSaguzovsk 677c478bd9Sstevel@tonic-gate /* 687c478bd9Sstevel@tonic-gate * kstats for segment advise 697c478bd9Sstevel@tonic-gate */ 707c478bd9Sstevel@tonic-gate segadvstat_t segadvstat = { 717c478bd9Sstevel@tonic-gate { "MADV_FREE_hit", KSTAT_DATA_ULONG }, 727c478bd9Sstevel@tonic-gate { "MADV_FREE_miss", KSTAT_DATA_ULONG }, 737c478bd9Sstevel@tonic-gate }; 747c478bd9Sstevel@tonic-gate 757c478bd9Sstevel@tonic-gate kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; 767c478bd9Sstevel@tonic-gate uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); 777c478bd9Sstevel@tonic-gate 787c478bd9Sstevel@tonic-gate /* 797c478bd9Sstevel@tonic-gate * entry in the segment page cache 807c478bd9Sstevel@tonic-gate */ 817c478bd9Sstevel@tonic-gate struct seg_pcache { 82a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 83a98e9dbfSaguzovsk struct seg_pcache *p_hprev; 84a98e9dbfSaguzovsk pcache_link_t p_plink; /* per segment/amp list */ 85a98e9dbfSaguzovsk void *p_htag0; /* segment/amp pointer */ 86a98e9dbfSaguzovsk caddr_t p_addr; /* base address/anon_idx */ 87a98e9dbfSaguzovsk size_t p_len; /* total bytes */ 88a98e9dbfSaguzovsk size_t p_wlen; /* writtable bytes at p_addr */ 89a98e9dbfSaguzovsk struct page **p_pp; /* pp shadow list */ 90a98e9dbfSaguzovsk seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ 91a98e9dbfSaguzovsk clock_t p_lbolt; /* lbolt from last use */ 92a98e9dbfSaguzovsk struct seg_phash *p_hashp; /* our pcache hash bucket */ 93a98e9dbfSaguzovsk uint_t p_active; /* active count */ 94a98e9dbfSaguzovsk uchar_t p_write; /* true if S_WRITE */ 95a98e9dbfSaguzovsk uchar_t p_ref; /* reference byte */ 96a98e9dbfSaguzovsk ushort_t p_flags; /* bit flags */ 977c478bd9Sstevel@tonic-gate }; 987c478bd9Sstevel@tonic-gate 997c478bd9Sstevel@tonic-gate struct seg_phash { 100a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 101a98e9dbfSaguzovsk struct seg_pcache *p_hprev; 102a98e9dbfSaguzovsk kmutex_t p_hmutex; /* protects hash bucket */ 103a98e9dbfSaguzovsk pcache_link_t p_halink[2]; /* active bucket linkages */ 104a98e9dbfSaguzovsk }; 105a98e9dbfSaguzovsk 106a98e9dbfSaguzovsk struct seg_phash_wired { 107a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 108a98e9dbfSaguzovsk struct seg_pcache *p_hprev; 109a98e9dbfSaguzovsk kmutex_t p_hmutex; /* protects hash bucket */ 1107c478bd9Sstevel@tonic-gate }; 1117c478bd9Sstevel@tonic-gate 112a98e9dbfSaguzovsk /* 113a98e9dbfSaguzovsk * A parameter to control a maximum number of bytes that can be 114a98e9dbfSaguzovsk * purged from pcache at a time. 115a98e9dbfSaguzovsk */ 116a98e9dbfSaguzovsk #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) 117a98e9dbfSaguzovsk 118a98e9dbfSaguzovsk /* 119a98e9dbfSaguzovsk * log2(fraction of pcache to reclaim at a time). 120a98e9dbfSaguzovsk */ 121a98e9dbfSaguzovsk #define P_SHRINK_SHFT (5) 122a98e9dbfSaguzovsk 123a98e9dbfSaguzovsk /* 124a98e9dbfSaguzovsk * The following variables can be tuned via /etc/system. 125a98e9dbfSaguzovsk */ 126a98e9dbfSaguzovsk 127a98e9dbfSaguzovsk int segpcache_enabled = 1; /* if 1, shadow lists are cached */ 128a98e9dbfSaguzovsk pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ 129a98e9dbfSaguzovsk ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ 130a98e9dbfSaguzovsk ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ 131a98e9dbfSaguzovsk int segpcache_reap_sec = 1; /* reap check rate in secs */ 132a98e9dbfSaguzovsk clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ 133a98e9dbfSaguzovsk int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ 134a98e9dbfSaguzovsk clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ 135a98e9dbfSaguzovsk int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ 136a98e9dbfSaguzovsk pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ 137a98e9dbfSaguzovsk 138a98e9dbfSaguzovsk static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ 139a98e9dbfSaguzovsk static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ 140a98e9dbfSaguzovsk static kcondvar_t seg_pasync_cv; 141a98e9dbfSaguzovsk 142a98e9dbfSaguzovsk #pragma align 64(pctrl1) 143a98e9dbfSaguzovsk #pragma align 64(pctrl2) 144a98e9dbfSaguzovsk #pragma align 64(pctrl3) 145a98e9dbfSaguzovsk 146a98e9dbfSaguzovsk /* 147a98e9dbfSaguzovsk * Keep frequently used variables together in one cache line. 148a98e9dbfSaguzovsk */ 149a98e9dbfSaguzovsk static struct p_ctrl1 { 150a98e9dbfSaguzovsk uint_t p_disabled; /* if not 0, caching temporarily off */ 151a98e9dbfSaguzovsk pgcnt_t p_maxwin; /* max # of pages that can be cached */ 152a98e9dbfSaguzovsk size_t p_hashwin_sz; /* # of non wired buckets */ 153a98e9dbfSaguzovsk struct seg_phash *p_htabwin; /* hash table for non wired entries */ 154a98e9dbfSaguzovsk size_t p_hashwired_sz; /* # of wired buckets */ 155a98e9dbfSaguzovsk struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ 156a98e9dbfSaguzovsk kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ 157a98e9dbfSaguzovsk #ifdef _LP64 158a98e9dbfSaguzovsk ulong_t pad[1]; 159a98e9dbfSaguzovsk #endif /* _LP64 */ 160a98e9dbfSaguzovsk } pctrl1; 161a98e9dbfSaguzovsk 162a98e9dbfSaguzovsk static struct p_ctrl2 { 163a98e9dbfSaguzovsk kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ 164a98e9dbfSaguzovsk pgcnt_t p_locked_win; /* # pages from window */ 165a98e9dbfSaguzovsk pgcnt_t p_locked; /* # of pages cached by pagelock */ 166a98e9dbfSaguzovsk uchar_t p_ahcur; /* current active links for insert/delete */ 167a98e9dbfSaguzovsk uchar_t p_athr_on; /* async reclaim thread is running. */ 168a98e9dbfSaguzovsk pcache_link_t p_ahhead[2]; /* active buckets linkages */ 169a98e9dbfSaguzovsk } pctrl2; 170a98e9dbfSaguzovsk 171a98e9dbfSaguzovsk static struct p_ctrl3 { 172a98e9dbfSaguzovsk clock_t p_pcp_maxage; /* max pcp age in ticks */ 173a98e9dbfSaguzovsk ulong_t p_athr_empty_ahb; /* athread walk stats */ 174a98e9dbfSaguzovsk ulong_t p_athr_full_ahb; /* athread walk stats */ 175a98e9dbfSaguzovsk pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ 176a98e9dbfSaguzovsk int p_shrink_shft; /* reap shift factor */ 177a98e9dbfSaguzovsk #ifdef _LP64 178a98e9dbfSaguzovsk ulong_t pad[3]; 179a98e9dbfSaguzovsk #endif /* _LP64 */ 180a98e9dbfSaguzovsk } pctrl3; 181a98e9dbfSaguzovsk 182a98e9dbfSaguzovsk #define seg_pdisabled pctrl1.p_disabled 183a98e9dbfSaguzovsk #define seg_pmaxwindow pctrl1.p_maxwin 184a98e9dbfSaguzovsk #define seg_phashsize_win pctrl1.p_hashwin_sz 185a98e9dbfSaguzovsk #define seg_phashtab_win pctrl1.p_htabwin 186a98e9dbfSaguzovsk #define seg_phashsize_wired pctrl1.p_hashwired_sz 187a98e9dbfSaguzovsk #define seg_phashtab_wired pctrl1.p_htabwired 188a98e9dbfSaguzovsk #define seg_pkmcache pctrl1.p_kmcache 189a98e9dbfSaguzovsk #define seg_pmem_mtx pctrl2.p_mem_mtx 190a98e9dbfSaguzovsk #define seg_plocked_window pctrl2.p_locked_win 191a98e9dbfSaguzovsk #define seg_plocked pctrl2.p_locked 192a98e9dbfSaguzovsk #define seg_pahcur pctrl2.p_ahcur 193a98e9dbfSaguzovsk #define seg_pathr_on pctrl2.p_athr_on 194a98e9dbfSaguzovsk #define seg_pahhead pctrl2.p_ahhead 195a98e9dbfSaguzovsk #define seg_pmax_pcpage pctrl3.p_pcp_maxage 196a98e9dbfSaguzovsk #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb 197a98e9dbfSaguzovsk #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb 198a98e9dbfSaguzovsk #define seg_pshrink_shift pctrl3.p_shrink_shft 199a98e9dbfSaguzovsk #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages 200a98e9dbfSaguzovsk 201a98e9dbfSaguzovsk #define P_HASHWIN_MASK (seg_phashsize_win - 1) 202a98e9dbfSaguzovsk #define P_HASHWIRED_MASK (seg_phashsize_wired - 1) 203a98e9dbfSaguzovsk #define P_BASESHIFT (6) 204a98e9dbfSaguzovsk 205a98e9dbfSaguzovsk kthread_t *seg_pasync_thr; 206a98e9dbfSaguzovsk 207a98e9dbfSaguzovsk extern struct seg_ops segvn_ops; 208a98e9dbfSaguzovsk extern struct seg_ops segspt_shmops; 209a98e9dbfSaguzovsk 210a98e9dbfSaguzovsk #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) 211a98e9dbfSaguzovsk #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) 2127c478bd9Sstevel@tonic-gate 213*d3d50737SRafael Vanoni #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t))) 2147c478bd9Sstevel@tonic-gate 215a98e9dbfSaguzovsk #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) 2167c478bd9Sstevel@tonic-gate 217a98e9dbfSaguzovsk /* 218a98e9dbfSaguzovsk * htag0 argument can be a seg or amp pointer. 219a98e9dbfSaguzovsk */ 220a98e9dbfSaguzovsk #define P_HASHBP(seg, htag0, addr, flags) \ 221a98e9dbfSaguzovsk (IS_PFLAGS_WIRED((flags)) ? \ 222a98e9dbfSaguzovsk ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ 223a98e9dbfSaguzovsk ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ 224a98e9dbfSaguzovsk (&seg_phashtab_win[P_HASHWIN_MASK & \ 225a98e9dbfSaguzovsk (((uintptr_t)(htag0) >> 3) ^ \ 226a98e9dbfSaguzovsk ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ 227a98e9dbfSaguzovsk (flags >> 16) : page_get_shift((seg)->s_szc))))])) 2287c478bd9Sstevel@tonic-gate 229a98e9dbfSaguzovsk /* 230a98e9dbfSaguzovsk * htag0 argument can be a seg or amp pointer. 231a98e9dbfSaguzovsk */ 232a98e9dbfSaguzovsk #define P_MATCH(pcp, htag0, addr, len) \ 233a98e9dbfSaguzovsk ((pcp)->p_htag0 == (htag0) && \ 234a98e9dbfSaguzovsk (pcp)->p_addr == (addr) && \ 235a98e9dbfSaguzovsk (pcp)->p_len >= (len)) 2367c478bd9Sstevel@tonic-gate 237a98e9dbfSaguzovsk #define P_MATCH_PP(pcp, htag0, addr, len, pp) \ 238a98e9dbfSaguzovsk ((pcp)->p_pp == (pp) && \ 239a98e9dbfSaguzovsk (pcp)->p_htag0 == (htag0) && \ 240a98e9dbfSaguzovsk (pcp)->p_addr == (addr) && \ 241a98e9dbfSaguzovsk (pcp)->p_len >= (len)) 2427c478bd9Sstevel@tonic-gate 243a98e9dbfSaguzovsk #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ 244a98e9dbfSaguzovsk offsetof(struct seg_pcache, p_plink))) 2457c478bd9Sstevel@tonic-gate 246a98e9dbfSaguzovsk #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ 247a98e9dbfSaguzovsk offsetof(struct seg_phash, p_halink[l]))) 2487c478bd9Sstevel@tonic-gate 2497c478bd9Sstevel@tonic-gate /* 250a98e9dbfSaguzovsk * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from 251a98e9dbfSaguzovsk * active hash bucket lists. We maintain active bucket lists to reduce the 252a98e9dbfSaguzovsk * overhead of finding active buckets during asynchronous purging since there 253a98e9dbfSaguzovsk * can be 10s of millions of buckets on a large system but only a small subset 254a98e9dbfSaguzovsk * of them in actual use. 255a98e9dbfSaguzovsk * 256a98e9dbfSaguzovsk * There're 2 active bucket lists. Current active list (as per seg_pahcur) is 257a98e9dbfSaguzovsk * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete 258a98e9dbfSaguzovsk * buckets. The other list is used by asynchronous purge thread. This allows 259a98e9dbfSaguzovsk * the purge thread to walk its active list without holding seg_pmem_mtx for a 260a98e9dbfSaguzovsk * long time. When asynchronous thread is done with its list it switches to 261a98e9dbfSaguzovsk * current active list and makes the list it just finished processing as 262a98e9dbfSaguzovsk * current active list. 263a98e9dbfSaguzovsk * 264a98e9dbfSaguzovsk * seg_padd_abuck() only adds the bucket to current list if the bucket is not 265a98e9dbfSaguzovsk * yet on any list. seg_premove_abuck() may remove the bucket from either 266a98e9dbfSaguzovsk * list. If the bucket is on current list it will be always removed. Otherwise 267a98e9dbfSaguzovsk * the bucket is only removed if asynchronous purge thread is not currently 268a98e9dbfSaguzovsk * running or seg_premove_abuck() is called by asynchronous purge thread 269a98e9dbfSaguzovsk * itself. A given bucket can only be on one of active lists at a time. These 270a98e9dbfSaguzovsk * routines should be called with per bucket lock held. The routines use 271a98e9dbfSaguzovsk * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after 272a98e9dbfSaguzovsk * the first entry is added to the bucket chain and seg_premove_abuck() must 273a98e9dbfSaguzovsk * be called after the last pcp entry is deleted from its chain. Per bucket 274a98e9dbfSaguzovsk * lock should be held by the callers. This avoids a potential race condition 275a98e9dbfSaguzovsk * when seg_premove_abuck() removes a bucket after pcp entries are added to 276a98e9dbfSaguzovsk * its list after the caller checked that the bucket has no entries. (this 277a98e9dbfSaguzovsk * race would cause a loss of an active bucket from the active lists). 278a98e9dbfSaguzovsk * 279a98e9dbfSaguzovsk * Both lists are circular doubly linked lists anchored at seg_pahhead heads. 280a98e9dbfSaguzovsk * New entries are added to the end of the list since LRU is used as the 281a98e9dbfSaguzovsk * purging policy. 282a98e9dbfSaguzovsk */ 283a98e9dbfSaguzovsk static void 284a98e9dbfSaguzovsk seg_padd_abuck(struct seg_phash *hp) 285a98e9dbfSaguzovsk { 286a98e9dbfSaguzovsk int lix; 287a98e9dbfSaguzovsk 288a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 289a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hnext != hp); 290a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hprev != hp); 291a98e9dbfSaguzovsk ASSERT(hp->p_hnext == hp->p_hprev); 292a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(hp->p_hnext)); 293a98e9dbfSaguzovsk ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); 294a98e9dbfSaguzovsk ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); 295a98e9dbfSaguzovsk ASSERT(hp >= seg_phashtab_win && 296a98e9dbfSaguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 297a98e9dbfSaguzovsk 298a98e9dbfSaguzovsk /* 299a98e9dbfSaguzovsk * This bucket can already be on one of active lists 300a98e9dbfSaguzovsk * since seg_premove_abuck() may have failed to remove it 301a98e9dbfSaguzovsk * before. 302a98e9dbfSaguzovsk */ 303a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 304a98e9dbfSaguzovsk lix = seg_pahcur; 305a98e9dbfSaguzovsk ASSERT(lix >= 0 && lix <= 1); 306a98e9dbfSaguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 307a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 308a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 309a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 310a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 311a98e9dbfSaguzovsk return; 312a98e9dbfSaguzovsk } 313a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 314a98e9dbfSaguzovsk 315a98e9dbfSaguzovsk /* 316a98e9dbfSaguzovsk * If this bucket is still on list !lix async thread can't yet remove 317a98e9dbfSaguzovsk * it since we hold here per bucket lock. In this case just return 318a98e9dbfSaguzovsk * since async thread will eventually find and process this bucket. 319a98e9dbfSaguzovsk */ 320a98e9dbfSaguzovsk if (hp->p_halink[!lix].p_lnext != NULL) { 321a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev != NULL); 322a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 323a98e9dbfSaguzovsk return; 324a98e9dbfSaguzovsk } 325a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 326a98e9dbfSaguzovsk /* 327a98e9dbfSaguzovsk * This bucket is not on any active bucket list yet. 328a98e9dbfSaguzovsk * Add the bucket to the tail of current active list. 329a98e9dbfSaguzovsk */ 330a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; 331a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; 332a98e9dbfSaguzovsk seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; 333a98e9dbfSaguzovsk seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; 334a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 335a98e9dbfSaguzovsk } 336a98e9dbfSaguzovsk 337a98e9dbfSaguzovsk static void 338a98e9dbfSaguzovsk seg_premove_abuck(struct seg_phash *hp, int athr) 339a98e9dbfSaguzovsk { 340a98e9dbfSaguzovsk int lix; 341a98e9dbfSaguzovsk 342a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 343a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hnext == hp); 344a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hprev == hp); 345a98e9dbfSaguzovsk ASSERT(hp >= seg_phashtab_win && 346a98e9dbfSaguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 347a98e9dbfSaguzovsk 348a98e9dbfSaguzovsk if (athr) { 349a98e9dbfSaguzovsk ASSERT(seg_pathr_on); 350a98e9dbfSaguzovsk ASSERT(seg_pahcur <= 1); 351a98e9dbfSaguzovsk /* 352a98e9dbfSaguzovsk * We are called by asynchronous thread that found this bucket 353a98e9dbfSaguzovsk * on not currently active (i.e. !seg_pahcur) list. Remove it 354a98e9dbfSaguzovsk * from there. Per bucket lock we are holding makes sure 355a98e9dbfSaguzovsk * seg_pinsert() can't sneak in and add pcp entries to this 356a98e9dbfSaguzovsk * bucket right before we remove the bucket from its list. 357a98e9dbfSaguzovsk */ 358a98e9dbfSaguzovsk lix = !seg_pahcur; 359a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 360a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 361a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 362a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 363a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 364a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 365a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL; 366a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL; 367a98e9dbfSaguzovsk return; 368a98e9dbfSaguzovsk } 369a98e9dbfSaguzovsk 370a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 371a98e9dbfSaguzovsk lix = seg_pahcur; 372a98e9dbfSaguzovsk ASSERT(lix >= 0 && lix <= 1); 373a98e9dbfSaguzovsk 374a98e9dbfSaguzovsk /* 375a98e9dbfSaguzovsk * If the bucket is on currently active list just remove it from 376a98e9dbfSaguzovsk * there. 377a98e9dbfSaguzovsk */ 378a98e9dbfSaguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 379a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 380a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 381a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 382a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 383a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 384a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL; 385a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL; 386a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 387a98e9dbfSaguzovsk return; 388a98e9dbfSaguzovsk } 389a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 390a98e9dbfSaguzovsk 391a98e9dbfSaguzovsk /* 392a98e9dbfSaguzovsk * If asynchronous thread is not running we can remove the bucket from 393a98e9dbfSaguzovsk * not currently active list. The bucket must be on this list since we 394a98e9dbfSaguzovsk * already checked that it's not on the other list and the bucket from 395a98e9dbfSaguzovsk * which we just deleted the last pcp entry must be still on one of the 396a98e9dbfSaguzovsk * active bucket lists. 397a98e9dbfSaguzovsk */ 398a98e9dbfSaguzovsk lix = !lix; 399a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 400a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 401a98e9dbfSaguzovsk 402a98e9dbfSaguzovsk if (!seg_pathr_on) { 403a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 404a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 405a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL; 406a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL; 407a98e9dbfSaguzovsk } 408a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 409a98e9dbfSaguzovsk } 410a98e9dbfSaguzovsk 411a98e9dbfSaguzovsk /* 412a98e9dbfSaguzovsk * Check if bucket pointed by hp already has a pcp entry that matches request 413a98e9dbfSaguzovsk * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. 414a98e9dbfSaguzovsk * Also delete matching entries that cover smaller address range but start 415a98e9dbfSaguzovsk * at the same address as addr argument. Return the list of deleted entries if 416a98e9dbfSaguzovsk * any. This is an internal helper function called from seg_pinsert() only 417a98e9dbfSaguzovsk * for non wired shadow lists. The caller already holds a per seg/amp list 418a98e9dbfSaguzovsk * lock. 419a98e9dbfSaguzovsk */ 420a98e9dbfSaguzovsk static struct seg_pcache * 421a98e9dbfSaguzovsk seg_plookup_checkdup(struct seg_phash *hp, void *htag0, 422a98e9dbfSaguzovsk caddr_t addr, size_t len, int *found) 423a98e9dbfSaguzovsk { 424a98e9dbfSaguzovsk struct seg_pcache *pcp; 425a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL; 426a98e9dbfSaguzovsk 427a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 428a98e9dbfSaguzovsk 429a98e9dbfSaguzovsk *found = 0; 430a98e9dbfSaguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 431a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 432a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 433a98e9dbfSaguzovsk if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { 434a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 435a98e9dbfSaguzovsk if (pcp->p_len < len) { 436a98e9dbfSaguzovsk pcache_link_t *plinkp; 437a98e9dbfSaguzovsk if (pcp->p_active) { 438a98e9dbfSaguzovsk continue; 439a98e9dbfSaguzovsk } 440a98e9dbfSaguzovsk plinkp = &pcp->p_plink; 441a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = plinkp->p_lnext; 442a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = plinkp->p_lprev; 443a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 444a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 445a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 446a98e9dbfSaguzovsk delcallb_list = pcp; 447a98e9dbfSaguzovsk } else { 448a98e9dbfSaguzovsk *found = 1; 449a98e9dbfSaguzovsk break; 450a98e9dbfSaguzovsk } 451a98e9dbfSaguzovsk } 452a98e9dbfSaguzovsk } 453a98e9dbfSaguzovsk return (delcallb_list); 454a98e9dbfSaguzovsk } 455a98e9dbfSaguzovsk 456a98e9dbfSaguzovsk /* 457a98e9dbfSaguzovsk * lookup an address range in pagelock cache. Return shadow list and bump up 458a98e9dbfSaguzovsk * active count. If amp is not NULL use amp as a lookup tag otherwise use seg 459a98e9dbfSaguzovsk * as a lookup tag. 4607c478bd9Sstevel@tonic-gate */ 4617c478bd9Sstevel@tonic-gate struct page ** 462a98e9dbfSaguzovsk seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 463a98e9dbfSaguzovsk enum seg_rw rw, uint_t flags) 4647c478bd9Sstevel@tonic-gate { 4657c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 4667c478bd9Sstevel@tonic-gate struct seg_phash *hp; 467a98e9dbfSaguzovsk void *htag0; 468a98e9dbfSaguzovsk 469a98e9dbfSaguzovsk ASSERT(seg != NULL); 470a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 4717c478bd9Sstevel@tonic-gate 4727c478bd9Sstevel@tonic-gate /* 4737c478bd9Sstevel@tonic-gate * Skip pagelock cache, while DR is in progress or 4747c478bd9Sstevel@tonic-gate * seg_pcache is off. 4757c478bd9Sstevel@tonic-gate */ 476a98e9dbfSaguzovsk if (seg_pdisabled) { 4777c478bd9Sstevel@tonic-gate return (NULL); 4787c478bd9Sstevel@tonic-gate } 479a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 4807c478bd9Sstevel@tonic-gate 481a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 482a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 4837c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 4847c478bd9Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 4857c478bd9Sstevel@tonic-gate pcp = pcp->p_hnext) { 486a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 487a98e9dbfSaguzovsk if (P_MATCH(pcp, htag0, addr, len)) { 488a98e9dbfSaguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 489a98e9dbfSaguzovsk /* 490a98e9dbfSaguzovsk * If this request wants to write pages 491a98e9dbfSaguzovsk * but write permissions starting from 492a98e9dbfSaguzovsk * addr don't cover the entire length len 493a98e9dbfSaguzovsk * return lookup failure back to the caller. 494a98e9dbfSaguzovsk * It will check protections and fail this 495a98e9dbfSaguzovsk * pagelock operation with EACCESS error. 496a98e9dbfSaguzovsk */ 497a98e9dbfSaguzovsk if (rw == S_WRITE && pcp->p_wlen < len) { 498a98e9dbfSaguzovsk break; 499a98e9dbfSaguzovsk } 500a98e9dbfSaguzovsk if (pcp->p_active == UINT_MAX) { 501a98e9dbfSaguzovsk break; 502a98e9dbfSaguzovsk } 5037c478bd9Sstevel@tonic-gate pcp->p_active++; 504a98e9dbfSaguzovsk if (rw == S_WRITE && !pcp->p_write) { 505a98e9dbfSaguzovsk pcp->p_write = 1; 506a98e9dbfSaguzovsk } 5077c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5087c478bd9Sstevel@tonic-gate return (pcp->p_pp); 5097c478bd9Sstevel@tonic-gate } 5107c478bd9Sstevel@tonic-gate } 5117c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5127c478bd9Sstevel@tonic-gate return (NULL); 5137c478bd9Sstevel@tonic-gate } 5147c478bd9Sstevel@tonic-gate 5157c478bd9Sstevel@tonic-gate /* 516a98e9dbfSaguzovsk * mark address range inactive. If the cache is off or the address range is 517a98e9dbfSaguzovsk * not in the cache or another shadow list that covers bigger range is found 518a98e9dbfSaguzovsk * we call the segment driver to reclaim the pages. Otherwise just decrement 519a98e9dbfSaguzovsk * active count and set ref bit. If amp is not NULL use amp as a lookup tag 520a98e9dbfSaguzovsk * otherwise use seg as a lookup tag. 5217c478bd9Sstevel@tonic-gate */ 5227c478bd9Sstevel@tonic-gate void 523a98e9dbfSaguzovsk seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, 524a98e9dbfSaguzovsk size_t len, struct page **pp, enum seg_rw rw, uint_t flags, 525a98e9dbfSaguzovsk seg_preclaim_cbfunc_t callback) 5267c478bd9Sstevel@tonic-gate { 5277c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 5287c478bd9Sstevel@tonic-gate struct seg_phash *hp; 529a98e9dbfSaguzovsk kmutex_t *pmtx = NULL; 530a98e9dbfSaguzovsk pcache_link_t *pheadp; 531a98e9dbfSaguzovsk void *htag0; 532a98e9dbfSaguzovsk pgcnt_t npages = 0; 533a98e9dbfSaguzovsk int keep = 0; 5347c478bd9Sstevel@tonic-gate 535a98e9dbfSaguzovsk ASSERT(seg != NULL); 536a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 537a98e9dbfSaguzovsk 538a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 539a98e9dbfSaguzovsk 540a98e9dbfSaguzovsk /* 541a98e9dbfSaguzovsk * Skip lookup if pcache is not configured. 542a98e9dbfSaguzovsk */ 543a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 544a98e9dbfSaguzovsk goto out; 545a98e9dbfSaguzovsk } 546a98e9dbfSaguzovsk 547a98e9dbfSaguzovsk /* 548a98e9dbfSaguzovsk * Grab per seg/amp lock before hash lock if we are going to remove 549a98e9dbfSaguzovsk * inactive entry from pcache. 550a98e9dbfSaguzovsk */ 551a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { 552a98e9dbfSaguzovsk if (amp == NULL) { 553a98e9dbfSaguzovsk pheadp = &seg->s_phead; 554a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 555a98e9dbfSaguzovsk } else { 556a98e9dbfSaguzovsk pheadp = &->a_phead; 557a98e9dbfSaguzovsk pmtx = &->a_pmtx; 558a98e9dbfSaguzovsk } 559a98e9dbfSaguzovsk mutex_enter(pmtx); 5607c478bd9Sstevel@tonic-gate } 561a98e9dbfSaguzovsk 562a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 5637c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 564a98e9dbfSaguzovsk again: 5657c478bd9Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 5667c478bd9Sstevel@tonic-gate pcp = pcp->p_hnext) { 567a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 568a98e9dbfSaguzovsk if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { 569a98e9dbfSaguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 570a98e9dbfSaguzovsk ASSERT(pcp->p_active); 571a98e9dbfSaguzovsk if (keep) { 572a98e9dbfSaguzovsk /* 573a98e9dbfSaguzovsk * Don't remove this pcp entry 574a98e9dbfSaguzovsk * if we didn't find duplicate 575a98e9dbfSaguzovsk * shadow lists on second search. 576a98e9dbfSaguzovsk * Somebody removed those duplicates 577a98e9dbfSaguzovsk * since we dropped hash lock after first 578a98e9dbfSaguzovsk * search. 579a98e9dbfSaguzovsk */ 580a98e9dbfSaguzovsk ASSERT(pmtx != NULL); 581a98e9dbfSaguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 582a98e9dbfSaguzovsk mutex_exit(pmtx); 583a98e9dbfSaguzovsk pmtx = NULL; 584a98e9dbfSaguzovsk } 5857c478bd9Sstevel@tonic-gate pcp->p_active--; 586a98e9dbfSaguzovsk if (pcp->p_active == 0 && (pmtx != NULL || 587a98e9dbfSaguzovsk (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { 588a98e9dbfSaguzovsk 589a98e9dbfSaguzovsk /* 590a98e9dbfSaguzovsk * This entry is no longer active. Remove it 591a98e9dbfSaguzovsk * now either because pcaching is temporarily 592a98e9dbfSaguzovsk * disabled or there're other pcp entries that 593a98e9dbfSaguzovsk * can match this pagelock request (i.e. this 594a98e9dbfSaguzovsk * entry is a duplicate). 595a98e9dbfSaguzovsk */ 5967c478bd9Sstevel@tonic-gate 5977c478bd9Sstevel@tonic-gate ASSERT(callback == pcp->p_callback); 598a98e9dbfSaguzovsk if (pmtx != NULL) { 599a98e9dbfSaguzovsk pcache_link_t *plinkp = &pcp->p_plink; 600a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 601a98e9dbfSaguzovsk ASSERT(pheadp->p_lnext != pheadp); 602a98e9dbfSaguzovsk ASSERT(pheadp->p_lprev != pheadp); 603a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = 604a98e9dbfSaguzovsk plinkp->p_lnext; 605a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = 606a98e9dbfSaguzovsk plinkp->p_lprev; 607a98e9dbfSaguzovsk } 6087c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 6097c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 610a98e9dbfSaguzovsk if (!IS_PCP_WIRED(pcp) && 611a98e9dbfSaguzovsk hp->p_hnext == (struct seg_pcache *)hp) { 612a98e9dbfSaguzovsk /* 613a98e9dbfSaguzovsk * We removed the last entry from this 614a98e9dbfSaguzovsk * bucket. Now remove the bucket from 615a98e9dbfSaguzovsk * its active list. 616a98e9dbfSaguzovsk */ 617a98e9dbfSaguzovsk seg_premove_abuck(hp, 0); 618a98e9dbfSaguzovsk } 6197c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 620a98e9dbfSaguzovsk if (pmtx != NULL) { 621a98e9dbfSaguzovsk mutex_exit(pmtx); 6227c478bd9Sstevel@tonic-gate } 623a98e9dbfSaguzovsk len = pcp->p_len; 624a98e9dbfSaguzovsk npages = btop(len); 625a98e9dbfSaguzovsk if (rw != S_WRITE && pcp->p_write) { 626a98e9dbfSaguzovsk rw = S_WRITE; 627a98e9dbfSaguzovsk } 628a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 6297c478bd9Sstevel@tonic-gate goto out; 630a98e9dbfSaguzovsk } else { 631a98e9dbfSaguzovsk /* 632a98e9dbfSaguzovsk * We found a matching pcp entry but will not 633a98e9dbfSaguzovsk * free it right away even if it's no longer 634a98e9dbfSaguzovsk * active. 635a98e9dbfSaguzovsk */ 636a98e9dbfSaguzovsk if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { 637a98e9dbfSaguzovsk /* 638a98e9dbfSaguzovsk * Set the reference bit and mark the 639a98e9dbfSaguzovsk * time of last access to this pcp 640a98e9dbfSaguzovsk * so that asynchronous thread doesn't 641a98e9dbfSaguzovsk * free it immediately since 642a98e9dbfSaguzovsk * it may be reactivated very soon. 643a98e9dbfSaguzovsk */ 644*d3d50737SRafael Vanoni pcp->p_lbolt = ddi_get_lbolt(); 645a98e9dbfSaguzovsk pcp->p_ref = 1; 646a98e9dbfSaguzovsk } 647a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 648a98e9dbfSaguzovsk if (pmtx != NULL) { 649a98e9dbfSaguzovsk mutex_exit(pmtx); 650a98e9dbfSaguzovsk } 651a98e9dbfSaguzovsk return; 652a98e9dbfSaguzovsk } 653a98e9dbfSaguzovsk } else if (!IS_PFLAGS_WIRED(flags) && 654a98e9dbfSaguzovsk P_MATCH(pcp, htag0, addr, len)) { 655a98e9dbfSaguzovsk /* 656a98e9dbfSaguzovsk * This is a duplicate pcp entry. This situation may 657a98e9dbfSaguzovsk * happen if a bigger shadow list that covers our 658a98e9dbfSaguzovsk * range was added while our entry was still active. 659a98e9dbfSaguzovsk * Now we can free our pcp entry if it becomes 660a98e9dbfSaguzovsk * inactive. 661a98e9dbfSaguzovsk */ 662a98e9dbfSaguzovsk if (!pcp->p_active) { 663a98e9dbfSaguzovsk /* 664a98e9dbfSaguzovsk * Mark this entry as referenced just in case 665a98e9dbfSaguzovsk * we'll free our own pcp entry soon. 666a98e9dbfSaguzovsk */ 667*d3d50737SRafael Vanoni pcp->p_lbolt = ddi_get_lbolt(); 668a98e9dbfSaguzovsk pcp->p_ref = 1; 669a98e9dbfSaguzovsk } 670a98e9dbfSaguzovsk if (pmtx != NULL) { 671a98e9dbfSaguzovsk /* 672a98e9dbfSaguzovsk * we are already holding pmtx and found a 673a98e9dbfSaguzovsk * duplicate. Don't keep our own pcp entry. 674a98e9dbfSaguzovsk */ 675a98e9dbfSaguzovsk keep = 0; 676a98e9dbfSaguzovsk continue; 677a98e9dbfSaguzovsk } 678a98e9dbfSaguzovsk /* 679a98e9dbfSaguzovsk * We have to use mutex_tryenter to attempt to lock 680a98e9dbfSaguzovsk * seg/amp list lock since we already hold hash lock 681a98e9dbfSaguzovsk * and seg/amp list lock is above hash lock in lock 682a98e9dbfSaguzovsk * order. If mutex_tryenter fails drop hash lock and 683a98e9dbfSaguzovsk * retake both locks in correct order and research 684a98e9dbfSaguzovsk * this hash chain. 685a98e9dbfSaguzovsk */ 686a98e9dbfSaguzovsk ASSERT(keep == 0); 687a98e9dbfSaguzovsk if (amp == NULL) { 688a98e9dbfSaguzovsk pheadp = &seg->s_phead; 689a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 690a98e9dbfSaguzovsk } else { 691a98e9dbfSaguzovsk pheadp = &->a_phead; 692a98e9dbfSaguzovsk pmtx = &->a_pmtx; 693a98e9dbfSaguzovsk } 694a98e9dbfSaguzovsk if (!mutex_tryenter(pmtx)) { 695a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 696a98e9dbfSaguzovsk mutex_enter(pmtx); 697a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 698a98e9dbfSaguzovsk /* 699a98e9dbfSaguzovsk * If we don't find bigger shadow list on 700a98e9dbfSaguzovsk * second search (it may happen since we 701a98e9dbfSaguzovsk * dropped bucket lock) keep the entry that 702a98e9dbfSaguzovsk * matches our own shadow list. 703a98e9dbfSaguzovsk */ 704a98e9dbfSaguzovsk keep = 1; 705a98e9dbfSaguzovsk goto again; 7067c478bd9Sstevel@tonic-gate } 7077c478bd9Sstevel@tonic-gate } 7087c478bd9Sstevel@tonic-gate } 7097c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 710a98e9dbfSaguzovsk if (pmtx != NULL) { 711a98e9dbfSaguzovsk mutex_exit(pmtx); 712a98e9dbfSaguzovsk } 7137c478bd9Sstevel@tonic-gate out: 714a98e9dbfSaguzovsk (*callback)(htag0, addr, len, pp, rw, 0); 715a98e9dbfSaguzovsk if (npages) { 716a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 717a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 718a98e9dbfSaguzovsk seg_plocked -= npages; 719a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 720a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages); 721a98e9dbfSaguzovsk seg_plocked_window -= npages; 722a98e9dbfSaguzovsk } 723a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 724a98e9dbfSaguzovsk } 725a98e9dbfSaguzovsk 7267c478bd9Sstevel@tonic-gate } 7277c478bd9Sstevel@tonic-gate 728a98e9dbfSaguzovsk #ifdef DEBUG 729a98e9dbfSaguzovsk static uint32_t p_insert_chk_mtbf = 0; 730a98e9dbfSaguzovsk #endif 731a98e9dbfSaguzovsk 7327c478bd9Sstevel@tonic-gate /* 7337c478bd9Sstevel@tonic-gate * The seg_pinsert_check() is used by segment drivers to predict whether 7347c478bd9Sstevel@tonic-gate * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. 7357c478bd9Sstevel@tonic-gate */ 736a98e9dbfSaguzovsk /*ARGSUSED*/ 7377c478bd9Sstevel@tonic-gate int 738a98e9dbfSaguzovsk seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, 739a98e9dbfSaguzovsk size_t len, uint_t flags) 7407c478bd9Sstevel@tonic-gate { 741a98e9dbfSaguzovsk ASSERT(seg != NULL); 7427c478bd9Sstevel@tonic-gate 743a98e9dbfSaguzovsk #ifdef DEBUG 744a98e9dbfSaguzovsk if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { 7457c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 7467c478bd9Sstevel@tonic-gate } 747a98e9dbfSaguzovsk #endif 748a98e9dbfSaguzovsk 749a98e9dbfSaguzovsk if (seg_pdisabled) { 7507c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 7517c478bd9Sstevel@tonic-gate } 752a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 753a98e9dbfSaguzovsk 754a98e9dbfSaguzovsk if (IS_PFLAGS_WIRED(flags)) { 755a98e9dbfSaguzovsk return (SEGP_SUCCESS); 756a98e9dbfSaguzovsk } 757a98e9dbfSaguzovsk 758a98e9dbfSaguzovsk if (seg_plocked_window + btop(len) > seg_pmaxwindow) { 7597c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 7607c478bd9Sstevel@tonic-gate } 7617c478bd9Sstevel@tonic-gate 762a98e9dbfSaguzovsk if (freemem < desfree) { 763a98e9dbfSaguzovsk return (SEGP_FAIL); 7647c478bd9Sstevel@tonic-gate } 765a98e9dbfSaguzovsk 7667c478bd9Sstevel@tonic-gate return (SEGP_SUCCESS); 7677c478bd9Sstevel@tonic-gate } 7687c478bd9Sstevel@tonic-gate 769a98e9dbfSaguzovsk #ifdef DEBUG 770a98e9dbfSaguzovsk static uint32_t p_insert_mtbf = 0; 771a98e9dbfSaguzovsk #endif 7727c478bd9Sstevel@tonic-gate 7737c478bd9Sstevel@tonic-gate /* 774a98e9dbfSaguzovsk * Insert address range with shadow list into pagelock cache if there's no 775a98e9dbfSaguzovsk * shadow list already cached for this address range. If the cache is off or 776a98e9dbfSaguzovsk * caching is temporarily disabled or the allowed 'window' is exceeded return 777a98e9dbfSaguzovsk * SEGP_FAIL. Otherwise return SEGP_SUCCESS. 778a98e9dbfSaguzovsk * 779a98e9dbfSaguzovsk * For non wired shadow lists (segvn case) include address in the hashing 780a98e9dbfSaguzovsk * function to avoid linking all the entries from the same segment or amp on 781a98e9dbfSaguzovsk * the same bucket. amp is used instead of seg if amp is not NULL. Non wired 782a98e9dbfSaguzovsk * pcache entries are also linked on a per segment/amp list so that all 783a98e9dbfSaguzovsk * entries can be found quickly during seg/amp purge without walking the 784a98e9dbfSaguzovsk * entire pcache hash table. For wired shadow lists (segspt case) we 785a98e9dbfSaguzovsk * don't use address hashing and per segment linking because the caller 786a98e9dbfSaguzovsk * currently inserts only one entry per segment that covers the entire 787a98e9dbfSaguzovsk * segment. If we used per segment linking even for segspt it would complicate 788a98e9dbfSaguzovsk * seg_ppurge_wiredpp() locking. 789a98e9dbfSaguzovsk * 790a98e9dbfSaguzovsk * Both hash bucket and per seg/amp locks need to be held before adding a non 791a98e9dbfSaguzovsk * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken 792a98e9dbfSaguzovsk * first. 793a98e9dbfSaguzovsk * 794a98e9dbfSaguzovsk * This function will also remove from pcache old inactive shadow lists that 795a98e9dbfSaguzovsk * overlap with this request but cover smaller range for the same start 796a98e9dbfSaguzovsk * address. 7977c478bd9Sstevel@tonic-gate */ 7987c478bd9Sstevel@tonic-gate int 799a98e9dbfSaguzovsk seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 800a98e9dbfSaguzovsk size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, 801a98e9dbfSaguzovsk seg_preclaim_cbfunc_t callback) 8027c478bd9Sstevel@tonic-gate { 8037c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 8047c478bd9Sstevel@tonic-gate struct seg_phash *hp; 8057c478bd9Sstevel@tonic-gate pgcnt_t npages; 806a98e9dbfSaguzovsk pcache_link_t *pheadp; 807a98e9dbfSaguzovsk kmutex_t *pmtx; 808a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL; 8097c478bd9Sstevel@tonic-gate 810a98e9dbfSaguzovsk ASSERT(seg != NULL); 811a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 812a98e9dbfSaguzovsk ASSERT(rw == S_READ || wlen == len); 813a98e9dbfSaguzovsk ASSERT(rw == S_WRITE || wlen <= len); 814a98e9dbfSaguzovsk ASSERT(amp == NULL || wlen == len); 815a98e9dbfSaguzovsk 816a98e9dbfSaguzovsk #ifdef DEBUG 817a98e9dbfSaguzovsk if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { 8187c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 8197c478bd9Sstevel@tonic-gate } 820a98e9dbfSaguzovsk #endif 821a98e9dbfSaguzovsk 822a98e9dbfSaguzovsk if (seg_pdisabled) { 8237c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 8247c478bd9Sstevel@tonic-gate } 825a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 826a98e9dbfSaguzovsk 8277c478bd9Sstevel@tonic-gate ASSERT((len & PAGEOFFSET) == 0); 828a98e9dbfSaguzovsk npages = btop(len); 829a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 830a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 831a98e9dbfSaguzovsk if (seg_plocked_window + npages > seg_pmaxwindow) { 832a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 8337c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 8347c478bd9Sstevel@tonic-gate } 835a98e9dbfSaguzovsk seg_plocked_window += npages; 8367c478bd9Sstevel@tonic-gate } 8377c478bd9Sstevel@tonic-gate seg_plocked += npages; 838a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 8397c478bd9Sstevel@tonic-gate 840a98e9dbfSaguzovsk pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); 841a98e9dbfSaguzovsk /* 842a98e9dbfSaguzovsk * If amp is not NULL set htag0 to amp otherwise set it to seg. 843a98e9dbfSaguzovsk */ 844a98e9dbfSaguzovsk if (amp == NULL) { 845a98e9dbfSaguzovsk pcp->p_htag0 = (void *)seg; 846a98e9dbfSaguzovsk pcp->p_flags = flags & 0xffff; 847a98e9dbfSaguzovsk } else { 848a98e9dbfSaguzovsk pcp->p_htag0 = (void *)amp; 849a98e9dbfSaguzovsk pcp->p_flags = (flags & 0xffff) | SEGP_AMP; 850a98e9dbfSaguzovsk } 8517c478bd9Sstevel@tonic-gate pcp->p_addr = addr; 8527c478bd9Sstevel@tonic-gate pcp->p_len = len; 853a98e9dbfSaguzovsk pcp->p_wlen = wlen; 8547c478bd9Sstevel@tonic-gate pcp->p_pp = pp; 855a98e9dbfSaguzovsk pcp->p_write = (rw == S_WRITE); 8567c478bd9Sstevel@tonic-gate pcp->p_callback = callback; 8577c478bd9Sstevel@tonic-gate pcp->p_active = 1; 8587c478bd9Sstevel@tonic-gate 859a98e9dbfSaguzovsk hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); 860a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 861a98e9dbfSaguzovsk int found; 862a98e9dbfSaguzovsk void *htag0; 863a98e9dbfSaguzovsk if (amp == NULL) { 864a98e9dbfSaguzovsk pheadp = &seg->s_phead; 865a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 866a98e9dbfSaguzovsk htag0 = (void *)seg; 867a98e9dbfSaguzovsk } else { 868a98e9dbfSaguzovsk pheadp = &->a_phead; 869a98e9dbfSaguzovsk pmtx = &->a_pmtx; 870a98e9dbfSaguzovsk htag0 = (void *)amp; 871a98e9dbfSaguzovsk } 872a98e9dbfSaguzovsk mutex_enter(pmtx); 873a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 874a98e9dbfSaguzovsk delcallb_list = seg_plookup_checkdup(hp, htag0, addr, 875a98e9dbfSaguzovsk len, &found); 876a98e9dbfSaguzovsk if (found) { 877a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 878a98e9dbfSaguzovsk mutex_exit(pmtx); 879a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 880a98e9dbfSaguzovsk seg_plocked -= npages; 881a98e9dbfSaguzovsk seg_plocked_window -= npages; 882a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 883a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 884a98e9dbfSaguzovsk goto out; 885a98e9dbfSaguzovsk } 886a98e9dbfSaguzovsk pcp->p_plink.p_lnext = pheadp->p_lnext; 887a98e9dbfSaguzovsk pcp->p_plink.p_lprev = pheadp; 888a98e9dbfSaguzovsk pheadp->p_lnext->p_lprev = &pcp->p_plink; 889a98e9dbfSaguzovsk pheadp->p_lnext = &pcp->p_plink; 890a98e9dbfSaguzovsk } else { 891a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 892a98e9dbfSaguzovsk } 893a98e9dbfSaguzovsk pcp->p_hashp = hp; 8947c478bd9Sstevel@tonic-gate pcp->p_hnext = hp->p_hnext; 8957c478bd9Sstevel@tonic-gate pcp->p_hprev = (struct seg_pcache *)hp; 8967c478bd9Sstevel@tonic-gate hp->p_hnext->p_hprev = pcp; 8977c478bd9Sstevel@tonic-gate hp->p_hnext = pcp; 898a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags) && 899a98e9dbfSaguzovsk hp->p_hprev == pcp) { 900a98e9dbfSaguzovsk seg_padd_abuck(hp); 901a98e9dbfSaguzovsk } 9027c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 903a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 904a98e9dbfSaguzovsk mutex_exit(pmtx); 905a98e9dbfSaguzovsk } 906a98e9dbfSaguzovsk 907a98e9dbfSaguzovsk out: 908a98e9dbfSaguzovsk npages = 0; 909a98e9dbfSaguzovsk while (delcallb_list != NULL) { 910a98e9dbfSaguzovsk pcp = delcallb_list; 911a98e9dbfSaguzovsk delcallb_list = pcp->p_hprev; 912a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); 913a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 914a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 915a98e9dbfSaguzovsk npages += btop(pcp->p_len); 916a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 917a98e9dbfSaguzovsk } 918a98e9dbfSaguzovsk if (npages) { 919a98e9dbfSaguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 920a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 921a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 922a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages); 923a98e9dbfSaguzovsk seg_plocked -= npages; 924a98e9dbfSaguzovsk seg_plocked_window -= npages; 925a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 926a98e9dbfSaguzovsk } 927a98e9dbfSaguzovsk 9287c478bd9Sstevel@tonic-gate return (SEGP_SUCCESS); 9297c478bd9Sstevel@tonic-gate } 9307c478bd9Sstevel@tonic-gate 9317c478bd9Sstevel@tonic-gate /* 932a98e9dbfSaguzovsk * purge entries from the pagelock cache if not active 933a98e9dbfSaguzovsk * and not recently used. 9347c478bd9Sstevel@tonic-gate */ 9357c478bd9Sstevel@tonic-gate static void 936a98e9dbfSaguzovsk seg_ppurge_async(int force) 9377c478bd9Sstevel@tonic-gate { 9387c478bd9Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 9397c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 9407c478bd9Sstevel@tonic-gate struct seg_phash *hp; 9417c478bd9Sstevel@tonic-gate pgcnt_t npages = 0; 9427c478bd9Sstevel@tonic-gate pgcnt_t npages_window = 0; 943a98e9dbfSaguzovsk pgcnt_t npgs_to_purge; 944a98e9dbfSaguzovsk pgcnt_t npgs_purged = 0; 945a98e9dbfSaguzovsk int hlinks = 0; 946a98e9dbfSaguzovsk int hlix; 947a98e9dbfSaguzovsk pcache_link_t *hlinkp; 948a98e9dbfSaguzovsk pcache_link_t *hlnextp = NULL; 949a98e9dbfSaguzovsk int lowmem; 950a98e9dbfSaguzovsk int trim; 951a98e9dbfSaguzovsk 952a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 9537c478bd9Sstevel@tonic-gate 9547c478bd9Sstevel@tonic-gate /* 955a98e9dbfSaguzovsk * if the cache is off or empty, return 9567c478bd9Sstevel@tonic-gate */ 957a98e9dbfSaguzovsk if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { 9587c478bd9Sstevel@tonic-gate return; 9597c478bd9Sstevel@tonic-gate } 9607c478bd9Sstevel@tonic-gate 961a98e9dbfSaguzovsk if (!force) { 962a98e9dbfSaguzovsk lowmem = 0; 963a98e9dbfSaguzovsk trim = 0; 964a98e9dbfSaguzovsk if (freemem < lotsfree + needfree) { 965a98e9dbfSaguzovsk spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); 966a98e9dbfSaguzovsk if (fmem <= 5 * (desfree >> 2)) { 967a98e9dbfSaguzovsk lowmem = 1; 968a98e9dbfSaguzovsk } else if (fmem <= 7 * (lotsfree >> 3)) { 969a98e9dbfSaguzovsk if (seg_plocked_window >= 970a98e9dbfSaguzovsk (availrmem_initial >> 1)) { 971a98e9dbfSaguzovsk lowmem = 1; 972a98e9dbfSaguzovsk } 973a98e9dbfSaguzovsk } else if (fmem < lotsfree) { 974a98e9dbfSaguzovsk if (seg_plocked_window >= 975a98e9dbfSaguzovsk 3 * (availrmem_initial >> 2)) { 976a98e9dbfSaguzovsk lowmem = 1; 977a98e9dbfSaguzovsk } 978a98e9dbfSaguzovsk } 979a98e9dbfSaguzovsk } 980a98e9dbfSaguzovsk if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { 981a98e9dbfSaguzovsk trim = 1; 982a98e9dbfSaguzovsk } 983a98e9dbfSaguzovsk if (!lowmem && !trim) { 984a98e9dbfSaguzovsk return; 985a98e9dbfSaguzovsk } 986a98e9dbfSaguzovsk npgs_to_purge = seg_plocked_window >> 987a98e9dbfSaguzovsk seg_pshrink_shift; 988a98e9dbfSaguzovsk if (lowmem) { 989a98e9dbfSaguzovsk npgs_to_purge = MIN(npgs_to_purge, 990a98e9dbfSaguzovsk MAX(seg_pmaxapurge_npages, desfree)); 991a98e9dbfSaguzovsk } else { 992a98e9dbfSaguzovsk npgs_to_purge = MIN(npgs_to_purge, 993a98e9dbfSaguzovsk seg_pmaxapurge_npages); 994a98e9dbfSaguzovsk } 995a98e9dbfSaguzovsk if (npgs_to_purge == 0) { 996a98e9dbfSaguzovsk return; 997a98e9dbfSaguzovsk } 998a98e9dbfSaguzovsk } else { 999a98e9dbfSaguzovsk struct seg_phash_wired *hpw; 10007c478bd9Sstevel@tonic-gate 1001a98e9dbfSaguzovsk ASSERT(seg_phashsize_wired != 0); 10027c478bd9Sstevel@tonic-gate 1003a98e9dbfSaguzovsk for (hpw = seg_phashtab_wired; 1004a98e9dbfSaguzovsk hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { 1005a98e9dbfSaguzovsk 1006a98e9dbfSaguzovsk if (hpw->p_hnext == (struct seg_pcache *)hpw) { 1007a98e9dbfSaguzovsk continue; 1008a98e9dbfSaguzovsk } 1009a98e9dbfSaguzovsk 1010a98e9dbfSaguzovsk mutex_enter(&hpw->p_hmutex); 1011a98e9dbfSaguzovsk 1012a98e9dbfSaguzovsk for (pcp = hpw->p_hnext; 1013a98e9dbfSaguzovsk pcp != (struct seg_pcache *)hpw; 1014a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 1015a98e9dbfSaguzovsk 1016a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1017a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == 1018a98e9dbfSaguzovsk (struct seg_phash *)hpw); 1019a98e9dbfSaguzovsk 1020a98e9dbfSaguzovsk if (pcp->p_active) { 1021a98e9dbfSaguzovsk continue; 10227c478bd9Sstevel@tonic-gate } 1023a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1024a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1025a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1026a98e9dbfSaguzovsk delcallb_list = pcp; 1027a98e9dbfSaguzovsk } 1028a98e9dbfSaguzovsk mutex_exit(&hpw->p_hmutex); 1029a98e9dbfSaguzovsk } 1030a98e9dbfSaguzovsk } 1031a98e9dbfSaguzovsk 1032a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1033a98e9dbfSaguzovsk if (seg_pathr_on) { 1034a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1035a98e9dbfSaguzovsk goto runcb; 1036a98e9dbfSaguzovsk } 1037a98e9dbfSaguzovsk seg_pathr_on = 1; 1038a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1039a98e9dbfSaguzovsk ASSERT(seg_pahcur <= 1); 1040a98e9dbfSaguzovsk hlix = !seg_pahcur; 1041a98e9dbfSaguzovsk 1042a98e9dbfSaguzovsk again: 1043a98e9dbfSaguzovsk for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; 1044a98e9dbfSaguzovsk hlinkp = hlnextp) { 1045a98e9dbfSaguzovsk 1046a98e9dbfSaguzovsk hlnextp = hlinkp->p_lnext; 1047a98e9dbfSaguzovsk ASSERT(hlnextp != NULL); 1048a98e9dbfSaguzovsk 1049a98e9dbfSaguzovsk hp = hlink2phash(hlinkp, hlix); 1050a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1051a98e9dbfSaguzovsk seg_pathr_empty_ahb++; 1052a98e9dbfSaguzovsk continue; 1053a98e9dbfSaguzovsk } 1054a98e9dbfSaguzovsk seg_pathr_full_ahb++; 1055a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 1056a98e9dbfSaguzovsk 1057a98e9dbfSaguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 1058a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 1059a98e9dbfSaguzovsk pcache_link_t *pheadp; 1060a98e9dbfSaguzovsk pcache_link_t *plinkp; 1061a98e9dbfSaguzovsk void *htag0; 1062a98e9dbfSaguzovsk kmutex_t *pmtx; 1063a98e9dbfSaguzovsk 1064a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 1065a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 1066a98e9dbfSaguzovsk 1067a98e9dbfSaguzovsk if (pcp->p_active) { 1068a98e9dbfSaguzovsk continue; 1069a98e9dbfSaguzovsk } 1070a98e9dbfSaguzovsk if (!force && pcp->p_ref && 1071a98e9dbfSaguzovsk PCP_AGE(pcp) < seg_pmax_pcpage) { 10727c478bd9Sstevel@tonic-gate pcp->p_ref = 0; 1073a98e9dbfSaguzovsk continue; 10747c478bd9Sstevel@tonic-gate } 1075a98e9dbfSaguzovsk plinkp = &pcp->p_plink; 1076a98e9dbfSaguzovsk htag0 = pcp->p_htag0; 1077a98e9dbfSaguzovsk if (pcp->p_flags & SEGP_AMP) { 1078a98e9dbfSaguzovsk pheadp = &((amp_t *)htag0)->a_phead; 1079a98e9dbfSaguzovsk pmtx = &((amp_t *)htag0)->a_pmtx; 1080a98e9dbfSaguzovsk } else { 1081a98e9dbfSaguzovsk pheadp = &((seg_t *)htag0)->s_phead; 1082a98e9dbfSaguzovsk pmtx = &((seg_t *)htag0)->s_pmtx; 1083a98e9dbfSaguzovsk } 1084a98e9dbfSaguzovsk if (!mutex_tryenter(pmtx)) { 1085a98e9dbfSaguzovsk continue; 1086a98e9dbfSaguzovsk } 1087a98e9dbfSaguzovsk ASSERT(pheadp->p_lnext != pheadp); 1088a98e9dbfSaguzovsk ASSERT(pheadp->p_lprev != pheadp); 1089a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = 1090a98e9dbfSaguzovsk plinkp->p_lnext; 1091a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = 1092a98e9dbfSaguzovsk plinkp->p_lprev; 1093a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1094a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1095a98e9dbfSaguzovsk mutex_exit(pmtx); 1096a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1097a98e9dbfSaguzovsk delcallb_list = pcp; 1098a98e9dbfSaguzovsk npgs_purged += btop(pcp->p_len); 1099a98e9dbfSaguzovsk } 1100a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1101a98e9dbfSaguzovsk seg_premove_abuck(hp, 1); 11027c478bd9Sstevel@tonic-gate } 11037c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 1104a98e9dbfSaguzovsk if (npgs_purged >= seg_plocked_window) { 11057c478bd9Sstevel@tonic-gate break; 1106a98e9dbfSaguzovsk } 1107a98e9dbfSaguzovsk if (!force) { 1108a98e9dbfSaguzovsk if (npgs_purged >= npgs_to_purge) { 1109a98e9dbfSaguzovsk break; 1110a98e9dbfSaguzovsk } 1111a98e9dbfSaguzovsk if (!trim && !(seg_pathr_full_ahb & 15)) { 1112a98e9dbfSaguzovsk ASSERT(lowmem); 1113a98e9dbfSaguzovsk if (freemem >= lotsfree + needfree) { 1114a98e9dbfSaguzovsk break; 1115a98e9dbfSaguzovsk } 1116a98e9dbfSaguzovsk } 1117a98e9dbfSaguzovsk } 11187c478bd9Sstevel@tonic-gate } 11197c478bd9Sstevel@tonic-gate 1120a98e9dbfSaguzovsk if (hlinkp == &seg_pahhead[hlix]) { 1121a98e9dbfSaguzovsk /* 1122a98e9dbfSaguzovsk * We processed the entire hlix active bucket list 1123a98e9dbfSaguzovsk * but didn't find enough pages to reclaim. 1124a98e9dbfSaguzovsk * Switch the lists and walk the other list 1125a98e9dbfSaguzovsk * if we haven't done it yet. 1126a98e9dbfSaguzovsk */ 1127a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1128a98e9dbfSaguzovsk ASSERT(seg_pathr_on); 1129a98e9dbfSaguzovsk ASSERT(seg_pahcur == !hlix); 1130a98e9dbfSaguzovsk seg_pahcur = hlix; 1131a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1132a98e9dbfSaguzovsk if (++hlinks < 2) { 1133a98e9dbfSaguzovsk hlix = !hlix; 1134a98e9dbfSaguzovsk goto again; 1135a98e9dbfSaguzovsk } 1136a98e9dbfSaguzovsk } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && 1137a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext != hlinkp) { 1138a98e9dbfSaguzovsk ASSERT(hlinkp != NULL); 1139a98e9dbfSaguzovsk ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); 1140a98e9dbfSaguzovsk ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); 1141a98e9dbfSaguzovsk ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); 1142a98e9dbfSaguzovsk 1143a98e9dbfSaguzovsk /* 1144a98e9dbfSaguzovsk * Reinsert the header to point to hlinkp 1145a98e9dbfSaguzovsk * so that we start from hlinkp bucket next time around. 1146a98e9dbfSaguzovsk */ 1147a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; 1148a98e9dbfSaguzovsk seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; 1149a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext = hlinkp; 1150a98e9dbfSaguzovsk seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; 1151a98e9dbfSaguzovsk hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; 1152a98e9dbfSaguzovsk hlinkp->p_lprev = &seg_pahhead[hlix]; 1153a98e9dbfSaguzovsk } 1154a98e9dbfSaguzovsk 1155a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1156a98e9dbfSaguzovsk ASSERT(seg_pathr_on); 1157a98e9dbfSaguzovsk seg_pathr_on = 0; 1158a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1159a98e9dbfSaguzovsk 1160a98e9dbfSaguzovsk runcb: 11617c478bd9Sstevel@tonic-gate /* 1162a98e9dbfSaguzovsk * Run the delayed callback list. segments/amps can't go away until 1163a98e9dbfSaguzovsk * callback is executed since they must have non 0 softlockcnt. That's 1164a98e9dbfSaguzovsk * why we don't need to hold as/seg/amp locks to execute the callback. 11657c478bd9Sstevel@tonic-gate */ 11667c478bd9Sstevel@tonic-gate while (delcallb_list != NULL) { 11677c478bd9Sstevel@tonic-gate pcp = delcallb_list; 11687c478bd9Sstevel@tonic-gate delcallb_list = pcp->p_hprev; 1169a98e9dbfSaguzovsk ASSERT(!pcp->p_active); 1170a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1171a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); 1172a98e9dbfSaguzovsk npages += btop(pcp->p_len); 1173a98e9dbfSaguzovsk if (!IS_PCP_WIRED(pcp)) { 1174a98e9dbfSaguzovsk npages_window += btop(pcp->p_len); 11757c478bd9Sstevel@tonic-gate } 1176a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 1177a98e9dbfSaguzovsk } 1178a98e9dbfSaguzovsk if (npages) { 1179a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1180a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 1181a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages_window); 1182a98e9dbfSaguzovsk seg_plocked -= npages; 1183a98e9dbfSaguzovsk seg_plocked_window -= npages_window; 1184a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 11857c478bd9Sstevel@tonic-gate } 11867c478bd9Sstevel@tonic-gate } 11877c478bd9Sstevel@tonic-gate 11887c478bd9Sstevel@tonic-gate /* 1189a98e9dbfSaguzovsk * Remove cached pages for segment(s) entries from hashtable. The segments 1190a98e9dbfSaguzovsk * are identified by pp array. This is useful for multiple seg's cached on 1191a98e9dbfSaguzovsk * behalf of dummy segment (ISM/DISM) with common pp array. 11927c478bd9Sstevel@tonic-gate */ 11937c478bd9Sstevel@tonic-gate void 1194a98e9dbfSaguzovsk seg_ppurge_wiredpp(struct page **pp) 11957c478bd9Sstevel@tonic-gate { 1196a98e9dbfSaguzovsk struct seg_pcache *pcp; 1197a98e9dbfSaguzovsk struct seg_phash_wired *hp; 11987c478bd9Sstevel@tonic-gate pgcnt_t npages = 0; 1199a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL; 12007c478bd9Sstevel@tonic-gate 12017c478bd9Sstevel@tonic-gate /* 1202a98e9dbfSaguzovsk * if the cache is empty, return 12037c478bd9Sstevel@tonic-gate */ 1204a98e9dbfSaguzovsk if (seg_plocked == 0) { 12057c478bd9Sstevel@tonic-gate return; 12067c478bd9Sstevel@tonic-gate } 1207a98e9dbfSaguzovsk ASSERT(seg_phashsize_wired != 0); 12087c478bd9Sstevel@tonic-gate 1209a98e9dbfSaguzovsk for (hp = seg_phashtab_wired; 1210a98e9dbfSaguzovsk hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { 1211a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1212a98e9dbfSaguzovsk continue; 1213a98e9dbfSaguzovsk } 12147c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 12157c478bd9Sstevel@tonic-gate pcp = hp->p_hnext; 12167c478bd9Sstevel@tonic-gate while (pcp != (struct seg_pcache *)hp) { 1217a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == (struct seg_phash *)hp); 1218a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 12197c478bd9Sstevel@tonic-gate /* 12207c478bd9Sstevel@tonic-gate * purge entries which are not active 12217c478bd9Sstevel@tonic-gate */ 1222a98e9dbfSaguzovsk if (!pcp->p_active && pcp->p_pp == pp) { 1223a98e9dbfSaguzovsk ASSERT(pcp->p_htag0 != NULL); 12247c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 12257c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 1226a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1227a98e9dbfSaguzovsk delcallb_list = pcp; 12287c478bd9Sstevel@tonic-gate } 1229a98e9dbfSaguzovsk pcp = pcp->p_hnext; 12307c478bd9Sstevel@tonic-gate } 12317c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 1232a98e9dbfSaguzovsk /* 1233a98e9dbfSaguzovsk * segments can't go away until callback is executed since 1234a98e9dbfSaguzovsk * they must have non 0 softlockcnt. That's why we don't 1235a98e9dbfSaguzovsk * need to hold as/seg locks to execute the callback. 1236a98e9dbfSaguzovsk */ 1237a98e9dbfSaguzovsk while (delcallb_list != NULL) { 1238a98e9dbfSaguzovsk int done; 1239a98e9dbfSaguzovsk pcp = delcallb_list; 1240a98e9dbfSaguzovsk delcallb_list = pcp->p_hprev; 1241a98e9dbfSaguzovsk ASSERT(!pcp->p_active); 1242a98e9dbfSaguzovsk done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1243a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, 1244a98e9dbfSaguzovsk pcp->p_write ? S_WRITE : S_READ, 1); 1245a98e9dbfSaguzovsk npages += btop(pcp->p_len); 1246a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1247a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 1248a98e9dbfSaguzovsk if (done) { 1249a98e9dbfSaguzovsk ASSERT(delcallb_list == NULL); 1250a98e9dbfSaguzovsk goto out; 1251a98e9dbfSaguzovsk } 1252a98e9dbfSaguzovsk } 12537c478bd9Sstevel@tonic-gate } 12547c478bd9Sstevel@tonic-gate 1255a98e9dbfSaguzovsk out: 1256a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1257a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 12587c478bd9Sstevel@tonic-gate seg_plocked -= npages; 1259a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 12607c478bd9Sstevel@tonic-gate } 12617c478bd9Sstevel@tonic-gate 12627c478bd9Sstevel@tonic-gate /* 12637c478bd9Sstevel@tonic-gate * purge all entries for a given segment. Since we 12647c478bd9Sstevel@tonic-gate * callback into the segment driver directly for page 12657c478bd9Sstevel@tonic-gate * reclaim the caller needs to hold the right locks. 12667c478bd9Sstevel@tonic-gate */ 12677c478bd9Sstevel@tonic-gate void 1268a98e9dbfSaguzovsk seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) 12697c478bd9Sstevel@tonic-gate { 12707c478bd9Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 12717c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 12727c478bd9Sstevel@tonic-gate struct seg_phash *hp; 12737c478bd9Sstevel@tonic-gate pgcnt_t npages = 0; 1274a98e9dbfSaguzovsk void *htag0; 12757c478bd9Sstevel@tonic-gate 1276a98e9dbfSaguzovsk if (seg_plocked == 0) { 12777c478bd9Sstevel@tonic-gate return; 12787c478bd9Sstevel@tonic-gate } 1279a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 1280a98e9dbfSaguzovsk 1281a98e9dbfSaguzovsk /* 1282a98e9dbfSaguzovsk * If amp is not NULL use amp as a lookup tag otherwise use seg 1283a98e9dbfSaguzovsk * as a lookup tag. 1284a98e9dbfSaguzovsk */ 1285a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 1286a98e9dbfSaguzovsk ASSERT(htag0 != NULL); 1287a98e9dbfSaguzovsk if (IS_PFLAGS_WIRED(flags)) { 1288a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, 0, flags); 1289a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 1290a98e9dbfSaguzovsk pcp = hp->p_hnext; 1291a98e9dbfSaguzovsk while (pcp != (struct seg_pcache *)hp) { 1292a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 1293a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1294a98e9dbfSaguzovsk if (pcp->p_htag0 == htag0) { 1295a98e9dbfSaguzovsk if (pcp->p_active) { 1296a98e9dbfSaguzovsk break; 1297a98e9dbfSaguzovsk } 1298a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1299a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1300a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1301a98e9dbfSaguzovsk delcallb_list = pcp; 1302a98e9dbfSaguzovsk } 1303a98e9dbfSaguzovsk pcp = pcp->p_hnext; 1304a98e9dbfSaguzovsk } 1305a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 1306a98e9dbfSaguzovsk } else { 1307a98e9dbfSaguzovsk pcache_link_t *plinkp; 1308a98e9dbfSaguzovsk pcache_link_t *pheadp; 1309a98e9dbfSaguzovsk kmutex_t *pmtx; 1310a98e9dbfSaguzovsk 1311a98e9dbfSaguzovsk if (amp == NULL) { 1312a98e9dbfSaguzovsk ASSERT(seg != NULL); 1313a98e9dbfSaguzovsk pheadp = &seg->s_phead; 1314a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 1315a98e9dbfSaguzovsk } else { 1316a98e9dbfSaguzovsk pheadp = &->a_phead; 1317a98e9dbfSaguzovsk pmtx = &->a_pmtx; 1318a98e9dbfSaguzovsk } 1319a98e9dbfSaguzovsk mutex_enter(pmtx); 1320a98e9dbfSaguzovsk while ((plinkp = pheadp->p_lnext) != pheadp) { 1321a98e9dbfSaguzovsk pcp = plink2pcache(plinkp); 1322a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 1323a98e9dbfSaguzovsk ASSERT(pcp->p_htag0 == htag0); 1324a98e9dbfSaguzovsk hp = pcp->p_hashp; 1325a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 13267c478bd9Sstevel@tonic-gate if (pcp->p_active) { 1327a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 13287c478bd9Sstevel@tonic-gate break; 13297c478bd9Sstevel@tonic-gate } 1330a98e9dbfSaguzovsk ASSERT(plinkp->p_lprev == pheadp); 1331a98e9dbfSaguzovsk pheadp->p_lnext = plinkp->p_lnext; 1332a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = pheadp; 13337c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 13347c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 13357c478bd9Sstevel@tonic-gate pcp->p_hprev = delcallb_list; 13367c478bd9Sstevel@tonic-gate delcallb_list = pcp; 1337a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1338a98e9dbfSaguzovsk seg_premove_abuck(hp, 0); 1339a98e9dbfSaguzovsk } 1340a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 13417c478bd9Sstevel@tonic-gate } 1342a98e9dbfSaguzovsk mutex_exit(pmtx); 13437c478bd9Sstevel@tonic-gate } 13447c478bd9Sstevel@tonic-gate while (delcallb_list != NULL) { 13457c478bd9Sstevel@tonic-gate pcp = delcallb_list; 13467c478bd9Sstevel@tonic-gate delcallb_list = pcp->p_hprev; 1347a98e9dbfSaguzovsk ASSERT(!pcp->p_active); 1348a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, 1349a98e9dbfSaguzovsk pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 1350a98e9dbfSaguzovsk npages += btop(pcp->p_len); 1351a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 13527c478bd9Sstevel@tonic-gate } 1353a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1354a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 13557c478bd9Sstevel@tonic-gate seg_plocked -= npages; 1356a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 1357a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages); 1358a98e9dbfSaguzovsk seg_plocked_window -= npages; 1359a98e9dbfSaguzovsk } 1360a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 13617c478bd9Sstevel@tonic-gate } 13627c478bd9Sstevel@tonic-gate 13637c478bd9Sstevel@tonic-gate static void seg_pinit_mem_config(void); 13647c478bd9Sstevel@tonic-gate 13657c478bd9Sstevel@tonic-gate /* 13667c478bd9Sstevel@tonic-gate * setup the pagelock cache 13677c478bd9Sstevel@tonic-gate */ 13687c478bd9Sstevel@tonic-gate static void 13697c478bd9Sstevel@tonic-gate seg_pinit(void) 13707c478bd9Sstevel@tonic-gate { 13717c478bd9Sstevel@tonic-gate struct seg_phash *hp; 1372a98e9dbfSaguzovsk ulong_t i; 1373a98e9dbfSaguzovsk pgcnt_t physmegs; 13747c478bd9Sstevel@tonic-gate 1375a98e9dbfSaguzovsk seg_plocked = 0; 1376a98e9dbfSaguzovsk seg_plocked_window = 0; 13777c478bd9Sstevel@tonic-gate 1378a98e9dbfSaguzovsk if (segpcache_enabled == 0) { 1379a98e9dbfSaguzovsk seg_phashsize_win = 0; 1380a98e9dbfSaguzovsk seg_phashsize_wired = 0; 1381a98e9dbfSaguzovsk seg_pdisabled = 1; 1382a98e9dbfSaguzovsk return; 1383a98e9dbfSaguzovsk } 13847c478bd9Sstevel@tonic-gate 1385a98e9dbfSaguzovsk seg_pdisabled = 0; 1386a98e9dbfSaguzovsk seg_pkmcache = kmem_cache_create("seg_pcache", 1387a98e9dbfSaguzovsk sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); 1388a98e9dbfSaguzovsk if (segpcache_pcp_maxage_ticks <= 0) { 1389a98e9dbfSaguzovsk segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; 1390a98e9dbfSaguzovsk } 1391a98e9dbfSaguzovsk seg_pmax_pcpage = segpcache_pcp_maxage_ticks; 1392a98e9dbfSaguzovsk seg_pathr_empty_ahb = 0; 1393a98e9dbfSaguzovsk seg_pathr_full_ahb = 0; 1394a98e9dbfSaguzovsk seg_pshrink_shift = segpcache_shrink_shift; 1395a98e9dbfSaguzovsk seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); 13967c478bd9Sstevel@tonic-gate 1397a98e9dbfSaguzovsk mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); 1398a98e9dbfSaguzovsk mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); 1399a98e9dbfSaguzovsk mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); 1400a98e9dbfSaguzovsk cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); 1401a98e9dbfSaguzovsk 1402a98e9dbfSaguzovsk physmegs = physmem >> (20 - PAGESHIFT); 1403a98e9dbfSaguzovsk 1404a98e9dbfSaguzovsk /* 1405a98e9dbfSaguzovsk * If segpcache_hashsize_win was not set in /etc/system or it has 1406a98e9dbfSaguzovsk * absurd value set it to a default. 1407a98e9dbfSaguzovsk */ 1408a98e9dbfSaguzovsk if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { 1409a98e9dbfSaguzovsk /* 1410a98e9dbfSaguzovsk * Create one bucket per 32K (or at least per 8 pages) of 1411a98e9dbfSaguzovsk * available memory. 1412a98e9dbfSaguzovsk */ 1413a98e9dbfSaguzovsk pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); 1414a98e9dbfSaguzovsk segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); 1415a98e9dbfSaguzovsk } 1416a98e9dbfSaguzovsk if (!ISP2(segpcache_hashsize_win)) { 1417a98e9dbfSaguzovsk ulong_t rndfac = ~(1UL << 1418a98e9dbfSaguzovsk (highbit(segpcache_hashsize_win) - 1)); 1419a98e9dbfSaguzovsk rndfac &= segpcache_hashsize_win; 1420a98e9dbfSaguzovsk segpcache_hashsize_win += rndfac; 1421a98e9dbfSaguzovsk segpcache_hashsize_win = 1 << 1422a98e9dbfSaguzovsk (highbit(segpcache_hashsize_win) - 1); 1423a98e9dbfSaguzovsk } 1424a98e9dbfSaguzovsk seg_phashsize_win = segpcache_hashsize_win; 1425a98e9dbfSaguzovsk seg_phashtab_win = kmem_zalloc( 1426a98e9dbfSaguzovsk seg_phashsize_win * sizeof (struct seg_phash), 1427a98e9dbfSaguzovsk KM_SLEEP); 1428a98e9dbfSaguzovsk for (i = 0; i < seg_phashsize_win; i++) { 1429a98e9dbfSaguzovsk hp = &seg_phashtab_win[i]; 1430a98e9dbfSaguzovsk hp->p_hnext = (struct seg_pcache *)hp; 1431a98e9dbfSaguzovsk hp->p_hprev = (struct seg_pcache *)hp; 1432a98e9dbfSaguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1433a98e9dbfSaguzovsk } 1434a98e9dbfSaguzovsk 1435a98e9dbfSaguzovsk seg_pahcur = 0; 1436a98e9dbfSaguzovsk seg_pathr_on = 0; 1437a98e9dbfSaguzovsk seg_pahhead[0].p_lnext = &seg_pahhead[0]; 1438a98e9dbfSaguzovsk seg_pahhead[0].p_lprev = &seg_pahhead[0]; 1439a98e9dbfSaguzovsk seg_pahhead[1].p_lnext = &seg_pahhead[1]; 1440a98e9dbfSaguzovsk seg_pahhead[1].p_lprev = &seg_pahhead[1]; 1441a98e9dbfSaguzovsk 1442a98e9dbfSaguzovsk /* 1443a98e9dbfSaguzovsk * If segpcache_hashsize_wired was not set in /etc/system or it has 1444a98e9dbfSaguzovsk * absurd value set it to a default. 1445a98e9dbfSaguzovsk */ 1446a98e9dbfSaguzovsk if (segpcache_hashsize_wired == 0 || 1447a98e9dbfSaguzovsk segpcache_hashsize_wired > physmem / 4) { 1448a98e9dbfSaguzovsk /* 1449a98e9dbfSaguzovsk * Choose segpcache_hashsize_wired based on physmem. 1450a98e9dbfSaguzovsk * Create a bucket per 128K bytes upto 256K buckets. 1451a98e9dbfSaguzovsk */ 1452a98e9dbfSaguzovsk if (physmegs < 20 * 1024) { 1453a98e9dbfSaguzovsk segpcache_hashsize_wired = MAX(1024, physmegs << 3); 1454a98e9dbfSaguzovsk } else { 1455a98e9dbfSaguzovsk segpcache_hashsize_wired = 256 * 1024; 14567c478bd9Sstevel@tonic-gate } 14577c478bd9Sstevel@tonic-gate } 1458a98e9dbfSaguzovsk if (!ISP2(segpcache_hashsize_wired)) { 1459a98e9dbfSaguzovsk segpcache_hashsize_wired = 1 << 1460a98e9dbfSaguzovsk highbit(segpcache_hashsize_wired); 1461a98e9dbfSaguzovsk } 1462a98e9dbfSaguzovsk seg_phashsize_wired = segpcache_hashsize_wired; 1463a98e9dbfSaguzovsk seg_phashtab_wired = kmem_zalloc( 1464a98e9dbfSaguzovsk seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); 1465a98e9dbfSaguzovsk for (i = 0; i < seg_phashsize_wired; i++) { 1466a98e9dbfSaguzovsk hp = (struct seg_phash *)&seg_phashtab_wired[i]; 1467a98e9dbfSaguzovsk hp->p_hnext = (struct seg_pcache *)hp; 1468a98e9dbfSaguzovsk hp->p_hprev = (struct seg_pcache *)hp; 1469a98e9dbfSaguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1470a98e9dbfSaguzovsk } 14717c478bd9Sstevel@tonic-gate 1472a98e9dbfSaguzovsk if (segpcache_maxwindow == 0) { 1473a98e9dbfSaguzovsk if (physmegs < 64) { 1474a98e9dbfSaguzovsk /* 3% of memory */ 1475a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 5; 1476a98e9dbfSaguzovsk } else if (physmegs < 512) { 1477a98e9dbfSaguzovsk /* 12% of memory */ 1478a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 3; 1479a98e9dbfSaguzovsk } else if (physmegs < 1024) { 1480a98e9dbfSaguzovsk /* 25% of memory */ 1481a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 2; 1482a98e9dbfSaguzovsk } else if (physmegs < 2048) { 1483a98e9dbfSaguzovsk /* 50% of memory */ 1484a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 1; 1485a98e9dbfSaguzovsk } else { 1486a98e9dbfSaguzovsk /* no limit */ 1487a98e9dbfSaguzovsk segpcache_maxwindow = (pgcnt_t)-1; 1488a98e9dbfSaguzovsk } 1489a98e9dbfSaguzovsk } 1490a98e9dbfSaguzovsk seg_pmaxwindow = segpcache_maxwindow; 14917c478bd9Sstevel@tonic-gate seg_pinit_mem_config(); 14927c478bd9Sstevel@tonic-gate } 14937c478bd9Sstevel@tonic-gate 14947c478bd9Sstevel@tonic-gate /* 14957c478bd9Sstevel@tonic-gate * called by pageout if memory is low 14967c478bd9Sstevel@tonic-gate */ 14977c478bd9Sstevel@tonic-gate void 14987c478bd9Sstevel@tonic-gate seg_preap(void) 14997c478bd9Sstevel@tonic-gate { 15007c478bd9Sstevel@tonic-gate /* 1501a98e9dbfSaguzovsk * if the cache is off or empty, return 15027c478bd9Sstevel@tonic-gate */ 1503a98e9dbfSaguzovsk if (seg_plocked_window == 0) { 15047c478bd9Sstevel@tonic-gate return; 15057c478bd9Sstevel@tonic-gate } 1506a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 15077c478bd9Sstevel@tonic-gate 1508a98e9dbfSaguzovsk /* 1509a98e9dbfSaguzovsk * If somebody is already purging pcache 1510a98e9dbfSaguzovsk * just return. 1511a98e9dbfSaguzovsk */ 1512a98e9dbfSaguzovsk if (seg_pdisabled) { 1513a98e9dbfSaguzovsk return; 1514a98e9dbfSaguzovsk } 1515a98e9dbfSaguzovsk 1516a98e9dbfSaguzovsk cv_signal(&seg_pasync_cv); 1517a98e9dbfSaguzovsk } 15187c478bd9Sstevel@tonic-gate 15197c478bd9Sstevel@tonic-gate /* 15207c478bd9Sstevel@tonic-gate * run as a backgroud thread and reclaim pagelock 15217c478bd9Sstevel@tonic-gate * pages which have not been used recently 15227c478bd9Sstevel@tonic-gate */ 15237c478bd9Sstevel@tonic-gate void 15247c478bd9Sstevel@tonic-gate seg_pasync_thread(void) 15257c478bd9Sstevel@tonic-gate { 15267c478bd9Sstevel@tonic-gate callb_cpr_t cpr_info; 15277c478bd9Sstevel@tonic-gate 1528a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 1529a98e9dbfSaguzovsk thread_exit(); 1530a98e9dbfSaguzovsk /*NOTREACHED*/ 1531a98e9dbfSaguzovsk } 1532a98e9dbfSaguzovsk 1533a98e9dbfSaguzovsk seg_pasync_thr = curthread; 15347c478bd9Sstevel@tonic-gate 1535a98e9dbfSaguzovsk CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, 1536a98e9dbfSaguzovsk callb_generic_cpr, "seg_pasync"); 15377c478bd9Sstevel@tonic-gate 1538a98e9dbfSaguzovsk if (segpcache_reap_ticks <= 0) { 1539a98e9dbfSaguzovsk segpcache_reap_ticks = segpcache_reap_sec * hz; 15407c478bd9Sstevel@tonic-gate } 15417c478bd9Sstevel@tonic-gate 1542a98e9dbfSaguzovsk mutex_enter(&seg_pasync_mtx); 15437c478bd9Sstevel@tonic-gate for (;;) { 15447c478bd9Sstevel@tonic-gate CALLB_CPR_SAFE_BEGIN(&cpr_info); 1545*d3d50737SRafael Vanoni (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx, 1546*d3d50737SRafael Vanoni segpcache_reap_ticks, TR_CLOCK_TICK); 1547a98e9dbfSaguzovsk CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); 1548a98e9dbfSaguzovsk if (seg_pdisabled == 0) { 1549a98e9dbfSaguzovsk seg_ppurge_async(0); 1550a98e9dbfSaguzovsk } 15517c478bd9Sstevel@tonic-gate } 15527c478bd9Sstevel@tonic-gate } 15537c478bd9Sstevel@tonic-gate 15547c478bd9Sstevel@tonic-gate static struct kmem_cache *seg_cache; 15557c478bd9Sstevel@tonic-gate 15567c478bd9Sstevel@tonic-gate /* 15577c478bd9Sstevel@tonic-gate * Initialize segment management data structures. 15587c478bd9Sstevel@tonic-gate */ 15597c478bd9Sstevel@tonic-gate void 15607c478bd9Sstevel@tonic-gate seg_init(void) 15617c478bd9Sstevel@tonic-gate { 15627c478bd9Sstevel@tonic-gate kstat_t *ksp; 15637c478bd9Sstevel@tonic-gate 1564a98e9dbfSaguzovsk seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 1565a98e9dbfSaguzovsk 0, NULL, NULL, NULL, NULL, NULL, 0); 15667c478bd9Sstevel@tonic-gate 15677c478bd9Sstevel@tonic-gate ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, 1568c6f08383Sjj segadvstat_ndata, KSTAT_FLAG_VIRTUAL); 15697c478bd9Sstevel@tonic-gate if (ksp) { 15707c478bd9Sstevel@tonic-gate ksp->ks_data = (void *)segadvstat_ptr; 15717c478bd9Sstevel@tonic-gate kstat_install(ksp); 15727c478bd9Sstevel@tonic-gate } 15737c478bd9Sstevel@tonic-gate 15747c478bd9Sstevel@tonic-gate seg_pinit(); 15757c478bd9Sstevel@tonic-gate } 15767c478bd9Sstevel@tonic-gate 15777c478bd9Sstevel@tonic-gate /* 15787c478bd9Sstevel@tonic-gate * Allocate a segment to cover [base, base+size] 15797c478bd9Sstevel@tonic-gate * and attach it to the specified address space. 15807c478bd9Sstevel@tonic-gate */ 15817c478bd9Sstevel@tonic-gate struct seg * 15827c478bd9Sstevel@tonic-gate seg_alloc(struct as *as, caddr_t base, size_t size) 15837c478bd9Sstevel@tonic-gate { 15847c478bd9Sstevel@tonic-gate struct seg *new; 15857c478bd9Sstevel@tonic-gate caddr_t segbase; 15867c478bd9Sstevel@tonic-gate size_t segsize; 15877c478bd9Sstevel@tonic-gate 15887c478bd9Sstevel@tonic-gate segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); 15897c478bd9Sstevel@tonic-gate segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - 15907c478bd9Sstevel@tonic-gate (uintptr_t)segbase; 15917c478bd9Sstevel@tonic-gate 15927c478bd9Sstevel@tonic-gate if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) 15937c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 15947c478bd9Sstevel@tonic-gate 15957c478bd9Sstevel@tonic-gate if (as != &kas && 15967c478bd9Sstevel@tonic-gate valid_usr_range(segbase, segsize, 0, as, 15977c478bd9Sstevel@tonic-gate as->a_userlimit) != RANGE_OKAY) 15987c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 15997c478bd9Sstevel@tonic-gate 16007c478bd9Sstevel@tonic-gate new = kmem_cache_alloc(seg_cache, KM_SLEEP); 16017c478bd9Sstevel@tonic-gate new->s_ops = NULL; 16027c478bd9Sstevel@tonic-gate new->s_data = NULL; 16037c478bd9Sstevel@tonic-gate new->s_szc = 0; 16047c478bd9Sstevel@tonic-gate new->s_flags = 0; 1605a98e9dbfSaguzovsk mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); 1606a98e9dbfSaguzovsk new->s_phead.p_lnext = &new->s_phead; 1607a98e9dbfSaguzovsk new->s_phead.p_lprev = &new->s_phead; 16087c478bd9Sstevel@tonic-gate if (seg_attach(as, segbase, segsize, new) < 0) { 16097c478bd9Sstevel@tonic-gate kmem_cache_free(seg_cache, new); 16107c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); 16117c478bd9Sstevel@tonic-gate } 16127c478bd9Sstevel@tonic-gate /* caller must fill in ops, data */ 16137c478bd9Sstevel@tonic-gate return (new); 16147c478bd9Sstevel@tonic-gate } 16157c478bd9Sstevel@tonic-gate 16167c478bd9Sstevel@tonic-gate /* 16177c478bd9Sstevel@tonic-gate * Attach a segment to the address space. Used by seg_alloc() 16187c478bd9Sstevel@tonic-gate * and for kernel startup to attach to static segments. 16197c478bd9Sstevel@tonic-gate */ 16207c478bd9Sstevel@tonic-gate int 16217c478bd9Sstevel@tonic-gate seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) 16227c478bd9Sstevel@tonic-gate { 16237c478bd9Sstevel@tonic-gate seg->s_as = as; 16247c478bd9Sstevel@tonic-gate seg->s_base = base; 16257c478bd9Sstevel@tonic-gate seg->s_size = size; 16267c478bd9Sstevel@tonic-gate 16277c478bd9Sstevel@tonic-gate /* 16287c478bd9Sstevel@tonic-gate * as_addseg() will add the segment at the appropraite point 16297c478bd9Sstevel@tonic-gate * in the list. It will return -1 if there is overlap with 16307c478bd9Sstevel@tonic-gate * an already existing segment. 16317c478bd9Sstevel@tonic-gate */ 16327c478bd9Sstevel@tonic-gate return (as_addseg(as, seg)); 16337c478bd9Sstevel@tonic-gate } 16347c478bd9Sstevel@tonic-gate 16357c478bd9Sstevel@tonic-gate /* 16367c478bd9Sstevel@tonic-gate * Unmap a segment and free it from its associated address space. 16377c478bd9Sstevel@tonic-gate * This should be called by anybody who's finished with a whole segment's 16387c478bd9Sstevel@tonic-gate * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the 16397c478bd9Sstevel@tonic-gate * responsibility of the segment driver to unlink the the segment 16407c478bd9Sstevel@tonic-gate * from the address space, and to free public and private data structures 16417c478bd9Sstevel@tonic-gate * associated with the segment. (This is typically done by a call to 16427c478bd9Sstevel@tonic-gate * seg_free()). 16437c478bd9Sstevel@tonic-gate */ 16447c478bd9Sstevel@tonic-gate void 16457c478bd9Sstevel@tonic-gate seg_unmap(struct seg *seg) 16467c478bd9Sstevel@tonic-gate { 16477c478bd9Sstevel@tonic-gate #ifdef DEBUG 16487c478bd9Sstevel@tonic-gate int ret; 16497c478bd9Sstevel@tonic-gate #endif /* DEBUG */ 16507c478bd9Sstevel@tonic-gate 16517c478bd9Sstevel@tonic-gate ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 16527c478bd9Sstevel@tonic-gate 16537c478bd9Sstevel@tonic-gate /* Shouldn't have called seg_unmap if mapping isn't yet established */ 16547c478bd9Sstevel@tonic-gate ASSERT(seg->s_data != NULL); 16557c478bd9Sstevel@tonic-gate 16567c478bd9Sstevel@tonic-gate /* Unmap the whole mapping */ 16577c478bd9Sstevel@tonic-gate #ifdef DEBUG 16587c478bd9Sstevel@tonic-gate ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16597c478bd9Sstevel@tonic-gate ASSERT(ret == 0); 16607c478bd9Sstevel@tonic-gate #else 16617c478bd9Sstevel@tonic-gate SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16627c478bd9Sstevel@tonic-gate #endif /* DEBUG */ 16637c478bd9Sstevel@tonic-gate } 16647c478bd9Sstevel@tonic-gate 16657c478bd9Sstevel@tonic-gate /* 16667c478bd9Sstevel@tonic-gate * Free the segment from its associated as. This should only be called 16677c478bd9Sstevel@tonic-gate * if a mapping to the segment has not yet been established (e.g., if 16687c478bd9Sstevel@tonic-gate * an error occurs in the middle of doing an as_map when the segment 16697c478bd9Sstevel@tonic-gate * has already been partially set up) or if it has already been deleted 16707c478bd9Sstevel@tonic-gate * (e.g., from a segment driver unmap routine if the unmap applies to the 16717c478bd9Sstevel@tonic-gate * entire segment). If the mapping is currently set up then seg_unmap() should 16727c478bd9Sstevel@tonic-gate * be called instead. 16737c478bd9Sstevel@tonic-gate */ 16747c478bd9Sstevel@tonic-gate void 16757c478bd9Sstevel@tonic-gate seg_free(struct seg *seg) 16767c478bd9Sstevel@tonic-gate { 16777c478bd9Sstevel@tonic-gate register struct as *as = seg->s_as; 16787c478bd9Sstevel@tonic-gate struct seg *tseg = as_removeseg(as, seg); 16797c478bd9Sstevel@tonic-gate 16807c478bd9Sstevel@tonic-gate ASSERT(tseg == seg); 16817c478bd9Sstevel@tonic-gate 16827c478bd9Sstevel@tonic-gate /* 16837c478bd9Sstevel@tonic-gate * If the segment private data field is NULL, 16847c478bd9Sstevel@tonic-gate * then segment driver is not attached yet. 16857c478bd9Sstevel@tonic-gate */ 16867c478bd9Sstevel@tonic-gate if (seg->s_data != NULL) 16877c478bd9Sstevel@tonic-gate SEGOP_FREE(seg); 16887c478bd9Sstevel@tonic-gate 1689a98e9dbfSaguzovsk mutex_destroy(&seg->s_pmtx); 1690a98e9dbfSaguzovsk ASSERT(seg->s_phead.p_lnext == &seg->s_phead); 1691a98e9dbfSaguzovsk ASSERT(seg->s_phead.p_lprev == &seg->s_phead); 16927c478bd9Sstevel@tonic-gate kmem_cache_free(seg_cache, seg); 16937c478bd9Sstevel@tonic-gate } 16947c478bd9Sstevel@tonic-gate 16957c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 16967c478bd9Sstevel@tonic-gate static void 16977c478bd9Sstevel@tonic-gate seg_p_mem_config_post_add( 16987c478bd9Sstevel@tonic-gate void *arg, 16997c478bd9Sstevel@tonic-gate pgcnt_t delta_pages) 17007c478bd9Sstevel@tonic-gate { 17017c478bd9Sstevel@tonic-gate /* Nothing to do. */ 17027c478bd9Sstevel@tonic-gate } 17037c478bd9Sstevel@tonic-gate 1704cee1d74bSjfrank void 1705cee1d74bSjfrank seg_p_enable(void) 1706cee1d74bSjfrank { 1707a98e9dbfSaguzovsk mutex_enter(&seg_pcache_mtx); 1708a98e9dbfSaguzovsk ASSERT(seg_pdisabled != 0); 1709a98e9dbfSaguzovsk seg_pdisabled--; 1710a98e9dbfSaguzovsk mutex_exit(&seg_pcache_mtx); 1711cee1d74bSjfrank } 1712cee1d74bSjfrank 17137c478bd9Sstevel@tonic-gate /* 1714cee1d74bSjfrank * seg_p_disable - disables seg_pcache, and then attempts to empty the 1715cee1d74bSjfrank * cache. 1716cee1d74bSjfrank * Returns SEGP_SUCCESS if the cache was successfully emptied, or 1717cee1d74bSjfrank * SEGP_FAIL if the cache could not be emptied. 17187c478bd9Sstevel@tonic-gate */ 1719cee1d74bSjfrank int 1720cee1d74bSjfrank seg_p_disable(void) 17217c478bd9Sstevel@tonic-gate { 17227c478bd9Sstevel@tonic-gate pgcnt_t old_plocked; 17237c478bd9Sstevel@tonic-gate int stall_count = 0; 17247c478bd9Sstevel@tonic-gate 1725a98e9dbfSaguzovsk mutex_enter(&seg_pcache_mtx); 1726a98e9dbfSaguzovsk seg_pdisabled++; 1727a98e9dbfSaguzovsk ASSERT(seg_pdisabled != 0); 1728a98e9dbfSaguzovsk mutex_exit(&seg_pcache_mtx); 17297c478bd9Sstevel@tonic-gate 17307c478bd9Sstevel@tonic-gate /* 17317c478bd9Sstevel@tonic-gate * Attempt to empty the cache. Terminate if seg_plocked does not 17327c478bd9Sstevel@tonic-gate * diminish with SEGP_STALL_THRESHOLD consecutive attempts. 17337c478bd9Sstevel@tonic-gate */ 17347c478bd9Sstevel@tonic-gate while (seg_plocked != 0) { 1735a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 17367c478bd9Sstevel@tonic-gate old_plocked = seg_plocked; 1737a98e9dbfSaguzovsk seg_ppurge_async(1); 17387c478bd9Sstevel@tonic-gate if (seg_plocked == old_plocked) { 17397c478bd9Sstevel@tonic-gate if (stall_count++ > SEGP_STALL_THRESHOLD) { 1740cee1d74bSjfrank return (SEGP_FAIL); 17417c478bd9Sstevel@tonic-gate } 17427c478bd9Sstevel@tonic-gate } else 17437c478bd9Sstevel@tonic-gate stall_count = 0; 17447c478bd9Sstevel@tonic-gate if (seg_plocked != 0) 17457c478bd9Sstevel@tonic-gate delay(hz/SEGP_PREDEL_DELAY_FACTOR); 17467c478bd9Sstevel@tonic-gate } 1747cee1d74bSjfrank return (SEGP_SUCCESS); 1748cee1d74bSjfrank } 1749cee1d74bSjfrank 1750cee1d74bSjfrank /* 1751cee1d74bSjfrank * Attempt to purge seg_pcache. May need to return before this has 1752cee1d74bSjfrank * completed to allow other pre_del callbacks to unlock pages. This is 1753cee1d74bSjfrank * ok because: 1754a98e9dbfSaguzovsk * 1) The seg_pdisabled flag has been set so at least we won't 1755cee1d74bSjfrank * cache anymore locks and the locks we couldn't purge 1756cee1d74bSjfrank * will not be held if they do get released by a subsequent 1757cee1d74bSjfrank * pre-delete callback. 1758cee1d74bSjfrank * 1759cee1d74bSjfrank * 2) The rest of the memory delete thread processing does not 1760cee1d74bSjfrank * depend on the changes made in this pre-delete callback. No 1761cee1d74bSjfrank * panics will result, the worst that will happen is that the 1762cee1d74bSjfrank * DR code will timeout and cancel the delete. 1763cee1d74bSjfrank */ 1764cee1d74bSjfrank /*ARGSUSED*/ 1765cee1d74bSjfrank static int 1766cee1d74bSjfrank seg_p_mem_config_pre_del( 1767cee1d74bSjfrank void *arg, 1768cee1d74bSjfrank pgcnt_t delta_pages) 1769cee1d74bSjfrank { 1770a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 1771a98e9dbfSaguzovsk return (0); 1772a98e9dbfSaguzovsk } 1773cee1d74bSjfrank if (seg_p_disable() != SEGP_SUCCESS) 1774cee1d74bSjfrank cmn_err(CE_NOTE, 1775cee1d74bSjfrank "!Pre-delete couldn't purge"" pagelock cache - continuing"); 17767c478bd9Sstevel@tonic-gate return (0); 17777c478bd9Sstevel@tonic-gate } 17787c478bd9Sstevel@tonic-gate 17797c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 17807c478bd9Sstevel@tonic-gate static void 17817c478bd9Sstevel@tonic-gate seg_p_mem_config_post_del( 17827c478bd9Sstevel@tonic-gate void *arg, 17837c478bd9Sstevel@tonic-gate pgcnt_t delta_pages, 17847c478bd9Sstevel@tonic-gate int cancelled) 17857c478bd9Sstevel@tonic-gate { 1786a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 1787a98e9dbfSaguzovsk return; 1788a98e9dbfSaguzovsk } 1789cee1d74bSjfrank seg_p_enable(); 17907c478bd9Sstevel@tonic-gate } 17917c478bd9Sstevel@tonic-gate 17927c478bd9Sstevel@tonic-gate static kphysm_setup_vector_t seg_p_mem_config_vec = { 17937c478bd9Sstevel@tonic-gate KPHYSM_SETUP_VECTOR_VERSION, 17947c478bd9Sstevel@tonic-gate seg_p_mem_config_post_add, 17957c478bd9Sstevel@tonic-gate seg_p_mem_config_pre_del, 17967c478bd9Sstevel@tonic-gate seg_p_mem_config_post_del, 17977c478bd9Sstevel@tonic-gate }; 17987c478bd9Sstevel@tonic-gate 17997c478bd9Sstevel@tonic-gate static void 18007c478bd9Sstevel@tonic-gate seg_pinit_mem_config(void) 18017c478bd9Sstevel@tonic-gate { 18027c478bd9Sstevel@tonic-gate int ret; 18037c478bd9Sstevel@tonic-gate 18047c478bd9Sstevel@tonic-gate ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); 18057c478bd9Sstevel@tonic-gate /* 18067c478bd9Sstevel@tonic-gate * Want to catch this in the debug kernel. At run time, if the 18077c478bd9Sstevel@tonic-gate * callbacks don't get run all will be OK as the disable just makes 18087c478bd9Sstevel@tonic-gate * it more likely that the pages can be collected. 18097c478bd9Sstevel@tonic-gate */ 18107c478bd9Sstevel@tonic-gate ASSERT(ret == 0); 18117c478bd9Sstevel@tonic-gate } 18120209230bSgjelinek 18130209230bSgjelinek /* 18140209230bSgjelinek * Verify that segment is not a shared anonymous segment which reserves 18150209230bSgjelinek * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered 18160209230bSgjelinek * from one zone to another if any segments are shared. This is because the 18170209230bSgjelinek * last process to exit will credit the swap reservation. This could lead 18180209230bSgjelinek * to the swap being reserved by one zone, and credited to another. 18190209230bSgjelinek */ 18200209230bSgjelinek boolean_t 18210209230bSgjelinek seg_can_change_zones(struct seg *seg) 18220209230bSgjelinek { 18230209230bSgjelinek struct segvn_data *svd; 18240209230bSgjelinek 18250209230bSgjelinek if (seg->s_ops == &segspt_shmops) 18260209230bSgjelinek return (B_FALSE); 18270209230bSgjelinek 18280209230bSgjelinek if (seg->s_ops == &segvn_ops) { 18290209230bSgjelinek svd = (struct segvn_data *)seg->s_data; 18300209230bSgjelinek if (svd->type == MAP_SHARED && 18310209230bSgjelinek svd->amp != NULL && 18320209230bSgjelinek svd->amp->swresv > 0) 18330209230bSgjelinek return (B_FALSE); 18340209230bSgjelinek } 18350209230bSgjelinek return (B_TRUE); 18360209230bSgjelinek } 18370209230bSgjelinek 18380209230bSgjelinek /* 18390209230bSgjelinek * Return swap reserved by a segment backing a private mapping. 18400209230bSgjelinek */ 18410209230bSgjelinek size_t 18420209230bSgjelinek seg_swresv(struct seg *seg) 18430209230bSgjelinek { 18440209230bSgjelinek struct segvn_data *svd; 18450209230bSgjelinek size_t swap = 0; 18460209230bSgjelinek 18470209230bSgjelinek if (seg->s_ops == &segvn_ops) { 18480209230bSgjelinek svd = (struct segvn_data *)seg->s_data; 18490209230bSgjelinek if (svd->type == MAP_PRIVATE && svd->swresv > 0) 18500209230bSgjelinek swap = svd->swresv; 18510209230bSgjelinek } 18520209230bSgjelinek return (swap); 18530209230bSgjelinek } 1854