17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 50209230bSgjelinek * Common Development and Distribution License (the "License"). 60209230bSgjelinek * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 22c6f08383Sjj * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 237c478bd9Sstevel@tonic-gate * Use is subject to license terms. 247c478bd9Sstevel@tonic-gate */ 257c478bd9Sstevel@tonic-gate 267c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 277c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate /* 307c478bd9Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 317c478bd9Sstevel@tonic-gate * The Regents of the University of California 327c478bd9Sstevel@tonic-gate * All Rights Reserved 337c478bd9Sstevel@tonic-gate * 347c478bd9Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 357c478bd9Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 367c478bd9Sstevel@tonic-gate * contributors. 377c478bd9Sstevel@tonic-gate */ 387c478bd9Sstevel@tonic-gate 397c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 407c478bd9Sstevel@tonic-gate 417c478bd9Sstevel@tonic-gate /* 427c478bd9Sstevel@tonic-gate * VM - segment management. 437c478bd9Sstevel@tonic-gate */ 447c478bd9Sstevel@tonic-gate 457c478bd9Sstevel@tonic-gate #include <sys/types.h> 467c478bd9Sstevel@tonic-gate #include <sys/inttypes.h> 477c478bd9Sstevel@tonic-gate #include <sys/t_lock.h> 487c478bd9Sstevel@tonic-gate #include <sys/param.h> 497c478bd9Sstevel@tonic-gate #include <sys/systm.h> 507c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 51*a98e9dbfSaguzovsk #include <sys/sysmacros.h> 527c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h> 53*a98e9dbfSaguzovsk #include <sys/tuneable.h> 547c478bd9Sstevel@tonic-gate #include <sys/debug.h> 55*a98e9dbfSaguzovsk #include <sys/fs/swapnode.h> 567c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 577c478bd9Sstevel@tonic-gate #include <sys/callb.h> 587c478bd9Sstevel@tonic-gate #include <sys/mem_config.h> 590209230bSgjelinek #include <sys/mman.h> 607c478bd9Sstevel@tonic-gate 617c478bd9Sstevel@tonic-gate #include <vm/hat.h> 627c478bd9Sstevel@tonic-gate #include <vm/as.h> 637c478bd9Sstevel@tonic-gate #include <vm/seg.h> 647c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 650209230bSgjelinek #include <vm/seg_spt.h> 660209230bSgjelinek #include <vm/seg_vn.h> 67*a98e9dbfSaguzovsk #include <vm/anon.h> 68*a98e9dbfSaguzovsk 697c478bd9Sstevel@tonic-gate /* 707c478bd9Sstevel@tonic-gate * kstats for segment advise 717c478bd9Sstevel@tonic-gate */ 727c478bd9Sstevel@tonic-gate segadvstat_t segadvstat = { 737c478bd9Sstevel@tonic-gate { "MADV_FREE_hit", KSTAT_DATA_ULONG }, 747c478bd9Sstevel@tonic-gate { "MADV_FREE_miss", KSTAT_DATA_ULONG }, 757c478bd9Sstevel@tonic-gate }; 767c478bd9Sstevel@tonic-gate 777c478bd9Sstevel@tonic-gate kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; 787c478bd9Sstevel@tonic-gate uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); 797c478bd9Sstevel@tonic-gate 807c478bd9Sstevel@tonic-gate /* 817c478bd9Sstevel@tonic-gate * entry in the segment page cache 827c478bd9Sstevel@tonic-gate */ 837c478bd9Sstevel@tonic-gate struct seg_pcache { 84*a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 85*a98e9dbfSaguzovsk struct seg_pcache *p_hprev; 86*a98e9dbfSaguzovsk pcache_link_t p_plink; /* per segment/amp list */ 87*a98e9dbfSaguzovsk void *p_htag0; /* segment/amp pointer */ 88*a98e9dbfSaguzovsk caddr_t p_addr; /* base address/anon_idx */ 89*a98e9dbfSaguzovsk size_t p_len; /* total bytes */ 90*a98e9dbfSaguzovsk size_t p_wlen; /* writtable bytes at p_addr */ 91*a98e9dbfSaguzovsk struct page **p_pp; /* pp shadow list */ 92*a98e9dbfSaguzovsk seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ 93*a98e9dbfSaguzovsk clock_t p_lbolt; /* lbolt from last use */ 94*a98e9dbfSaguzovsk struct seg_phash *p_hashp; /* our pcache hash bucket */ 95*a98e9dbfSaguzovsk uint_t p_active; /* active count */ 96*a98e9dbfSaguzovsk uchar_t p_write; /* true if S_WRITE */ 97*a98e9dbfSaguzovsk uchar_t p_ref; /* reference byte */ 98*a98e9dbfSaguzovsk ushort_t p_flags; /* bit flags */ 997c478bd9Sstevel@tonic-gate }; 1007c478bd9Sstevel@tonic-gate 1017c478bd9Sstevel@tonic-gate struct seg_phash { 102*a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 103*a98e9dbfSaguzovsk struct seg_pcache *p_hprev; 104*a98e9dbfSaguzovsk kmutex_t p_hmutex; /* protects hash bucket */ 105*a98e9dbfSaguzovsk pcache_link_t p_halink[2]; /* active bucket linkages */ 106*a98e9dbfSaguzovsk }; 107*a98e9dbfSaguzovsk 108*a98e9dbfSaguzovsk struct seg_phash_wired { 109*a98e9dbfSaguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 110*a98e9dbfSaguzovsk struct seg_pcache *p_hprev; 111*a98e9dbfSaguzovsk kmutex_t p_hmutex; /* protects hash bucket */ 1127c478bd9Sstevel@tonic-gate }; 1137c478bd9Sstevel@tonic-gate 114*a98e9dbfSaguzovsk /* 115*a98e9dbfSaguzovsk * A parameter to control a maximum number of bytes that can be 116*a98e9dbfSaguzovsk * purged from pcache at a time. 117*a98e9dbfSaguzovsk */ 118*a98e9dbfSaguzovsk #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) 119*a98e9dbfSaguzovsk 120*a98e9dbfSaguzovsk /* 121*a98e9dbfSaguzovsk * log2(fraction of pcache to reclaim at a time). 122*a98e9dbfSaguzovsk */ 123*a98e9dbfSaguzovsk #define P_SHRINK_SHFT (5) 124*a98e9dbfSaguzovsk 125*a98e9dbfSaguzovsk /* 126*a98e9dbfSaguzovsk * The following variables can be tuned via /etc/system. 127*a98e9dbfSaguzovsk */ 128*a98e9dbfSaguzovsk 129*a98e9dbfSaguzovsk int segpcache_enabled = 1; /* if 1, shadow lists are cached */ 130*a98e9dbfSaguzovsk pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ 131*a98e9dbfSaguzovsk ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ 132*a98e9dbfSaguzovsk ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ 133*a98e9dbfSaguzovsk int segpcache_reap_sec = 1; /* reap check rate in secs */ 134*a98e9dbfSaguzovsk clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ 135*a98e9dbfSaguzovsk int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ 136*a98e9dbfSaguzovsk clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ 137*a98e9dbfSaguzovsk int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ 138*a98e9dbfSaguzovsk pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ 139*a98e9dbfSaguzovsk 140*a98e9dbfSaguzovsk static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ 141*a98e9dbfSaguzovsk static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ 142*a98e9dbfSaguzovsk static kcondvar_t seg_pasync_cv; 143*a98e9dbfSaguzovsk 144*a98e9dbfSaguzovsk #pragma align 64(pctrl1) 145*a98e9dbfSaguzovsk #pragma align 64(pctrl2) 146*a98e9dbfSaguzovsk #pragma align 64(pctrl3) 147*a98e9dbfSaguzovsk 148*a98e9dbfSaguzovsk /* 149*a98e9dbfSaguzovsk * Keep frequently used variables together in one cache line. 150*a98e9dbfSaguzovsk */ 151*a98e9dbfSaguzovsk static struct p_ctrl1 { 152*a98e9dbfSaguzovsk uint_t p_disabled; /* if not 0, caching temporarily off */ 153*a98e9dbfSaguzovsk pgcnt_t p_maxwin; /* max # of pages that can be cached */ 154*a98e9dbfSaguzovsk size_t p_hashwin_sz; /* # of non wired buckets */ 155*a98e9dbfSaguzovsk struct seg_phash *p_htabwin; /* hash table for non wired entries */ 156*a98e9dbfSaguzovsk size_t p_hashwired_sz; /* # of wired buckets */ 157*a98e9dbfSaguzovsk struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ 158*a98e9dbfSaguzovsk kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ 159*a98e9dbfSaguzovsk #ifdef _LP64 160*a98e9dbfSaguzovsk ulong_t pad[1]; 161*a98e9dbfSaguzovsk #endif /* _LP64 */ 162*a98e9dbfSaguzovsk } pctrl1; 163*a98e9dbfSaguzovsk 164*a98e9dbfSaguzovsk static struct p_ctrl2 { 165*a98e9dbfSaguzovsk kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ 166*a98e9dbfSaguzovsk pgcnt_t p_locked_win; /* # pages from window */ 167*a98e9dbfSaguzovsk pgcnt_t p_locked; /* # of pages cached by pagelock */ 168*a98e9dbfSaguzovsk uchar_t p_ahcur; /* current active links for insert/delete */ 169*a98e9dbfSaguzovsk uchar_t p_athr_on; /* async reclaim thread is running. */ 170*a98e9dbfSaguzovsk pcache_link_t p_ahhead[2]; /* active buckets linkages */ 171*a98e9dbfSaguzovsk } pctrl2; 172*a98e9dbfSaguzovsk 173*a98e9dbfSaguzovsk static struct p_ctrl3 { 174*a98e9dbfSaguzovsk clock_t p_pcp_maxage; /* max pcp age in ticks */ 175*a98e9dbfSaguzovsk ulong_t p_athr_empty_ahb; /* athread walk stats */ 176*a98e9dbfSaguzovsk ulong_t p_athr_full_ahb; /* athread walk stats */ 177*a98e9dbfSaguzovsk pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ 178*a98e9dbfSaguzovsk int p_shrink_shft; /* reap shift factor */ 179*a98e9dbfSaguzovsk #ifdef _LP64 180*a98e9dbfSaguzovsk ulong_t pad[3]; 181*a98e9dbfSaguzovsk #endif /* _LP64 */ 182*a98e9dbfSaguzovsk } pctrl3; 183*a98e9dbfSaguzovsk 184*a98e9dbfSaguzovsk #define seg_pdisabled pctrl1.p_disabled 185*a98e9dbfSaguzovsk #define seg_pmaxwindow pctrl1.p_maxwin 186*a98e9dbfSaguzovsk #define seg_phashsize_win pctrl1.p_hashwin_sz 187*a98e9dbfSaguzovsk #define seg_phashtab_win pctrl1.p_htabwin 188*a98e9dbfSaguzovsk #define seg_phashsize_wired pctrl1.p_hashwired_sz 189*a98e9dbfSaguzovsk #define seg_phashtab_wired pctrl1.p_htabwired 190*a98e9dbfSaguzovsk #define seg_pkmcache pctrl1.p_kmcache 191*a98e9dbfSaguzovsk #define seg_pmem_mtx pctrl2.p_mem_mtx 192*a98e9dbfSaguzovsk #define seg_plocked_window pctrl2.p_locked_win 193*a98e9dbfSaguzovsk #define seg_plocked pctrl2.p_locked 194*a98e9dbfSaguzovsk #define seg_pahcur pctrl2.p_ahcur 195*a98e9dbfSaguzovsk #define seg_pathr_on pctrl2.p_athr_on 196*a98e9dbfSaguzovsk #define seg_pahhead pctrl2.p_ahhead 197*a98e9dbfSaguzovsk #define seg_pmax_pcpage pctrl3.p_pcp_maxage 198*a98e9dbfSaguzovsk #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb 199*a98e9dbfSaguzovsk #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb 200*a98e9dbfSaguzovsk #define seg_pshrink_shift pctrl3.p_shrink_shft 201*a98e9dbfSaguzovsk #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages 202*a98e9dbfSaguzovsk 203*a98e9dbfSaguzovsk #define P_HASHWIN_MASK (seg_phashsize_win - 1) 204*a98e9dbfSaguzovsk #define P_HASHWIRED_MASK (seg_phashsize_wired - 1) 205*a98e9dbfSaguzovsk #define P_BASESHIFT (6) 206*a98e9dbfSaguzovsk 207*a98e9dbfSaguzovsk kthread_t *seg_pasync_thr; 208*a98e9dbfSaguzovsk 209*a98e9dbfSaguzovsk extern struct seg_ops segvn_ops; 210*a98e9dbfSaguzovsk extern struct seg_ops segspt_shmops; 211*a98e9dbfSaguzovsk 212*a98e9dbfSaguzovsk #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) 213*a98e9dbfSaguzovsk #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) 2147c478bd9Sstevel@tonic-gate 215*a98e9dbfSaguzovsk #define LBOLT_DELTA(t) ((ulong_t)(lbolt - (t))) 2167c478bd9Sstevel@tonic-gate 217*a98e9dbfSaguzovsk #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) 2187c478bd9Sstevel@tonic-gate 219*a98e9dbfSaguzovsk /* 220*a98e9dbfSaguzovsk * htag0 argument can be a seg or amp pointer. 221*a98e9dbfSaguzovsk */ 222*a98e9dbfSaguzovsk #define P_HASHBP(seg, htag0, addr, flags) \ 223*a98e9dbfSaguzovsk (IS_PFLAGS_WIRED((flags)) ? \ 224*a98e9dbfSaguzovsk ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ 225*a98e9dbfSaguzovsk ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ 226*a98e9dbfSaguzovsk (&seg_phashtab_win[P_HASHWIN_MASK & \ 227*a98e9dbfSaguzovsk (((uintptr_t)(htag0) >> 3) ^ \ 228*a98e9dbfSaguzovsk ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ 229*a98e9dbfSaguzovsk (flags >> 16) : page_get_shift((seg)->s_szc))))])) 2307c478bd9Sstevel@tonic-gate 231*a98e9dbfSaguzovsk /* 232*a98e9dbfSaguzovsk * htag0 argument can be a seg or amp pointer. 233*a98e9dbfSaguzovsk */ 234*a98e9dbfSaguzovsk #define P_MATCH(pcp, htag0, addr, len) \ 235*a98e9dbfSaguzovsk ((pcp)->p_htag0 == (htag0) && \ 236*a98e9dbfSaguzovsk (pcp)->p_addr == (addr) && \ 237*a98e9dbfSaguzovsk (pcp)->p_len >= (len)) 2387c478bd9Sstevel@tonic-gate 239*a98e9dbfSaguzovsk #define P_MATCH_PP(pcp, htag0, addr, len, pp) \ 240*a98e9dbfSaguzovsk ((pcp)->p_pp == (pp) && \ 241*a98e9dbfSaguzovsk (pcp)->p_htag0 == (htag0) && \ 242*a98e9dbfSaguzovsk (pcp)->p_addr == (addr) && \ 243*a98e9dbfSaguzovsk (pcp)->p_len >= (len)) 2447c478bd9Sstevel@tonic-gate 245*a98e9dbfSaguzovsk #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ 246*a98e9dbfSaguzovsk offsetof(struct seg_pcache, p_plink))) 2477c478bd9Sstevel@tonic-gate 248*a98e9dbfSaguzovsk #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ 249*a98e9dbfSaguzovsk offsetof(struct seg_phash, p_halink[l]))) 2507c478bd9Sstevel@tonic-gate 2517c478bd9Sstevel@tonic-gate /* 252*a98e9dbfSaguzovsk * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from 253*a98e9dbfSaguzovsk * active hash bucket lists. We maintain active bucket lists to reduce the 254*a98e9dbfSaguzovsk * overhead of finding active buckets during asynchronous purging since there 255*a98e9dbfSaguzovsk * can be 10s of millions of buckets on a large system but only a small subset 256*a98e9dbfSaguzovsk * of them in actual use. 257*a98e9dbfSaguzovsk * 258*a98e9dbfSaguzovsk * There're 2 active bucket lists. Current active list (as per seg_pahcur) is 259*a98e9dbfSaguzovsk * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete 260*a98e9dbfSaguzovsk * buckets. The other list is used by asynchronous purge thread. This allows 261*a98e9dbfSaguzovsk * the purge thread to walk its active list without holding seg_pmem_mtx for a 262*a98e9dbfSaguzovsk * long time. When asynchronous thread is done with its list it switches to 263*a98e9dbfSaguzovsk * current active list and makes the list it just finished processing as 264*a98e9dbfSaguzovsk * current active list. 265*a98e9dbfSaguzovsk * 266*a98e9dbfSaguzovsk * seg_padd_abuck() only adds the bucket to current list if the bucket is not 267*a98e9dbfSaguzovsk * yet on any list. seg_premove_abuck() may remove the bucket from either 268*a98e9dbfSaguzovsk * list. If the bucket is on current list it will be always removed. Otherwise 269*a98e9dbfSaguzovsk * the bucket is only removed if asynchronous purge thread is not currently 270*a98e9dbfSaguzovsk * running or seg_premove_abuck() is called by asynchronous purge thread 271*a98e9dbfSaguzovsk * itself. A given bucket can only be on one of active lists at a time. These 272*a98e9dbfSaguzovsk * routines should be called with per bucket lock held. The routines use 273*a98e9dbfSaguzovsk * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after 274*a98e9dbfSaguzovsk * the first entry is added to the bucket chain and seg_premove_abuck() must 275*a98e9dbfSaguzovsk * be called after the last pcp entry is deleted from its chain. Per bucket 276*a98e9dbfSaguzovsk * lock should be held by the callers. This avoids a potential race condition 277*a98e9dbfSaguzovsk * when seg_premove_abuck() removes a bucket after pcp entries are added to 278*a98e9dbfSaguzovsk * its list after the caller checked that the bucket has no entries. (this 279*a98e9dbfSaguzovsk * race would cause a loss of an active bucket from the active lists). 280*a98e9dbfSaguzovsk * 281*a98e9dbfSaguzovsk * Both lists are circular doubly linked lists anchored at seg_pahhead heads. 282*a98e9dbfSaguzovsk * New entries are added to the end of the list since LRU is used as the 283*a98e9dbfSaguzovsk * purging policy. 284*a98e9dbfSaguzovsk */ 285*a98e9dbfSaguzovsk static void 286*a98e9dbfSaguzovsk seg_padd_abuck(struct seg_phash *hp) 287*a98e9dbfSaguzovsk { 288*a98e9dbfSaguzovsk int lix; 289*a98e9dbfSaguzovsk 290*a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 291*a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hnext != hp); 292*a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hprev != hp); 293*a98e9dbfSaguzovsk ASSERT(hp->p_hnext == hp->p_hprev); 294*a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(hp->p_hnext)); 295*a98e9dbfSaguzovsk ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); 296*a98e9dbfSaguzovsk ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); 297*a98e9dbfSaguzovsk ASSERT(hp >= seg_phashtab_win && 298*a98e9dbfSaguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 299*a98e9dbfSaguzovsk 300*a98e9dbfSaguzovsk /* 301*a98e9dbfSaguzovsk * This bucket can already be on one of active lists 302*a98e9dbfSaguzovsk * since seg_premove_abuck() may have failed to remove it 303*a98e9dbfSaguzovsk * before. 304*a98e9dbfSaguzovsk */ 305*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 306*a98e9dbfSaguzovsk lix = seg_pahcur; 307*a98e9dbfSaguzovsk ASSERT(lix >= 0 && lix <= 1); 308*a98e9dbfSaguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 309*a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 310*a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 311*a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 312*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 313*a98e9dbfSaguzovsk return; 314*a98e9dbfSaguzovsk } 315*a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 316*a98e9dbfSaguzovsk 317*a98e9dbfSaguzovsk /* 318*a98e9dbfSaguzovsk * If this bucket is still on list !lix async thread can't yet remove 319*a98e9dbfSaguzovsk * it since we hold here per bucket lock. In this case just return 320*a98e9dbfSaguzovsk * since async thread will eventually find and process this bucket. 321*a98e9dbfSaguzovsk */ 322*a98e9dbfSaguzovsk if (hp->p_halink[!lix].p_lnext != NULL) { 323*a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev != NULL); 324*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 325*a98e9dbfSaguzovsk return; 326*a98e9dbfSaguzovsk } 327*a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 328*a98e9dbfSaguzovsk /* 329*a98e9dbfSaguzovsk * This bucket is not on any active bucket list yet. 330*a98e9dbfSaguzovsk * Add the bucket to the tail of current active list. 331*a98e9dbfSaguzovsk */ 332*a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; 333*a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; 334*a98e9dbfSaguzovsk seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; 335*a98e9dbfSaguzovsk seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; 336*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 337*a98e9dbfSaguzovsk } 338*a98e9dbfSaguzovsk 339*a98e9dbfSaguzovsk static void 340*a98e9dbfSaguzovsk seg_premove_abuck(struct seg_phash *hp, int athr) 341*a98e9dbfSaguzovsk { 342*a98e9dbfSaguzovsk int lix; 343*a98e9dbfSaguzovsk 344*a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 345*a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hnext == hp); 346*a98e9dbfSaguzovsk ASSERT((struct seg_phash *)hp->p_hprev == hp); 347*a98e9dbfSaguzovsk ASSERT(hp >= seg_phashtab_win && 348*a98e9dbfSaguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 349*a98e9dbfSaguzovsk 350*a98e9dbfSaguzovsk if (athr) { 351*a98e9dbfSaguzovsk ASSERT(seg_pathr_on); 352*a98e9dbfSaguzovsk ASSERT(seg_pahcur <= 1); 353*a98e9dbfSaguzovsk /* 354*a98e9dbfSaguzovsk * We are called by asynchronous thread that found this bucket 355*a98e9dbfSaguzovsk * on not currently active (i.e. !seg_pahcur) list. Remove it 356*a98e9dbfSaguzovsk * from there. Per bucket lock we are holding makes sure 357*a98e9dbfSaguzovsk * seg_pinsert() can't sneak in and add pcp entries to this 358*a98e9dbfSaguzovsk * bucket right before we remove the bucket from its list. 359*a98e9dbfSaguzovsk */ 360*a98e9dbfSaguzovsk lix = !seg_pahcur; 361*a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 362*a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 363*a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 364*a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 365*a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 366*a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 367*a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL; 368*a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL; 369*a98e9dbfSaguzovsk return; 370*a98e9dbfSaguzovsk } 371*a98e9dbfSaguzovsk 372*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 373*a98e9dbfSaguzovsk lix = seg_pahcur; 374*a98e9dbfSaguzovsk ASSERT(lix >= 0 && lix <= 1); 375*a98e9dbfSaguzovsk 376*a98e9dbfSaguzovsk /* 377*a98e9dbfSaguzovsk * If the bucket is on currently active list just remove it from 378*a98e9dbfSaguzovsk * there. 379*a98e9dbfSaguzovsk */ 380*a98e9dbfSaguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 381*a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 382*a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 383*a98e9dbfSaguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 384*a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 385*a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 386*a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL; 387*a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL; 388*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 389*a98e9dbfSaguzovsk return; 390*a98e9dbfSaguzovsk } 391*a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 392*a98e9dbfSaguzovsk 393*a98e9dbfSaguzovsk /* 394*a98e9dbfSaguzovsk * If asynchronous thread is not running we can remove the bucket from 395*a98e9dbfSaguzovsk * not currently active list. The bucket must be on this list since we 396*a98e9dbfSaguzovsk * already checked that it's not on the other list and the bucket from 397*a98e9dbfSaguzovsk * which we just deleted the last pcp entry must be still on one of the 398*a98e9dbfSaguzovsk * active bucket lists. 399*a98e9dbfSaguzovsk */ 400*a98e9dbfSaguzovsk lix = !lix; 401*a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 402*a98e9dbfSaguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 403*a98e9dbfSaguzovsk 404*a98e9dbfSaguzovsk if (!seg_pathr_on) { 405*a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 406*a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 407*a98e9dbfSaguzovsk hp->p_halink[lix].p_lnext = NULL; 408*a98e9dbfSaguzovsk hp->p_halink[lix].p_lprev = NULL; 409*a98e9dbfSaguzovsk } 410*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 411*a98e9dbfSaguzovsk } 412*a98e9dbfSaguzovsk 413*a98e9dbfSaguzovsk /* 414*a98e9dbfSaguzovsk * Check if bucket pointed by hp already has a pcp entry that matches request 415*a98e9dbfSaguzovsk * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. 416*a98e9dbfSaguzovsk * Also delete matching entries that cover smaller address range but start 417*a98e9dbfSaguzovsk * at the same address as addr argument. Return the list of deleted entries if 418*a98e9dbfSaguzovsk * any. This is an internal helper function called from seg_pinsert() only 419*a98e9dbfSaguzovsk * for non wired shadow lists. The caller already holds a per seg/amp list 420*a98e9dbfSaguzovsk * lock. 421*a98e9dbfSaguzovsk */ 422*a98e9dbfSaguzovsk static struct seg_pcache * 423*a98e9dbfSaguzovsk seg_plookup_checkdup(struct seg_phash *hp, void *htag0, 424*a98e9dbfSaguzovsk caddr_t addr, size_t len, int *found) 425*a98e9dbfSaguzovsk { 426*a98e9dbfSaguzovsk struct seg_pcache *pcp; 427*a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL; 428*a98e9dbfSaguzovsk 429*a98e9dbfSaguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 430*a98e9dbfSaguzovsk 431*a98e9dbfSaguzovsk *found = 0; 432*a98e9dbfSaguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 433*a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 434*a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 435*a98e9dbfSaguzovsk if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { 436*a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 437*a98e9dbfSaguzovsk if (pcp->p_len < len) { 438*a98e9dbfSaguzovsk pcache_link_t *plinkp; 439*a98e9dbfSaguzovsk if (pcp->p_active) { 440*a98e9dbfSaguzovsk continue; 441*a98e9dbfSaguzovsk } 442*a98e9dbfSaguzovsk plinkp = &pcp->p_plink; 443*a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = plinkp->p_lnext; 444*a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = plinkp->p_lprev; 445*a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 446*a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 447*a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 448*a98e9dbfSaguzovsk delcallb_list = pcp; 449*a98e9dbfSaguzovsk } else { 450*a98e9dbfSaguzovsk *found = 1; 451*a98e9dbfSaguzovsk break; 452*a98e9dbfSaguzovsk } 453*a98e9dbfSaguzovsk } 454*a98e9dbfSaguzovsk } 455*a98e9dbfSaguzovsk return (delcallb_list); 456*a98e9dbfSaguzovsk } 457*a98e9dbfSaguzovsk 458*a98e9dbfSaguzovsk /* 459*a98e9dbfSaguzovsk * lookup an address range in pagelock cache. Return shadow list and bump up 460*a98e9dbfSaguzovsk * active count. If amp is not NULL use amp as a lookup tag otherwise use seg 461*a98e9dbfSaguzovsk * as a lookup tag. 4627c478bd9Sstevel@tonic-gate */ 4637c478bd9Sstevel@tonic-gate struct page ** 464*a98e9dbfSaguzovsk seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 465*a98e9dbfSaguzovsk enum seg_rw rw, uint_t flags) 4667c478bd9Sstevel@tonic-gate { 4677c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 4687c478bd9Sstevel@tonic-gate struct seg_phash *hp; 469*a98e9dbfSaguzovsk void *htag0; 470*a98e9dbfSaguzovsk 471*a98e9dbfSaguzovsk ASSERT(seg != NULL); 472*a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 4737c478bd9Sstevel@tonic-gate 4747c478bd9Sstevel@tonic-gate /* 4757c478bd9Sstevel@tonic-gate * Skip pagelock cache, while DR is in progress or 4767c478bd9Sstevel@tonic-gate * seg_pcache is off. 4777c478bd9Sstevel@tonic-gate */ 478*a98e9dbfSaguzovsk if (seg_pdisabled) { 4797c478bd9Sstevel@tonic-gate return (NULL); 4807c478bd9Sstevel@tonic-gate } 481*a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 4827c478bd9Sstevel@tonic-gate 483*a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 484*a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 4857c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 4867c478bd9Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 4877c478bd9Sstevel@tonic-gate pcp = pcp->p_hnext) { 488*a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 489*a98e9dbfSaguzovsk if (P_MATCH(pcp, htag0, addr, len)) { 490*a98e9dbfSaguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 491*a98e9dbfSaguzovsk /* 492*a98e9dbfSaguzovsk * If this request wants to write pages 493*a98e9dbfSaguzovsk * but write permissions starting from 494*a98e9dbfSaguzovsk * addr don't cover the entire length len 495*a98e9dbfSaguzovsk * return lookup failure back to the caller. 496*a98e9dbfSaguzovsk * It will check protections and fail this 497*a98e9dbfSaguzovsk * pagelock operation with EACCESS error. 498*a98e9dbfSaguzovsk */ 499*a98e9dbfSaguzovsk if (rw == S_WRITE && pcp->p_wlen < len) { 500*a98e9dbfSaguzovsk break; 501*a98e9dbfSaguzovsk } 502*a98e9dbfSaguzovsk if (pcp->p_active == UINT_MAX) { 503*a98e9dbfSaguzovsk break; 504*a98e9dbfSaguzovsk } 5057c478bd9Sstevel@tonic-gate pcp->p_active++; 506*a98e9dbfSaguzovsk if (rw == S_WRITE && !pcp->p_write) { 507*a98e9dbfSaguzovsk pcp->p_write = 1; 508*a98e9dbfSaguzovsk } 5097c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5107c478bd9Sstevel@tonic-gate return (pcp->p_pp); 5117c478bd9Sstevel@tonic-gate } 5127c478bd9Sstevel@tonic-gate } 5137c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5147c478bd9Sstevel@tonic-gate return (NULL); 5157c478bd9Sstevel@tonic-gate } 5167c478bd9Sstevel@tonic-gate 5177c478bd9Sstevel@tonic-gate /* 518*a98e9dbfSaguzovsk * mark address range inactive. If the cache is off or the address range is 519*a98e9dbfSaguzovsk * not in the cache or another shadow list that covers bigger range is found 520*a98e9dbfSaguzovsk * we call the segment driver to reclaim the pages. Otherwise just decrement 521*a98e9dbfSaguzovsk * active count and set ref bit. If amp is not NULL use amp as a lookup tag 522*a98e9dbfSaguzovsk * otherwise use seg as a lookup tag. 5237c478bd9Sstevel@tonic-gate */ 5247c478bd9Sstevel@tonic-gate void 525*a98e9dbfSaguzovsk seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, 526*a98e9dbfSaguzovsk size_t len, struct page **pp, enum seg_rw rw, uint_t flags, 527*a98e9dbfSaguzovsk seg_preclaim_cbfunc_t callback) 5287c478bd9Sstevel@tonic-gate { 5297c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 5307c478bd9Sstevel@tonic-gate struct seg_phash *hp; 531*a98e9dbfSaguzovsk kmutex_t *pmtx = NULL; 532*a98e9dbfSaguzovsk pcache_link_t *pheadp; 533*a98e9dbfSaguzovsk void *htag0; 534*a98e9dbfSaguzovsk pgcnt_t npages = 0; 535*a98e9dbfSaguzovsk int keep = 0; 5367c478bd9Sstevel@tonic-gate 537*a98e9dbfSaguzovsk ASSERT(seg != NULL); 538*a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 539*a98e9dbfSaguzovsk 540*a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 541*a98e9dbfSaguzovsk 542*a98e9dbfSaguzovsk /* 543*a98e9dbfSaguzovsk * Skip lookup if pcache is not configured. 544*a98e9dbfSaguzovsk */ 545*a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 546*a98e9dbfSaguzovsk goto out; 547*a98e9dbfSaguzovsk } 548*a98e9dbfSaguzovsk 549*a98e9dbfSaguzovsk /* 550*a98e9dbfSaguzovsk * Grab per seg/amp lock before hash lock if we are going to remove 551*a98e9dbfSaguzovsk * inactive entry from pcache. 552*a98e9dbfSaguzovsk */ 553*a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { 554*a98e9dbfSaguzovsk if (amp == NULL) { 555*a98e9dbfSaguzovsk pheadp = &seg->s_phead; 556*a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 557*a98e9dbfSaguzovsk } else { 558*a98e9dbfSaguzovsk pheadp = &->a_phead; 559*a98e9dbfSaguzovsk pmtx = &->a_pmtx; 560*a98e9dbfSaguzovsk } 561*a98e9dbfSaguzovsk mutex_enter(pmtx); 5627c478bd9Sstevel@tonic-gate } 563*a98e9dbfSaguzovsk 564*a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 5657c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 566*a98e9dbfSaguzovsk again: 5677c478bd9Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 5687c478bd9Sstevel@tonic-gate pcp = pcp->p_hnext) { 569*a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 570*a98e9dbfSaguzovsk if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { 571*a98e9dbfSaguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 572*a98e9dbfSaguzovsk ASSERT(pcp->p_active); 573*a98e9dbfSaguzovsk if (keep) { 574*a98e9dbfSaguzovsk /* 575*a98e9dbfSaguzovsk * Don't remove this pcp entry 576*a98e9dbfSaguzovsk * if we didn't find duplicate 577*a98e9dbfSaguzovsk * shadow lists on second search. 578*a98e9dbfSaguzovsk * Somebody removed those duplicates 579*a98e9dbfSaguzovsk * since we dropped hash lock after first 580*a98e9dbfSaguzovsk * search. 581*a98e9dbfSaguzovsk */ 582*a98e9dbfSaguzovsk ASSERT(pmtx != NULL); 583*a98e9dbfSaguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 584*a98e9dbfSaguzovsk mutex_exit(pmtx); 585*a98e9dbfSaguzovsk pmtx = NULL; 586*a98e9dbfSaguzovsk } 5877c478bd9Sstevel@tonic-gate pcp->p_active--; 588*a98e9dbfSaguzovsk if (pcp->p_active == 0 && (pmtx != NULL || 589*a98e9dbfSaguzovsk (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { 590*a98e9dbfSaguzovsk 591*a98e9dbfSaguzovsk /* 592*a98e9dbfSaguzovsk * This entry is no longer active. Remove it 593*a98e9dbfSaguzovsk * now either because pcaching is temporarily 594*a98e9dbfSaguzovsk * disabled or there're other pcp entries that 595*a98e9dbfSaguzovsk * can match this pagelock request (i.e. this 596*a98e9dbfSaguzovsk * entry is a duplicate). 597*a98e9dbfSaguzovsk */ 5987c478bd9Sstevel@tonic-gate 5997c478bd9Sstevel@tonic-gate ASSERT(callback == pcp->p_callback); 600*a98e9dbfSaguzovsk if (pmtx != NULL) { 601*a98e9dbfSaguzovsk pcache_link_t *plinkp = &pcp->p_plink; 602*a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 603*a98e9dbfSaguzovsk ASSERT(pheadp->p_lnext != pheadp); 604*a98e9dbfSaguzovsk ASSERT(pheadp->p_lprev != pheadp); 605*a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = 606*a98e9dbfSaguzovsk plinkp->p_lnext; 607*a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = 608*a98e9dbfSaguzovsk plinkp->p_lprev; 609*a98e9dbfSaguzovsk } 6107c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 6117c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 612*a98e9dbfSaguzovsk if (!IS_PCP_WIRED(pcp) && 613*a98e9dbfSaguzovsk hp->p_hnext == (struct seg_pcache *)hp) { 614*a98e9dbfSaguzovsk /* 615*a98e9dbfSaguzovsk * We removed the last entry from this 616*a98e9dbfSaguzovsk * bucket. Now remove the bucket from 617*a98e9dbfSaguzovsk * its active list. 618*a98e9dbfSaguzovsk */ 619*a98e9dbfSaguzovsk seg_premove_abuck(hp, 0); 620*a98e9dbfSaguzovsk } 6217c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 622*a98e9dbfSaguzovsk if (pmtx != NULL) { 623*a98e9dbfSaguzovsk mutex_exit(pmtx); 6247c478bd9Sstevel@tonic-gate } 625*a98e9dbfSaguzovsk len = pcp->p_len; 626*a98e9dbfSaguzovsk npages = btop(len); 627*a98e9dbfSaguzovsk if (rw != S_WRITE && pcp->p_write) { 628*a98e9dbfSaguzovsk rw = S_WRITE; 629*a98e9dbfSaguzovsk } 630*a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 6317c478bd9Sstevel@tonic-gate goto out; 632*a98e9dbfSaguzovsk } else { 633*a98e9dbfSaguzovsk /* 634*a98e9dbfSaguzovsk * We found a matching pcp entry but will not 635*a98e9dbfSaguzovsk * free it right away even if it's no longer 636*a98e9dbfSaguzovsk * active. 637*a98e9dbfSaguzovsk */ 638*a98e9dbfSaguzovsk if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { 639*a98e9dbfSaguzovsk /* 640*a98e9dbfSaguzovsk * Set the reference bit and mark the 641*a98e9dbfSaguzovsk * time of last access to this pcp 642*a98e9dbfSaguzovsk * so that asynchronous thread doesn't 643*a98e9dbfSaguzovsk * free it immediately since 644*a98e9dbfSaguzovsk * it may be reactivated very soon. 645*a98e9dbfSaguzovsk */ 646*a98e9dbfSaguzovsk pcp->p_lbolt = lbolt; 647*a98e9dbfSaguzovsk pcp->p_ref = 1; 648*a98e9dbfSaguzovsk } 649*a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 650*a98e9dbfSaguzovsk if (pmtx != NULL) { 651*a98e9dbfSaguzovsk mutex_exit(pmtx); 652*a98e9dbfSaguzovsk } 653*a98e9dbfSaguzovsk return; 654*a98e9dbfSaguzovsk } 655*a98e9dbfSaguzovsk } else if (!IS_PFLAGS_WIRED(flags) && 656*a98e9dbfSaguzovsk P_MATCH(pcp, htag0, addr, len)) { 657*a98e9dbfSaguzovsk /* 658*a98e9dbfSaguzovsk * This is a duplicate pcp entry. This situation may 659*a98e9dbfSaguzovsk * happen if a bigger shadow list that covers our 660*a98e9dbfSaguzovsk * range was added while our entry was still active. 661*a98e9dbfSaguzovsk * Now we can free our pcp entry if it becomes 662*a98e9dbfSaguzovsk * inactive. 663*a98e9dbfSaguzovsk */ 664*a98e9dbfSaguzovsk if (!pcp->p_active) { 665*a98e9dbfSaguzovsk /* 666*a98e9dbfSaguzovsk * Mark this entry as referenced just in case 667*a98e9dbfSaguzovsk * we'll free our own pcp entry soon. 668*a98e9dbfSaguzovsk */ 669*a98e9dbfSaguzovsk pcp->p_lbolt = lbolt; 670*a98e9dbfSaguzovsk pcp->p_ref = 1; 671*a98e9dbfSaguzovsk } 672*a98e9dbfSaguzovsk if (pmtx != NULL) { 673*a98e9dbfSaguzovsk /* 674*a98e9dbfSaguzovsk * we are already holding pmtx and found a 675*a98e9dbfSaguzovsk * duplicate. Don't keep our own pcp entry. 676*a98e9dbfSaguzovsk */ 677*a98e9dbfSaguzovsk keep = 0; 678*a98e9dbfSaguzovsk continue; 679*a98e9dbfSaguzovsk } 680*a98e9dbfSaguzovsk /* 681*a98e9dbfSaguzovsk * We have to use mutex_tryenter to attempt to lock 682*a98e9dbfSaguzovsk * seg/amp list lock since we already hold hash lock 683*a98e9dbfSaguzovsk * and seg/amp list lock is above hash lock in lock 684*a98e9dbfSaguzovsk * order. If mutex_tryenter fails drop hash lock and 685*a98e9dbfSaguzovsk * retake both locks in correct order and research 686*a98e9dbfSaguzovsk * this hash chain. 687*a98e9dbfSaguzovsk */ 688*a98e9dbfSaguzovsk ASSERT(keep == 0); 689*a98e9dbfSaguzovsk if (amp == NULL) { 690*a98e9dbfSaguzovsk pheadp = &seg->s_phead; 691*a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 692*a98e9dbfSaguzovsk } else { 693*a98e9dbfSaguzovsk pheadp = &->a_phead; 694*a98e9dbfSaguzovsk pmtx = &->a_pmtx; 695*a98e9dbfSaguzovsk } 696*a98e9dbfSaguzovsk if (!mutex_tryenter(pmtx)) { 697*a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 698*a98e9dbfSaguzovsk mutex_enter(pmtx); 699*a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 700*a98e9dbfSaguzovsk /* 701*a98e9dbfSaguzovsk * If we don't find bigger shadow list on 702*a98e9dbfSaguzovsk * second search (it may happen since we 703*a98e9dbfSaguzovsk * dropped bucket lock) keep the entry that 704*a98e9dbfSaguzovsk * matches our own shadow list. 705*a98e9dbfSaguzovsk */ 706*a98e9dbfSaguzovsk keep = 1; 707*a98e9dbfSaguzovsk goto again; 7087c478bd9Sstevel@tonic-gate } 7097c478bd9Sstevel@tonic-gate } 7107c478bd9Sstevel@tonic-gate } 7117c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 712*a98e9dbfSaguzovsk if (pmtx != NULL) { 713*a98e9dbfSaguzovsk mutex_exit(pmtx); 714*a98e9dbfSaguzovsk } 7157c478bd9Sstevel@tonic-gate out: 716*a98e9dbfSaguzovsk (*callback)(htag0, addr, len, pp, rw, 0); 717*a98e9dbfSaguzovsk if (npages) { 718*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 719*a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 720*a98e9dbfSaguzovsk seg_plocked -= npages; 721*a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 722*a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages); 723*a98e9dbfSaguzovsk seg_plocked_window -= npages; 724*a98e9dbfSaguzovsk } 725*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 726*a98e9dbfSaguzovsk } 727*a98e9dbfSaguzovsk 7287c478bd9Sstevel@tonic-gate } 7297c478bd9Sstevel@tonic-gate 730*a98e9dbfSaguzovsk #ifdef DEBUG 731*a98e9dbfSaguzovsk static uint32_t p_insert_chk_mtbf = 0; 732*a98e9dbfSaguzovsk #endif 733*a98e9dbfSaguzovsk 7347c478bd9Sstevel@tonic-gate /* 7357c478bd9Sstevel@tonic-gate * The seg_pinsert_check() is used by segment drivers to predict whether 7367c478bd9Sstevel@tonic-gate * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. 7377c478bd9Sstevel@tonic-gate */ 738*a98e9dbfSaguzovsk /*ARGSUSED*/ 7397c478bd9Sstevel@tonic-gate int 740*a98e9dbfSaguzovsk seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, 741*a98e9dbfSaguzovsk size_t len, uint_t flags) 7427c478bd9Sstevel@tonic-gate { 743*a98e9dbfSaguzovsk ASSERT(seg != NULL); 7447c478bd9Sstevel@tonic-gate 745*a98e9dbfSaguzovsk #ifdef DEBUG 746*a98e9dbfSaguzovsk if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { 7477c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 7487c478bd9Sstevel@tonic-gate } 749*a98e9dbfSaguzovsk #endif 750*a98e9dbfSaguzovsk 751*a98e9dbfSaguzovsk if (seg_pdisabled) { 7527c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 7537c478bd9Sstevel@tonic-gate } 754*a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 755*a98e9dbfSaguzovsk 756*a98e9dbfSaguzovsk if (IS_PFLAGS_WIRED(flags)) { 757*a98e9dbfSaguzovsk return (SEGP_SUCCESS); 758*a98e9dbfSaguzovsk } 759*a98e9dbfSaguzovsk 760*a98e9dbfSaguzovsk if (seg_plocked_window + btop(len) > seg_pmaxwindow) { 7617c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 7627c478bd9Sstevel@tonic-gate } 7637c478bd9Sstevel@tonic-gate 764*a98e9dbfSaguzovsk if (freemem < desfree) { 765*a98e9dbfSaguzovsk return (SEGP_FAIL); 7667c478bd9Sstevel@tonic-gate } 767*a98e9dbfSaguzovsk 7687c478bd9Sstevel@tonic-gate return (SEGP_SUCCESS); 7697c478bd9Sstevel@tonic-gate } 7707c478bd9Sstevel@tonic-gate 771*a98e9dbfSaguzovsk #ifdef DEBUG 772*a98e9dbfSaguzovsk static uint32_t p_insert_mtbf = 0; 773*a98e9dbfSaguzovsk #endif 7747c478bd9Sstevel@tonic-gate 7757c478bd9Sstevel@tonic-gate /* 776*a98e9dbfSaguzovsk * Insert address range with shadow list into pagelock cache if there's no 777*a98e9dbfSaguzovsk * shadow list already cached for this address range. If the cache is off or 778*a98e9dbfSaguzovsk * caching is temporarily disabled or the allowed 'window' is exceeded return 779*a98e9dbfSaguzovsk * SEGP_FAIL. Otherwise return SEGP_SUCCESS. 780*a98e9dbfSaguzovsk * 781*a98e9dbfSaguzovsk * For non wired shadow lists (segvn case) include address in the hashing 782*a98e9dbfSaguzovsk * function to avoid linking all the entries from the same segment or amp on 783*a98e9dbfSaguzovsk * the same bucket. amp is used instead of seg if amp is not NULL. Non wired 784*a98e9dbfSaguzovsk * pcache entries are also linked on a per segment/amp list so that all 785*a98e9dbfSaguzovsk * entries can be found quickly during seg/amp purge without walking the 786*a98e9dbfSaguzovsk * entire pcache hash table. For wired shadow lists (segspt case) we 787*a98e9dbfSaguzovsk * don't use address hashing and per segment linking because the caller 788*a98e9dbfSaguzovsk * currently inserts only one entry per segment that covers the entire 789*a98e9dbfSaguzovsk * segment. If we used per segment linking even for segspt it would complicate 790*a98e9dbfSaguzovsk * seg_ppurge_wiredpp() locking. 791*a98e9dbfSaguzovsk * 792*a98e9dbfSaguzovsk * Both hash bucket and per seg/amp locks need to be held before adding a non 793*a98e9dbfSaguzovsk * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken 794*a98e9dbfSaguzovsk * first. 795*a98e9dbfSaguzovsk * 796*a98e9dbfSaguzovsk * This function will also remove from pcache old inactive shadow lists that 797*a98e9dbfSaguzovsk * overlap with this request but cover smaller range for the same start 798*a98e9dbfSaguzovsk * address. 7997c478bd9Sstevel@tonic-gate */ 8007c478bd9Sstevel@tonic-gate int 801*a98e9dbfSaguzovsk seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 802*a98e9dbfSaguzovsk size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, 803*a98e9dbfSaguzovsk seg_preclaim_cbfunc_t callback) 8047c478bd9Sstevel@tonic-gate { 8057c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 8067c478bd9Sstevel@tonic-gate struct seg_phash *hp; 8077c478bd9Sstevel@tonic-gate pgcnt_t npages; 808*a98e9dbfSaguzovsk pcache_link_t *pheadp; 809*a98e9dbfSaguzovsk kmutex_t *pmtx; 810*a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL; 8117c478bd9Sstevel@tonic-gate 812*a98e9dbfSaguzovsk ASSERT(seg != NULL); 813*a98e9dbfSaguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 814*a98e9dbfSaguzovsk ASSERT(rw == S_READ || wlen == len); 815*a98e9dbfSaguzovsk ASSERT(rw == S_WRITE || wlen <= len); 816*a98e9dbfSaguzovsk ASSERT(amp == NULL || wlen == len); 817*a98e9dbfSaguzovsk 818*a98e9dbfSaguzovsk #ifdef DEBUG 819*a98e9dbfSaguzovsk if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { 8207c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 8217c478bd9Sstevel@tonic-gate } 822*a98e9dbfSaguzovsk #endif 823*a98e9dbfSaguzovsk 824*a98e9dbfSaguzovsk if (seg_pdisabled) { 8257c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 8267c478bd9Sstevel@tonic-gate } 827*a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 828*a98e9dbfSaguzovsk 8297c478bd9Sstevel@tonic-gate ASSERT((len & PAGEOFFSET) == 0); 830*a98e9dbfSaguzovsk npages = btop(len); 831*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 832*a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 833*a98e9dbfSaguzovsk if (seg_plocked_window + npages > seg_pmaxwindow) { 834*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 8357c478bd9Sstevel@tonic-gate return (SEGP_FAIL); 8367c478bd9Sstevel@tonic-gate } 837*a98e9dbfSaguzovsk seg_plocked_window += npages; 8387c478bd9Sstevel@tonic-gate } 8397c478bd9Sstevel@tonic-gate seg_plocked += npages; 840*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 8417c478bd9Sstevel@tonic-gate 842*a98e9dbfSaguzovsk pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); 843*a98e9dbfSaguzovsk /* 844*a98e9dbfSaguzovsk * If amp is not NULL set htag0 to amp otherwise set it to seg. 845*a98e9dbfSaguzovsk */ 846*a98e9dbfSaguzovsk if (amp == NULL) { 847*a98e9dbfSaguzovsk pcp->p_htag0 = (void *)seg; 848*a98e9dbfSaguzovsk pcp->p_flags = flags & 0xffff; 849*a98e9dbfSaguzovsk } else { 850*a98e9dbfSaguzovsk pcp->p_htag0 = (void *)amp; 851*a98e9dbfSaguzovsk pcp->p_flags = (flags & 0xffff) | SEGP_AMP; 852*a98e9dbfSaguzovsk } 8537c478bd9Sstevel@tonic-gate pcp->p_addr = addr; 8547c478bd9Sstevel@tonic-gate pcp->p_len = len; 855*a98e9dbfSaguzovsk pcp->p_wlen = wlen; 8567c478bd9Sstevel@tonic-gate pcp->p_pp = pp; 857*a98e9dbfSaguzovsk pcp->p_write = (rw == S_WRITE); 8587c478bd9Sstevel@tonic-gate pcp->p_callback = callback; 8597c478bd9Sstevel@tonic-gate pcp->p_active = 1; 8607c478bd9Sstevel@tonic-gate 861*a98e9dbfSaguzovsk hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); 862*a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 863*a98e9dbfSaguzovsk int found; 864*a98e9dbfSaguzovsk void *htag0; 865*a98e9dbfSaguzovsk if (amp == NULL) { 866*a98e9dbfSaguzovsk pheadp = &seg->s_phead; 867*a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 868*a98e9dbfSaguzovsk htag0 = (void *)seg; 869*a98e9dbfSaguzovsk } else { 870*a98e9dbfSaguzovsk pheadp = &->a_phead; 871*a98e9dbfSaguzovsk pmtx = &->a_pmtx; 872*a98e9dbfSaguzovsk htag0 = (void *)amp; 873*a98e9dbfSaguzovsk } 874*a98e9dbfSaguzovsk mutex_enter(pmtx); 875*a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 876*a98e9dbfSaguzovsk delcallb_list = seg_plookup_checkdup(hp, htag0, addr, 877*a98e9dbfSaguzovsk len, &found); 878*a98e9dbfSaguzovsk if (found) { 879*a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 880*a98e9dbfSaguzovsk mutex_exit(pmtx); 881*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 882*a98e9dbfSaguzovsk seg_plocked -= npages; 883*a98e9dbfSaguzovsk seg_plocked_window -= npages; 884*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 885*a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 886*a98e9dbfSaguzovsk goto out; 887*a98e9dbfSaguzovsk } 888*a98e9dbfSaguzovsk pcp->p_plink.p_lnext = pheadp->p_lnext; 889*a98e9dbfSaguzovsk pcp->p_plink.p_lprev = pheadp; 890*a98e9dbfSaguzovsk pheadp->p_lnext->p_lprev = &pcp->p_plink; 891*a98e9dbfSaguzovsk pheadp->p_lnext = &pcp->p_plink; 892*a98e9dbfSaguzovsk } else { 893*a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 894*a98e9dbfSaguzovsk } 895*a98e9dbfSaguzovsk pcp->p_hashp = hp; 8967c478bd9Sstevel@tonic-gate pcp->p_hnext = hp->p_hnext; 8977c478bd9Sstevel@tonic-gate pcp->p_hprev = (struct seg_pcache *)hp; 8987c478bd9Sstevel@tonic-gate hp->p_hnext->p_hprev = pcp; 8997c478bd9Sstevel@tonic-gate hp->p_hnext = pcp; 900*a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags) && 901*a98e9dbfSaguzovsk hp->p_hprev == pcp) { 902*a98e9dbfSaguzovsk seg_padd_abuck(hp); 903*a98e9dbfSaguzovsk } 9047c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 905*a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 906*a98e9dbfSaguzovsk mutex_exit(pmtx); 907*a98e9dbfSaguzovsk } 908*a98e9dbfSaguzovsk 909*a98e9dbfSaguzovsk out: 910*a98e9dbfSaguzovsk npages = 0; 911*a98e9dbfSaguzovsk while (delcallb_list != NULL) { 912*a98e9dbfSaguzovsk pcp = delcallb_list; 913*a98e9dbfSaguzovsk delcallb_list = pcp->p_hprev; 914*a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); 915*a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 916*a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 917*a98e9dbfSaguzovsk npages += btop(pcp->p_len); 918*a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 919*a98e9dbfSaguzovsk } 920*a98e9dbfSaguzovsk if (npages) { 921*a98e9dbfSaguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 922*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 923*a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 924*a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages); 925*a98e9dbfSaguzovsk seg_plocked -= npages; 926*a98e9dbfSaguzovsk seg_plocked_window -= npages; 927*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 928*a98e9dbfSaguzovsk } 929*a98e9dbfSaguzovsk 9307c478bd9Sstevel@tonic-gate return (SEGP_SUCCESS); 9317c478bd9Sstevel@tonic-gate } 9327c478bd9Sstevel@tonic-gate 9337c478bd9Sstevel@tonic-gate /* 934*a98e9dbfSaguzovsk * purge entries from the pagelock cache if not active 935*a98e9dbfSaguzovsk * and not recently used. 9367c478bd9Sstevel@tonic-gate */ 9377c478bd9Sstevel@tonic-gate static void 938*a98e9dbfSaguzovsk seg_ppurge_async(int force) 9397c478bd9Sstevel@tonic-gate { 9407c478bd9Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 9417c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 9427c478bd9Sstevel@tonic-gate struct seg_phash *hp; 9437c478bd9Sstevel@tonic-gate pgcnt_t npages = 0; 9447c478bd9Sstevel@tonic-gate pgcnt_t npages_window = 0; 945*a98e9dbfSaguzovsk pgcnt_t npgs_to_purge; 946*a98e9dbfSaguzovsk pgcnt_t npgs_purged = 0; 947*a98e9dbfSaguzovsk int hlinks = 0; 948*a98e9dbfSaguzovsk int hlix; 949*a98e9dbfSaguzovsk pcache_link_t *hlinkp; 950*a98e9dbfSaguzovsk pcache_link_t *hlnextp = NULL; 951*a98e9dbfSaguzovsk int lowmem; 952*a98e9dbfSaguzovsk int trim; 953*a98e9dbfSaguzovsk 954*a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 9557c478bd9Sstevel@tonic-gate 9567c478bd9Sstevel@tonic-gate /* 957*a98e9dbfSaguzovsk * if the cache is off or empty, return 9587c478bd9Sstevel@tonic-gate */ 959*a98e9dbfSaguzovsk if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { 9607c478bd9Sstevel@tonic-gate return; 9617c478bd9Sstevel@tonic-gate } 9627c478bd9Sstevel@tonic-gate 963*a98e9dbfSaguzovsk if (!force) { 964*a98e9dbfSaguzovsk lowmem = 0; 965*a98e9dbfSaguzovsk trim = 0; 966*a98e9dbfSaguzovsk if (freemem < lotsfree + needfree) { 967*a98e9dbfSaguzovsk spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); 968*a98e9dbfSaguzovsk if (fmem <= 5 * (desfree >> 2)) { 969*a98e9dbfSaguzovsk lowmem = 1; 970*a98e9dbfSaguzovsk } else if (fmem <= 7 * (lotsfree >> 3)) { 971*a98e9dbfSaguzovsk if (seg_plocked_window >= 972*a98e9dbfSaguzovsk (availrmem_initial >> 1)) { 973*a98e9dbfSaguzovsk lowmem = 1; 974*a98e9dbfSaguzovsk } 975*a98e9dbfSaguzovsk } else if (fmem < lotsfree) { 976*a98e9dbfSaguzovsk if (seg_plocked_window >= 977*a98e9dbfSaguzovsk 3 * (availrmem_initial >> 2)) { 978*a98e9dbfSaguzovsk lowmem = 1; 979*a98e9dbfSaguzovsk } 980*a98e9dbfSaguzovsk } 981*a98e9dbfSaguzovsk } 982*a98e9dbfSaguzovsk if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { 983*a98e9dbfSaguzovsk trim = 1; 984*a98e9dbfSaguzovsk } 985*a98e9dbfSaguzovsk if (!lowmem && !trim) { 986*a98e9dbfSaguzovsk return; 987*a98e9dbfSaguzovsk } 988*a98e9dbfSaguzovsk npgs_to_purge = seg_plocked_window >> 989*a98e9dbfSaguzovsk seg_pshrink_shift; 990*a98e9dbfSaguzovsk if (lowmem) { 991*a98e9dbfSaguzovsk npgs_to_purge = MIN(npgs_to_purge, 992*a98e9dbfSaguzovsk MAX(seg_pmaxapurge_npages, desfree)); 993*a98e9dbfSaguzovsk } else { 994*a98e9dbfSaguzovsk npgs_to_purge = MIN(npgs_to_purge, 995*a98e9dbfSaguzovsk seg_pmaxapurge_npages); 996*a98e9dbfSaguzovsk } 997*a98e9dbfSaguzovsk if (npgs_to_purge == 0) { 998*a98e9dbfSaguzovsk return; 999*a98e9dbfSaguzovsk } 1000*a98e9dbfSaguzovsk } else { 1001*a98e9dbfSaguzovsk struct seg_phash_wired *hpw; 10027c478bd9Sstevel@tonic-gate 1003*a98e9dbfSaguzovsk ASSERT(seg_phashsize_wired != 0); 10047c478bd9Sstevel@tonic-gate 1005*a98e9dbfSaguzovsk for (hpw = seg_phashtab_wired; 1006*a98e9dbfSaguzovsk hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { 1007*a98e9dbfSaguzovsk 1008*a98e9dbfSaguzovsk if (hpw->p_hnext == (struct seg_pcache *)hpw) { 1009*a98e9dbfSaguzovsk continue; 1010*a98e9dbfSaguzovsk } 1011*a98e9dbfSaguzovsk 1012*a98e9dbfSaguzovsk mutex_enter(&hpw->p_hmutex); 1013*a98e9dbfSaguzovsk 1014*a98e9dbfSaguzovsk for (pcp = hpw->p_hnext; 1015*a98e9dbfSaguzovsk pcp != (struct seg_pcache *)hpw; 1016*a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 1017*a98e9dbfSaguzovsk 1018*a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1019*a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == 1020*a98e9dbfSaguzovsk (struct seg_phash *)hpw); 1021*a98e9dbfSaguzovsk 1022*a98e9dbfSaguzovsk if (pcp->p_active) { 1023*a98e9dbfSaguzovsk continue; 10247c478bd9Sstevel@tonic-gate } 1025*a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1026*a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1027*a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1028*a98e9dbfSaguzovsk delcallb_list = pcp; 1029*a98e9dbfSaguzovsk } 1030*a98e9dbfSaguzovsk mutex_exit(&hpw->p_hmutex); 1031*a98e9dbfSaguzovsk } 1032*a98e9dbfSaguzovsk } 1033*a98e9dbfSaguzovsk 1034*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1035*a98e9dbfSaguzovsk if (seg_pathr_on) { 1036*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1037*a98e9dbfSaguzovsk goto runcb; 1038*a98e9dbfSaguzovsk } 1039*a98e9dbfSaguzovsk seg_pathr_on = 1; 1040*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1041*a98e9dbfSaguzovsk ASSERT(seg_pahcur <= 1); 1042*a98e9dbfSaguzovsk hlix = !seg_pahcur; 1043*a98e9dbfSaguzovsk 1044*a98e9dbfSaguzovsk again: 1045*a98e9dbfSaguzovsk for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; 1046*a98e9dbfSaguzovsk hlinkp = hlnextp) { 1047*a98e9dbfSaguzovsk 1048*a98e9dbfSaguzovsk hlnextp = hlinkp->p_lnext; 1049*a98e9dbfSaguzovsk ASSERT(hlnextp != NULL); 1050*a98e9dbfSaguzovsk 1051*a98e9dbfSaguzovsk hp = hlink2phash(hlinkp, hlix); 1052*a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1053*a98e9dbfSaguzovsk seg_pathr_empty_ahb++; 1054*a98e9dbfSaguzovsk continue; 1055*a98e9dbfSaguzovsk } 1056*a98e9dbfSaguzovsk seg_pathr_full_ahb++; 1057*a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 1058*a98e9dbfSaguzovsk 1059*a98e9dbfSaguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 1060*a98e9dbfSaguzovsk pcp = pcp->p_hnext) { 1061*a98e9dbfSaguzovsk pcache_link_t *pheadp; 1062*a98e9dbfSaguzovsk pcache_link_t *plinkp; 1063*a98e9dbfSaguzovsk void *htag0; 1064*a98e9dbfSaguzovsk kmutex_t *pmtx; 1065*a98e9dbfSaguzovsk 1066*a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 1067*a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 1068*a98e9dbfSaguzovsk 1069*a98e9dbfSaguzovsk if (pcp->p_active) { 1070*a98e9dbfSaguzovsk continue; 1071*a98e9dbfSaguzovsk } 1072*a98e9dbfSaguzovsk if (!force && pcp->p_ref && 1073*a98e9dbfSaguzovsk PCP_AGE(pcp) < seg_pmax_pcpage) { 10747c478bd9Sstevel@tonic-gate pcp->p_ref = 0; 1075*a98e9dbfSaguzovsk continue; 10767c478bd9Sstevel@tonic-gate } 1077*a98e9dbfSaguzovsk plinkp = &pcp->p_plink; 1078*a98e9dbfSaguzovsk htag0 = pcp->p_htag0; 1079*a98e9dbfSaguzovsk if (pcp->p_flags & SEGP_AMP) { 1080*a98e9dbfSaguzovsk pheadp = &((amp_t *)htag0)->a_phead; 1081*a98e9dbfSaguzovsk pmtx = &((amp_t *)htag0)->a_pmtx; 1082*a98e9dbfSaguzovsk } else { 1083*a98e9dbfSaguzovsk pheadp = &((seg_t *)htag0)->s_phead; 1084*a98e9dbfSaguzovsk pmtx = &((seg_t *)htag0)->s_pmtx; 1085*a98e9dbfSaguzovsk } 1086*a98e9dbfSaguzovsk if (!mutex_tryenter(pmtx)) { 1087*a98e9dbfSaguzovsk continue; 1088*a98e9dbfSaguzovsk } 1089*a98e9dbfSaguzovsk ASSERT(pheadp->p_lnext != pheadp); 1090*a98e9dbfSaguzovsk ASSERT(pheadp->p_lprev != pheadp); 1091*a98e9dbfSaguzovsk plinkp->p_lprev->p_lnext = 1092*a98e9dbfSaguzovsk plinkp->p_lnext; 1093*a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = 1094*a98e9dbfSaguzovsk plinkp->p_lprev; 1095*a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1096*a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1097*a98e9dbfSaguzovsk mutex_exit(pmtx); 1098*a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1099*a98e9dbfSaguzovsk delcallb_list = pcp; 1100*a98e9dbfSaguzovsk npgs_purged += btop(pcp->p_len); 1101*a98e9dbfSaguzovsk } 1102*a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1103*a98e9dbfSaguzovsk seg_premove_abuck(hp, 1); 11047c478bd9Sstevel@tonic-gate } 11057c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 1106*a98e9dbfSaguzovsk if (npgs_purged >= seg_plocked_window) { 11077c478bd9Sstevel@tonic-gate break; 1108*a98e9dbfSaguzovsk } 1109*a98e9dbfSaguzovsk if (!force) { 1110*a98e9dbfSaguzovsk if (npgs_purged >= npgs_to_purge) { 1111*a98e9dbfSaguzovsk break; 1112*a98e9dbfSaguzovsk } 1113*a98e9dbfSaguzovsk if (!trim && !(seg_pathr_full_ahb & 15)) { 1114*a98e9dbfSaguzovsk ASSERT(lowmem); 1115*a98e9dbfSaguzovsk if (freemem >= lotsfree + needfree) { 1116*a98e9dbfSaguzovsk break; 1117*a98e9dbfSaguzovsk } 1118*a98e9dbfSaguzovsk } 1119*a98e9dbfSaguzovsk } 11207c478bd9Sstevel@tonic-gate } 11217c478bd9Sstevel@tonic-gate 1122*a98e9dbfSaguzovsk if (hlinkp == &seg_pahhead[hlix]) { 1123*a98e9dbfSaguzovsk /* 1124*a98e9dbfSaguzovsk * We processed the entire hlix active bucket list 1125*a98e9dbfSaguzovsk * but didn't find enough pages to reclaim. 1126*a98e9dbfSaguzovsk * Switch the lists and walk the other list 1127*a98e9dbfSaguzovsk * if we haven't done it yet. 1128*a98e9dbfSaguzovsk */ 1129*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1130*a98e9dbfSaguzovsk ASSERT(seg_pathr_on); 1131*a98e9dbfSaguzovsk ASSERT(seg_pahcur == !hlix); 1132*a98e9dbfSaguzovsk seg_pahcur = hlix; 1133*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1134*a98e9dbfSaguzovsk if (++hlinks < 2) { 1135*a98e9dbfSaguzovsk hlix = !hlix; 1136*a98e9dbfSaguzovsk goto again; 1137*a98e9dbfSaguzovsk } 1138*a98e9dbfSaguzovsk } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && 1139*a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext != hlinkp) { 1140*a98e9dbfSaguzovsk ASSERT(hlinkp != NULL); 1141*a98e9dbfSaguzovsk ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); 1142*a98e9dbfSaguzovsk ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); 1143*a98e9dbfSaguzovsk ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); 1144*a98e9dbfSaguzovsk 1145*a98e9dbfSaguzovsk /* 1146*a98e9dbfSaguzovsk * Reinsert the header to point to hlinkp 1147*a98e9dbfSaguzovsk * so that we start from hlinkp bucket next time around. 1148*a98e9dbfSaguzovsk */ 1149*a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; 1150*a98e9dbfSaguzovsk seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; 1151*a98e9dbfSaguzovsk seg_pahhead[hlix].p_lnext = hlinkp; 1152*a98e9dbfSaguzovsk seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; 1153*a98e9dbfSaguzovsk hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; 1154*a98e9dbfSaguzovsk hlinkp->p_lprev = &seg_pahhead[hlix]; 1155*a98e9dbfSaguzovsk } 1156*a98e9dbfSaguzovsk 1157*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1158*a98e9dbfSaguzovsk ASSERT(seg_pathr_on); 1159*a98e9dbfSaguzovsk seg_pathr_on = 0; 1160*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 1161*a98e9dbfSaguzovsk 1162*a98e9dbfSaguzovsk runcb: 11637c478bd9Sstevel@tonic-gate /* 1164*a98e9dbfSaguzovsk * Run the delayed callback list. segments/amps can't go away until 1165*a98e9dbfSaguzovsk * callback is executed since they must have non 0 softlockcnt. That's 1166*a98e9dbfSaguzovsk * why we don't need to hold as/seg/amp locks to execute the callback. 11677c478bd9Sstevel@tonic-gate */ 11687c478bd9Sstevel@tonic-gate while (delcallb_list != NULL) { 11697c478bd9Sstevel@tonic-gate pcp = delcallb_list; 11707c478bd9Sstevel@tonic-gate delcallb_list = pcp->p_hprev; 1171*a98e9dbfSaguzovsk ASSERT(!pcp->p_active); 1172*a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1173*a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); 1174*a98e9dbfSaguzovsk npages += btop(pcp->p_len); 1175*a98e9dbfSaguzovsk if (!IS_PCP_WIRED(pcp)) { 1176*a98e9dbfSaguzovsk npages_window += btop(pcp->p_len); 11777c478bd9Sstevel@tonic-gate } 1178*a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 1179*a98e9dbfSaguzovsk } 1180*a98e9dbfSaguzovsk if (npages) { 1181*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1182*a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 1183*a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages_window); 1184*a98e9dbfSaguzovsk seg_plocked -= npages; 1185*a98e9dbfSaguzovsk seg_plocked_window -= npages_window; 1186*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 11877c478bd9Sstevel@tonic-gate } 11887c478bd9Sstevel@tonic-gate } 11897c478bd9Sstevel@tonic-gate 11907c478bd9Sstevel@tonic-gate /* 1191*a98e9dbfSaguzovsk * Remove cached pages for segment(s) entries from hashtable. The segments 1192*a98e9dbfSaguzovsk * are identified by pp array. This is useful for multiple seg's cached on 1193*a98e9dbfSaguzovsk * behalf of dummy segment (ISM/DISM) with common pp array. 11947c478bd9Sstevel@tonic-gate */ 11957c478bd9Sstevel@tonic-gate void 1196*a98e9dbfSaguzovsk seg_ppurge_wiredpp(struct page **pp) 11977c478bd9Sstevel@tonic-gate { 1198*a98e9dbfSaguzovsk struct seg_pcache *pcp; 1199*a98e9dbfSaguzovsk struct seg_phash_wired *hp; 12007c478bd9Sstevel@tonic-gate pgcnt_t npages = 0; 1201*a98e9dbfSaguzovsk struct seg_pcache *delcallb_list = NULL; 12027c478bd9Sstevel@tonic-gate 12037c478bd9Sstevel@tonic-gate /* 1204*a98e9dbfSaguzovsk * if the cache is empty, return 12057c478bd9Sstevel@tonic-gate */ 1206*a98e9dbfSaguzovsk if (seg_plocked == 0) { 12077c478bd9Sstevel@tonic-gate return; 12087c478bd9Sstevel@tonic-gate } 1209*a98e9dbfSaguzovsk ASSERT(seg_phashsize_wired != 0); 12107c478bd9Sstevel@tonic-gate 1211*a98e9dbfSaguzovsk for (hp = seg_phashtab_wired; 1212*a98e9dbfSaguzovsk hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { 1213*a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1214*a98e9dbfSaguzovsk continue; 1215*a98e9dbfSaguzovsk } 12167c478bd9Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 12177c478bd9Sstevel@tonic-gate pcp = hp->p_hnext; 12187c478bd9Sstevel@tonic-gate while (pcp != (struct seg_pcache *)hp) { 1219*a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == (struct seg_phash *)hp); 1220*a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 12217c478bd9Sstevel@tonic-gate /* 12227c478bd9Sstevel@tonic-gate * purge entries which are not active 12237c478bd9Sstevel@tonic-gate */ 1224*a98e9dbfSaguzovsk if (!pcp->p_active && pcp->p_pp == pp) { 1225*a98e9dbfSaguzovsk ASSERT(pcp->p_htag0 != NULL); 12267c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 12277c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 1228*a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1229*a98e9dbfSaguzovsk delcallb_list = pcp; 12307c478bd9Sstevel@tonic-gate } 1231*a98e9dbfSaguzovsk pcp = pcp->p_hnext; 12327c478bd9Sstevel@tonic-gate } 12337c478bd9Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 1234*a98e9dbfSaguzovsk /* 1235*a98e9dbfSaguzovsk * segments can't go away until callback is executed since 1236*a98e9dbfSaguzovsk * they must have non 0 softlockcnt. That's why we don't 1237*a98e9dbfSaguzovsk * need to hold as/seg locks to execute the callback. 1238*a98e9dbfSaguzovsk */ 1239*a98e9dbfSaguzovsk while (delcallb_list != NULL) { 1240*a98e9dbfSaguzovsk int done; 1241*a98e9dbfSaguzovsk pcp = delcallb_list; 1242*a98e9dbfSaguzovsk delcallb_list = pcp->p_hprev; 1243*a98e9dbfSaguzovsk ASSERT(!pcp->p_active); 1244*a98e9dbfSaguzovsk done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1245*a98e9dbfSaguzovsk pcp->p_len, pcp->p_pp, 1246*a98e9dbfSaguzovsk pcp->p_write ? S_WRITE : S_READ, 1); 1247*a98e9dbfSaguzovsk npages += btop(pcp->p_len); 1248*a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1249*a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 1250*a98e9dbfSaguzovsk if (done) { 1251*a98e9dbfSaguzovsk ASSERT(delcallb_list == NULL); 1252*a98e9dbfSaguzovsk goto out; 1253*a98e9dbfSaguzovsk } 1254*a98e9dbfSaguzovsk } 12557c478bd9Sstevel@tonic-gate } 12567c478bd9Sstevel@tonic-gate 1257*a98e9dbfSaguzovsk out: 1258*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1259*a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 12607c478bd9Sstevel@tonic-gate seg_plocked -= npages; 1261*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 12627c478bd9Sstevel@tonic-gate } 12637c478bd9Sstevel@tonic-gate 12647c478bd9Sstevel@tonic-gate /* 12657c478bd9Sstevel@tonic-gate * purge all entries for a given segment. Since we 12667c478bd9Sstevel@tonic-gate * callback into the segment driver directly for page 12677c478bd9Sstevel@tonic-gate * reclaim the caller needs to hold the right locks. 12687c478bd9Sstevel@tonic-gate */ 12697c478bd9Sstevel@tonic-gate void 1270*a98e9dbfSaguzovsk seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) 12717c478bd9Sstevel@tonic-gate { 12727c478bd9Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 12737c478bd9Sstevel@tonic-gate struct seg_pcache *pcp; 12747c478bd9Sstevel@tonic-gate struct seg_phash *hp; 12757c478bd9Sstevel@tonic-gate pgcnt_t npages = 0; 1276*a98e9dbfSaguzovsk void *htag0; 12777c478bd9Sstevel@tonic-gate 1278*a98e9dbfSaguzovsk if (seg_plocked == 0) { 12797c478bd9Sstevel@tonic-gate return; 12807c478bd9Sstevel@tonic-gate } 1281*a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 1282*a98e9dbfSaguzovsk 1283*a98e9dbfSaguzovsk /* 1284*a98e9dbfSaguzovsk * If amp is not NULL use amp as a lookup tag otherwise use seg 1285*a98e9dbfSaguzovsk * as a lookup tag. 1286*a98e9dbfSaguzovsk */ 1287*a98e9dbfSaguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 1288*a98e9dbfSaguzovsk ASSERT(htag0 != NULL); 1289*a98e9dbfSaguzovsk if (IS_PFLAGS_WIRED(flags)) { 1290*a98e9dbfSaguzovsk hp = P_HASHBP(seg, htag0, 0, flags); 1291*a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 1292*a98e9dbfSaguzovsk pcp = hp->p_hnext; 1293*a98e9dbfSaguzovsk while (pcp != (struct seg_pcache *)hp) { 1294*a98e9dbfSaguzovsk ASSERT(pcp->p_hashp == hp); 1295*a98e9dbfSaguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1296*a98e9dbfSaguzovsk if (pcp->p_htag0 == htag0) { 1297*a98e9dbfSaguzovsk if (pcp->p_active) { 1298*a98e9dbfSaguzovsk break; 1299*a98e9dbfSaguzovsk } 1300*a98e9dbfSaguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1301*a98e9dbfSaguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1302*a98e9dbfSaguzovsk pcp->p_hprev = delcallb_list; 1303*a98e9dbfSaguzovsk delcallb_list = pcp; 1304*a98e9dbfSaguzovsk } 1305*a98e9dbfSaguzovsk pcp = pcp->p_hnext; 1306*a98e9dbfSaguzovsk } 1307*a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 1308*a98e9dbfSaguzovsk } else { 1309*a98e9dbfSaguzovsk pcache_link_t *plinkp; 1310*a98e9dbfSaguzovsk pcache_link_t *pheadp; 1311*a98e9dbfSaguzovsk kmutex_t *pmtx; 1312*a98e9dbfSaguzovsk 1313*a98e9dbfSaguzovsk if (amp == NULL) { 1314*a98e9dbfSaguzovsk ASSERT(seg != NULL); 1315*a98e9dbfSaguzovsk pheadp = &seg->s_phead; 1316*a98e9dbfSaguzovsk pmtx = &seg->s_pmtx; 1317*a98e9dbfSaguzovsk } else { 1318*a98e9dbfSaguzovsk pheadp = &->a_phead; 1319*a98e9dbfSaguzovsk pmtx = &->a_pmtx; 1320*a98e9dbfSaguzovsk } 1321*a98e9dbfSaguzovsk mutex_enter(pmtx); 1322*a98e9dbfSaguzovsk while ((plinkp = pheadp->p_lnext) != pheadp) { 1323*a98e9dbfSaguzovsk pcp = plink2pcache(plinkp); 1324*a98e9dbfSaguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 1325*a98e9dbfSaguzovsk ASSERT(pcp->p_htag0 == htag0); 1326*a98e9dbfSaguzovsk hp = pcp->p_hashp; 1327*a98e9dbfSaguzovsk mutex_enter(&hp->p_hmutex); 13287c478bd9Sstevel@tonic-gate if (pcp->p_active) { 1329*a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 13307c478bd9Sstevel@tonic-gate break; 13317c478bd9Sstevel@tonic-gate } 1332*a98e9dbfSaguzovsk ASSERT(plinkp->p_lprev == pheadp); 1333*a98e9dbfSaguzovsk pheadp->p_lnext = plinkp->p_lnext; 1334*a98e9dbfSaguzovsk plinkp->p_lnext->p_lprev = pheadp; 13357c478bd9Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 13367c478bd9Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 13377c478bd9Sstevel@tonic-gate pcp->p_hprev = delcallb_list; 13387c478bd9Sstevel@tonic-gate delcallb_list = pcp; 1339*a98e9dbfSaguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1340*a98e9dbfSaguzovsk seg_premove_abuck(hp, 0); 1341*a98e9dbfSaguzovsk } 1342*a98e9dbfSaguzovsk mutex_exit(&hp->p_hmutex); 13437c478bd9Sstevel@tonic-gate } 1344*a98e9dbfSaguzovsk mutex_exit(pmtx); 13457c478bd9Sstevel@tonic-gate } 13467c478bd9Sstevel@tonic-gate while (delcallb_list != NULL) { 13477c478bd9Sstevel@tonic-gate pcp = delcallb_list; 13487c478bd9Sstevel@tonic-gate delcallb_list = pcp->p_hprev; 1349*a98e9dbfSaguzovsk ASSERT(!pcp->p_active); 1350*a98e9dbfSaguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, 1351*a98e9dbfSaguzovsk pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 1352*a98e9dbfSaguzovsk npages += btop(pcp->p_len); 1353*a98e9dbfSaguzovsk kmem_cache_free(seg_pkmcache, pcp); 13547c478bd9Sstevel@tonic-gate } 1355*a98e9dbfSaguzovsk mutex_enter(&seg_pmem_mtx); 1356*a98e9dbfSaguzovsk ASSERT(seg_plocked >= npages); 13577c478bd9Sstevel@tonic-gate seg_plocked -= npages; 1358*a98e9dbfSaguzovsk if (!IS_PFLAGS_WIRED(flags)) { 1359*a98e9dbfSaguzovsk ASSERT(seg_plocked_window >= npages); 1360*a98e9dbfSaguzovsk seg_plocked_window -= npages; 1361*a98e9dbfSaguzovsk } 1362*a98e9dbfSaguzovsk mutex_exit(&seg_pmem_mtx); 13637c478bd9Sstevel@tonic-gate } 13647c478bd9Sstevel@tonic-gate 13657c478bd9Sstevel@tonic-gate static void seg_pinit_mem_config(void); 13667c478bd9Sstevel@tonic-gate 13677c478bd9Sstevel@tonic-gate /* 13687c478bd9Sstevel@tonic-gate * setup the pagelock cache 13697c478bd9Sstevel@tonic-gate */ 13707c478bd9Sstevel@tonic-gate static void 13717c478bd9Sstevel@tonic-gate seg_pinit(void) 13727c478bd9Sstevel@tonic-gate { 13737c478bd9Sstevel@tonic-gate struct seg_phash *hp; 1374*a98e9dbfSaguzovsk ulong_t i; 1375*a98e9dbfSaguzovsk pgcnt_t physmegs; 13767c478bd9Sstevel@tonic-gate 1377*a98e9dbfSaguzovsk seg_plocked = 0; 1378*a98e9dbfSaguzovsk seg_plocked_window = 0; 13797c478bd9Sstevel@tonic-gate 1380*a98e9dbfSaguzovsk if (segpcache_enabled == 0) { 1381*a98e9dbfSaguzovsk seg_phashsize_win = 0; 1382*a98e9dbfSaguzovsk seg_phashsize_wired = 0; 1383*a98e9dbfSaguzovsk seg_pdisabled = 1; 1384*a98e9dbfSaguzovsk return; 1385*a98e9dbfSaguzovsk } 13867c478bd9Sstevel@tonic-gate 1387*a98e9dbfSaguzovsk seg_pdisabled = 0; 1388*a98e9dbfSaguzovsk seg_pkmcache = kmem_cache_create("seg_pcache", 1389*a98e9dbfSaguzovsk sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); 1390*a98e9dbfSaguzovsk if (segpcache_pcp_maxage_ticks <= 0) { 1391*a98e9dbfSaguzovsk segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; 1392*a98e9dbfSaguzovsk } 1393*a98e9dbfSaguzovsk seg_pmax_pcpage = segpcache_pcp_maxage_ticks; 1394*a98e9dbfSaguzovsk seg_pathr_empty_ahb = 0; 1395*a98e9dbfSaguzovsk seg_pathr_full_ahb = 0; 1396*a98e9dbfSaguzovsk seg_pshrink_shift = segpcache_shrink_shift; 1397*a98e9dbfSaguzovsk seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); 13987c478bd9Sstevel@tonic-gate 1399*a98e9dbfSaguzovsk mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); 1400*a98e9dbfSaguzovsk mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); 1401*a98e9dbfSaguzovsk mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); 1402*a98e9dbfSaguzovsk cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); 1403*a98e9dbfSaguzovsk 1404*a98e9dbfSaguzovsk physmegs = physmem >> (20 - PAGESHIFT); 1405*a98e9dbfSaguzovsk 1406*a98e9dbfSaguzovsk /* 1407*a98e9dbfSaguzovsk * If segpcache_hashsize_win was not set in /etc/system or it has 1408*a98e9dbfSaguzovsk * absurd value set it to a default. 1409*a98e9dbfSaguzovsk */ 1410*a98e9dbfSaguzovsk if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { 1411*a98e9dbfSaguzovsk /* 1412*a98e9dbfSaguzovsk * Create one bucket per 32K (or at least per 8 pages) of 1413*a98e9dbfSaguzovsk * available memory. 1414*a98e9dbfSaguzovsk */ 1415*a98e9dbfSaguzovsk pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); 1416*a98e9dbfSaguzovsk segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); 1417*a98e9dbfSaguzovsk } 1418*a98e9dbfSaguzovsk if (!ISP2(segpcache_hashsize_win)) { 1419*a98e9dbfSaguzovsk ulong_t rndfac = ~(1UL << 1420*a98e9dbfSaguzovsk (highbit(segpcache_hashsize_win) - 1)); 1421*a98e9dbfSaguzovsk rndfac &= segpcache_hashsize_win; 1422*a98e9dbfSaguzovsk segpcache_hashsize_win += rndfac; 1423*a98e9dbfSaguzovsk segpcache_hashsize_win = 1 << 1424*a98e9dbfSaguzovsk (highbit(segpcache_hashsize_win) - 1); 1425*a98e9dbfSaguzovsk } 1426*a98e9dbfSaguzovsk seg_phashsize_win = segpcache_hashsize_win; 1427*a98e9dbfSaguzovsk seg_phashtab_win = kmem_zalloc( 1428*a98e9dbfSaguzovsk seg_phashsize_win * sizeof (struct seg_phash), 1429*a98e9dbfSaguzovsk KM_SLEEP); 1430*a98e9dbfSaguzovsk for (i = 0; i < seg_phashsize_win; i++) { 1431*a98e9dbfSaguzovsk hp = &seg_phashtab_win[i]; 1432*a98e9dbfSaguzovsk hp->p_hnext = (struct seg_pcache *)hp; 1433*a98e9dbfSaguzovsk hp->p_hprev = (struct seg_pcache *)hp; 1434*a98e9dbfSaguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1435*a98e9dbfSaguzovsk } 1436*a98e9dbfSaguzovsk 1437*a98e9dbfSaguzovsk seg_pahcur = 0; 1438*a98e9dbfSaguzovsk seg_pathr_on = 0; 1439*a98e9dbfSaguzovsk seg_pahhead[0].p_lnext = &seg_pahhead[0]; 1440*a98e9dbfSaguzovsk seg_pahhead[0].p_lprev = &seg_pahhead[0]; 1441*a98e9dbfSaguzovsk seg_pahhead[1].p_lnext = &seg_pahhead[1]; 1442*a98e9dbfSaguzovsk seg_pahhead[1].p_lprev = &seg_pahhead[1]; 1443*a98e9dbfSaguzovsk 1444*a98e9dbfSaguzovsk /* 1445*a98e9dbfSaguzovsk * If segpcache_hashsize_wired was not set in /etc/system or it has 1446*a98e9dbfSaguzovsk * absurd value set it to a default. 1447*a98e9dbfSaguzovsk */ 1448*a98e9dbfSaguzovsk if (segpcache_hashsize_wired == 0 || 1449*a98e9dbfSaguzovsk segpcache_hashsize_wired > physmem / 4) { 1450*a98e9dbfSaguzovsk /* 1451*a98e9dbfSaguzovsk * Choose segpcache_hashsize_wired based on physmem. 1452*a98e9dbfSaguzovsk * Create a bucket per 128K bytes upto 256K buckets. 1453*a98e9dbfSaguzovsk */ 1454*a98e9dbfSaguzovsk if (physmegs < 20 * 1024) { 1455*a98e9dbfSaguzovsk segpcache_hashsize_wired = MAX(1024, physmegs << 3); 1456*a98e9dbfSaguzovsk } else { 1457*a98e9dbfSaguzovsk segpcache_hashsize_wired = 256 * 1024; 14587c478bd9Sstevel@tonic-gate } 14597c478bd9Sstevel@tonic-gate } 1460*a98e9dbfSaguzovsk if (!ISP2(segpcache_hashsize_wired)) { 1461*a98e9dbfSaguzovsk segpcache_hashsize_wired = 1 << 1462*a98e9dbfSaguzovsk highbit(segpcache_hashsize_wired); 1463*a98e9dbfSaguzovsk } 1464*a98e9dbfSaguzovsk seg_phashsize_wired = segpcache_hashsize_wired; 1465*a98e9dbfSaguzovsk seg_phashtab_wired = kmem_zalloc( 1466*a98e9dbfSaguzovsk seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); 1467*a98e9dbfSaguzovsk for (i = 0; i < seg_phashsize_wired; i++) { 1468*a98e9dbfSaguzovsk hp = (struct seg_phash *)&seg_phashtab_wired[i]; 1469*a98e9dbfSaguzovsk hp->p_hnext = (struct seg_pcache *)hp; 1470*a98e9dbfSaguzovsk hp->p_hprev = (struct seg_pcache *)hp; 1471*a98e9dbfSaguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1472*a98e9dbfSaguzovsk } 14737c478bd9Sstevel@tonic-gate 1474*a98e9dbfSaguzovsk if (segpcache_maxwindow == 0) { 1475*a98e9dbfSaguzovsk if (physmegs < 64) { 1476*a98e9dbfSaguzovsk /* 3% of memory */ 1477*a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 5; 1478*a98e9dbfSaguzovsk } else if (physmegs < 512) { 1479*a98e9dbfSaguzovsk /* 12% of memory */ 1480*a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 3; 1481*a98e9dbfSaguzovsk } else if (physmegs < 1024) { 1482*a98e9dbfSaguzovsk /* 25% of memory */ 1483*a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 2; 1484*a98e9dbfSaguzovsk } else if (physmegs < 2048) { 1485*a98e9dbfSaguzovsk /* 50% of memory */ 1486*a98e9dbfSaguzovsk segpcache_maxwindow = availrmem >> 1; 1487*a98e9dbfSaguzovsk } else { 1488*a98e9dbfSaguzovsk /* no limit */ 1489*a98e9dbfSaguzovsk segpcache_maxwindow = (pgcnt_t)-1; 1490*a98e9dbfSaguzovsk } 1491*a98e9dbfSaguzovsk } 1492*a98e9dbfSaguzovsk seg_pmaxwindow = segpcache_maxwindow; 14937c478bd9Sstevel@tonic-gate seg_pinit_mem_config(); 14947c478bd9Sstevel@tonic-gate } 14957c478bd9Sstevel@tonic-gate 14967c478bd9Sstevel@tonic-gate /* 14977c478bd9Sstevel@tonic-gate * called by pageout if memory is low 14987c478bd9Sstevel@tonic-gate */ 14997c478bd9Sstevel@tonic-gate void 15007c478bd9Sstevel@tonic-gate seg_preap(void) 15017c478bd9Sstevel@tonic-gate { 15027c478bd9Sstevel@tonic-gate /* 1503*a98e9dbfSaguzovsk * if the cache is off or empty, return 15047c478bd9Sstevel@tonic-gate */ 1505*a98e9dbfSaguzovsk if (seg_plocked_window == 0) { 15067c478bd9Sstevel@tonic-gate return; 15077c478bd9Sstevel@tonic-gate } 1508*a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 15097c478bd9Sstevel@tonic-gate 1510*a98e9dbfSaguzovsk /* 1511*a98e9dbfSaguzovsk * If somebody is already purging pcache 1512*a98e9dbfSaguzovsk * just return. 1513*a98e9dbfSaguzovsk */ 1514*a98e9dbfSaguzovsk if (seg_pdisabled) { 1515*a98e9dbfSaguzovsk return; 1516*a98e9dbfSaguzovsk } 1517*a98e9dbfSaguzovsk 1518*a98e9dbfSaguzovsk cv_signal(&seg_pasync_cv); 1519*a98e9dbfSaguzovsk } 15207c478bd9Sstevel@tonic-gate 15217c478bd9Sstevel@tonic-gate /* 15227c478bd9Sstevel@tonic-gate * run as a backgroud thread and reclaim pagelock 15237c478bd9Sstevel@tonic-gate * pages which have not been used recently 15247c478bd9Sstevel@tonic-gate */ 15257c478bd9Sstevel@tonic-gate void 15267c478bd9Sstevel@tonic-gate seg_pasync_thread(void) 15277c478bd9Sstevel@tonic-gate { 15287c478bd9Sstevel@tonic-gate callb_cpr_t cpr_info; 15297c478bd9Sstevel@tonic-gate 1530*a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 1531*a98e9dbfSaguzovsk thread_exit(); 1532*a98e9dbfSaguzovsk /*NOTREACHED*/ 1533*a98e9dbfSaguzovsk } 1534*a98e9dbfSaguzovsk 1535*a98e9dbfSaguzovsk seg_pasync_thr = curthread; 15367c478bd9Sstevel@tonic-gate 1537*a98e9dbfSaguzovsk CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, 1538*a98e9dbfSaguzovsk callb_generic_cpr, "seg_pasync"); 15397c478bd9Sstevel@tonic-gate 1540*a98e9dbfSaguzovsk if (segpcache_reap_ticks <= 0) { 1541*a98e9dbfSaguzovsk segpcache_reap_ticks = segpcache_reap_sec * hz; 15427c478bd9Sstevel@tonic-gate } 15437c478bd9Sstevel@tonic-gate 1544*a98e9dbfSaguzovsk mutex_enter(&seg_pasync_mtx); 15457c478bd9Sstevel@tonic-gate for (;;) { 15467c478bd9Sstevel@tonic-gate CALLB_CPR_SAFE_BEGIN(&cpr_info); 1547*a98e9dbfSaguzovsk (void) cv_timedwait(&seg_pasync_cv, &seg_pasync_mtx, 1548*a98e9dbfSaguzovsk lbolt + segpcache_reap_ticks); 1549*a98e9dbfSaguzovsk CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); 1550*a98e9dbfSaguzovsk if (seg_pdisabled == 0) { 1551*a98e9dbfSaguzovsk seg_ppurge_async(0); 1552*a98e9dbfSaguzovsk } 15537c478bd9Sstevel@tonic-gate } 15547c478bd9Sstevel@tonic-gate } 15557c478bd9Sstevel@tonic-gate 15567c478bd9Sstevel@tonic-gate static struct kmem_cache *seg_cache; 15577c478bd9Sstevel@tonic-gate 15587c478bd9Sstevel@tonic-gate /* 15597c478bd9Sstevel@tonic-gate * Initialize segment management data structures. 15607c478bd9Sstevel@tonic-gate */ 15617c478bd9Sstevel@tonic-gate void 15627c478bd9Sstevel@tonic-gate seg_init(void) 15637c478bd9Sstevel@tonic-gate { 15647c478bd9Sstevel@tonic-gate kstat_t *ksp; 15657c478bd9Sstevel@tonic-gate 1566*a98e9dbfSaguzovsk seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 1567*a98e9dbfSaguzovsk 0, NULL, NULL, NULL, NULL, NULL, 0); 15687c478bd9Sstevel@tonic-gate 15697c478bd9Sstevel@tonic-gate ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, 1570c6f08383Sjj segadvstat_ndata, KSTAT_FLAG_VIRTUAL); 15717c478bd9Sstevel@tonic-gate if (ksp) { 15727c478bd9Sstevel@tonic-gate ksp->ks_data = (void *)segadvstat_ptr; 15737c478bd9Sstevel@tonic-gate kstat_install(ksp); 15747c478bd9Sstevel@tonic-gate } 15757c478bd9Sstevel@tonic-gate 15767c478bd9Sstevel@tonic-gate seg_pinit(); 15777c478bd9Sstevel@tonic-gate } 15787c478bd9Sstevel@tonic-gate 15797c478bd9Sstevel@tonic-gate /* 15807c478bd9Sstevel@tonic-gate * Allocate a segment to cover [base, base+size] 15817c478bd9Sstevel@tonic-gate * and attach it to the specified address space. 15827c478bd9Sstevel@tonic-gate */ 15837c478bd9Sstevel@tonic-gate struct seg * 15847c478bd9Sstevel@tonic-gate seg_alloc(struct as *as, caddr_t base, size_t size) 15857c478bd9Sstevel@tonic-gate { 15867c478bd9Sstevel@tonic-gate struct seg *new; 15877c478bd9Sstevel@tonic-gate caddr_t segbase; 15887c478bd9Sstevel@tonic-gate size_t segsize; 15897c478bd9Sstevel@tonic-gate 15907c478bd9Sstevel@tonic-gate segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); 15917c478bd9Sstevel@tonic-gate segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - 15927c478bd9Sstevel@tonic-gate (uintptr_t)segbase; 15937c478bd9Sstevel@tonic-gate 15947c478bd9Sstevel@tonic-gate if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) 15957c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 15967c478bd9Sstevel@tonic-gate 15977c478bd9Sstevel@tonic-gate if (as != &kas && 15987c478bd9Sstevel@tonic-gate valid_usr_range(segbase, segsize, 0, as, 15997c478bd9Sstevel@tonic-gate as->a_userlimit) != RANGE_OKAY) 16007c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 16017c478bd9Sstevel@tonic-gate 16027c478bd9Sstevel@tonic-gate new = kmem_cache_alloc(seg_cache, KM_SLEEP); 16037c478bd9Sstevel@tonic-gate new->s_ops = NULL; 16047c478bd9Sstevel@tonic-gate new->s_data = NULL; 16057c478bd9Sstevel@tonic-gate new->s_szc = 0; 16067c478bd9Sstevel@tonic-gate new->s_flags = 0; 1607*a98e9dbfSaguzovsk mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); 1608*a98e9dbfSaguzovsk new->s_phead.p_lnext = &new->s_phead; 1609*a98e9dbfSaguzovsk new->s_phead.p_lprev = &new->s_phead; 16107c478bd9Sstevel@tonic-gate if (seg_attach(as, segbase, segsize, new) < 0) { 16117c478bd9Sstevel@tonic-gate kmem_cache_free(seg_cache, new); 16127c478bd9Sstevel@tonic-gate return ((struct seg *)NULL); 16137c478bd9Sstevel@tonic-gate } 16147c478bd9Sstevel@tonic-gate /* caller must fill in ops, data */ 16157c478bd9Sstevel@tonic-gate return (new); 16167c478bd9Sstevel@tonic-gate } 16177c478bd9Sstevel@tonic-gate 16187c478bd9Sstevel@tonic-gate /* 16197c478bd9Sstevel@tonic-gate * Attach a segment to the address space. Used by seg_alloc() 16207c478bd9Sstevel@tonic-gate * and for kernel startup to attach to static segments. 16217c478bd9Sstevel@tonic-gate */ 16227c478bd9Sstevel@tonic-gate int 16237c478bd9Sstevel@tonic-gate seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) 16247c478bd9Sstevel@tonic-gate { 16257c478bd9Sstevel@tonic-gate seg->s_as = as; 16267c478bd9Sstevel@tonic-gate seg->s_base = base; 16277c478bd9Sstevel@tonic-gate seg->s_size = size; 16287c478bd9Sstevel@tonic-gate 16297c478bd9Sstevel@tonic-gate /* 16307c478bd9Sstevel@tonic-gate * as_addseg() will add the segment at the appropraite point 16317c478bd9Sstevel@tonic-gate * in the list. It will return -1 if there is overlap with 16327c478bd9Sstevel@tonic-gate * an already existing segment. 16337c478bd9Sstevel@tonic-gate */ 16347c478bd9Sstevel@tonic-gate return (as_addseg(as, seg)); 16357c478bd9Sstevel@tonic-gate } 16367c478bd9Sstevel@tonic-gate 16377c478bd9Sstevel@tonic-gate /* 16387c478bd9Sstevel@tonic-gate * Unmap a segment and free it from its associated address space. 16397c478bd9Sstevel@tonic-gate * This should be called by anybody who's finished with a whole segment's 16407c478bd9Sstevel@tonic-gate * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the 16417c478bd9Sstevel@tonic-gate * responsibility of the segment driver to unlink the the segment 16427c478bd9Sstevel@tonic-gate * from the address space, and to free public and private data structures 16437c478bd9Sstevel@tonic-gate * associated with the segment. (This is typically done by a call to 16447c478bd9Sstevel@tonic-gate * seg_free()). 16457c478bd9Sstevel@tonic-gate */ 16467c478bd9Sstevel@tonic-gate void 16477c478bd9Sstevel@tonic-gate seg_unmap(struct seg *seg) 16487c478bd9Sstevel@tonic-gate { 16497c478bd9Sstevel@tonic-gate #ifdef DEBUG 16507c478bd9Sstevel@tonic-gate int ret; 16517c478bd9Sstevel@tonic-gate #endif /* DEBUG */ 16527c478bd9Sstevel@tonic-gate 16537c478bd9Sstevel@tonic-gate ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 16547c478bd9Sstevel@tonic-gate 16557c478bd9Sstevel@tonic-gate /* Shouldn't have called seg_unmap if mapping isn't yet established */ 16567c478bd9Sstevel@tonic-gate ASSERT(seg->s_data != NULL); 16577c478bd9Sstevel@tonic-gate 16587c478bd9Sstevel@tonic-gate /* Unmap the whole mapping */ 16597c478bd9Sstevel@tonic-gate #ifdef DEBUG 16607c478bd9Sstevel@tonic-gate ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16617c478bd9Sstevel@tonic-gate ASSERT(ret == 0); 16627c478bd9Sstevel@tonic-gate #else 16637c478bd9Sstevel@tonic-gate SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16647c478bd9Sstevel@tonic-gate #endif /* DEBUG */ 16657c478bd9Sstevel@tonic-gate } 16667c478bd9Sstevel@tonic-gate 16677c478bd9Sstevel@tonic-gate /* 16687c478bd9Sstevel@tonic-gate * Free the segment from its associated as. This should only be called 16697c478bd9Sstevel@tonic-gate * if a mapping to the segment has not yet been established (e.g., if 16707c478bd9Sstevel@tonic-gate * an error occurs in the middle of doing an as_map when the segment 16717c478bd9Sstevel@tonic-gate * has already been partially set up) or if it has already been deleted 16727c478bd9Sstevel@tonic-gate * (e.g., from a segment driver unmap routine if the unmap applies to the 16737c478bd9Sstevel@tonic-gate * entire segment). If the mapping is currently set up then seg_unmap() should 16747c478bd9Sstevel@tonic-gate * be called instead. 16757c478bd9Sstevel@tonic-gate */ 16767c478bd9Sstevel@tonic-gate void 16777c478bd9Sstevel@tonic-gate seg_free(struct seg *seg) 16787c478bd9Sstevel@tonic-gate { 16797c478bd9Sstevel@tonic-gate register struct as *as = seg->s_as; 16807c478bd9Sstevel@tonic-gate struct seg *tseg = as_removeseg(as, seg); 16817c478bd9Sstevel@tonic-gate 16827c478bd9Sstevel@tonic-gate ASSERT(tseg == seg); 16837c478bd9Sstevel@tonic-gate 16847c478bd9Sstevel@tonic-gate /* 16857c478bd9Sstevel@tonic-gate * If the segment private data field is NULL, 16867c478bd9Sstevel@tonic-gate * then segment driver is not attached yet. 16877c478bd9Sstevel@tonic-gate */ 16887c478bd9Sstevel@tonic-gate if (seg->s_data != NULL) 16897c478bd9Sstevel@tonic-gate SEGOP_FREE(seg); 16907c478bd9Sstevel@tonic-gate 1691*a98e9dbfSaguzovsk mutex_destroy(&seg->s_pmtx); 1692*a98e9dbfSaguzovsk ASSERT(seg->s_phead.p_lnext == &seg->s_phead); 1693*a98e9dbfSaguzovsk ASSERT(seg->s_phead.p_lprev == &seg->s_phead); 16947c478bd9Sstevel@tonic-gate kmem_cache_free(seg_cache, seg); 16957c478bd9Sstevel@tonic-gate } 16967c478bd9Sstevel@tonic-gate 16977c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 16987c478bd9Sstevel@tonic-gate static void 16997c478bd9Sstevel@tonic-gate seg_p_mem_config_post_add( 17007c478bd9Sstevel@tonic-gate void *arg, 17017c478bd9Sstevel@tonic-gate pgcnt_t delta_pages) 17027c478bd9Sstevel@tonic-gate { 17037c478bd9Sstevel@tonic-gate /* Nothing to do. */ 17047c478bd9Sstevel@tonic-gate } 17057c478bd9Sstevel@tonic-gate 1706cee1d74bSjfrank void 1707cee1d74bSjfrank seg_p_enable(void) 1708cee1d74bSjfrank { 1709*a98e9dbfSaguzovsk mutex_enter(&seg_pcache_mtx); 1710*a98e9dbfSaguzovsk ASSERT(seg_pdisabled != 0); 1711*a98e9dbfSaguzovsk seg_pdisabled--; 1712*a98e9dbfSaguzovsk mutex_exit(&seg_pcache_mtx); 1713cee1d74bSjfrank } 1714cee1d74bSjfrank 17157c478bd9Sstevel@tonic-gate /* 1716cee1d74bSjfrank * seg_p_disable - disables seg_pcache, and then attempts to empty the 1717cee1d74bSjfrank * cache. 1718cee1d74bSjfrank * Returns SEGP_SUCCESS if the cache was successfully emptied, or 1719cee1d74bSjfrank * SEGP_FAIL if the cache could not be emptied. 17207c478bd9Sstevel@tonic-gate */ 1721cee1d74bSjfrank int 1722cee1d74bSjfrank seg_p_disable(void) 17237c478bd9Sstevel@tonic-gate { 17247c478bd9Sstevel@tonic-gate pgcnt_t old_plocked; 17257c478bd9Sstevel@tonic-gate int stall_count = 0; 17267c478bd9Sstevel@tonic-gate 1727*a98e9dbfSaguzovsk mutex_enter(&seg_pcache_mtx); 1728*a98e9dbfSaguzovsk seg_pdisabled++; 1729*a98e9dbfSaguzovsk ASSERT(seg_pdisabled != 0); 1730*a98e9dbfSaguzovsk mutex_exit(&seg_pcache_mtx); 17317c478bd9Sstevel@tonic-gate 17327c478bd9Sstevel@tonic-gate /* 17337c478bd9Sstevel@tonic-gate * Attempt to empty the cache. Terminate if seg_plocked does not 17347c478bd9Sstevel@tonic-gate * diminish with SEGP_STALL_THRESHOLD consecutive attempts. 17357c478bd9Sstevel@tonic-gate */ 17367c478bd9Sstevel@tonic-gate while (seg_plocked != 0) { 1737*a98e9dbfSaguzovsk ASSERT(seg_phashsize_win != 0); 17387c478bd9Sstevel@tonic-gate old_plocked = seg_plocked; 1739*a98e9dbfSaguzovsk seg_ppurge_async(1); 17407c478bd9Sstevel@tonic-gate if (seg_plocked == old_plocked) { 17417c478bd9Sstevel@tonic-gate if (stall_count++ > SEGP_STALL_THRESHOLD) { 1742cee1d74bSjfrank return (SEGP_FAIL); 17437c478bd9Sstevel@tonic-gate } 17447c478bd9Sstevel@tonic-gate } else 17457c478bd9Sstevel@tonic-gate stall_count = 0; 17467c478bd9Sstevel@tonic-gate if (seg_plocked != 0) 17477c478bd9Sstevel@tonic-gate delay(hz/SEGP_PREDEL_DELAY_FACTOR); 17487c478bd9Sstevel@tonic-gate } 1749cee1d74bSjfrank return (SEGP_SUCCESS); 1750cee1d74bSjfrank } 1751cee1d74bSjfrank 1752cee1d74bSjfrank /* 1753cee1d74bSjfrank * Attempt to purge seg_pcache. May need to return before this has 1754cee1d74bSjfrank * completed to allow other pre_del callbacks to unlock pages. This is 1755cee1d74bSjfrank * ok because: 1756*a98e9dbfSaguzovsk * 1) The seg_pdisabled flag has been set so at least we won't 1757cee1d74bSjfrank * cache anymore locks and the locks we couldn't purge 1758cee1d74bSjfrank * will not be held if they do get released by a subsequent 1759cee1d74bSjfrank * pre-delete callback. 1760cee1d74bSjfrank * 1761cee1d74bSjfrank * 2) The rest of the memory delete thread processing does not 1762cee1d74bSjfrank * depend on the changes made in this pre-delete callback. No 1763cee1d74bSjfrank * panics will result, the worst that will happen is that the 1764cee1d74bSjfrank * DR code will timeout and cancel the delete. 1765cee1d74bSjfrank */ 1766cee1d74bSjfrank /*ARGSUSED*/ 1767cee1d74bSjfrank static int 1768cee1d74bSjfrank seg_p_mem_config_pre_del( 1769cee1d74bSjfrank void *arg, 1770cee1d74bSjfrank pgcnt_t delta_pages) 1771cee1d74bSjfrank { 1772*a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 1773*a98e9dbfSaguzovsk return (0); 1774*a98e9dbfSaguzovsk } 1775cee1d74bSjfrank if (seg_p_disable() != SEGP_SUCCESS) 1776cee1d74bSjfrank cmn_err(CE_NOTE, 1777cee1d74bSjfrank "!Pre-delete couldn't purge"" pagelock cache - continuing"); 17787c478bd9Sstevel@tonic-gate return (0); 17797c478bd9Sstevel@tonic-gate } 17807c478bd9Sstevel@tonic-gate 17817c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 17827c478bd9Sstevel@tonic-gate static void 17837c478bd9Sstevel@tonic-gate seg_p_mem_config_post_del( 17847c478bd9Sstevel@tonic-gate void *arg, 17857c478bd9Sstevel@tonic-gate pgcnt_t delta_pages, 17867c478bd9Sstevel@tonic-gate int cancelled) 17877c478bd9Sstevel@tonic-gate { 1788*a98e9dbfSaguzovsk if (seg_phashsize_win == 0) { 1789*a98e9dbfSaguzovsk return; 1790*a98e9dbfSaguzovsk } 1791cee1d74bSjfrank seg_p_enable(); 17927c478bd9Sstevel@tonic-gate } 17937c478bd9Sstevel@tonic-gate 17947c478bd9Sstevel@tonic-gate static kphysm_setup_vector_t seg_p_mem_config_vec = { 17957c478bd9Sstevel@tonic-gate KPHYSM_SETUP_VECTOR_VERSION, 17967c478bd9Sstevel@tonic-gate seg_p_mem_config_post_add, 17977c478bd9Sstevel@tonic-gate seg_p_mem_config_pre_del, 17987c478bd9Sstevel@tonic-gate seg_p_mem_config_post_del, 17997c478bd9Sstevel@tonic-gate }; 18007c478bd9Sstevel@tonic-gate 18017c478bd9Sstevel@tonic-gate static void 18027c478bd9Sstevel@tonic-gate seg_pinit_mem_config(void) 18037c478bd9Sstevel@tonic-gate { 18047c478bd9Sstevel@tonic-gate int ret; 18057c478bd9Sstevel@tonic-gate 18067c478bd9Sstevel@tonic-gate ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); 18077c478bd9Sstevel@tonic-gate /* 18087c478bd9Sstevel@tonic-gate * Want to catch this in the debug kernel. At run time, if the 18097c478bd9Sstevel@tonic-gate * callbacks don't get run all will be OK as the disable just makes 18107c478bd9Sstevel@tonic-gate * it more likely that the pages can be collected. 18117c478bd9Sstevel@tonic-gate */ 18127c478bd9Sstevel@tonic-gate ASSERT(ret == 0); 18137c478bd9Sstevel@tonic-gate } 18140209230bSgjelinek 18150209230bSgjelinek /* 18160209230bSgjelinek * Verify that segment is not a shared anonymous segment which reserves 18170209230bSgjelinek * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered 18180209230bSgjelinek * from one zone to another if any segments are shared. This is because the 18190209230bSgjelinek * last process to exit will credit the swap reservation. This could lead 18200209230bSgjelinek * to the swap being reserved by one zone, and credited to another. 18210209230bSgjelinek */ 18220209230bSgjelinek boolean_t 18230209230bSgjelinek seg_can_change_zones(struct seg *seg) 18240209230bSgjelinek { 18250209230bSgjelinek struct segvn_data *svd; 18260209230bSgjelinek 18270209230bSgjelinek if (seg->s_ops == &segspt_shmops) 18280209230bSgjelinek return (B_FALSE); 18290209230bSgjelinek 18300209230bSgjelinek if (seg->s_ops == &segvn_ops) { 18310209230bSgjelinek svd = (struct segvn_data *)seg->s_data; 18320209230bSgjelinek if (svd->type == MAP_SHARED && 18330209230bSgjelinek svd->amp != NULL && 18340209230bSgjelinek svd->amp->swresv > 0) 18350209230bSgjelinek return (B_FALSE); 18360209230bSgjelinek } 18370209230bSgjelinek return (B_TRUE); 18380209230bSgjelinek } 18390209230bSgjelinek 18400209230bSgjelinek /* 18410209230bSgjelinek * Return swap reserved by a segment backing a private mapping. 18420209230bSgjelinek */ 18430209230bSgjelinek size_t 18440209230bSgjelinek seg_swresv(struct seg *seg) 18450209230bSgjelinek { 18460209230bSgjelinek struct segvn_data *svd; 18470209230bSgjelinek size_t swap = 0; 18480209230bSgjelinek 18490209230bSgjelinek if (seg->s_ops == &segvn_ops) { 18500209230bSgjelinek svd = (struct segvn_data *)seg->s_data; 18510209230bSgjelinek if (svd->type == MAP_PRIVATE && svd->swresv > 0) 18520209230bSgjelinek swap = svd->swresv; 18530209230bSgjelinek } 18540209230bSgjelinek return (swap); 18550209230bSgjelinek } 1856