17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 50b5aa17bSmec * Common Development and Distribution License (the "License"). 60b5aa17bSmec * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 22*cb15d5d9SPeter Rival * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 237c478bd9Sstevel@tonic-gate */ 247c478bd9Sstevel@tonic-gate 257c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 267c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 277c478bd9Sstevel@tonic-gate 287c478bd9Sstevel@tonic-gate /* 297c478bd9Sstevel@tonic-gate * Portions of this source code were derived from Berkeley 4.3 BSD 307c478bd9Sstevel@tonic-gate * under license from the Regents of the University of California. 317c478bd9Sstevel@tonic-gate */ 327c478bd9Sstevel@tonic-gate 337c478bd9Sstevel@tonic-gate 347c478bd9Sstevel@tonic-gate /* 357c478bd9Sstevel@tonic-gate * This file contains common functions to access and manage the page lists. 367c478bd9Sstevel@tonic-gate * Many of these routines originated from platform dependent modules 377c478bd9Sstevel@tonic-gate * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 387c478bd9Sstevel@tonic-gate * a platform independent manner. 397c478bd9Sstevel@tonic-gate * 407c478bd9Sstevel@tonic-gate * vm/vm_dep.h provides for platform specific support. 417c478bd9Sstevel@tonic-gate */ 427c478bd9Sstevel@tonic-gate 437c478bd9Sstevel@tonic-gate #include <sys/types.h> 447c478bd9Sstevel@tonic-gate #include <sys/debug.h> 457c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 467c478bd9Sstevel@tonic-gate #include <sys/systm.h> 477c478bd9Sstevel@tonic-gate #include <sys/atomic.h> 487c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h> 497c478bd9Sstevel@tonic-gate #include <vm/as.h> 507c478bd9Sstevel@tonic-gate #include <vm/page.h> 517c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 527c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h> 5378b03d3aSkchow #include <sys/vmsystm.h> 547c478bd9Sstevel@tonic-gate #include <sys/memnode.h> 557c478bd9Sstevel@tonic-gate #include <vm/vm_dep.h> 567c478bd9Sstevel@tonic-gate #include <sys/lgrp.h> 577c478bd9Sstevel@tonic-gate #include <sys/mem_config.h> 587c478bd9Sstevel@tonic-gate #include <sys/callb.h> 597c478bd9Sstevel@tonic-gate #include <sys/mem_cage.h> 607c478bd9Sstevel@tonic-gate #include <sys/sdt.h> 61ca3e8d88SDave Plauger #include <sys/dumphdr.h> 62*cb15d5d9SPeter Rival #include <sys/swap.h> 637c478bd9Sstevel@tonic-gate 647c478bd9Sstevel@tonic-gate extern uint_t vac_colors; 657c478bd9Sstevel@tonic-gate 666061ce8aSkchow #define MAX_PRAGMA_ALIGN 128 676061ce8aSkchow 686061ce8aSkchow /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 696061ce8aSkchow 706061ce8aSkchow #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71affbd3ccSkchow #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 726061ce8aSkchow #else 736061ce8aSkchow #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 746061ce8aSkchow #endif 75affbd3ccSkchow char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76affbd3ccSkchow 777c478bd9Sstevel@tonic-gate /* 787c478bd9Sstevel@tonic-gate * number of page colors equivalent to reqested color in page_get routines. 797c478bd9Sstevel@tonic-gate * If set, keeps large pages intact longer and keeps MPO allocation 807c478bd9Sstevel@tonic-gate * from the local mnode in favor of acquiring the 'correct' page color from 817c478bd9Sstevel@tonic-gate * a demoted large page or from a remote mnode. 827c478bd9Sstevel@tonic-gate */ 835d07b933Sdp uint_t colorequiv; 845d07b933Sdp 855d07b933Sdp /* 865d07b933Sdp * color equivalency mask for each page size. 875d07b933Sdp * Mask is computed based on cpu L2$ way sizes and colorequiv global. 885d07b933Sdp * High 4 bits determine the number of high order bits of the color to ignore. 895d07b933Sdp * Low 4 bits determines number of low order bits of color to ignore (it's only 905d07b933Sdp * relevant for hashed index based page coloring). 915d07b933Sdp */ 925d07b933Sdp uchar_t colorequivszc[MMU_PAGE_SIZES]; 937c478bd9Sstevel@tonic-gate 947c478bd9Sstevel@tonic-gate /* 957c478bd9Sstevel@tonic-gate * if set, specifies the percentage of large pages that are free from within 967c478bd9Sstevel@tonic-gate * a large page region before attempting to lock those pages for 977c478bd9Sstevel@tonic-gate * page_get_contig_pages processing. 987c478bd9Sstevel@tonic-gate * 997c478bd9Sstevel@tonic-gate * Should be turned on when kpr is available when page_trylock_contig_pages 1007c478bd9Sstevel@tonic-gate * can be more selective. 1017c478bd9Sstevel@tonic-gate */ 1027c478bd9Sstevel@tonic-gate 1037c478bd9Sstevel@tonic-gate int ptcpthreshold; 1047c478bd9Sstevel@tonic-gate 1057c478bd9Sstevel@tonic-gate /* 1067c478bd9Sstevel@tonic-gate * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 10783f9b804Skchow * Enabled by default via pgcplimitsearch. 10883f9b804Skchow * 10983f9b804Skchow * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 11083f9b804Skchow * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 11183f9b804Skchow * bound. This upper bound range guarantees: 11283f9b804Skchow * - all large page 'slots' will be searched over time 11383f9b804Skchow * - the minimum (1) large page candidates considered on each pgcp call 11483f9b804Skchow * - count doesn't wrap around to 0 1157c478bd9Sstevel@tonic-gate */ 11683f9b804Skchow pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 1177c478bd9Sstevel@tonic-gate int pgcplimitsearch = 1; 1187c478bd9Sstevel@tonic-gate 11983f9b804Skchow #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 12083f9b804Skchow #define SETPGCPFAILCNT(szc) \ 12183f9b804Skchow if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 12283f9b804Skchow pgcpfailcnt[szc] = PGCPFAILMAX / 2; 12383f9b804Skchow 1247c478bd9Sstevel@tonic-gate #ifdef VM_STATS 1257c478bd9Sstevel@tonic-gate struct vmm_vmstats_str vmm_vmstats; 1267c478bd9Sstevel@tonic-gate 1277c478bd9Sstevel@tonic-gate #endif /* VM_STATS */ 1287c478bd9Sstevel@tonic-gate 1297c478bd9Sstevel@tonic-gate #if defined(__sparc) 1307c478bd9Sstevel@tonic-gate #define LPGCREATE 0 1317c478bd9Sstevel@tonic-gate #else 1327c478bd9Sstevel@tonic-gate /* enable page_get_contig_pages */ 1337c478bd9Sstevel@tonic-gate #define LPGCREATE 1 1347c478bd9Sstevel@tonic-gate #endif 1357c478bd9Sstevel@tonic-gate 1367c478bd9Sstevel@tonic-gate int pg_contig_disable; 1377c478bd9Sstevel@tonic-gate int pg_lpgcreate_nocage = LPGCREATE; 1387c478bd9Sstevel@tonic-gate 1397c478bd9Sstevel@tonic-gate /* 14019397407SSherry Moore * page_freelist_split pfn flag to signify no lo or hi pfn requirement. 1417c478bd9Sstevel@tonic-gate */ 1427c478bd9Sstevel@tonic-gate #define PFNNULL 0 1437c478bd9Sstevel@tonic-gate 1447c478bd9Sstevel@tonic-gate /* Flags involved in promotion and demotion routines */ 1457c478bd9Sstevel@tonic-gate #define PC_FREE 0x1 /* put page on freelist */ 1467c478bd9Sstevel@tonic-gate #define PC_ALLOC 0x2 /* return page for allocation */ 1477c478bd9Sstevel@tonic-gate 1487c478bd9Sstevel@tonic-gate /* 1497c478bd9Sstevel@tonic-gate * Flag for page_demote to be used with PC_FREE to denote that we don't care 1507c478bd9Sstevel@tonic-gate * what the color is as the color parameter to the function is ignored. 1517c478bd9Sstevel@tonic-gate */ 1527c478bd9Sstevel@tonic-gate #define PC_NO_COLOR (-1) 1537c478bd9Sstevel@tonic-gate 1545d07b933Sdp /* mtype value for page_promote to use when mtype does not matter */ 1555d07b933Sdp #define PC_MTYPE_ANY (-1) 1565d07b933Sdp 1577c478bd9Sstevel@tonic-gate /* 1587c478bd9Sstevel@tonic-gate * page counters candidates info 1597c478bd9Sstevel@tonic-gate * See page_ctrs_cands comment below for more details. 1607c478bd9Sstevel@tonic-gate * fields are as follows: 1617c478bd9Sstevel@tonic-gate * pcc_pages_free: # pages which freelist coalesce can create 1627c478bd9Sstevel@tonic-gate * pcc_color_free: pointer to page free counts per color 1637c478bd9Sstevel@tonic-gate */ 1647c478bd9Sstevel@tonic-gate typedef struct pcc_info { 1657c478bd9Sstevel@tonic-gate pgcnt_t pcc_pages_free; 1667c478bd9Sstevel@tonic-gate pgcnt_t *pcc_color_free; 16706fb6a36Sdv uint_t pad[12]; 1687c478bd9Sstevel@tonic-gate } pcc_info_t; 1697c478bd9Sstevel@tonic-gate 1707c478bd9Sstevel@tonic-gate /* 1717c478bd9Sstevel@tonic-gate * On big machines it can take a long time to check page_counters 1727c478bd9Sstevel@tonic-gate * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 1737c478bd9Sstevel@tonic-gate * updated sum of all elements of the corresponding page_counters arrays. 1747c478bd9Sstevel@tonic-gate * page_freelist_coalesce() searches page_counters only if an appropriate 1757c478bd9Sstevel@tonic-gate * element of page_ctrs_cands array is greater than 0. 1767c478bd9Sstevel@tonic-gate * 1775d07b933Sdp * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 1787c478bd9Sstevel@tonic-gate */ 1795d07b933Sdp pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 1807c478bd9Sstevel@tonic-gate 1817c478bd9Sstevel@tonic-gate /* 1827c478bd9Sstevel@tonic-gate * Return in val the total number of free pages which can be created 1835d07b933Sdp * for the given mnode (m), mrange (g), and region size (r) 1847c478bd9Sstevel@tonic-gate */ 1855d07b933Sdp #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 1867c478bd9Sstevel@tonic-gate int i; \ 1877c478bd9Sstevel@tonic-gate val = 0; \ 1887c478bd9Sstevel@tonic-gate for (i = 0; i < NPC_MUTEX; i++) { \ 1895d07b933Sdp val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 1907c478bd9Sstevel@tonic-gate } \ 1917c478bd9Sstevel@tonic-gate } 1927c478bd9Sstevel@tonic-gate 1937c478bd9Sstevel@tonic-gate /* 1947c478bd9Sstevel@tonic-gate * Return in val the total number of free pages which can be created 1955d07b933Sdp * for the given mnode (m), mrange (g), region size (r), and color (c) 1967c478bd9Sstevel@tonic-gate */ 1975d07b933Sdp #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 1987c478bd9Sstevel@tonic-gate int i; \ 1997c478bd9Sstevel@tonic-gate val = 0; \ 2005d07b933Sdp ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 2017c478bd9Sstevel@tonic-gate for (i = 0; i < NPC_MUTEX; i++) { \ 2025d07b933Sdp val += \ 2035d07b933Sdp page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 2047c478bd9Sstevel@tonic-gate } \ 2057c478bd9Sstevel@tonic-gate } 2067c478bd9Sstevel@tonic-gate 2077c478bd9Sstevel@tonic-gate /* 2087c478bd9Sstevel@tonic-gate * We can only allow a single thread to update a counter within the physical 2097c478bd9Sstevel@tonic-gate * range of the largest supported page size. That is the finest granularity 2107c478bd9Sstevel@tonic-gate * possible since the counter values are dependent on each other 2117c478bd9Sstevel@tonic-gate * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 2127c478bd9Sstevel@tonic-gate * ctr_mutex lock index for a particular physical range. 2137c478bd9Sstevel@tonic-gate */ 2147c478bd9Sstevel@tonic-gate static kmutex_t *ctr_mutex[NPC_MUTEX]; 2157c478bd9Sstevel@tonic-gate 2167c478bd9Sstevel@tonic-gate #define PP_CTR_LOCK_INDX(pp) \ 2175d07b933Sdp (((pp)->p_pagenum >> \ 2187c478bd9Sstevel@tonic-gate (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 2197c478bd9Sstevel@tonic-gate 2205d07b933Sdp #define INVALID_COLOR 0xffffffff 2215d07b933Sdp #define INVALID_MASK 0xffffffff 2225d07b933Sdp 2237c478bd9Sstevel@tonic-gate /* 2247c478bd9Sstevel@tonic-gate * Local functions prototypes. 2257c478bd9Sstevel@tonic-gate */ 2267c478bd9Sstevel@tonic-gate 227affbd3ccSkchow void page_ctr_add(int, int, page_t *, int); 228affbd3ccSkchow void page_ctr_add_internal(int, int, page_t *, int); 229affbd3ccSkchow void page_ctr_sub(int, int, page_t *, int); 2305d07b933Sdp void page_ctr_sub_internal(int, int, page_t *, int); 2317c478bd9Sstevel@tonic-gate void page_freelist_lock(int); 2327c478bd9Sstevel@tonic-gate void page_freelist_unlock(int); 2335d07b933Sdp page_t *page_promote(int, pfn_t, uchar_t, int, int); 23419397407SSherry Moore page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int); 2355d07b933Sdp page_t *page_freelist_split(uchar_t, 23619397407SSherry Moore uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *); 2377c478bd9Sstevel@tonic-gate page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 2387c478bd9Sstevel@tonic-gate static int page_trylock_cons(page_t *pp, se_t se); 2397c478bd9Sstevel@tonic-gate 2407c478bd9Sstevel@tonic-gate /* 2417c478bd9Sstevel@tonic-gate * The page_counters array below is used to keep track of free contiguous 2427c478bd9Sstevel@tonic-gate * physical memory. A hw_page_map_t will be allocated per mnode per szc. 2437c478bd9Sstevel@tonic-gate * This contains an array of counters, the size of the array, a shift value 2447c478bd9Sstevel@tonic-gate * used to convert a pagenum into a counter array index or vice versa, as 2457c478bd9Sstevel@tonic-gate * well as a cache of the last successful index to be promoted to a larger 2467c478bd9Sstevel@tonic-gate * page size. As an optimization, we keep track of the last successful index 2477c478bd9Sstevel@tonic-gate * to be promoted per page color for the given size region, and this is 2487c478bd9Sstevel@tonic-gate * allocated dynamically based upon the number of colors for a given 2497c478bd9Sstevel@tonic-gate * region size. 2507c478bd9Sstevel@tonic-gate * 2517c478bd9Sstevel@tonic-gate * Conceptually, the page counters are represented as: 2527c478bd9Sstevel@tonic-gate * 2537c478bd9Sstevel@tonic-gate * page_counters[region_size][mnode] 2547c478bd9Sstevel@tonic-gate * 2557c478bd9Sstevel@tonic-gate * region_size: size code of a candidate larger page made up 2567c478bd9Sstevel@tonic-gate * of contiguous free smaller pages. 2577c478bd9Sstevel@tonic-gate * 2587c478bd9Sstevel@tonic-gate * page_counters[region_size][mnode].hpm_counters[index]: 2597c478bd9Sstevel@tonic-gate * represents how many (region_size - 1) pages either 2607c478bd9Sstevel@tonic-gate * exist or can be created within the given index range. 2617c478bd9Sstevel@tonic-gate * 2627c478bd9Sstevel@tonic-gate * Let's look at a sparc example: 2637c478bd9Sstevel@tonic-gate * If we want to create a free 512k page, we look at region_size 2 2647c478bd9Sstevel@tonic-gate * for the mnode we want. We calculate the index and look at a specific 2657c478bd9Sstevel@tonic-gate * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 2667c478bd9Sstevel@tonic-gate * this location, it means that 8 64k pages either exist or can be created 2677c478bd9Sstevel@tonic-gate * from 8K pages in order to make a single free 512k page at the given 2687c478bd9Sstevel@tonic-gate * index. Note that when a region is full, it will contribute to the 2697c478bd9Sstevel@tonic-gate * counts in the region above it. Thus we will not know what page 2707c478bd9Sstevel@tonic-gate * size the free pages will be which can be promoted to this new free 2717c478bd9Sstevel@tonic-gate * page unless we look at all regions below the current region. 2727c478bd9Sstevel@tonic-gate */ 2737c478bd9Sstevel@tonic-gate 2747c478bd9Sstevel@tonic-gate /* 2757c478bd9Sstevel@tonic-gate * Note: hpmctr_t is defined in platform vm_dep.h 2767c478bd9Sstevel@tonic-gate * hw_page_map_t contains all the information needed for the page_counters 2777c478bd9Sstevel@tonic-gate * logic. The fields are as follows: 2787c478bd9Sstevel@tonic-gate * 2797c478bd9Sstevel@tonic-gate * hpm_counters: dynamically allocated array to hold counter data 2807c478bd9Sstevel@tonic-gate * hpm_entries: entries in hpm_counters 2817c478bd9Sstevel@tonic-gate * hpm_shift: shift for pnum/array index conv 2827c478bd9Sstevel@tonic-gate * hpm_base: PFN mapped to counter index 0 2837c478bd9Sstevel@tonic-gate * hpm_color_current: last index in counter array for this color at 2847c478bd9Sstevel@tonic-gate * which we successfully created a large page 2857c478bd9Sstevel@tonic-gate */ 2867c478bd9Sstevel@tonic-gate typedef struct hw_page_map { 2877c478bd9Sstevel@tonic-gate hpmctr_t *hpm_counters; 2887c478bd9Sstevel@tonic-gate size_t hpm_entries; 2897c478bd9Sstevel@tonic-gate int hpm_shift; 2907c478bd9Sstevel@tonic-gate pfn_t hpm_base; 2915d07b933Sdp size_t *hpm_color_current[MAX_MNODE_MRANGES]; 29206fb6a36Sdv #if defined(__sparc) 29306fb6a36Sdv uint_t pad[4]; 29406fb6a36Sdv #endif 2957c478bd9Sstevel@tonic-gate } hw_page_map_t; 2967c478bd9Sstevel@tonic-gate 2977c478bd9Sstevel@tonic-gate /* 2987c478bd9Sstevel@tonic-gate * Element zero is not used, but is allocated for convenience. 2997c478bd9Sstevel@tonic-gate */ 3007c478bd9Sstevel@tonic-gate static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 3017c478bd9Sstevel@tonic-gate 3025d07b933Sdp /* 3035d07b933Sdp * Cached value of MNODE_RANGE_CNT(mnode). 3045d07b933Sdp * This is a function call in x86. 3055d07b933Sdp */ 3065d07b933Sdp static int mnode_nranges[MAX_MEM_NODES]; 3075d07b933Sdp static int mnode_maxmrange[MAX_MEM_NODES]; 3085d07b933Sdp 3097c478bd9Sstevel@tonic-gate /* 3107c478bd9Sstevel@tonic-gate * The following macros are convenient ways to get access to the individual 3117c478bd9Sstevel@tonic-gate * elements of the page_counters arrays. They can be used on both 3127c478bd9Sstevel@tonic-gate * the left side and right side of equations. 3137c478bd9Sstevel@tonic-gate */ 3147c478bd9Sstevel@tonic-gate #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 3157c478bd9Sstevel@tonic-gate (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 3167c478bd9Sstevel@tonic-gate 3177c478bd9Sstevel@tonic-gate #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 3187c478bd9Sstevel@tonic-gate (page_counters[(rg_szc)][(mnode)].hpm_counters) 3197c478bd9Sstevel@tonic-gate 3207c478bd9Sstevel@tonic-gate #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 3217c478bd9Sstevel@tonic-gate (page_counters[(rg_szc)][(mnode)].hpm_shift) 3227c478bd9Sstevel@tonic-gate 3237c478bd9Sstevel@tonic-gate #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 3247c478bd9Sstevel@tonic-gate (page_counters[(rg_szc)][(mnode)].hpm_entries) 3257c478bd9Sstevel@tonic-gate 3267c478bd9Sstevel@tonic-gate #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 3277c478bd9Sstevel@tonic-gate (page_counters[(rg_szc)][(mnode)].hpm_base) 3287c478bd9Sstevel@tonic-gate 3295d07b933Sdp #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 3305d07b933Sdp (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 3317c478bd9Sstevel@tonic-gate 3325d07b933Sdp #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 3335d07b933Sdp (page_counters[(rg_szc)][(mnode)]. \ 3345d07b933Sdp hpm_color_current[(mrange)][(color)]) 3357c478bd9Sstevel@tonic-gate 3367c478bd9Sstevel@tonic-gate #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 3377c478bd9Sstevel@tonic-gate (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 3387c478bd9Sstevel@tonic-gate PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 3397c478bd9Sstevel@tonic-gate 3407c478bd9Sstevel@tonic-gate #define IDX_TO_PNUM(mnode, rg_szc, index) \ 3417c478bd9Sstevel@tonic-gate (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 3427c478bd9Sstevel@tonic-gate ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 3437c478bd9Sstevel@tonic-gate 3447c478bd9Sstevel@tonic-gate /* 3457c478bd9Sstevel@tonic-gate * Protects the hpm_counters and hpm_color_current memory from changing while 3467c478bd9Sstevel@tonic-gate * looking at page counters information. 3477c478bd9Sstevel@tonic-gate * Grab the write lock to modify what these fields point at. 3487c478bd9Sstevel@tonic-gate * Grab the read lock to prevent any pointers from changing. 3497c478bd9Sstevel@tonic-gate * The write lock can not be held during memory allocation due to a possible 3507c478bd9Sstevel@tonic-gate * recursion deadlock with trying to grab the read lock while the 3517c478bd9Sstevel@tonic-gate * write lock is already held. 3527c478bd9Sstevel@tonic-gate */ 3537c478bd9Sstevel@tonic-gate krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 3547c478bd9Sstevel@tonic-gate 355affbd3ccSkchow 356affbd3ccSkchow /* 357affbd3ccSkchow * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 358affbd3ccSkchow */ 359affbd3ccSkchow void 360affbd3ccSkchow cpu_vm_data_init(struct cpu *cp) 361affbd3ccSkchow { 362affbd3ccSkchow if (cp == CPU0) { 363affbd3ccSkchow cp->cpu_vm_data = (void *)&vm_cpu_data0; 364affbd3ccSkchow } else { 365affbd3ccSkchow void *kmptr; 3666061ce8aSkchow int align; 3676061ce8aSkchow size_t sz; 368affbd3ccSkchow 3696061ce8aSkchow align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 3706061ce8aSkchow sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 3716061ce8aSkchow kmptr = kmem_zalloc(sz, KM_SLEEP); 372affbd3ccSkchow cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 373affbd3ccSkchow ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 3746061ce8aSkchow ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 375affbd3ccSkchow } 376affbd3ccSkchow } 377affbd3ccSkchow 378affbd3ccSkchow /* 379affbd3ccSkchow * free cpu_vm_data 380affbd3ccSkchow */ 381affbd3ccSkchow void 382affbd3ccSkchow cpu_vm_data_destroy(struct cpu *cp) 383affbd3ccSkchow { 384affbd3ccSkchow if (cp->cpu_seqid && cp->cpu_vm_data) { 385affbd3ccSkchow ASSERT(cp != CPU0); 386affbd3ccSkchow kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 3876061ce8aSkchow ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 388affbd3ccSkchow } 389affbd3ccSkchow cp->cpu_vm_data = NULL; 390affbd3ccSkchow } 391affbd3ccSkchow 392affbd3ccSkchow 3937c478bd9Sstevel@tonic-gate /* 3947c478bd9Sstevel@tonic-gate * page size to page size code 3957c478bd9Sstevel@tonic-gate */ 3967c478bd9Sstevel@tonic-gate int 3977c478bd9Sstevel@tonic-gate page_szc(size_t pagesize) 3987c478bd9Sstevel@tonic-gate { 3997c478bd9Sstevel@tonic-gate int i = 0; 4007c478bd9Sstevel@tonic-gate 4017c478bd9Sstevel@tonic-gate while (hw_page_array[i].hp_size) { 4027c478bd9Sstevel@tonic-gate if (pagesize == hw_page_array[i].hp_size) 4037c478bd9Sstevel@tonic-gate return (i); 4047c478bd9Sstevel@tonic-gate i++; 4057c478bd9Sstevel@tonic-gate } 4067c478bd9Sstevel@tonic-gate return (-1); 4077c478bd9Sstevel@tonic-gate } 4087c478bd9Sstevel@tonic-gate 4097c478bd9Sstevel@tonic-gate /* 4104abce959Smec * page size to page size code with the restriction that it be a supported 4114abce959Smec * user page size. If it's not a supported user page size, -1 will be returned. 4127c478bd9Sstevel@tonic-gate */ 4137c478bd9Sstevel@tonic-gate int 4144abce959Smec page_szc_user_filtered(size_t pagesize) 4157c478bd9Sstevel@tonic-gate { 4167c478bd9Sstevel@tonic-gate int szc = page_szc(pagesize); 4174abce959Smec if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 4184abce959Smec return (szc); 4194abce959Smec } 4207c478bd9Sstevel@tonic-gate return (-1); 4217c478bd9Sstevel@tonic-gate } 4227c478bd9Sstevel@tonic-gate 4237c478bd9Sstevel@tonic-gate /* 4247c478bd9Sstevel@tonic-gate * Return how many page sizes are available for the user to use. This is 4257c478bd9Sstevel@tonic-gate * what the hardware supports and not based upon how the OS implements the 4267c478bd9Sstevel@tonic-gate * support of different page sizes. 42702bc52beSkchow * 42802bc52beSkchow * If legacy is non-zero, return the number of pagesizes available to legacy 42902bc52beSkchow * applications. The number of legacy page sizes might be less than the 43002bc52beSkchow * exported user page sizes. This is to prevent legacy applications that 43102bc52beSkchow * use the largest page size returned from getpagesizes(3c) from inadvertantly 43202bc52beSkchow * using the 'new' large pagesizes. 4337c478bd9Sstevel@tonic-gate */ 4347c478bd9Sstevel@tonic-gate uint_t 43502bc52beSkchow page_num_user_pagesizes(int legacy) 4367c478bd9Sstevel@tonic-gate { 43702bc52beSkchow if (legacy) 43802bc52beSkchow return (mmu_legacy_page_sizes); 4397c478bd9Sstevel@tonic-gate return (mmu_exported_page_sizes); 4407c478bd9Sstevel@tonic-gate } 4417c478bd9Sstevel@tonic-gate 4427c478bd9Sstevel@tonic-gate uint_t 4437c478bd9Sstevel@tonic-gate page_num_pagesizes(void) 4447c478bd9Sstevel@tonic-gate { 4457c478bd9Sstevel@tonic-gate return (mmu_page_sizes); 4467c478bd9Sstevel@tonic-gate } 4477c478bd9Sstevel@tonic-gate 4487c478bd9Sstevel@tonic-gate /* 4497c478bd9Sstevel@tonic-gate * returns the count of the number of base pagesize pages associated with szc 4507c478bd9Sstevel@tonic-gate */ 4517c478bd9Sstevel@tonic-gate pgcnt_t 4527c478bd9Sstevel@tonic-gate page_get_pagecnt(uint_t szc) 4537c478bd9Sstevel@tonic-gate { 4547c478bd9Sstevel@tonic-gate if (szc >= mmu_page_sizes) 4557c478bd9Sstevel@tonic-gate panic("page_get_pagecnt: out of range %d", szc); 4567c478bd9Sstevel@tonic-gate return (hw_page_array[szc].hp_pgcnt); 4577c478bd9Sstevel@tonic-gate } 4587c478bd9Sstevel@tonic-gate 4597c478bd9Sstevel@tonic-gate size_t 4607c478bd9Sstevel@tonic-gate page_get_pagesize(uint_t szc) 4617c478bd9Sstevel@tonic-gate { 4627c478bd9Sstevel@tonic-gate if (szc >= mmu_page_sizes) 4637c478bd9Sstevel@tonic-gate panic("page_get_pagesize: out of range %d", szc); 4647c478bd9Sstevel@tonic-gate return (hw_page_array[szc].hp_size); 4657c478bd9Sstevel@tonic-gate } 4667c478bd9Sstevel@tonic-gate 4677c478bd9Sstevel@tonic-gate /* 4687c478bd9Sstevel@tonic-gate * Return the size of a page based upon the index passed in. An index of 4697c478bd9Sstevel@tonic-gate * zero refers to the smallest page size in the system, and as index increases 4707c478bd9Sstevel@tonic-gate * it refers to the next larger supported page size in the system. 4717c478bd9Sstevel@tonic-gate * Note that szc and userszc may not be the same due to unsupported szc's on 4727c478bd9Sstevel@tonic-gate * some systems. 4737c478bd9Sstevel@tonic-gate */ 4747c478bd9Sstevel@tonic-gate size_t 4757c478bd9Sstevel@tonic-gate page_get_user_pagesize(uint_t userszc) 4767c478bd9Sstevel@tonic-gate { 4777c478bd9Sstevel@tonic-gate uint_t szc = USERSZC_2_SZC(userszc); 4787c478bd9Sstevel@tonic-gate 4797c478bd9Sstevel@tonic-gate if (szc >= mmu_page_sizes) 4807c478bd9Sstevel@tonic-gate panic("page_get_user_pagesize: out of range %d", szc); 4817c478bd9Sstevel@tonic-gate return (hw_page_array[szc].hp_size); 4827c478bd9Sstevel@tonic-gate } 4837c478bd9Sstevel@tonic-gate 4847c478bd9Sstevel@tonic-gate uint_t 4857c478bd9Sstevel@tonic-gate page_get_shift(uint_t szc) 4867c478bd9Sstevel@tonic-gate { 4877c478bd9Sstevel@tonic-gate if (szc >= mmu_page_sizes) 4887c478bd9Sstevel@tonic-gate panic("page_get_shift: out of range %d", szc); 4895d07b933Sdp return (PAGE_GET_SHIFT(szc)); 4907c478bd9Sstevel@tonic-gate } 4917c478bd9Sstevel@tonic-gate 4927c478bd9Sstevel@tonic-gate uint_t 4937c478bd9Sstevel@tonic-gate page_get_pagecolors(uint_t szc) 4947c478bd9Sstevel@tonic-gate { 4955d07b933Sdp if (szc >= mmu_page_sizes) 4965d07b933Sdp panic("page_get_pagecolors: out of range %d", szc); 4975d07b933Sdp return (PAGE_GET_PAGECOLORS(szc)); 4985d07b933Sdp } 4995d07b933Sdp 5005d07b933Sdp /* 5015d07b933Sdp * this assigns the desired equivalent color after a split 5025d07b933Sdp */ 5035d07b933Sdp uint_t 5045d07b933Sdp page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 5055d07b933Sdp uint_t ncolor, uint_t ceq_mask) 5065d07b933Sdp { 5075d07b933Sdp ASSERT(nszc > szc); 5085d07b933Sdp ASSERT(szc < mmu_page_sizes); 5095d07b933Sdp ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 5105d07b933Sdp ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 5115d07b933Sdp 5125d07b933Sdp color &= ceq_mask; 513ce8eb11aSdp ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 5145d07b933Sdp return (color | (ncolor & ~ceq_mask)); 5157c478bd9Sstevel@tonic-gate } 5167c478bd9Sstevel@tonic-gate 517ce8eb11aSdp /* 518ce8eb11aSdp * The interleaved_mnodes flag is set when mnodes overlap in 519ce8eb11aSdp * the physbase..physmax range, but have disjoint slices. 520ce8eb11aSdp * In this case hpm_counters is shared by all mnodes. 521ce8eb11aSdp * This flag is set dynamically by the platform. 522ce8eb11aSdp */ 523ce8eb11aSdp int interleaved_mnodes = 0; 524ce8eb11aSdp 5257c478bd9Sstevel@tonic-gate /* 5267c478bd9Sstevel@tonic-gate * Called by startup(). 5277c478bd9Sstevel@tonic-gate * Size up the per page size free list counters based on physmax 5287c478bd9Sstevel@tonic-gate * of each node and max_mem_nodes. 529ce8eb11aSdp * 530ce8eb11aSdp * If interleaved_mnodes is set we need to find the first mnode that 531ce8eb11aSdp * exists. hpm_counters for the first mnode will then be shared by 532ce8eb11aSdp * all other mnodes. If interleaved_mnodes is not set, just set 533ce8eb11aSdp * first=mnode each time. That means there will be no sharing. 5347c478bd9Sstevel@tonic-gate */ 5357c478bd9Sstevel@tonic-gate size_t 5367c478bd9Sstevel@tonic-gate page_ctrs_sz(void) 5377c478bd9Sstevel@tonic-gate { 5387c478bd9Sstevel@tonic-gate int r; /* region size */ 5397c478bd9Sstevel@tonic-gate int mnode; 540ce8eb11aSdp int firstmn; /* first mnode that exists */ 5415d07b933Sdp int nranges; 542ce8eb11aSdp pfn_t physbase; 543ce8eb11aSdp pfn_t physmax; 5447c478bd9Sstevel@tonic-gate uint_t ctrs_sz = 0; 5457c478bd9Sstevel@tonic-gate int i; 5467c478bd9Sstevel@tonic-gate pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 5477c478bd9Sstevel@tonic-gate 5487c478bd9Sstevel@tonic-gate /* 5497c478bd9Sstevel@tonic-gate * We need to determine how many page colors there are for each 5507c478bd9Sstevel@tonic-gate * page size in order to allocate memory for any color specific 5517c478bd9Sstevel@tonic-gate * arrays. 5527c478bd9Sstevel@tonic-gate */ 5535d07b933Sdp for (i = 0; i < mmu_page_sizes; i++) { 5545d07b933Sdp colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 5557c478bd9Sstevel@tonic-gate } 5567c478bd9Sstevel@tonic-gate 557ce8eb11aSdp for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 5587c478bd9Sstevel@tonic-gate 5597c478bd9Sstevel@tonic-gate pgcnt_t r_pgcnt; 5607c478bd9Sstevel@tonic-gate pfn_t r_base; 5617c478bd9Sstevel@tonic-gate pgcnt_t r_align; 5627c478bd9Sstevel@tonic-gate 5637c478bd9Sstevel@tonic-gate if (mem_node_config[mnode].exists == 0) 5647c478bd9Sstevel@tonic-gate continue; 5657c478bd9Sstevel@tonic-gate 566ce8eb11aSdp HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 5675d07b933Sdp nranges = MNODE_RANGE_CNT(mnode); 5685d07b933Sdp mnode_nranges[mnode] = nranges; 5695d07b933Sdp mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 5705d07b933Sdp 5717c478bd9Sstevel@tonic-gate /* 5727c478bd9Sstevel@tonic-gate * determine size needed for page counter arrays with 5737c478bd9Sstevel@tonic-gate * base aligned to large page size. 5747c478bd9Sstevel@tonic-gate */ 5757c478bd9Sstevel@tonic-gate for (r = 1; r < mmu_page_sizes; r++) { 576ce8eb11aSdp /* add in space for hpm_color_current */ 577ce8eb11aSdp ctrs_sz += sizeof (size_t) * 578ce8eb11aSdp colors_per_szc[r] * nranges; 579ce8eb11aSdp 580ce8eb11aSdp if (firstmn != mnode) 581ce8eb11aSdp continue; 582ce8eb11aSdp 5837c478bd9Sstevel@tonic-gate /* add in space for hpm_counters */ 5847c478bd9Sstevel@tonic-gate r_align = page_get_pagecnt(r); 585ce8eb11aSdp r_base = physbase; 5867c478bd9Sstevel@tonic-gate r_base &= ~(r_align - 1); 587ce8eb11aSdp r_pgcnt = howmany(physmax - r_base + 1, r_align); 588ce8eb11aSdp 5897c478bd9Sstevel@tonic-gate /* 5907c478bd9Sstevel@tonic-gate * Round up to always allocate on pointer sized 5917c478bd9Sstevel@tonic-gate * boundaries. 5927c478bd9Sstevel@tonic-gate */ 5937c478bd9Sstevel@tonic-gate ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 5947c478bd9Sstevel@tonic-gate sizeof (hpmctr_t *)); 5957c478bd9Sstevel@tonic-gate } 5967c478bd9Sstevel@tonic-gate } 5977c478bd9Sstevel@tonic-gate 5987c478bd9Sstevel@tonic-gate for (r = 1; r < mmu_page_sizes; r++) { 5997c478bd9Sstevel@tonic-gate ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 6005d07b933Sdp } 6015d07b933Sdp 6025d07b933Sdp /* add in space for page_ctrs_cands and pcc_color_free */ 6035d07b933Sdp ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 6045d07b933Sdp mmu_page_sizes * NPC_MUTEX; 6055d07b933Sdp 6065d07b933Sdp for (mnode = 0; mnode < max_mem_nodes; mnode++) { 6075d07b933Sdp 6085d07b933Sdp if (mem_node_config[mnode].exists == 0) 6095d07b933Sdp continue; 6107c478bd9Sstevel@tonic-gate 6115d07b933Sdp nranges = mnode_nranges[mnode]; 6125d07b933Sdp ctrs_sz += sizeof (pcc_info_t) * nranges * 6135d07b933Sdp mmu_page_sizes * NPC_MUTEX; 6145d07b933Sdp for (r = 1; r < mmu_page_sizes; r++) { 6155d07b933Sdp ctrs_sz += sizeof (pgcnt_t) * nranges * 6165d07b933Sdp colors_per_szc[r] * NPC_MUTEX; 6175d07b933Sdp } 6187c478bd9Sstevel@tonic-gate } 6197c478bd9Sstevel@tonic-gate 6207c478bd9Sstevel@tonic-gate /* ctr_mutex */ 6217c478bd9Sstevel@tonic-gate ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 6227c478bd9Sstevel@tonic-gate 6237c478bd9Sstevel@tonic-gate /* size for page list counts */ 6247c478bd9Sstevel@tonic-gate PLCNT_SZ(ctrs_sz); 6257c478bd9Sstevel@tonic-gate 6267c478bd9Sstevel@tonic-gate /* 6277c478bd9Sstevel@tonic-gate * add some slop for roundups. page_ctrs_alloc will roundup the start 6287c478bd9Sstevel@tonic-gate * address of the counters to ecache_alignsize boundary for every 6297c478bd9Sstevel@tonic-gate * memory node. 6307c478bd9Sstevel@tonic-gate */ 6317c478bd9Sstevel@tonic-gate return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 6327c478bd9Sstevel@tonic-gate } 6337c478bd9Sstevel@tonic-gate 6347c478bd9Sstevel@tonic-gate caddr_t 6357c478bd9Sstevel@tonic-gate page_ctrs_alloc(caddr_t alloc_base) 6367c478bd9Sstevel@tonic-gate { 6377c478bd9Sstevel@tonic-gate int mnode; 6385d07b933Sdp int mrange, nranges; 6397c478bd9Sstevel@tonic-gate int r; /* region size */ 6407c478bd9Sstevel@tonic-gate int i; 641ce8eb11aSdp int firstmn; /* first mnode that exists */ 642ce8eb11aSdp pfn_t physbase; 643ce8eb11aSdp pfn_t physmax; 6447c478bd9Sstevel@tonic-gate pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 6457c478bd9Sstevel@tonic-gate 6467c478bd9Sstevel@tonic-gate /* 6477c478bd9Sstevel@tonic-gate * We need to determine how many page colors there are for each 6487c478bd9Sstevel@tonic-gate * page size in order to allocate memory for any color specific 6497c478bd9Sstevel@tonic-gate * arrays. 6507c478bd9Sstevel@tonic-gate */ 6515d07b933Sdp for (i = 0; i < mmu_page_sizes; i++) { 6525d07b933Sdp colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 6537c478bd9Sstevel@tonic-gate } 6547c478bd9Sstevel@tonic-gate 6557c478bd9Sstevel@tonic-gate for (r = 1; r < mmu_page_sizes; r++) { 6567c478bd9Sstevel@tonic-gate page_counters[r] = (hw_page_map_t *)alloc_base; 6577c478bd9Sstevel@tonic-gate alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 6587c478bd9Sstevel@tonic-gate } 6597c478bd9Sstevel@tonic-gate 6605d07b933Sdp /* page_ctrs_cands and pcc_color_free array */ 6615d07b933Sdp for (i = 0; i < NPC_MUTEX; i++) { 6625d07b933Sdp for (r = 1; r < mmu_page_sizes; r++) { 6637c478bd9Sstevel@tonic-gate 6645d07b933Sdp page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 6655d07b933Sdp alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 6667c478bd9Sstevel@tonic-gate 6677c478bd9Sstevel@tonic-gate for (mnode = 0; mnode < max_mem_nodes; mnode++) { 6685d07b933Sdp pcc_info_t *pi; 6695d07b933Sdp 6705d07b933Sdp if (mem_node_config[mnode].exists == 0) 6715d07b933Sdp continue; 6725d07b933Sdp 6735d07b933Sdp nranges = mnode_nranges[mnode]; 6745d07b933Sdp 6755d07b933Sdp pi = (pcc_info_t *)alloc_base; 6765d07b933Sdp alloc_base += sizeof (pcc_info_t) * nranges; 6775d07b933Sdp page_ctrs_cands[i][r][mnode] = pi; 6785d07b933Sdp 6795d07b933Sdp for (mrange = 0; mrange < nranges; mrange++) { 6805d07b933Sdp pi->pcc_color_free = 6815d07b933Sdp (pgcnt_t *)alloc_base; 6825d07b933Sdp alloc_base += sizeof (pgcnt_t) * 6835d07b933Sdp colors_per_szc[r]; 6845d07b933Sdp pi++; 6855d07b933Sdp } 6867c478bd9Sstevel@tonic-gate } 6877c478bd9Sstevel@tonic-gate } 6887c478bd9Sstevel@tonic-gate } 6897c478bd9Sstevel@tonic-gate 6907c478bd9Sstevel@tonic-gate /* ctr_mutex */ 6917c478bd9Sstevel@tonic-gate for (i = 0; i < NPC_MUTEX; i++) { 6927c478bd9Sstevel@tonic-gate ctr_mutex[i] = (kmutex_t *)alloc_base; 6937c478bd9Sstevel@tonic-gate alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 6947c478bd9Sstevel@tonic-gate } 6957c478bd9Sstevel@tonic-gate 6967c478bd9Sstevel@tonic-gate /* initialize page list counts */ 6977c478bd9Sstevel@tonic-gate PLCNT_INIT(alloc_base); 6987c478bd9Sstevel@tonic-gate 699ce8eb11aSdp for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 7007c478bd9Sstevel@tonic-gate 7017c478bd9Sstevel@tonic-gate pgcnt_t r_pgcnt; 7027c478bd9Sstevel@tonic-gate pfn_t r_base; 7037c478bd9Sstevel@tonic-gate pgcnt_t r_align; 7047c478bd9Sstevel@tonic-gate int r_shift; 7055d07b933Sdp int nranges = mnode_nranges[mnode]; 7067c478bd9Sstevel@tonic-gate 7077c478bd9Sstevel@tonic-gate if (mem_node_config[mnode].exists == 0) 7087c478bd9Sstevel@tonic-gate continue; 7097c478bd9Sstevel@tonic-gate 710ce8eb11aSdp HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 711ce8eb11aSdp 7127c478bd9Sstevel@tonic-gate for (r = 1; r < mmu_page_sizes; r++) { 7137c478bd9Sstevel@tonic-gate /* 7147c478bd9Sstevel@tonic-gate * the page_counters base has to be aligned to the 7157c478bd9Sstevel@tonic-gate * page count of page size code r otherwise the counts 7167c478bd9Sstevel@tonic-gate * will cross large page boundaries. 7177c478bd9Sstevel@tonic-gate */ 7187c478bd9Sstevel@tonic-gate r_align = page_get_pagecnt(r); 719ce8eb11aSdp r_base = physbase; 7207c478bd9Sstevel@tonic-gate /* base needs to be aligned - lower to aligned value */ 7217c478bd9Sstevel@tonic-gate r_base &= ~(r_align - 1); 722ce8eb11aSdp r_pgcnt = howmany(physmax - r_base + 1, r_align); 7237c478bd9Sstevel@tonic-gate r_shift = PAGE_BSZS_SHIFT(r); 7247c478bd9Sstevel@tonic-gate 7257c478bd9Sstevel@tonic-gate PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 7267c478bd9Sstevel@tonic-gate PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 7277c478bd9Sstevel@tonic-gate PAGE_COUNTERS_BASE(mnode, r) = r_base; 7285d07b933Sdp for (mrange = 0; mrange < nranges; mrange++) { 7295d07b933Sdp PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 7305d07b933Sdp r, mrange) = (size_t *)alloc_base; 7315d07b933Sdp alloc_base += sizeof (size_t) * 7325d07b933Sdp colors_per_szc[r]; 7335d07b933Sdp } 7347c478bd9Sstevel@tonic-gate for (i = 0; i < colors_per_szc[r]; i++) { 7355d07b933Sdp uint_t color_mask = colors_per_szc[r] - 1; 7365d07b933Sdp pfn_t pfnum = r_base; 7375d07b933Sdp size_t idx; 7385d07b933Sdp int mrange; 739ce8eb11aSdp MEM_NODE_ITERATOR_DECL(it); 7405d07b933Sdp 741b779d3e0Sdp MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it); 742b779d3e0Sdp if (pfnum == (pfn_t)-1) { 743b779d3e0Sdp idx = 0; 744b779d3e0Sdp } else { 745b779d3e0Sdp PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 746b779d3e0Sdp color_mask, color_mask, &it); 747b779d3e0Sdp idx = PNUM_TO_IDX(mnode, r, pfnum); 748b779d3e0Sdp idx = (idx >= r_pgcnt) ? 0 : idx; 749b779d3e0Sdp } 7505d07b933Sdp for (mrange = 0; mrange < nranges; mrange++) { 7515d07b933Sdp PAGE_COUNTERS_CURRENT_COLOR(mnode, 7525d07b933Sdp r, i, mrange) = idx; 7535d07b933Sdp } 7547c478bd9Sstevel@tonic-gate } 755ce8eb11aSdp 756ce8eb11aSdp /* hpm_counters may be shared by all mnodes */ 757ce8eb11aSdp if (firstmn == mnode) { 758ce8eb11aSdp PAGE_COUNTERS_COUNTERS(mnode, r) = 759ce8eb11aSdp (hpmctr_t *)alloc_base; 760ce8eb11aSdp alloc_base += 761ce8eb11aSdp P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 762ce8eb11aSdp sizeof (hpmctr_t *)); 763ce8eb11aSdp } else { 764ce8eb11aSdp PAGE_COUNTERS_COUNTERS(mnode, r) = 765ce8eb11aSdp PAGE_COUNTERS_COUNTERS(firstmn, r); 766ce8eb11aSdp } 7677c478bd9Sstevel@tonic-gate 7687c478bd9Sstevel@tonic-gate /* 7697c478bd9Sstevel@tonic-gate * Verify that PNUM_TO_IDX and IDX_TO_PNUM 7707c478bd9Sstevel@tonic-gate * satisfy the identity requirement. 7717c478bd9Sstevel@tonic-gate * We should be able to go from one to the other 7727c478bd9Sstevel@tonic-gate * and get consistent values. 7737c478bd9Sstevel@tonic-gate */ 7747c478bd9Sstevel@tonic-gate ASSERT(PNUM_TO_IDX(mnode, r, 7757c478bd9Sstevel@tonic-gate (IDX_TO_PNUM(mnode, r, 0))) == 0); 7767c478bd9Sstevel@tonic-gate ASSERT(IDX_TO_PNUM(mnode, r, 7777c478bd9Sstevel@tonic-gate (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 7787c478bd9Sstevel@tonic-gate } 7797c478bd9Sstevel@tonic-gate /* 7807c478bd9Sstevel@tonic-gate * Roundup the start address of the page_counters to 7817c478bd9Sstevel@tonic-gate * cache aligned boundary for every memory node. 7827c478bd9Sstevel@tonic-gate * page_ctrs_sz() has added some slop for these roundups. 7837c478bd9Sstevel@tonic-gate */ 7847c478bd9Sstevel@tonic-gate alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 785ce8eb11aSdp L2CACHE_ALIGN); 7867c478bd9Sstevel@tonic-gate } 7877c478bd9Sstevel@tonic-gate 7887c478bd9Sstevel@tonic-gate /* Initialize other page counter specific data structures. */ 7897c478bd9Sstevel@tonic-gate for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 7907c478bd9Sstevel@tonic-gate rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 7917c478bd9Sstevel@tonic-gate } 7927c478bd9Sstevel@tonic-gate 7937c478bd9Sstevel@tonic-gate return (alloc_base); 7947c478bd9Sstevel@tonic-gate } 7957c478bd9Sstevel@tonic-gate 7967c478bd9Sstevel@tonic-gate /* 7977c478bd9Sstevel@tonic-gate * Functions to adjust region counters for each size free list. 7987c478bd9Sstevel@tonic-gate * Caller is responsible to acquire the ctr_mutex lock if necessary and 7997c478bd9Sstevel@tonic-gate * thus can be called during startup without locks. 8007c478bd9Sstevel@tonic-gate */ 8017c478bd9Sstevel@tonic-gate /* ARGSUSED */ 8027c478bd9Sstevel@tonic-gate void 803affbd3ccSkchow page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 8047c478bd9Sstevel@tonic-gate { 8057c478bd9Sstevel@tonic-gate ssize_t r; /* region size */ 8067c478bd9Sstevel@tonic-gate ssize_t idx; 8077c478bd9Sstevel@tonic-gate pfn_t pfnum; 8087c478bd9Sstevel@tonic-gate int lckidx; 8097c478bd9Sstevel@tonic-gate 810affbd3ccSkchow ASSERT(mnode == PP_2_MEM_NODE(pp)); 811affbd3ccSkchow ASSERT(mtype == PP_2_MTYPE(pp)); 812affbd3ccSkchow 8137c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc < mmu_page_sizes); 8147c478bd9Sstevel@tonic-gate 815affbd3ccSkchow PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 8167c478bd9Sstevel@tonic-gate 8177c478bd9Sstevel@tonic-gate /* no counter update needed for largest page size */ 8187c478bd9Sstevel@tonic-gate if (pp->p_szc >= mmu_page_sizes - 1) { 8197c478bd9Sstevel@tonic-gate return; 8207c478bd9Sstevel@tonic-gate } 8217c478bd9Sstevel@tonic-gate 8227c478bd9Sstevel@tonic-gate r = pp->p_szc + 1; 8237c478bd9Sstevel@tonic-gate pfnum = pp->p_pagenum; 8247c478bd9Sstevel@tonic-gate lckidx = PP_CTR_LOCK_INDX(pp); 8257c478bd9Sstevel@tonic-gate 8267c478bd9Sstevel@tonic-gate /* 8277c478bd9Sstevel@tonic-gate * Increment the count of free pages for the current 8287c478bd9Sstevel@tonic-gate * region. Continue looping up in region size incrementing 8297c478bd9Sstevel@tonic-gate * count if the preceeding region is full. 8307c478bd9Sstevel@tonic-gate */ 8317c478bd9Sstevel@tonic-gate while (r < mmu_page_sizes) { 8327c478bd9Sstevel@tonic-gate idx = PNUM_TO_IDX(mnode, r, pfnum); 8337c478bd9Sstevel@tonic-gate 8347c478bd9Sstevel@tonic-gate ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 8357c478bd9Sstevel@tonic-gate ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 8367c478bd9Sstevel@tonic-gate 8375d07b933Sdp if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 8387c478bd9Sstevel@tonic-gate break; 8395d07b933Sdp } else { 8405d07b933Sdp int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 8415d07b933Sdp pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 8425d07b933Sdp [MTYPE_2_MRANGE(mnode, root_mtype)]; 8437c478bd9Sstevel@tonic-gate 8445d07b933Sdp cand->pcc_pages_free++; 8455d07b933Sdp cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 8465d07b933Sdp } 8477c478bd9Sstevel@tonic-gate r++; 8487c478bd9Sstevel@tonic-gate } 8497c478bd9Sstevel@tonic-gate } 8507c478bd9Sstevel@tonic-gate 8517c478bd9Sstevel@tonic-gate void 852affbd3ccSkchow page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 8537c478bd9Sstevel@tonic-gate { 8547c478bd9Sstevel@tonic-gate int lckidx = PP_CTR_LOCK_INDX(pp); 8557c478bd9Sstevel@tonic-gate kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 8567c478bd9Sstevel@tonic-gate 8577c478bd9Sstevel@tonic-gate mutex_enter(lock); 858affbd3ccSkchow page_ctr_add_internal(mnode, mtype, pp, flags); 8597c478bd9Sstevel@tonic-gate mutex_exit(lock); 8607c478bd9Sstevel@tonic-gate } 8617c478bd9Sstevel@tonic-gate 8627c478bd9Sstevel@tonic-gate void 8635d07b933Sdp page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 8647c478bd9Sstevel@tonic-gate { 8657c478bd9Sstevel@tonic-gate int lckidx; 8667c478bd9Sstevel@tonic-gate ssize_t r; /* region size */ 8677c478bd9Sstevel@tonic-gate ssize_t idx; 8687c478bd9Sstevel@tonic-gate pfn_t pfnum; 8697c478bd9Sstevel@tonic-gate 870affbd3ccSkchow ASSERT(mnode == PP_2_MEM_NODE(pp)); 871affbd3ccSkchow ASSERT(mtype == PP_2_MTYPE(pp)); 872affbd3ccSkchow 8737c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc < mmu_page_sizes); 8747c478bd9Sstevel@tonic-gate 875affbd3ccSkchow PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 8767c478bd9Sstevel@tonic-gate 8777c478bd9Sstevel@tonic-gate /* no counter update needed for largest page size */ 8787c478bd9Sstevel@tonic-gate if (pp->p_szc >= mmu_page_sizes - 1) { 8797c478bd9Sstevel@tonic-gate return; 8807c478bd9Sstevel@tonic-gate } 8817c478bd9Sstevel@tonic-gate 8827c478bd9Sstevel@tonic-gate r = pp->p_szc + 1; 8837c478bd9Sstevel@tonic-gate pfnum = pp->p_pagenum; 8847c478bd9Sstevel@tonic-gate lckidx = PP_CTR_LOCK_INDX(pp); 8857c478bd9Sstevel@tonic-gate 8867c478bd9Sstevel@tonic-gate /* 8877c478bd9Sstevel@tonic-gate * Decrement the count of free pages for the current 8887c478bd9Sstevel@tonic-gate * region. Continue looping up in region size decrementing 8897c478bd9Sstevel@tonic-gate * count if the preceeding region was full. 8907c478bd9Sstevel@tonic-gate */ 8917c478bd9Sstevel@tonic-gate while (r < mmu_page_sizes) { 8927c478bd9Sstevel@tonic-gate idx = PNUM_TO_IDX(mnode, r, pfnum); 8937c478bd9Sstevel@tonic-gate 8947c478bd9Sstevel@tonic-gate ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 8957c478bd9Sstevel@tonic-gate ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 8967c478bd9Sstevel@tonic-gate 8977c478bd9Sstevel@tonic-gate if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 8987c478bd9Sstevel@tonic-gate break; 8995d07b933Sdp } else { 9005d07b933Sdp int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 9015d07b933Sdp pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 9025d07b933Sdp [MTYPE_2_MRANGE(mnode, root_mtype)]; 9037c478bd9Sstevel@tonic-gate 9045d07b933Sdp ASSERT(cand->pcc_pages_free != 0); 9055d07b933Sdp ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 9065d07b933Sdp 9075d07b933Sdp cand->pcc_pages_free--; 9085d07b933Sdp cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 9095d07b933Sdp } 9107c478bd9Sstevel@tonic-gate r++; 9117c478bd9Sstevel@tonic-gate } 9125d07b933Sdp } 9135d07b933Sdp 9145d07b933Sdp void 9155d07b933Sdp page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 9165d07b933Sdp { 9175d07b933Sdp int lckidx = PP_CTR_LOCK_INDX(pp); 9185d07b933Sdp kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 9195d07b933Sdp 9205d07b933Sdp mutex_enter(lock); 9215d07b933Sdp page_ctr_sub_internal(mnode, mtype, pp, flags); 9227c478bd9Sstevel@tonic-gate mutex_exit(lock); 9237c478bd9Sstevel@tonic-gate } 9247c478bd9Sstevel@tonic-gate 9257c478bd9Sstevel@tonic-gate /* 9267c478bd9Sstevel@tonic-gate * Adjust page counters following a memory attach, since typically the 9277c478bd9Sstevel@tonic-gate * size of the array needs to change, and the PFN to counter index 9287c478bd9Sstevel@tonic-gate * mapping needs to change. 9295d07b933Sdp * 9305d07b933Sdp * It is possible this mnode did not exist at startup. In that case 9315d07b933Sdp * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 9325d07b933Sdp * to change (a theoretical possibility on x86), which means pcc_color_free 9335d07b933Sdp * arrays must be extended. 9347c478bd9Sstevel@tonic-gate */ 9357c478bd9Sstevel@tonic-gate uint_t 9367c478bd9Sstevel@tonic-gate page_ctrs_adjust(int mnode) 9377c478bd9Sstevel@tonic-gate { 9387c478bd9Sstevel@tonic-gate pgcnt_t npgs; 9397c478bd9Sstevel@tonic-gate int r; /* region size */ 9407c478bd9Sstevel@tonic-gate int i; 9417c478bd9Sstevel@tonic-gate size_t pcsz, old_csz; 9427c478bd9Sstevel@tonic-gate hpmctr_t *new_ctr, *old_ctr; 9437c478bd9Sstevel@tonic-gate pfn_t oldbase, newbase; 944ce8eb11aSdp pfn_t physbase, physmax; 9457c478bd9Sstevel@tonic-gate size_t old_npgs; 9467c478bd9Sstevel@tonic-gate hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 9477c478bd9Sstevel@tonic-gate size_t size_cache[MMU_PAGE_SIZES]; 9485d07b933Sdp size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 9495d07b933Sdp size_t *old_color_array[MAX_MNODE_MRANGES]; 9507c478bd9Sstevel@tonic-gate pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 9515d07b933Sdp pcc_info_t **cands_cache; 9525d07b933Sdp pcc_info_t *old_pi, *pi; 9535d07b933Sdp pgcnt_t *pgcntp; 9545d07b933Sdp int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 9555d07b933Sdp int cands_cache_nranges; 9565d07b933Sdp int old_maxmrange, new_maxmrange; 9575d07b933Sdp int rc = 0; 9589853d9e8SJason Beloro int oldmnode; 9597c478bd9Sstevel@tonic-gate 9605d07b933Sdp cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 9615d07b933Sdp MMU_PAGE_SIZES, KM_NOSLEEP); 9625d07b933Sdp if (cands_cache == NULL) 9635d07b933Sdp return (ENOMEM); 9645d07b933Sdp 965ce8eb11aSdp i = -1; 966ce8eb11aSdp HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 967ce8eb11aSdp 968ce8eb11aSdp newbase = physbase & ~PC_BASE_ALIGN_MASK; 969ce8eb11aSdp npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 970ce8eb11aSdp 9715d07b933Sdp /* prepare to free non-null pointers on the way out */ 9725d07b933Sdp cands_cache_nranges = nranges; 9735d07b933Sdp bzero(ctr_cache, sizeof (ctr_cache)); 9745d07b933Sdp bzero(color_cache, sizeof (color_cache)); 9755d07b933Sdp 9767c478bd9Sstevel@tonic-gate /* 9777c478bd9Sstevel@tonic-gate * We need to determine how many page colors there are for each 9787c478bd9Sstevel@tonic-gate * page size in order to allocate memory for any color specific 9797c478bd9Sstevel@tonic-gate * arrays. 9807c478bd9Sstevel@tonic-gate */ 9815d07b933Sdp for (r = 0; r < mmu_page_sizes; r++) { 9825d07b933Sdp colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 9837c478bd9Sstevel@tonic-gate } 9847c478bd9Sstevel@tonic-gate 9857c478bd9Sstevel@tonic-gate /* 9867c478bd9Sstevel@tonic-gate * Preallocate all of the new hpm_counters arrays as we can't 9877c478bd9Sstevel@tonic-gate * hold the page_ctrs_rwlock as a writer and allocate memory. 9887c478bd9Sstevel@tonic-gate * If we can't allocate all of the arrays, undo our work so far 9897c478bd9Sstevel@tonic-gate * and return failure. 9907c478bd9Sstevel@tonic-gate */ 9917c478bd9Sstevel@tonic-gate for (r = 1; r < mmu_page_sizes; r++) { 9927c478bd9Sstevel@tonic-gate pcsz = npgs >> PAGE_BSZS_SHIFT(r); 9935d07b933Sdp size_cache[r] = pcsz; 9947c478bd9Sstevel@tonic-gate ctr_cache[r] = kmem_zalloc(pcsz * 9957c478bd9Sstevel@tonic-gate sizeof (hpmctr_t), KM_NOSLEEP); 9967c478bd9Sstevel@tonic-gate if (ctr_cache[r] == NULL) { 9975d07b933Sdp rc = ENOMEM; 9985d07b933Sdp goto cleanup; 9997c478bd9Sstevel@tonic-gate } 10007c478bd9Sstevel@tonic-gate } 10015d07b933Sdp 10027c478bd9Sstevel@tonic-gate /* 10037c478bd9Sstevel@tonic-gate * Preallocate all of the new color current arrays as we can't 10047c478bd9Sstevel@tonic-gate * hold the page_ctrs_rwlock as a writer and allocate memory. 10057c478bd9Sstevel@tonic-gate * If we can't allocate all of the arrays, undo our work so far 10067c478bd9Sstevel@tonic-gate * and return failure. 10077c478bd9Sstevel@tonic-gate */ 10087c478bd9Sstevel@tonic-gate for (r = 1; r < mmu_page_sizes; r++) { 10095d07b933Sdp for (mrange = 0; mrange < nranges; mrange++) { 10105d07b933Sdp color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 10115d07b933Sdp colors_per_szc[r], KM_NOSLEEP); 10125d07b933Sdp if (color_cache[r][mrange] == NULL) { 10135d07b933Sdp rc = ENOMEM; 10145d07b933Sdp goto cleanup; 10155d07b933Sdp } 10165d07b933Sdp } 10175d07b933Sdp } 10185d07b933Sdp 10195d07b933Sdp /* 10205d07b933Sdp * Preallocate all of the new pcc_info_t arrays as we can't 10215d07b933Sdp * hold the page_ctrs_rwlock as a writer and allocate memory. 10225d07b933Sdp * If we can't allocate all of the arrays, undo our work so far 10235d07b933Sdp * and return failure. 10245d07b933Sdp */ 10255d07b933Sdp for (r = 1; r < mmu_page_sizes; r++) { 10265d07b933Sdp for (i = 0; i < NPC_MUTEX; i++) { 10275d07b933Sdp pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 10285d07b933Sdp KM_NOSLEEP); 10295d07b933Sdp if (pi == NULL) { 10305d07b933Sdp rc = ENOMEM; 10315d07b933Sdp goto cleanup; 10327c478bd9Sstevel@tonic-gate } 10335d07b933Sdp cands_cache[i * MMU_PAGE_SIZES + r] = pi; 10345d07b933Sdp 10355d07b933Sdp for (mrange = 0; mrange < nranges; mrange++, pi++) { 10365d07b933Sdp pgcntp = kmem_zalloc(colors_per_szc[r] * 10375d07b933Sdp sizeof (pgcnt_t), KM_NOSLEEP); 10385d07b933Sdp if (pgcntp == NULL) { 10395d07b933Sdp rc = ENOMEM; 10405d07b933Sdp goto cleanup; 10415d07b933Sdp } 10425d07b933Sdp pi->pcc_color_free = pgcntp; 10437c478bd9Sstevel@tonic-gate } 10447c478bd9Sstevel@tonic-gate } 10457c478bd9Sstevel@tonic-gate } 10467c478bd9Sstevel@tonic-gate 10477c478bd9Sstevel@tonic-gate /* 10487c478bd9Sstevel@tonic-gate * Grab the write lock to prevent others from walking these arrays 10497c478bd9Sstevel@tonic-gate * while we are modifying them. 10507c478bd9Sstevel@tonic-gate */ 1051ce8eb11aSdp PAGE_CTRS_WRITE_LOCK(mnode); 10525d07b933Sdp 10539853d9e8SJason Beloro /* 10549853d9e8SJason Beloro * For interleaved mnodes, find the first mnode 10559853d9e8SJason Beloro * with valid page counters since the current 10569853d9e8SJason Beloro * mnode may have just been added and not have 10579853d9e8SJason Beloro * valid page counters. 10589853d9e8SJason Beloro */ 10599853d9e8SJason Beloro if (interleaved_mnodes) { 10609853d9e8SJason Beloro for (i = 0; i < max_mem_nodes; i++) 10619853d9e8SJason Beloro if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL) 10629853d9e8SJason Beloro break; 10639853d9e8SJason Beloro ASSERT(i < max_mem_nodes); 10649853d9e8SJason Beloro oldmnode = i; 10659853d9e8SJason Beloro } else 10669853d9e8SJason Beloro oldmnode = mnode; 10679853d9e8SJason Beloro 10685d07b933Sdp old_nranges = mnode_nranges[mnode]; 10695d07b933Sdp cands_cache_nranges = old_nranges; 10705d07b933Sdp mnode_nranges[mnode] = nranges; 10715d07b933Sdp old_maxmrange = mnode_maxmrange[mnode]; 10725d07b933Sdp mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 10735d07b933Sdp new_maxmrange = mnode_maxmrange[mnode]; 10745d07b933Sdp 10757c478bd9Sstevel@tonic-gate for (r = 1; r < mmu_page_sizes; r++) { 10767c478bd9Sstevel@tonic-gate PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 10779853d9e8SJason Beloro old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r); 10789853d9e8SJason Beloro old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r); 10799853d9e8SJason Beloro oldbase = PAGE_COUNTERS_BASE(oldmnode, r); 10809853d9e8SJason Beloro old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r); 10815d07b933Sdp for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 10825d07b933Sdp old_color_array[mrange] = 10835d07b933Sdp PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1084ce8eb11aSdp r, mrange); 10855d07b933Sdp } 10867c478bd9Sstevel@tonic-gate 10877c478bd9Sstevel@tonic-gate pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 10887c478bd9Sstevel@tonic-gate new_ctr = ctr_cache[r]; 10897c478bd9Sstevel@tonic-gate ctr_cache[r] = NULL; 10907c478bd9Sstevel@tonic-gate if (old_ctr != NULL && 10917c478bd9Sstevel@tonic-gate (oldbase + old_npgs > newbase) && 10927c478bd9Sstevel@tonic-gate (newbase + npgs > oldbase)) { 10937c478bd9Sstevel@tonic-gate /* 10947c478bd9Sstevel@tonic-gate * Map the intersection of the old and new 10957c478bd9Sstevel@tonic-gate * counters into the new array. 10967c478bd9Sstevel@tonic-gate */ 10977c478bd9Sstevel@tonic-gate size_t offset; 10987c478bd9Sstevel@tonic-gate if (newbase > oldbase) { 10997c478bd9Sstevel@tonic-gate offset = (newbase - oldbase) >> 11007c478bd9Sstevel@tonic-gate PAGE_COUNTERS_SHIFT(mnode, r); 11017c478bd9Sstevel@tonic-gate bcopy(old_ctr + offset, new_ctr, 11027c478bd9Sstevel@tonic-gate MIN(pcsz, (old_csz - offset)) * 11037c478bd9Sstevel@tonic-gate sizeof (hpmctr_t)); 11047c478bd9Sstevel@tonic-gate } else { 11057c478bd9Sstevel@tonic-gate offset = (oldbase - newbase) >> 11067c478bd9Sstevel@tonic-gate PAGE_COUNTERS_SHIFT(mnode, r); 11077c478bd9Sstevel@tonic-gate bcopy(old_ctr, new_ctr + offset, 11087c478bd9Sstevel@tonic-gate MIN(pcsz - offset, old_csz) * 11097c478bd9Sstevel@tonic-gate sizeof (hpmctr_t)); 11107c478bd9Sstevel@tonic-gate } 11117c478bd9Sstevel@tonic-gate } 11127c478bd9Sstevel@tonic-gate 11137c478bd9Sstevel@tonic-gate PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 11147c478bd9Sstevel@tonic-gate PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 11157c478bd9Sstevel@tonic-gate PAGE_COUNTERS_BASE(mnode, r) = newbase; 1116ce8eb11aSdp 1117ce8eb11aSdp /* update shared hpm_counters in other mnodes */ 1118ce8eb11aSdp if (interleaved_mnodes) { 1119ce8eb11aSdp for (i = 0; i < max_mem_nodes; i++) { 1120af4c679fSSean McEnroe if ((i == mnode) || 1121af4c679fSSean McEnroe (mem_node_config[i].exists == 0)) 1122ce8eb11aSdp continue; 11239853d9e8SJason Beloro ASSERT( 11249853d9e8SJason Beloro PAGE_COUNTERS_COUNTERS(i, r) == old_ctr || 11259853d9e8SJason Beloro PAGE_COUNTERS_COUNTERS(i, r) == NULL); 1126ce8eb11aSdp PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1127ce8eb11aSdp PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1128ce8eb11aSdp PAGE_COUNTERS_BASE(i, r) = newbase; 1129ce8eb11aSdp } 1130ce8eb11aSdp } 1131ce8eb11aSdp 11325d07b933Sdp for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 11335d07b933Sdp PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 11345d07b933Sdp color_cache[r][mrange]; 11355d07b933Sdp color_cache[r][mrange] = NULL; 11365d07b933Sdp } 11377c478bd9Sstevel@tonic-gate /* 11387c478bd9Sstevel@tonic-gate * for now, just reset on these events as it's probably 11397c478bd9Sstevel@tonic-gate * not worthwhile to try and optimize this. 11407c478bd9Sstevel@tonic-gate */ 11417c478bd9Sstevel@tonic-gate for (i = 0; i < colors_per_szc[r]; i++) { 11425d07b933Sdp uint_t color_mask = colors_per_szc[r] - 1; 1143ce8eb11aSdp int mlo = interleaved_mnodes ? 0 : mnode; 1144ce8eb11aSdp int mhi = interleaved_mnodes ? max_mem_nodes : 1145ce8eb11aSdp (mnode + 1); 1146ce8eb11aSdp int m; 11479853d9e8SJason Beloro pfn_t pfnum; 11485d07b933Sdp size_t idx; 1149ce8eb11aSdp MEM_NODE_ITERATOR_DECL(it); 11505d07b933Sdp 1151ce8eb11aSdp for (m = mlo; m < mhi; m++) { 1152ce8eb11aSdp if (mem_node_config[m].exists == 0) 1153ce8eb11aSdp continue; 11549853d9e8SJason Beloro pfnum = newbase; 1155b779d3e0Sdp MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it); 1156b779d3e0Sdp if (pfnum == (pfn_t)-1) { 1157b779d3e0Sdp idx = 0; 1158b779d3e0Sdp } else { 1159b779d3e0Sdp PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 1160b779d3e0Sdp color_mask, color_mask, &it); 1161b779d3e0Sdp idx = PNUM_TO_IDX(m, r, pfnum); 1162b779d3e0Sdp idx = (idx < pcsz) ? idx : 0; 1163b779d3e0Sdp } 1164ce8eb11aSdp for (mrange = 0; mrange < nranges; mrange++) { 11659853d9e8SJason Beloro if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m, 11669853d9e8SJason Beloro r, mrange) != NULL) 11679853d9e8SJason Beloro PAGE_COUNTERS_CURRENT_COLOR(m, 11689853d9e8SJason Beloro r, i, mrange) = idx; 1169ce8eb11aSdp } 11705d07b933Sdp } 11717c478bd9Sstevel@tonic-gate } 11727c478bd9Sstevel@tonic-gate 11737c478bd9Sstevel@tonic-gate /* cache info for freeing out of the critical path */ 11747c478bd9Sstevel@tonic-gate if ((caddr_t)old_ctr >= kernelheap && 11757c478bd9Sstevel@tonic-gate (caddr_t)old_ctr < ekernelheap) { 11767c478bd9Sstevel@tonic-gate ctr_cache[r] = old_ctr; 11777c478bd9Sstevel@tonic-gate size_cache[r] = old_csz; 11787c478bd9Sstevel@tonic-gate } 11795d07b933Sdp for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 11805d07b933Sdp size_t *tmp = old_color_array[mrange]; 11815d07b933Sdp if ((caddr_t)tmp >= kernelheap && 11825d07b933Sdp (caddr_t)tmp < ekernelheap) { 11835d07b933Sdp color_cache[r][mrange] = tmp; 11845d07b933Sdp } 11857c478bd9Sstevel@tonic-gate } 11867c478bd9Sstevel@tonic-gate /* 11877c478bd9Sstevel@tonic-gate * Verify that PNUM_TO_IDX and IDX_TO_PNUM 11887c478bd9Sstevel@tonic-gate * satisfy the identity requirement. 11897c478bd9Sstevel@tonic-gate * We should be able to go from one to the other 11907c478bd9Sstevel@tonic-gate * and get consistent values. 11917c478bd9Sstevel@tonic-gate */ 11927c478bd9Sstevel@tonic-gate ASSERT(PNUM_TO_IDX(mnode, r, 11937c478bd9Sstevel@tonic-gate (IDX_TO_PNUM(mnode, r, 0))) == 0); 11947c478bd9Sstevel@tonic-gate ASSERT(IDX_TO_PNUM(mnode, r, 11957c478bd9Sstevel@tonic-gate (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 11965d07b933Sdp 11975d07b933Sdp /* pcc_info_t and pcc_color_free */ 11985d07b933Sdp for (i = 0; i < NPC_MUTEX; i++) { 11995d07b933Sdp pcc_info_t *epi; 12005d07b933Sdp pcc_info_t *eold_pi; 12015d07b933Sdp 12025d07b933Sdp pi = cands_cache[i * MMU_PAGE_SIZES + r]; 12035d07b933Sdp old_pi = page_ctrs_cands[i][r][mnode]; 12045d07b933Sdp page_ctrs_cands[i][r][mnode] = pi; 12055d07b933Sdp cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 12065d07b933Sdp 12075d07b933Sdp /* preserve old pcc_color_free values, if any */ 12085d07b933Sdp if (old_pi == NULL) 12095d07b933Sdp continue; 12105d07b933Sdp 12115d07b933Sdp /* 12125d07b933Sdp * when/if x86 does DR, must account for 12135d07b933Sdp * possible change in range index when 12145d07b933Sdp * preserving pcc_info 12155d07b933Sdp */ 12165d07b933Sdp epi = &pi[nranges]; 12175d07b933Sdp eold_pi = &old_pi[old_nranges]; 12185d07b933Sdp if (new_maxmrange > old_maxmrange) { 12195d07b933Sdp pi += new_maxmrange - old_maxmrange; 12205d07b933Sdp } else if (new_maxmrange < old_maxmrange) { 12215d07b933Sdp old_pi += old_maxmrange - new_maxmrange; 12225d07b933Sdp } 12235d07b933Sdp for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 12245d07b933Sdp pcc_info_t tmp = *pi; 12255d07b933Sdp *pi = *old_pi; 12265d07b933Sdp *old_pi = tmp; 12275d07b933Sdp } 12285d07b933Sdp } 12297c478bd9Sstevel@tonic-gate } 1230ce8eb11aSdp PAGE_CTRS_WRITE_UNLOCK(mnode); 12317c478bd9Sstevel@tonic-gate 12327c478bd9Sstevel@tonic-gate /* 12337c478bd9Sstevel@tonic-gate * Now that we have dropped the write lock, it is safe to free all 12347c478bd9Sstevel@tonic-gate * of the memory we have cached above. 12355d07b933Sdp * We come thru here to free memory when pre-alloc fails, and also to 12365d07b933Sdp * free old pointers which were recorded while locked. 12377c478bd9Sstevel@tonic-gate */ 12385d07b933Sdp cleanup: 12397c478bd9Sstevel@tonic-gate for (r = 1; r < mmu_page_sizes; r++) { 12407c478bd9Sstevel@tonic-gate if (ctr_cache[r] != NULL) { 12417c478bd9Sstevel@tonic-gate kmem_free(ctr_cache[r], 12427c478bd9Sstevel@tonic-gate size_cache[r] * sizeof (hpmctr_t)); 12437c478bd9Sstevel@tonic-gate } 12445d07b933Sdp for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 12455d07b933Sdp if (color_cache[r][mrange] != NULL) { 12465d07b933Sdp kmem_free(color_cache[r][mrange], 12475d07b933Sdp colors_per_szc[r] * sizeof (size_t)); 12485d07b933Sdp } 12495d07b933Sdp } 12505d07b933Sdp for (i = 0; i < NPC_MUTEX; i++) { 12515d07b933Sdp pi = cands_cache[i * MMU_PAGE_SIZES + r]; 12525d07b933Sdp if (pi == NULL) 12535d07b933Sdp continue; 12545d07b933Sdp nr = cands_cache_nranges; 12555d07b933Sdp for (mrange = 0; mrange < nr; mrange++, pi++) { 12565d07b933Sdp pgcntp = pi->pcc_color_free; 12575d07b933Sdp if (pgcntp == NULL) 12585d07b933Sdp continue; 12595d07b933Sdp if ((caddr_t)pgcntp >= kernelheap && 12605d07b933Sdp (caddr_t)pgcntp < ekernelheap) { 12615d07b933Sdp kmem_free(pgcntp, 12625d07b933Sdp colors_per_szc[r] * 12635d07b933Sdp sizeof (pgcnt_t)); 12645d07b933Sdp } 12655d07b933Sdp } 12665d07b933Sdp pi = cands_cache[i * MMU_PAGE_SIZES + r]; 12675d07b933Sdp if ((caddr_t)pi >= kernelheap && 12685d07b933Sdp (caddr_t)pi < ekernelheap) { 12695d07b933Sdp kmem_free(pi, nr * sizeof (pcc_info_t)); 12705d07b933Sdp } 12717c478bd9Sstevel@tonic-gate } 12727c478bd9Sstevel@tonic-gate } 12737c478bd9Sstevel@tonic-gate 12745d07b933Sdp kmem_free(cands_cache, 12755d07b933Sdp sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 12765d07b933Sdp return (rc); 12777c478bd9Sstevel@tonic-gate } 12787c478bd9Sstevel@tonic-gate 1279af4c679fSSean McEnroe /* 1280af4c679fSSean McEnroe * Cleanup the hpm_counters field in the page counters 1281af4c679fSSean McEnroe * array. 1282af4c679fSSean McEnroe */ 1283af4c679fSSean McEnroe void 1284af4c679fSSean McEnroe page_ctrs_cleanup(void) 1285af4c679fSSean McEnroe { 1286af4c679fSSean McEnroe int r; /* region size */ 1287af4c679fSSean McEnroe int i; /* mnode index */ 1288af4c679fSSean McEnroe 1289af4c679fSSean McEnroe /* 1290af4c679fSSean McEnroe * Get the page counters write lock while we are 1291af4c679fSSean McEnroe * setting the page hpm_counters field to NULL 1292af4c679fSSean McEnroe * for non-existent mnodes. 1293af4c679fSSean McEnroe */ 1294af4c679fSSean McEnroe for (i = 0; i < max_mem_nodes; i++) { 1295af4c679fSSean McEnroe PAGE_CTRS_WRITE_LOCK(i); 1296af4c679fSSean McEnroe if (mem_node_config[i].exists) { 1297af4c679fSSean McEnroe PAGE_CTRS_WRITE_UNLOCK(i); 1298af4c679fSSean McEnroe continue; 1299af4c679fSSean McEnroe } 1300af4c679fSSean McEnroe for (r = 1; r < mmu_page_sizes; r++) { 1301af4c679fSSean McEnroe PAGE_COUNTERS_COUNTERS(i, r) = NULL; 1302af4c679fSSean McEnroe } 1303af4c679fSSean McEnroe PAGE_CTRS_WRITE_UNLOCK(i); 1304af4c679fSSean McEnroe } 1305af4c679fSSean McEnroe } 13065d07b933Sdp 13077c478bd9Sstevel@tonic-gate #ifdef DEBUG 13087c478bd9Sstevel@tonic-gate 13097c478bd9Sstevel@tonic-gate /* 13107c478bd9Sstevel@tonic-gate * confirm pp is a large page corresponding to szc 13117c478bd9Sstevel@tonic-gate */ 13127c478bd9Sstevel@tonic-gate void 13137c478bd9Sstevel@tonic-gate chk_lpg(page_t *pp, uchar_t szc) 13147c478bd9Sstevel@tonic-gate { 13157c478bd9Sstevel@tonic-gate spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 13167c478bd9Sstevel@tonic-gate uint_t noreloc; 13177c478bd9Sstevel@tonic-gate 13187c478bd9Sstevel@tonic-gate if (npgs == 1) { 13197c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 13207c478bd9Sstevel@tonic-gate ASSERT(pp->p_next == pp); 13217c478bd9Sstevel@tonic-gate ASSERT(pp->p_prev == pp); 13227c478bd9Sstevel@tonic-gate return; 13237c478bd9Sstevel@tonic-gate } 13247c478bd9Sstevel@tonic-gate 13257c478bd9Sstevel@tonic-gate ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 13267c478bd9Sstevel@tonic-gate ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 13277c478bd9Sstevel@tonic-gate 13287c478bd9Sstevel@tonic-gate ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 13297c478bd9Sstevel@tonic-gate ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 13307c478bd9Sstevel@tonic-gate ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 13317c478bd9Sstevel@tonic-gate ASSERT(pp->p_prev == (pp + (npgs - 1))); 13327c478bd9Sstevel@tonic-gate 13337c478bd9Sstevel@tonic-gate /* 13347c478bd9Sstevel@tonic-gate * Check list of pages. 13357c478bd9Sstevel@tonic-gate */ 13367c478bd9Sstevel@tonic-gate noreloc = PP_ISNORELOC(pp); 13377c478bd9Sstevel@tonic-gate while (npgs--) { 13387c478bd9Sstevel@tonic-gate if (npgs != 0) { 13397c478bd9Sstevel@tonic-gate ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 13407c478bd9Sstevel@tonic-gate ASSERT(pp->p_next == (pp + 1)); 13417c478bd9Sstevel@tonic-gate } 13427c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == szc); 13437c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 13447c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 13457c478bd9Sstevel@tonic-gate ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 13467c478bd9Sstevel@tonic-gate ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 13477c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode == NULL); 13487c478bd9Sstevel@tonic-gate ASSERT(PP_ISNORELOC(pp) == noreloc); 13497c478bd9Sstevel@tonic-gate 13507c478bd9Sstevel@tonic-gate pp = pp->p_next; 13517c478bd9Sstevel@tonic-gate } 13527c478bd9Sstevel@tonic-gate } 13537c478bd9Sstevel@tonic-gate #endif /* DEBUG */ 13547c478bd9Sstevel@tonic-gate 13557c478bd9Sstevel@tonic-gate void 13567c478bd9Sstevel@tonic-gate page_freelist_lock(int mnode) 13577c478bd9Sstevel@tonic-gate { 13587c478bd9Sstevel@tonic-gate int i; 13597c478bd9Sstevel@tonic-gate for (i = 0; i < NPC_MUTEX; i++) { 13607c478bd9Sstevel@tonic-gate mutex_enter(FPC_MUTEX(mnode, i)); 13617c478bd9Sstevel@tonic-gate mutex_enter(CPC_MUTEX(mnode, i)); 13627c478bd9Sstevel@tonic-gate } 13637c478bd9Sstevel@tonic-gate } 13647c478bd9Sstevel@tonic-gate 13657c478bd9Sstevel@tonic-gate void 13667c478bd9Sstevel@tonic-gate page_freelist_unlock(int mnode) 13677c478bd9Sstevel@tonic-gate { 13687c478bd9Sstevel@tonic-gate int i; 13697c478bd9Sstevel@tonic-gate for (i = 0; i < NPC_MUTEX; i++) { 13707c478bd9Sstevel@tonic-gate mutex_exit(FPC_MUTEX(mnode, i)); 13717c478bd9Sstevel@tonic-gate mutex_exit(CPC_MUTEX(mnode, i)); 13727c478bd9Sstevel@tonic-gate } 13737c478bd9Sstevel@tonic-gate } 13747c478bd9Sstevel@tonic-gate 13757c478bd9Sstevel@tonic-gate /* 13767c478bd9Sstevel@tonic-gate * add pp to the specified page list. Defaults to head of the page list 13777c478bd9Sstevel@tonic-gate * unless PG_LIST_TAIL is specified. 13787c478bd9Sstevel@tonic-gate */ 13797c478bd9Sstevel@tonic-gate void 13807c478bd9Sstevel@tonic-gate page_list_add(page_t *pp, int flags) 13817c478bd9Sstevel@tonic-gate { 13827c478bd9Sstevel@tonic-gate page_t **ppp; 13837c478bd9Sstevel@tonic-gate kmutex_t *pcm; 13847c478bd9Sstevel@tonic-gate uint_t bin, mtype; 13857c478bd9Sstevel@tonic-gate int mnode; 13867c478bd9Sstevel@tonic-gate 13877c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 13887c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 13897c478bd9Sstevel@tonic-gate ASSERT(!hat_page_is_mapped(pp)); 13907c478bd9Sstevel@tonic-gate ASSERT(hat_page_getshare(pp) == 0); 13917c478bd9Sstevel@tonic-gate 13927c478bd9Sstevel@tonic-gate /* 13937c478bd9Sstevel@tonic-gate * Large pages should be freed via page_list_add_pages(). 13947c478bd9Sstevel@tonic-gate */ 13957c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 13967c478bd9Sstevel@tonic-gate 13977c478bd9Sstevel@tonic-gate /* 13987c478bd9Sstevel@tonic-gate * Don't need to lock the freelist first here 13997c478bd9Sstevel@tonic-gate * because the page isn't on the freelist yet. 14007c478bd9Sstevel@tonic-gate * This means p_szc can't change on us. 14017c478bd9Sstevel@tonic-gate */ 14027c478bd9Sstevel@tonic-gate 14037c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 14047c478bd9Sstevel@tonic-gate mnode = PP_2_MEM_NODE(pp); 14057c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 14067c478bd9Sstevel@tonic-gate 14077c478bd9Sstevel@tonic-gate if (flags & PG_LIST_ISINIT) { 14087c478bd9Sstevel@tonic-gate /* 14097c478bd9Sstevel@tonic-gate * PG_LIST_ISINIT is set during system startup (ie. single 14107c478bd9Sstevel@tonic-gate * threaded), add a page to the free list and add to the 14117c478bd9Sstevel@tonic-gate * the free region counters w/o any locking 14127c478bd9Sstevel@tonic-gate */ 14137c478bd9Sstevel@tonic-gate ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 14147c478bd9Sstevel@tonic-gate 14157c478bd9Sstevel@tonic-gate /* inline version of page_add() */ 14167c478bd9Sstevel@tonic-gate if (*ppp != NULL) { 14177c478bd9Sstevel@tonic-gate pp->p_next = *ppp; 14187c478bd9Sstevel@tonic-gate pp->p_prev = (*ppp)->p_prev; 14197c478bd9Sstevel@tonic-gate (*ppp)->p_prev = pp; 14207c478bd9Sstevel@tonic-gate pp->p_prev->p_next = pp; 14217c478bd9Sstevel@tonic-gate } else 14227c478bd9Sstevel@tonic-gate *ppp = pp; 14237c478bd9Sstevel@tonic-gate 1424affbd3ccSkchow page_ctr_add_internal(mnode, mtype, pp, flags); 1425affbd3ccSkchow VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 14267c478bd9Sstevel@tonic-gate } else { 14277c478bd9Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, flags); 14287c478bd9Sstevel@tonic-gate 14297c478bd9Sstevel@tonic-gate if (flags & PG_FREE_LIST) { 1430affbd3ccSkchow VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 14317c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 14327c478bd9Sstevel@tonic-gate ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 14337c478bd9Sstevel@tonic-gate 14347c478bd9Sstevel@tonic-gate } else { 1435affbd3ccSkchow VM_STAT_ADD(vmm_vmstats.pladd_cache); 14367c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode); 14377c478bd9Sstevel@tonic-gate ASSERT((pp->p_offset & PAGEOFFSET) == 0); 14387c478bd9Sstevel@tonic-gate ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 14397c478bd9Sstevel@tonic-gate } 14407c478bd9Sstevel@tonic-gate mutex_enter(pcm); 14417c478bd9Sstevel@tonic-gate page_add(ppp, pp); 14427c478bd9Sstevel@tonic-gate 14437c478bd9Sstevel@tonic-gate if (flags & PG_LIST_TAIL) 14447c478bd9Sstevel@tonic-gate *ppp = (*ppp)->p_next; 14457c478bd9Sstevel@tonic-gate /* 14467c478bd9Sstevel@tonic-gate * Add counters before releasing pcm mutex to avoid a race with 14475d07b933Sdp * page_freelist_coalesce and page_freelist_split. 14487c478bd9Sstevel@tonic-gate */ 1449affbd3ccSkchow page_ctr_add(mnode, mtype, pp, flags); 14507c478bd9Sstevel@tonic-gate mutex_exit(pcm); 14517c478bd9Sstevel@tonic-gate } 14527c478bd9Sstevel@tonic-gate 14537c478bd9Sstevel@tonic-gate 14547c478bd9Sstevel@tonic-gate #if defined(__sparc) 14557c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(pp)) { 14567c478bd9Sstevel@tonic-gate kcage_freemem_add(1); 14577c478bd9Sstevel@tonic-gate } 14587c478bd9Sstevel@tonic-gate #endif 14597c478bd9Sstevel@tonic-gate /* 14607c478bd9Sstevel@tonic-gate * It is up to the caller to unlock the page! 14617c478bd9Sstevel@tonic-gate */ 14627c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 14637c478bd9Sstevel@tonic-gate } 14647c478bd9Sstevel@tonic-gate 14657c478bd9Sstevel@tonic-gate 14667c478bd9Sstevel@tonic-gate #ifdef __sparc 14677c478bd9Sstevel@tonic-gate /* 14687c478bd9Sstevel@tonic-gate * This routine is only used by kcage_init during system startup. 14697c478bd9Sstevel@tonic-gate * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 14707c478bd9Sstevel@tonic-gate * without the overhead of taking locks and updating counters. 14717c478bd9Sstevel@tonic-gate */ 14727c478bd9Sstevel@tonic-gate void 14737c478bd9Sstevel@tonic-gate page_list_noreloc_startup(page_t *pp) 14747c478bd9Sstevel@tonic-gate { 14757c478bd9Sstevel@tonic-gate page_t **ppp; 14767c478bd9Sstevel@tonic-gate uint_t bin; 14777c478bd9Sstevel@tonic-gate int mnode; 14787c478bd9Sstevel@tonic-gate int mtype; 1479e21bae1bSkchow int flags = 0; 14807c478bd9Sstevel@tonic-gate 14817c478bd9Sstevel@tonic-gate /* 14827c478bd9Sstevel@tonic-gate * If this is a large page on the freelist then 14837c478bd9Sstevel@tonic-gate * break it up into smaller pages. 14847c478bd9Sstevel@tonic-gate */ 14857c478bd9Sstevel@tonic-gate if (pp->p_szc != 0) 14867c478bd9Sstevel@tonic-gate page_boot_demote(pp); 14877c478bd9Sstevel@tonic-gate 14887c478bd9Sstevel@tonic-gate /* 14897c478bd9Sstevel@tonic-gate * Get list page is currently on. 14907c478bd9Sstevel@tonic-gate */ 14917c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 14927c478bd9Sstevel@tonic-gate mnode = PP_2_MEM_NODE(pp); 14937c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 14947c478bd9Sstevel@tonic-gate ASSERT(mtype == MTYPE_RELOC); 14957c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 14967c478bd9Sstevel@tonic-gate 14977c478bd9Sstevel@tonic-gate if (PP_ISAGED(pp)) { 14987c478bd9Sstevel@tonic-gate ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 14997c478bd9Sstevel@tonic-gate flags |= PG_FREE_LIST; 15007c478bd9Sstevel@tonic-gate } else { 15017c478bd9Sstevel@tonic-gate ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 15027c478bd9Sstevel@tonic-gate flags |= PG_CACHE_LIST; 15037c478bd9Sstevel@tonic-gate } 15047c478bd9Sstevel@tonic-gate 15057c478bd9Sstevel@tonic-gate ASSERT(*ppp != NULL); 15067c478bd9Sstevel@tonic-gate 15077c478bd9Sstevel@tonic-gate /* 15087c478bd9Sstevel@tonic-gate * Delete page from current list. 15097c478bd9Sstevel@tonic-gate */ 15107c478bd9Sstevel@tonic-gate if (*ppp == pp) 15117c478bd9Sstevel@tonic-gate *ppp = pp->p_next; /* go to next page */ 15127c478bd9Sstevel@tonic-gate if (*ppp == pp) { 15137c478bd9Sstevel@tonic-gate *ppp = NULL; /* page list is gone */ 15147c478bd9Sstevel@tonic-gate } else { 15157c478bd9Sstevel@tonic-gate pp->p_prev->p_next = pp->p_next; 15167c478bd9Sstevel@tonic-gate pp->p_next->p_prev = pp->p_prev; 15177c478bd9Sstevel@tonic-gate } 15187c478bd9Sstevel@tonic-gate 15195d07b933Sdp /* 15205d07b933Sdp * Decrement page counters 15215d07b933Sdp */ 15225d07b933Sdp page_ctr_sub_internal(mnode, mtype, pp, flags); 15237c478bd9Sstevel@tonic-gate 15247c478bd9Sstevel@tonic-gate /* 15257c478bd9Sstevel@tonic-gate * Set no reloc for cage initted pages. 15267c478bd9Sstevel@tonic-gate */ 15277c478bd9Sstevel@tonic-gate PP_SETNORELOC(pp); 15287c478bd9Sstevel@tonic-gate 15297c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 15307c478bd9Sstevel@tonic-gate ASSERT(mtype == MTYPE_NORELOC); 15317c478bd9Sstevel@tonic-gate 15327c478bd9Sstevel@tonic-gate /* 15337c478bd9Sstevel@tonic-gate * Get new list for page. 15347c478bd9Sstevel@tonic-gate */ 15357c478bd9Sstevel@tonic-gate if (PP_ISAGED(pp)) { 15367c478bd9Sstevel@tonic-gate ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 15377c478bd9Sstevel@tonic-gate } else { 15387c478bd9Sstevel@tonic-gate ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 15397c478bd9Sstevel@tonic-gate } 15407c478bd9Sstevel@tonic-gate 15417c478bd9Sstevel@tonic-gate /* 15427c478bd9Sstevel@tonic-gate * Insert page on new list. 15437c478bd9Sstevel@tonic-gate */ 15447c478bd9Sstevel@tonic-gate if (*ppp == NULL) { 15457c478bd9Sstevel@tonic-gate *ppp = pp; 15467c478bd9Sstevel@tonic-gate pp->p_next = pp->p_prev = pp; 15477c478bd9Sstevel@tonic-gate } else { 15487c478bd9Sstevel@tonic-gate pp->p_next = *ppp; 15497c478bd9Sstevel@tonic-gate pp->p_prev = (*ppp)->p_prev; 15507c478bd9Sstevel@tonic-gate (*ppp)->p_prev = pp; 15517c478bd9Sstevel@tonic-gate pp->p_prev->p_next = pp; 15527c478bd9Sstevel@tonic-gate } 15537c478bd9Sstevel@tonic-gate 15545d07b933Sdp /* 15555d07b933Sdp * Increment page counters 15565d07b933Sdp */ 15575d07b933Sdp page_ctr_add_internal(mnode, mtype, pp, flags); 15587c478bd9Sstevel@tonic-gate 15597c478bd9Sstevel@tonic-gate /* 15607c478bd9Sstevel@tonic-gate * Update cage freemem counter 15617c478bd9Sstevel@tonic-gate */ 15627c478bd9Sstevel@tonic-gate atomic_add_long(&kcage_freemem, 1); 15637c478bd9Sstevel@tonic-gate } 15647c478bd9Sstevel@tonic-gate #else /* __sparc */ 15657c478bd9Sstevel@tonic-gate 15667c478bd9Sstevel@tonic-gate /* ARGSUSED */ 15677c478bd9Sstevel@tonic-gate void 15687c478bd9Sstevel@tonic-gate page_list_noreloc_startup(page_t *pp) 15697c478bd9Sstevel@tonic-gate { 15707c478bd9Sstevel@tonic-gate panic("page_list_noreloc_startup: should be here only for sparc"); 15717c478bd9Sstevel@tonic-gate } 15727c478bd9Sstevel@tonic-gate #endif 15737c478bd9Sstevel@tonic-gate 15747c478bd9Sstevel@tonic-gate void 15757c478bd9Sstevel@tonic-gate page_list_add_pages(page_t *pp, int flags) 15767c478bd9Sstevel@tonic-gate { 15777c478bd9Sstevel@tonic-gate kmutex_t *pcm; 15787c478bd9Sstevel@tonic-gate pgcnt_t pgcnt; 15797c478bd9Sstevel@tonic-gate uint_t bin, mtype, i; 15807c478bd9Sstevel@tonic-gate int mnode; 15817c478bd9Sstevel@tonic-gate 15827c478bd9Sstevel@tonic-gate /* default to freelist/head */ 15837c478bd9Sstevel@tonic-gate ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 15847c478bd9Sstevel@tonic-gate 15857c478bd9Sstevel@tonic-gate CHK_LPG(pp, pp->p_szc); 1586affbd3ccSkchow VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 15877c478bd9Sstevel@tonic-gate 15887c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 15897c478bd9Sstevel@tonic-gate mnode = PP_2_MEM_NODE(pp); 15907c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 15917c478bd9Sstevel@tonic-gate 15927c478bd9Sstevel@tonic-gate if (flags & PG_LIST_ISINIT) { 15937c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == mmu_page_sizes - 1); 15947c478bd9Sstevel@tonic-gate page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 15957c478bd9Sstevel@tonic-gate ASSERT(!PP_ISNORELOC(pp)); 1596affbd3ccSkchow PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 15977c478bd9Sstevel@tonic-gate } else { 15987c478bd9Sstevel@tonic-gate 15997c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 16007c478bd9Sstevel@tonic-gate 16017c478bd9Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 16027c478bd9Sstevel@tonic-gate 16037c478bd9Sstevel@tonic-gate mutex_enter(pcm); 16047c478bd9Sstevel@tonic-gate page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1605affbd3ccSkchow page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 16067c478bd9Sstevel@tonic-gate mutex_exit(pcm); 16077c478bd9Sstevel@tonic-gate 16087c478bd9Sstevel@tonic-gate pgcnt = page_get_pagecnt(pp->p_szc); 16097c478bd9Sstevel@tonic-gate #if defined(__sparc) 16107c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(pp)) 16117c478bd9Sstevel@tonic-gate kcage_freemem_add(pgcnt); 16127c478bd9Sstevel@tonic-gate #endif 16137c478bd9Sstevel@tonic-gate for (i = 0; i < pgcnt; i++, pp++) 16148b464eb8Smec page_unlock_nocapture(pp); 16157c478bd9Sstevel@tonic-gate } 16167c478bd9Sstevel@tonic-gate } 16177c478bd9Sstevel@tonic-gate 16187c478bd9Sstevel@tonic-gate /* 16197c478bd9Sstevel@tonic-gate * During boot, need to demote a large page to base 16207c478bd9Sstevel@tonic-gate * pagesize pages for seg_kmem for use in boot_alloc() 16217c478bd9Sstevel@tonic-gate */ 16227c478bd9Sstevel@tonic-gate void 16237c478bd9Sstevel@tonic-gate page_boot_demote(page_t *pp) 16247c478bd9Sstevel@tonic-gate { 16257c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc != 0); 16267c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 16277c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 16287c478bd9Sstevel@tonic-gate 16297c478bd9Sstevel@tonic-gate (void) page_demote(PP_2_MEM_NODE(pp), 163019397407SSherry Moore PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, 16317c478bd9Sstevel@tonic-gate PC_FREE); 16327c478bd9Sstevel@tonic-gate 16337c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 16347c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 16357c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 16367c478bd9Sstevel@tonic-gate } 16377c478bd9Sstevel@tonic-gate 16387c478bd9Sstevel@tonic-gate /* 16397c478bd9Sstevel@tonic-gate * Take a particular page off of whatever freelist the page 16407c478bd9Sstevel@tonic-gate * is claimed to be on. 16417c478bd9Sstevel@tonic-gate * 16427c478bd9Sstevel@tonic-gate * NOTE: Only used for PAGESIZE pages. 16437c478bd9Sstevel@tonic-gate */ 16447c478bd9Sstevel@tonic-gate void 16457c478bd9Sstevel@tonic-gate page_list_sub(page_t *pp, int flags) 16467c478bd9Sstevel@tonic-gate { 16477c478bd9Sstevel@tonic-gate int bin; 16487c478bd9Sstevel@tonic-gate uint_t mtype; 16497c478bd9Sstevel@tonic-gate int mnode; 16507c478bd9Sstevel@tonic-gate kmutex_t *pcm; 16517c478bd9Sstevel@tonic-gate page_t **ppp; 16527c478bd9Sstevel@tonic-gate 16537c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp)); 16547c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 16557c478bd9Sstevel@tonic-gate 16567c478bd9Sstevel@tonic-gate /* 16577c478bd9Sstevel@tonic-gate * The p_szc field can only be changed by page_promote() 16587c478bd9Sstevel@tonic-gate * and page_demote(). Only free pages can be promoted and 16597c478bd9Sstevel@tonic-gate * demoted and the free list MUST be locked during these 16607c478bd9Sstevel@tonic-gate * operations. So to prevent a race in page_list_sub() 16617c478bd9Sstevel@tonic-gate * between computing which bin of the freelist lock to 16627c478bd9Sstevel@tonic-gate * grab and actually grabing the lock we check again that 16637c478bd9Sstevel@tonic-gate * the bin we locked is still the correct one. Notice that 16647c478bd9Sstevel@tonic-gate * the p_szc field could have actually changed on us but 16657c478bd9Sstevel@tonic-gate * if the bin happens to still be the same we are safe. 16667c478bd9Sstevel@tonic-gate */ 16677c478bd9Sstevel@tonic-gate try_again: 16687c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 16697c478bd9Sstevel@tonic-gate mnode = PP_2_MEM_NODE(pp); 16707c478bd9Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, flags); 16717c478bd9Sstevel@tonic-gate mutex_enter(pcm); 16727c478bd9Sstevel@tonic-gate if (PP_2_BIN(pp) != bin) { 16737c478bd9Sstevel@tonic-gate mutex_exit(pcm); 16747c478bd9Sstevel@tonic-gate goto try_again; 16757c478bd9Sstevel@tonic-gate } 16767c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 16777c478bd9Sstevel@tonic-gate 16787c478bd9Sstevel@tonic-gate if (flags & PG_FREE_LIST) { 1679affbd3ccSkchow VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 16807c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 16817c478bd9Sstevel@tonic-gate ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 16827c478bd9Sstevel@tonic-gate } else { 1683affbd3ccSkchow VM_STAT_ADD(vmm_vmstats.plsub_cache); 16847c478bd9Sstevel@tonic-gate ASSERT(!PP_ISAGED(pp)); 16857c478bd9Sstevel@tonic-gate ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 16867c478bd9Sstevel@tonic-gate } 16877c478bd9Sstevel@tonic-gate 16887c478bd9Sstevel@tonic-gate /* 16897c478bd9Sstevel@tonic-gate * Common PAGESIZE case. 16907c478bd9Sstevel@tonic-gate * 16917c478bd9Sstevel@tonic-gate * Note that we locked the freelist. This prevents 16927c478bd9Sstevel@tonic-gate * any page promotion/demotion operations. Therefore 16937c478bd9Sstevel@tonic-gate * the p_szc will not change until we drop pcm mutex. 16947c478bd9Sstevel@tonic-gate */ 16957c478bd9Sstevel@tonic-gate if (pp->p_szc == 0) { 16967c478bd9Sstevel@tonic-gate page_sub(ppp, pp); 16977c478bd9Sstevel@tonic-gate /* 16987c478bd9Sstevel@tonic-gate * Subtract counters before releasing pcm mutex 16997c478bd9Sstevel@tonic-gate * to avoid race with page_freelist_coalesce. 17007c478bd9Sstevel@tonic-gate */ 1701affbd3ccSkchow page_ctr_sub(mnode, mtype, pp, flags); 17027c478bd9Sstevel@tonic-gate mutex_exit(pcm); 17037c478bd9Sstevel@tonic-gate 17047c478bd9Sstevel@tonic-gate #if defined(__sparc) 17057c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(pp)) { 17067c478bd9Sstevel@tonic-gate kcage_freemem_sub(1); 17077c478bd9Sstevel@tonic-gate } 17087c478bd9Sstevel@tonic-gate #endif 17097c478bd9Sstevel@tonic-gate return; 17107c478bd9Sstevel@tonic-gate } 17117c478bd9Sstevel@tonic-gate 17127c478bd9Sstevel@tonic-gate /* 17137c478bd9Sstevel@tonic-gate * Large pages on the cache list are not supported. 17147c478bd9Sstevel@tonic-gate */ 17157c478bd9Sstevel@tonic-gate if (flags & PG_CACHE_LIST) 17167c478bd9Sstevel@tonic-gate panic("page_list_sub: large page on cachelist"); 17177c478bd9Sstevel@tonic-gate 17187c478bd9Sstevel@tonic-gate /* 17197c478bd9Sstevel@tonic-gate * Slow but rare. 17207c478bd9Sstevel@tonic-gate * 17217c478bd9Sstevel@tonic-gate * Somebody wants this particular page which is part 17227c478bd9Sstevel@tonic-gate * of a large page. In this case we just demote the page 17237c478bd9Sstevel@tonic-gate * if it's on the freelist. 17247c478bd9Sstevel@tonic-gate * 17257c478bd9Sstevel@tonic-gate * We have to drop pcm before locking the entire freelist. 17267c478bd9Sstevel@tonic-gate * Once we have re-locked the freelist check to make sure 17277c478bd9Sstevel@tonic-gate * the page hasn't already been demoted or completely 17287c478bd9Sstevel@tonic-gate * freed. 17297c478bd9Sstevel@tonic-gate */ 17307c478bd9Sstevel@tonic-gate mutex_exit(pcm); 17317c478bd9Sstevel@tonic-gate page_freelist_lock(mnode); 17327c478bd9Sstevel@tonic-gate if (pp->p_szc != 0) { 17337c478bd9Sstevel@tonic-gate /* 17347c478bd9Sstevel@tonic-gate * Large page is on freelist. 17357c478bd9Sstevel@tonic-gate */ 17367c478bd9Sstevel@tonic-gate (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 173719397407SSherry Moore 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 17387c478bd9Sstevel@tonic-gate } 17397c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 17407c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 17417c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 17427c478bd9Sstevel@tonic-gate 17437c478bd9Sstevel@tonic-gate /* 17447c478bd9Sstevel@tonic-gate * Subtract counters before releasing pcm mutex 17457c478bd9Sstevel@tonic-gate * to avoid race with page_freelist_coalesce. 17467c478bd9Sstevel@tonic-gate */ 17477c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 17487c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 17497c478bd9Sstevel@tonic-gate ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 17507c478bd9Sstevel@tonic-gate 17517c478bd9Sstevel@tonic-gate page_sub(ppp, pp); 1752affbd3ccSkchow page_ctr_sub(mnode, mtype, pp, flags); 17537c478bd9Sstevel@tonic-gate page_freelist_unlock(mnode); 17547c478bd9Sstevel@tonic-gate 17557c478bd9Sstevel@tonic-gate #if defined(__sparc) 17567c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(pp)) { 17577c478bd9Sstevel@tonic-gate kcage_freemem_sub(1); 17587c478bd9Sstevel@tonic-gate } 17597c478bd9Sstevel@tonic-gate #endif 17607c478bd9Sstevel@tonic-gate } 17617c478bd9Sstevel@tonic-gate 17627c478bd9Sstevel@tonic-gate void 17637c478bd9Sstevel@tonic-gate page_list_sub_pages(page_t *pp, uint_t szc) 17647c478bd9Sstevel@tonic-gate { 17657c478bd9Sstevel@tonic-gate kmutex_t *pcm; 17667c478bd9Sstevel@tonic-gate uint_t bin, mtype; 17677c478bd9Sstevel@tonic-gate int mnode; 17687c478bd9Sstevel@tonic-gate 17697c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp)); 17707c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 17717c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 17727c478bd9Sstevel@tonic-gate 17737c478bd9Sstevel@tonic-gate /* 17747c478bd9Sstevel@tonic-gate * See comment in page_list_sub(). 17757c478bd9Sstevel@tonic-gate */ 17767c478bd9Sstevel@tonic-gate try_again: 17777c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 17787c478bd9Sstevel@tonic-gate mnode = PP_2_MEM_NODE(pp); 17797c478bd9Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 17807c478bd9Sstevel@tonic-gate mutex_enter(pcm); 17817c478bd9Sstevel@tonic-gate if (PP_2_BIN(pp) != bin) { 17827c478bd9Sstevel@tonic-gate mutex_exit(pcm); 17837c478bd9Sstevel@tonic-gate goto try_again; 17847c478bd9Sstevel@tonic-gate } 17857c478bd9Sstevel@tonic-gate 17867c478bd9Sstevel@tonic-gate /* 17877c478bd9Sstevel@tonic-gate * If we're called with a page larger than szc or it got 17887c478bd9Sstevel@tonic-gate * promoted above szc before we locked the freelist then 17897c478bd9Sstevel@tonic-gate * drop pcm and re-lock entire freelist. If page still larger 17907c478bd9Sstevel@tonic-gate * than szc then demote it. 17917c478bd9Sstevel@tonic-gate */ 17927c478bd9Sstevel@tonic-gate if (pp->p_szc > szc) { 17937c478bd9Sstevel@tonic-gate mutex_exit(pcm); 17947c478bd9Sstevel@tonic-gate pcm = NULL; 17957c478bd9Sstevel@tonic-gate page_freelist_lock(mnode); 17967c478bd9Sstevel@tonic-gate if (pp->p_szc > szc) { 1797affbd3ccSkchow VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 17987c478bd9Sstevel@tonic-gate (void) page_demote(mnode, 179919397407SSherry Moore PFN_BASE(pp->p_pagenum, pp->p_szc), 0, 18007c478bd9Sstevel@tonic-gate pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 18017c478bd9Sstevel@tonic-gate } 18027c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 18037c478bd9Sstevel@tonic-gate } 18047c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 18057c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 18067c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc <= szc); 18077c478bd9Sstevel@tonic-gate ASSERT(pp == PP_PAGEROOT(pp)); 18087c478bd9Sstevel@tonic-gate 1809affbd3ccSkchow VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1810affbd3ccSkchow 18117c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 18127c478bd9Sstevel@tonic-gate if (pp->p_szc != 0) { 18137c478bd9Sstevel@tonic-gate page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 18147c478bd9Sstevel@tonic-gate CHK_LPG(pp, pp->p_szc); 18157c478bd9Sstevel@tonic-gate } else { 1816affbd3ccSkchow VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 18177c478bd9Sstevel@tonic-gate page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 18187c478bd9Sstevel@tonic-gate } 1819affbd3ccSkchow page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 18207c478bd9Sstevel@tonic-gate 18217c478bd9Sstevel@tonic-gate if (pcm != NULL) { 18227c478bd9Sstevel@tonic-gate mutex_exit(pcm); 18237c478bd9Sstevel@tonic-gate } else { 18247c478bd9Sstevel@tonic-gate page_freelist_unlock(mnode); 18257c478bd9Sstevel@tonic-gate } 18267c478bd9Sstevel@tonic-gate 18277c478bd9Sstevel@tonic-gate #if defined(__sparc) 18287c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(pp)) { 18297c478bd9Sstevel@tonic-gate pgcnt_t pgcnt; 18307c478bd9Sstevel@tonic-gate 18317c478bd9Sstevel@tonic-gate pgcnt = page_get_pagecnt(pp->p_szc); 18327c478bd9Sstevel@tonic-gate kcage_freemem_sub(pgcnt); 18337c478bd9Sstevel@tonic-gate } 18347c478bd9Sstevel@tonic-gate #endif 18357c478bd9Sstevel@tonic-gate } 18367c478bd9Sstevel@tonic-gate 18377c478bd9Sstevel@tonic-gate /* 18387c478bd9Sstevel@tonic-gate * Add the page to the front of a linked list of pages 18397c478bd9Sstevel@tonic-gate * using the p_next & p_prev pointers for the list. 18407c478bd9Sstevel@tonic-gate * The caller is responsible for protecting the list pointers. 18417c478bd9Sstevel@tonic-gate */ 18427c478bd9Sstevel@tonic-gate void 18437c478bd9Sstevel@tonic-gate mach_page_add(page_t **ppp, page_t *pp) 18447c478bd9Sstevel@tonic-gate { 18457c478bd9Sstevel@tonic-gate if (*ppp == NULL) { 18467c478bd9Sstevel@tonic-gate pp->p_next = pp->p_prev = pp; 18477c478bd9Sstevel@tonic-gate } else { 18487c478bd9Sstevel@tonic-gate pp->p_next = *ppp; 18497c478bd9Sstevel@tonic-gate pp->p_prev = (*ppp)->p_prev; 18507c478bd9Sstevel@tonic-gate (*ppp)->p_prev = pp; 18517c478bd9Sstevel@tonic-gate pp->p_prev->p_next = pp; 18527c478bd9Sstevel@tonic-gate } 18537c478bd9Sstevel@tonic-gate *ppp = pp; 18547c478bd9Sstevel@tonic-gate } 18557c478bd9Sstevel@tonic-gate 18567c478bd9Sstevel@tonic-gate /* 18577c478bd9Sstevel@tonic-gate * Remove this page from a linked list of pages 18587c478bd9Sstevel@tonic-gate * using the p_next & p_prev pointers for the list. 18597c478bd9Sstevel@tonic-gate * 18607c478bd9Sstevel@tonic-gate * The caller is responsible for protecting the list pointers. 18617c478bd9Sstevel@tonic-gate */ 18627c478bd9Sstevel@tonic-gate void 18637c478bd9Sstevel@tonic-gate mach_page_sub(page_t **ppp, page_t *pp) 18647c478bd9Sstevel@tonic-gate { 18657c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 18667c478bd9Sstevel@tonic-gate 18677c478bd9Sstevel@tonic-gate if (*ppp == NULL || pp == NULL) 18687c478bd9Sstevel@tonic-gate panic("mach_page_sub"); 18697c478bd9Sstevel@tonic-gate 18707c478bd9Sstevel@tonic-gate if (*ppp == pp) 18717c478bd9Sstevel@tonic-gate *ppp = pp->p_next; /* go to next page */ 18727c478bd9Sstevel@tonic-gate 18737c478bd9Sstevel@tonic-gate if (*ppp == pp) 18747c478bd9Sstevel@tonic-gate *ppp = NULL; /* page list is gone */ 18757c478bd9Sstevel@tonic-gate else { 18767c478bd9Sstevel@tonic-gate pp->p_prev->p_next = pp->p_next; 18777c478bd9Sstevel@tonic-gate pp->p_next->p_prev = pp->p_prev; 18787c478bd9Sstevel@tonic-gate } 18797c478bd9Sstevel@tonic-gate pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 18807c478bd9Sstevel@tonic-gate } 18817c478bd9Sstevel@tonic-gate 18827c478bd9Sstevel@tonic-gate /* 18837c478bd9Sstevel@tonic-gate * Routine fsflush uses to gradually coalesce the free list into larger pages. 18847c478bd9Sstevel@tonic-gate */ 18857c478bd9Sstevel@tonic-gate void 18867c478bd9Sstevel@tonic-gate page_promote_size(page_t *pp, uint_t cur_szc) 18877c478bd9Sstevel@tonic-gate { 18887c478bd9Sstevel@tonic-gate pfn_t pfn; 18897c478bd9Sstevel@tonic-gate int mnode; 18907c478bd9Sstevel@tonic-gate int idx; 18917c478bd9Sstevel@tonic-gate int new_szc = cur_szc + 1; 18927c478bd9Sstevel@tonic-gate int full = FULL_REGION_CNT(new_szc); 18937c478bd9Sstevel@tonic-gate 18947c478bd9Sstevel@tonic-gate pfn = page_pptonum(pp); 18957c478bd9Sstevel@tonic-gate mnode = PFN_2_MEM_NODE(pfn); 18967c478bd9Sstevel@tonic-gate 18977c478bd9Sstevel@tonic-gate page_freelist_lock(mnode); 18987c478bd9Sstevel@tonic-gate 18997c478bd9Sstevel@tonic-gate idx = PNUM_TO_IDX(mnode, new_szc, pfn); 19007c478bd9Sstevel@tonic-gate if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 19015d07b933Sdp (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 19027c478bd9Sstevel@tonic-gate 19037c478bd9Sstevel@tonic-gate page_freelist_unlock(mnode); 19047c478bd9Sstevel@tonic-gate } 19057c478bd9Sstevel@tonic-gate 19067c478bd9Sstevel@tonic-gate static uint_t page_promote_err; 19077c478bd9Sstevel@tonic-gate static uint_t page_promote_noreloc_err; 19087c478bd9Sstevel@tonic-gate 19097c478bd9Sstevel@tonic-gate /* 19107c478bd9Sstevel@tonic-gate * Create a single larger page (of szc new_szc) from smaller contiguous pages 19117c478bd9Sstevel@tonic-gate * for the given mnode starting at pfnum. Pages involved are on the freelist 19127c478bd9Sstevel@tonic-gate * before the call and may be returned to the caller if requested, otherwise 19137c478bd9Sstevel@tonic-gate * they will be placed back on the freelist. 19147c478bd9Sstevel@tonic-gate * If flags is PC_ALLOC, then the large page will be returned to the user in 19157c478bd9Sstevel@tonic-gate * a state which is consistent with a page being taken off the freelist. If 19167c478bd9Sstevel@tonic-gate * we failed to lock the new large page, then we will return NULL to the 19177c478bd9Sstevel@tonic-gate * caller and put the large page on the freelist instead. 19187c478bd9Sstevel@tonic-gate * If flags is PC_FREE, then the large page will be placed on the freelist, 19197c478bd9Sstevel@tonic-gate * and NULL will be returned. 19207c478bd9Sstevel@tonic-gate * The caller is responsible for locking the freelist as well as any other 19217c478bd9Sstevel@tonic-gate * accounting which needs to be done for a returned page. 19227c478bd9Sstevel@tonic-gate * 19237c478bd9Sstevel@tonic-gate * RFE: For performance pass in pp instead of pfnum so 19247c478bd9Sstevel@tonic-gate * we can avoid excessive calls to page_numtopp_nolock(). 19257c478bd9Sstevel@tonic-gate * This would depend on an assumption that all contiguous 19267c478bd9Sstevel@tonic-gate * pages are in the same memseg so we can just add/dec 19277c478bd9Sstevel@tonic-gate * our pp. 19287c478bd9Sstevel@tonic-gate * 19297c478bd9Sstevel@tonic-gate * Lock ordering: 19307c478bd9Sstevel@tonic-gate * 19317c478bd9Sstevel@tonic-gate * There is a potential but rare deadlock situation 19327c478bd9Sstevel@tonic-gate * for page promotion and demotion operations. The problem 19337c478bd9Sstevel@tonic-gate * is there are two paths into the freelist manager and 19347c478bd9Sstevel@tonic-gate * they have different lock orders: 19357c478bd9Sstevel@tonic-gate * 19367c478bd9Sstevel@tonic-gate * page_create() 19377c478bd9Sstevel@tonic-gate * lock freelist 19387c478bd9Sstevel@tonic-gate * page_lock(EXCL) 19397c478bd9Sstevel@tonic-gate * unlock freelist 19407c478bd9Sstevel@tonic-gate * return 19417c478bd9Sstevel@tonic-gate * caller drops page_lock 19427c478bd9Sstevel@tonic-gate * 19437c478bd9Sstevel@tonic-gate * page_free() and page_reclaim() 19447c478bd9Sstevel@tonic-gate * caller grabs page_lock(EXCL) 19457c478bd9Sstevel@tonic-gate * 19467c478bd9Sstevel@tonic-gate * lock freelist 19477c478bd9Sstevel@tonic-gate * unlock freelist 19487c478bd9Sstevel@tonic-gate * drop page_lock 19497c478bd9Sstevel@tonic-gate * 19507c478bd9Sstevel@tonic-gate * What prevents a thread in page_create() from deadlocking 19517c478bd9Sstevel@tonic-gate * with a thread freeing or reclaiming the same page is the 19527c478bd9Sstevel@tonic-gate * page_trylock() in page_get_freelist(). If the trylock fails 19537c478bd9Sstevel@tonic-gate * it skips the page. 19547c478bd9Sstevel@tonic-gate * 19557c478bd9Sstevel@tonic-gate * The lock ordering for promotion and demotion is the same as 19567c478bd9Sstevel@tonic-gate * for page_create(). Since the same deadlock could occur during 19577c478bd9Sstevel@tonic-gate * page promotion and freeing or reclaiming of a page on the 19587c478bd9Sstevel@tonic-gate * cache list we might have to fail the operation and undo what 19597c478bd9Sstevel@tonic-gate * have done so far. Again this is rare. 19607c478bd9Sstevel@tonic-gate */ 19617c478bd9Sstevel@tonic-gate page_t * 19625d07b933Sdp page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 19637c478bd9Sstevel@tonic-gate { 19647c478bd9Sstevel@tonic-gate page_t *pp, *pplist, *tpp, *start_pp; 19657c478bd9Sstevel@tonic-gate pgcnt_t new_npgs, npgs; 19667c478bd9Sstevel@tonic-gate uint_t bin; 19677c478bd9Sstevel@tonic-gate pgcnt_t tmpnpgs, pages_left; 19687c478bd9Sstevel@tonic-gate uint_t noreloc; 19697c478bd9Sstevel@tonic-gate int which_list; 19707c478bd9Sstevel@tonic-gate ulong_t index; 19717c478bd9Sstevel@tonic-gate kmutex_t *phm; 19727c478bd9Sstevel@tonic-gate 19737c478bd9Sstevel@tonic-gate /* 19747c478bd9Sstevel@tonic-gate * General algorithm: 19757c478bd9Sstevel@tonic-gate * Find the starting page 19767c478bd9Sstevel@tonic-gate * Walk each page struct removing it from the freelist, 19777c478bd9Sstevel@tonic-gate * and linking it to all the other pages removed. 19787c478bd9Sstevel@tonic-gate * Once all pages are off the freelist, 19797c478bd9Sstevel@tonic-gate * walk the list, modifying p_szc to new_szc and what 19807c478bd9Sstevel@tonic-gate * ever other info needs to be done to create a large free page. 19817c478bd9Sstevel@tonic-gate * According to the flags, either return the page or put it 19827c478bd9Sstevel@tonic-gate * on the freelist. 19837c478bd9Sstevel@tonic-gate */ 19847c478bd9Sstevel@tonic-gate 19857c478bd9Sstevel@tonic-gate start_pp = page_numtopp_nolock(pfnum); 19867c478bd9Sstevel@tonic-gate ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 19877c478bd9Sstevel@tonic-gate new_npgs = page_get_pagecnt(new_szc); 19887c478bd9Sstevel@tonic-gate ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 19897c478bd9Sstevel@tonic-gate 19905d07b933Sdp /* don't return page of the wrong mtype */ 19915d07b933Sdp if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 19925d07b933Sdp return (NULL); 19935d07b933Sdp 19947c478bd9Sstevel@tonic-gate /* 19957c478bd9Sstevel@tonic-gate * Loop through smaller pages to confirm that all pages 19967c478bd9Sstevel@tonic-gate * give the same result for PP_ISNORELOC(). 19977c478bd9Sstevel@tonic-gate * We can check this reliably here as the protocol for setting 19987c478bd9Sstevel@tonic-gate * P_NORELOC requires pages to be taken off the free list first. 19997c478bd9Sstevel@tonic-gate */ 20005d07b933Sdp noreloc = PP_ISNORELOC(start_pp); 20015d07b933Sdp for (pp = start_pp + new_npgs; --pp > start_pp; ) { 20025d07b933Sdp if (noreloc != PP_ISNORELOC(pp)) { 20037c478bd9Sstevel@tonic-gate page_promote_noreloc_err++; 20047c478bd9Sstevel@tonic-gate page_promote_err++; 20057c478bd9Sstevel@tonic-gate return (NULL); 20067c478bd9Sstevel@tonic-gate } 20077c478bd9Sstevel@tonic-gate } 20087c478bd9Sstevel@tonic-gate 20097c478bd9Sstevel@tonic-gate pages_left = new_npgs; 20107c478bd9Sstevel@tonic-gate pplist = NULL; 20117c478bd9Sstevel@tonic-gate pp = start_pp; 20127c478bd9Sstevel@tonic-gate 20137c478bd9Sstevel@tonic-gate /* Loop around coalescing the smaller pages into a big page. */ 20147c478bd9Sstevel@tonic-gate while (pages_left) { 20157c478bd9Sstevel@tonic-gate /* 20167c478bd9Sstevel@tonic-gate * Remove from the freelist. 20177c478bd9Sstevel@tonic-gate */ 20187c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 20197c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 20207c478bd9Sstevel@tonic-gate ASSERT(mnode == PP_2_MEM_NODE(pp)); 20217c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 20227c478bd9Sstevel@tonic-gate if (PP_ISAGED(pp)) { 20237c478bd9Sstevel@tonic-gate 20247c478bd9Sstevel@tonic-gate /* 20257c478bd9Sstevel@tonic-gate * PG_FREE_LIST 20267c478bd9Sstevel@tonic-gate */ 20277c478bd9Sstevel@tonic-gate if (pp->p_szc) { 20287c478bd9Sstevel@tonic-gate page_vpsub(&PAGE_FREELISTS(mnode, 20297c478bd9Sstevel@tonic-gate pp->p_szc, bin, mtype), pp); 20307c478bd9Sstevel@tonic-gate } else { 20317c478bd9Sstevel@tonic-gate mach_page_sub(&PAGE_FREELISTS(mnode, 0, 20327c478bd9Sstevel@tonic-gate bin, mtype), pp); 20337c478bd9Sstevel@tonic-gate } 20347c478bd9Sstevel@tonic-gate which_list = PG_FREE_LIST; 20357c478bd9Sstevel@tonic-gate } else { 20367c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 20377c478bd9Sstevel@tonic-gate 20387c478bd9Sstevel@tonic-gate /* 20397c478bd9Sstevel@tonic-gate * PG_CACHE_LIST 20407c478bd9Sstevel@tonic-gate * 20417c478bd9Sstevel@tonic-gate * Since this page comes from the 20427c478bd9Sstevel@tonic-gate * cachelist, we must destroy the 20437c478bd9Sstevel@tonic-gate * vnode association. 20447c478bd9Sstevel@tonic-gate */ 20457c478bd9Sstevel@tonic-gate if (!page_trylock(pp, SE_EXCL)) { 20467c478bd9Sstevel@tonic-gate goto fail_promote; 20477c478bd9Sstevel@tonic-gate } 20487c478bd9Sstevel@tonic-gate 20497c478bd9Sstevel@tonic-gate /* 20507c478bd9Sstevel@tonic-gate * We need to be careful not to deadlock 20517c478bd9Sstevel@tonic-gate * with another thread in page_lookup(). 20527c478bd9Sstevel@tonic-gate * The page_lookup() thread could be holding 20537c478bd9Sstevel@tonic-gate * the same phm that we need if the two 20547c478bd9Sstevel@tonic-gate * pages happen to hash to the same phm lock. 20557c478bd9Sstevel@tonic-gate * At this point we have locked the entire 20567c478bd9Sstevel@tonic-gate * freelist and page_lookup() could be trying 20577c478bd9Sstevel@tonic-gate * to grab a freelist lock. 20587c478bd9Sstevel@tonic-gate */ 20597c478bd9Sstevel@tonic-gate index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 20607c478bd9Sstevel@tonic-gate phm = PAGE_HASH_MUTEX(index); 20617c478bd9Sstevel@tonic-gate if (!mutex_tryenter(phm)) { 20628b464eb8Smec page_unlock_nocapture(pp); 20637c478bd9Sstevel@tonic-gate goto fail_promote; 20647c478bd9Sstevel@tonic-gate } 20657c478bd9Sstevel@tonic-gate 20667c478bd9Sstevel@tonic-gate mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 20677c478bd9Sstevel@tonic-gate page_hashout(pp, phm); 20687c478bd9Sstevel@tonic-gate mutex_exit(phm); 20697c478bd9Sstevel@tonic-gate PP_SETAGED(pp); 20708b464eb8Smec page_unlock_nocapture(pp); 20717c478bd9Sstevel@tonic-gate which_list = PG_CACHE_LIST; 20727c478bd9Sstevel@tonic-gate } 2073affbd3ccSkchow page_ctr_sub(mnode, mtype, pp, which_list); 20747c478bd9Sstevel@tonic-gate 20757c478bd9Sstevel@tonic-gate /* 20767c478bd9Sstevel@tonic-gate * Concatenate the smaller page(s) onto 20777c478bd9Sstevel@tonic-gate * the large page list. 20787c478bd9Sstevel@tonic-gate */ 20797c478bd9Sstevel@tonic-gate tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 20807c478bd9Sstevel@tonic-gate pages_left -= npgs; 20817c478bd9Sstevel@tonic-gate tpp = pp; 20827c478bd9Sstevel@tonic-gate while (npgs--) { 20837c478bd9Sstevel@tonic-gate tpp->p_szc = new_szc; 20847c478bd9Sstevel@tonic-gate tpp = tpp->p_next; 20857c478bd9Sstevel@tonic-gate } 20867c478bd9Sstevel@tonic-gate page_list_concat(&pplist, &pp); 20877c478bd9Sstevel@tonic-gate pp += tmpnpgs; 20887c478bd9Sstevel@tonic-gate } 20897c478bd9Sstevel@tonic-gate CHK_LPG(pplist, new_szc); 20907c478bd9Sstevel@tonic-gate 20917c478bd9Sstevel@tonic-gate /* 20927c478bd9Sstevel@tonic-gate * return the page to the user if requested 20937c478bd9Sstevel@tonic-gate * in the properly locked state. 20947c478bd9Sstevel@tonic-gate */ 20957c478bd9Sstevel@tonic-gate if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 20967c478bd9Sstevel@tonic-gate return (pplist); 20977c478bd9Sstevel@tonic-gate } 20987c478bd9Sstevel@tonic-gate 20997c478bd9Sstevel@tonic-gate /* 21007c478bd9Sstevel@tonic-gate * Otherwise place the new large page on the freelist 21017c478bd9Sstevel@tonic-gate */ 21027c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pplist); 21037c478bd9Sstevel@tonic-gate mnode = PP_2_MEM_NODE(pplist); 21047c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pplist); 21057c478bd9Sstevel@tonic-gate page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 21067c478bd9Sstevel@tonic-gate 2107affbd3ccSkchow page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 21087c478bd9Sstevel@tonic-gate return (NULL); 21097c478bd9Sstevel@tonic-gate 21107c478bd9Sstevel@tonic-gate fail_promote: 21117c478bd9Sstevel@tonic-gate /* 21127c478bd9Sstevel@tonic-gate * A thread must have still been freeing or 21137c478bd9Sstevel@tonic-gate * reclaiming the page on the cachelist. 21147c478bd9Sstevel@tonic-gate * To prevent a deadlock undo what we have 21157c478bd9Sstevel@tonic-gate * done sofar and return failure. This 21167c478bd9Sstevel@tonic-gate * situation can only happen while promoting 21177c478bd9Sstevel@tonic-gate * PAGESIZE pages. 21187c478bd9Sstevel@tonic-gate */ 21197c478bd9Sstevel@tonic-gate page_promote_err++; 21207c478bd9Sstevel@tonic-gate while (pplist) { 21217c478bd9Sstevel@tonic-gate pp = pplist; 21227c478bd9Sstevel@tonic-gate mach_page_sub(&pplist, pp); 21237c478bd9Sstevel@tonic-gate pp->p_szc = 0; 21247c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 21257c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 21267c478bd9Sstevel@tonic-gate mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2127affbd3ccSkchow page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 21287c478bd9Sstevel@tonic-gate } 21297c478bd9Sstevel@tonic-gate return (NULL); 21307c478bd9Sstevel@tonic-gate 21317c478bd9Sstevel@tonic-gate } 21327c478bd9Sstevel@tonic-gate 21337c478bd9Sstevel@tonic-gate /* 21347c478bd9Sstevel@tonic-gate * Break up a large page into smaller size pages. 21357c478bd9Sstevel@tonic-gate * Pages involved are on the freelist before the call and may 21367c478bd9Sstevel@tonic-gate * be returned to the caller if requested, otherwise they will 21377c478bd9Sstevel@tonic-gate * be placed back on the freelist. 21387c478bd9Sstevel@tonic-gate * The caller is responsible for locking the freelist as well as any other 21397c478bd9Sstevel@tonic-gate * accounting which needs to be done for a returned page. 21407c478bd9Sstevel@tonic-gate * If flags is not PC_ALLOC, the color argument is ignored, and thus 21417c478bd9Sstevel@tonic-gate * technically, any value may be passed in but PC_NO_COLOR is the standard 21427c478bd9Sstevel@tonic-gate * which should be followed for clarity's sake. 214319397407SSherry Moore * Returns a page whose pfn is < pfnmax 21447c478bd9Sstevel@tonic-gate */ 21457c478bd9Sstevel@tonic-gate page_t * 214619397407SSherry Moore page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc, 214719397407SSherry Moore uchar_t new_szc, int color, int flags) 21487c478bd9Sstevel@tonic-gate { 21497c478bd9Sstevel@tonic-gate page_t *pp, *pplist, *npplist; 21507c478bd9Sstevel@tonic-gate pgcnt_t npgs, n; 21517c478bd9Sstevel@tonic-gate uint_t bin; 21527c478bd9Sstevel@tonic-gate uint_t mtype; 21537c478bd9Sstevel@tonic-gate page_t *ret_pp = NULL; 21547c478bd9Sstevel@tonic-gate 21557c478bd9Sstevel@tonic-gate ASSERT(cur_szc != 0); 21567c478bd9Sstevel@tonic-gate ASSERT(new_szc < cur_szc); 21577c478bd9Sstevel@tonic-gate 21587c478bd9Sstevel@tonic-gate pplist = page_numtopp_nolock(pfnum); 21597c478bd9Sstevel@tonic-gate ASSERT(pplist != NULL); 21607c478bd9Sstevel@tonic-gate 21617c478bd9Sstevel@tonic-gate ASSERT(pplist->p_szc == cur_szc); 21627c478bd9Sstevel@tonic-gate 21637c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pplist); 21647c478bd9Sstevel@tonic-gate ASSERT(mnode == PP_2_MEM_NODE(pplist)); 21657c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pplist); 21667c478bd9Sstevel@tonic-gate page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 21677c478bd9Sstevel@tonic-gate 21687c478bd9Sstevel@tonic-gate CHK_LPG(pplist, cur_szc); 2169affbd3ccSkchow page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 21707c478bd9Sstevel@tonic-gate 21717c478bd9Sstevel@tonic-gate /* 21727c478bd9Sstevel@tonic-gate * Number of PAGESIZE pages for smaller new_szc 21737c478bd9Sstevel@tonic-gate * page. 21747c478bd9Sstevel@tonic-gate */ 21757c478bd9Sstevel@tonic-gate npgs = page_get_pagecnt(new_szc); 21767c478bd9Sstevel@tonic-gate 21777c478bd9Sstevel@tonic-gate while (pplist) { 21787c478bd9Sstevel@tonic-gate pp = pplist; 21797c478bd9Sstevel@tonic-gate 21807c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == cur_szc); 21817c478bd9Sstevel@tonic-gate 21827c478bd9Sstevel@tonic-gate /* 21837c478bd9Sstevel@tonic-gate * We either break it up into PAGESIZE pages or larger. 21847c478bd9Sstevel@tonic-gate */ 21857c478bd9Sstevel@tonic-gate if (npgs == 1) { /* PAGESIZE case */ 21867c478bd9Sstevel@tonic-gate mach_page_sub(&pplist, pp); 21877c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == cur_szc); 21887c478bd9Sstevel@tonic-gate ASSERT(new_szc == 0); 21897c478bd9Sstevel@tonic-gate ASSERT(mnode == PP_2_MEM_NODE(pp)); 21907c478bd9Sstevel@tonic-gate pp->p_szc = new_szc; 21917c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pp); 21927c478bd9Sstevel@tonic-gate if ((bin == color) && (flags == PC_ALLOC) && 219319397407SSherry Moore (ret_pp == NULL) && (pfnmax == 0 || 219419397407SSherry Moore pp->p_pagenum < pfnmax) && 21957c478bd9Sstevel@tonic-gate page_trylock_cons(pp, SE_EXCL)) { 21967c478bd9Sstevel@tonic-gate ret_pp = pp; 21977c478bd9Sstevel@tonic-gate } else { 21987c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 21997c478bd9Sstevel@tonic-gate mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 22007c478bd9Sstevel@tonic-gate mtype), pp); 2201affbd3ccSkchow page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 22027c478bd9Sstevel@tonic-gate } 22037c478bd9Sstevel@tonic-gate } else { 220419397407SSherry Moore page_t *try_to_return_this_page = NULL; 220519397407SSherry Moore int count = 0; 22067c478bd9Sstevel@tonic-gate 22077c478bd9Sstevel@tonic-gate /* 22087c478bd9Sstevel@tonic-gate * Break down into smaller lists of pages. 22097c478bd9Sstevel@tonic-gate */ 22107c478bd9Sstevel@tonic-gate page_list_break(&pplist, &npplist, npgs); 22117c478bd9Sstevel@tonic-gate 22127c478bd9Sstevel@tonic-gate pp = pplist; 22137c478bd9Sstevel@tonic-gate n = npgs; 22147c478bd9Sstevel@tonic-gate while (n--) { 22157c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == cur_szc); 221619397407SSherry Moore /* 221719397407SSherry Moore * Check whether all the pages in this list 221819397407SSherry Moore * fit the request criteria. 221919397407SSherry Moore */ 222019397407SSherry Moore if (pfnmax == 0 || pp->p_pagenum < pfnmax) { 222119397407SSherry Moore count++; 222219397407SSherry Moore } 22237c478bd9Sstevel@tonic-gate pp->p_szc = new_szc; 22247c478bd9Sstevel@tonic-gate pp = pp->p_next; 22257c478bd9Sstevel@tonic-gate } 22267c478bd9Sstevel@tonic-gate 222719397407SSherry Moore if (count == npgs && 222819397407SSherry Moore (pfnmax == 0 || pp->p_pagenum < pfnmax)) { 222919397407SSherry Moore try_to_return_this_page = pp; 223019397407SSherry Moore } 223119397407SSherry Moore 22327c478bd9Sstevel@tonic-gate CHK_LPG(pplist, new_szc); 22337c478bd9Sstevel@tonic-gate 22347c478bd9Sstevel@tonic-gate bin = PP_2_BIN(pplist); 223519397407SSherry Moore if (try_to_return_this_page) 223619397407SSherry Moore ASSERT(mnode == 223719397407SSherry Moore PP_2_MEM_NODE(try_to_return_this_page)); 22387c478bd9Sstevel@tonic-gate if ((bin == color) && (flags == PC_ALLOC) && 223919397407SSherry Moore (ret_pp == NULL) && try_to_return_this_page && 224019397407SSherry Moore page_trylock_cons(try_to_return_this_page, 224119397407SSherry Moore SE_EXCL)) { 224219397407SSherry Moore ret_pp = try_to_return_this_page; 22437c478bd9Sstevel@tonic-gate } else { 22447c478bd9Sstevel@tonic-gate mtype = PP_2_MTYPE(pp); 22457c478bd9Sstevel@tonic-gate page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 22467c478bd9Sstevel@tonic-gate bin, mtype), pplist); 22477c478bd9Sstevel@tonic-gate 2248affbd3ccSkchow page_ctr_add(mnode, mtype, pplist, 2249affbd3ccSkchow PG_FREE_LIST); 22507c478bd9Sstevel@tonic-gate } 22517c478bd9Sstevel@tonic-gate pplist = npplist; 22527c478bd9Sstevel@tonic-gate } 22537c478bd9Sstevel@tonic-gate } 22547c478bd9Sstevel@tonic-gate return (ret_pp); 22557c478bd9Sstevel@tonic-gate } 22567c478bd9Sstevel@tonic-gate 22577c478bd9Sstevel@tonic-gate int mpss_coalesce_disable = 0; 22587c478bd9Sstevel@tonic-gate 22597c478bd9Sstevel@tonic-gate /* 22607c478bd9Sstevel@tonic-gate * Coalesce free pages into a page of the given szc and color if possible. 22617c478bd9Sstevel@tonic-gate * Return the pointer to the page created, otherwise, return NULL. 22625d07b933Sdp * 22635d07b933Sdp * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 22647c478bd9Sstevel@tonic-gate */ 22655d07b933Sdp page_t * 22665d07b933Sdp page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 22675d07b933Sdp int mtype, pfn_t pfnhi) 22687c478bd9Sstevel@tonic-gate { 22695d07b933Sdp int r = szc; /* region size */ 22705d07b933Sdp int mrange; 22715d07b933Sdp uint_t full, bin, color_mask, wrap = 0; 22725d07b933Sdp pfn_t pfnum, lo, hi; 22735d07b933Sdp size_t len, idx, idx0; 22745d07b933Sdp pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 22757c478bd9Sstevel@tonic-gate page_t *ret_pp; 2276ce8eb11aSdp MEM_NODE_ITERATOR_DECL(it); 22775d07b933Sdp #if defined(__sparc) 22785d07b933Sdp pfn_t pfnum0, nlo, nhi; 22795d07b933Sdp #endif 22807c478bd9Sstevel@tonic-gate 22817c478bd9Sstevel@tonic-gate if (mpss_coalesce_disable) { 22825d07b933Sdp ASSERT(szc < MMU_PAGE_SIZES); 22835d07b933Sdp VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 22847c478bd9Sstevel@tonic-gate return (NULL); 22857c478bd9Sstevel@tonic-gate } 22867c478bd9Sstevel@tonic-gate 22875d07b933Sdp ASSERT(szc < mmu_page_sizes); 22885d07b933Sdp color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 22895d07b933Sdp ASSERT(ceq_mask <= color_mask); 22905d07b933Sdp ASSERT(color <= color_mask); 22915d07b933Sdp color &= ceq_mask; 22925d07b933Sdp 22935d07b933Sdp /* Prevent page_counters dynamic memory from being freed */ 22945d07b933Sdp rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 22955d07b933Sdp 22965d07b933Sdp mrange = MTYPE_2_MRANGE(mnode, mtype); 22975d07b933Sdp ASSERT(mrange < mnode_nranges[mnode]); 22985d07b933Sdp VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 22995d07b933Sdp 23005d07b933Sdp /* get pfn range for mtype */ 23015d07b933Sdp len = PAGE_COUNTERS_ENTRIES(mnode, r); 23025d07b933Sdp MNODETYPE_2_PFN(mnode, mtype, lo, hi); 23035d07b933Sdp hi++; 23045d07b933Sdp 23055d07b933Sdp /* use lower limit if given */ 23065d07b933Sdp if (pfnhi != PFNNULL && pfnhi < hi) 23075d07b933Sdp hi = pfnhi; 23085d07b933Sdp 23095d07b933Sdp /* round to szcpgcnt boundaries */ 23105d07b933Sdp lo = P2ROUNDUP(lo, szcpgcnt); 2311b779d3e0Sdp MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 2312b779d3e0Sdp if (lo == (pfn_t)-1) { 2313b779d3e0Sdp rw_exit(&page_ctrs_rwlock[mnode]); 2314b779d3e0Sdp return (NULL); 2315b779d3e0Sdp } 23165d07b933Sdp hi = hi & ~(szcpgcnt - 1); 23175d07b933Sdp 23185d07b933Sdp /* set lo to the closest pfn of the right color */ 2319ce8eb11aSdp if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2320ce8eb11aSdp (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2321ce8eb11aSdp PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2322ce8eb11aSdp &it); 23235d07b933Sdp } 23245d07b933Sdp 23255d07b933Sdp if (hi <= lo) { 23265d07b933Sdp rw_exit(&page_ctrs_rwlock[mnode]); 23277c478bd9Sstevel@tonic-gate return (NULL); 23287c478bd9Sstevel@tonic-gate } 23295d07b933Sdp 23307c478bd9Sstevel@tonic-gate full = FULL_REGION_CNT(r); 23317c478bd9Sstevel@tonic-gate 23325d07b933Sdp /* calculate the number of page candidates and initial search index */ 23335d07b933Sdp bin = color; 23345d07b933Sdp idx0 = (size_t)(-1); 23355d07b933Sdp do { 23365d07b933Sdp pgcnt_t acand; 23375d07b933Sdp 23385d07b933Sdp PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 23395d07b933Sdp if (acand) { 23405d07b933Sdp idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 23415d07b933Sdp r, bin, mrange); 23425d07b933Sdp idx0 = MIN(idx0, idx); 23435d07b933Sdp cands += acand; 23445d07b933Sdp } 23455d07b933Sdp bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 23465d07b933Sdp } while (bin != color); 23475d07b933Sdp 23485d07b933Sdp if (cands == 0) { 23495d07b933Sdp VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 23505d07b933Sdp rw_exit(&page_ctrs_rwlock[mnode]); 23515d07b933Sdp return (NULL); 23525d07b933Sdp } 23535d07b933Sdp 23545d07b933Sdp pfnum = IDX_TO_PNUM(mnode, r, idx0); 23555d07b933Sdp if (pfnum < lo || pfnum >= hi) { 23565d07b933Sdp pfnum = lo; 2357ce8eb11aSdp } else { 2358b779d3e0Sdp MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2359ce8eb11aSdp if (pfnum == (pfn_t)-1) { 2360ce8eb11aSdp pfnum = lo; 2361b779d3e0Sdp MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2362ce8eb11aSdp ASSERT(pfnum != (pfn_t)-1); 2363ce8eb11aSdp } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2364ce8eb11aSdp (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2365ce8eb11aSdp /* invalid color, get the closest correct pfn */ 2366ce8eb11aSdp PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2367ce8eb11aSdp color_mask, &it); 2368ce8eb11aSdp if (pfnum >= hi) { 2369ce8eb11aSdp pfnum = lo; 2370b779d3e0Sdp MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2371ce8eb11aSdp } 2372ce8eb11aSdp } 23735d07b933Sdp } 23745d07b933Sdp 23755d07b933Sdp /* set starting index */ 23765d07b933Sdp idx0 = PNUM_TO_IDX(mnode, r, pfnum); 23775d07b933Sdp ASSERT(idx0 < len); 23785d07b933Sdp 23795d07b933Sdp #if defined(__sparc) 23805d07b933Sdp pfnum0 = pfnum; /* page corresponding to idx0 */ 23815d07b933Sdp nhi = 0; /* search kcage ranges */ 23825d07b933Sdp #endif 23835d07b933Sdp 23845d07b933Sdp for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 23855d07b933Sdp 23865d07b933Sdp #if defined(__sparc) 23875d07b933Sdp /* 23885d07b933Sdp * Find lowest intersection of kcage ranges and mnode. 23895d07b933Sdp * MTYPE_NORELOC means look in the cage, otherwise outside. 23905d07b933Sdp */ 23915d07b933Sdp if (nhi <= pfnum) { 23925d07b933Sdp if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 23935d07b933Sdp (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 23945d07b933Sdp goto wrapit; 23955d07b933Sdp 23965d07b933Sdp /* jump to the next page in the range */ 23975d07b933Sdp if (pfnum < nlo) { 23985d07b933Sdp pfnum = P2ROUNDUP(nlo, szcpgcnt); 2399b779d3e0Sdp MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 24005d07b933Sdp idx = PNUM_TO_IDX(mnode, r, pfnum); 24015d07b933Sdp if (idx >= len || pfnum >= hi) 24025d07b933Sdp goto wrapit; 2403ce8eb11aSdp if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 24045d07b933Sdp ceq_mask) 24055d07b933Sdp goto next; 2406ce8eb11aSdp if (interleaved_mnodes && 2407ce8eb11aSdp PFN_2_MEM_NODE(pfnum) != mnode) 2408ce8eb11aSdp goto next; 24097c478bd9Sstevel@tonic-gate } 24105d07b933Sdp } 24115d07b933Sdp #endif 24125d07b933Sdp 24135d07b933Sdp if (PAGE_COUNTERS(mnode, r, idx) != full) 24145d07b933Sdp goto next; 24155d07b933Sdp 24165d07b933Sdp /* 24175d07b933Sdp * RFE: For performance maybe we can do something less 24185d07b933Sdp * brutal than locking the entire freelist. So far 24195d07b933Sdp * this doesn't seem to be a performance problem? 24205d07b933Sdp */ 24215d07b933Sdp page_freelist_lock(mnode); 24225d07b933Sdp if (PAGE_COUNTERS(mnode, r, idx) == full) { 24235d07b933Sdp ret_pp = 24245d07b933Sdp page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 24257c478bd9Sstevel@tonic-gate if (ret_pp != NULL) { 24265d07b933Sdp VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 24275d07b933Sdp PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2428ce8eb11aSdp PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 24297c478bd9Sstevel@tonic-gate page_freelist_unlock(mnode); 24307c478bd9Sstevel@tonic-gate rw_exit(&page_ctrs_rwlock[mnode]); 24317c478bd9Sstevel@tonic-gate #if defined(__sparc) 24327c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(ret_pp)) { 24337c478bd9Sstevel@tonic-gate pgcnt_t npgs; 24347c478bd9Sstevel@tonic-gate 24357c478bd9Sstevel@tonic-gate npgs = page_get_pagecnt(ret_pp->p_szc); 24367c478bd9Sstevel@tonic-gate kcage_freemem_sub(npgs); 24377c478bd9Sstevel@tonic-gate } 24387c478bd9Sstevel@tonic-gate #endif 24397c478bd9Sstevel@tonic-gate return (ret_pp); 24407c478bd9Sstevel@tonic-gate } 24415d07b933Sdp } else { 24425d07b933Sdp VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 24435d07b933Sdp } 24445d07b933Sdp 24455d07b933Sdp page_freelist_unlock(mnode); 24465d07b933Sdp /* 24475d07b933Sdp * No point looking for another page if we've 24485d07b933Sdp * already tried all of the ones that 24495d07b933Sdp * page_ctr_cands indicated. Stash off where we left 24505d07b933Sdp * off. 24515d07b933Sdp * Note: this is not exact since we don't hold the 24525d07b933Sdp * page_freelist_locks before we initially get the 24535d07b933Sdp * value of cands for performance reasons, but should 24545d07b933Sdp * be a decent approximation. 24555d07b933Sdp */ 24565d07b933Sdp if (--cands == 0) { 24575d07b933Sdp PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 24585d07b933Sdp idx; 24595d07b933Sdp break; 24605d07b933Sdp } 24615d07b933Sdp next: 24625d07b933Sdp PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2463ce8eb11aSdp color_mask, &it); 24645d07b933Sdp idx = PNUM_TO_IDX(mnode, r, pfnum); 24655d07b933Sdp if (idx >= len || pfnum >= hi) { 24665d07b933Sdp wrapit: 24675d07b933Sdp pfnum = lo; 2468b779d3e0Sdp MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 24695d07b933Sdp idx = PNUM_TO_IDX(mnode, r, pfnum); 24705d07b933Sdp wrap++; 24715d07b933Sdp #if defined(__sparc) 24725d07b933Sdp nhi = 0; /* search kcage ranges */ 24735d07b933Sdp #endif 24747c478bd9Sstevel@tonic-gate } 24757c478bd9Sstevel@tonic-gate } 24765d07b933Sdp 24777c478bd9Sstevel@tonic-gate rw_exit(&page_ctrs_rwlock[mnode]); 24785d07b933Sdp VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 24797c478bd9Sstevel@tonic-gate return (NULL); 24807c478bd9Sstevel@tonic-gate } 24817c478bd9Sstevel@tonic-gate 24827c478bd9Sstevel@tonic-gate /* 24837c478bd9Sstevel@tonic-gate * For the given mnode, promote as many small pages to large pages as possible. 2484ce8eb11aSdp * mnode can be -1, which means do them all 24857c478bd9Sstevel@tonic-gate */ 24867c478bd9Sstevel@tonic-gate void 24877c478bd9Sstevel@tonic-gate page_freelist_coalesce_all(int mnode) 24887c478bd9Sstevel@tonic-gate { 24897c478bd9Sstevel@tonic-gate int r; /* region size */ 24907c478bd9Sstevel@tonic-gate int idx, full; 24917c478bd9Sstevel@tonic-gate size_t len; 2492ce8eb11aSdp int doall = interleaved_mnodes || mnode < 0; 2493ce8eb11aSdp int mlo = doall ? 0 : mnode; 2494ce8eb11aSdp int mhi = doall ? max_mem_nodes : (mnode + 1); 24957c478bd9Sstevel@tonic-gate 24967c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 24977c478bd9Sstevel@tonic-gate 24987c478bd9Sstevel@tonic-gate if (mpss_coalesce_disable) { 24997c478bd9Sstevel@tonic-gate return; 25007c478bd9Sstevel@tonic-gate } 25017c478bd9Sstevel@tonic-gate 25027c478bd9Sstevel@tonic-gate /* 25037c478bd9Sstevel@tonic-gate * Lock the entire freelist and coalesce what we can. 25047c478bd9Sstevel@tonic-gate * 25057c478bd9Sstevel@tonic-gate * Always promote to the largest page possible 25067c478bd9Sstevel@tonic-gate * first to reduce the number of page promotions. 25077c478bd9Sstevel@tonic-gate */ 2508ce8eb11aSdp for (mnode = mlo; mnode < mhi; mnode++) { 2509ce8eb11aSdp rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2510ce8eb11aSdp page_freelist_lock(mnode); 2511ce8eb11aSdp } 25127c478bd9Sstevel@tonic-gate for (r = mmu_page_sizes - 1; r > 0; r--) { 2513ce8eb11aSdp for (mnode = mlo; mnode < mhi; mnode++) { 2514ce8eb11aSdp pgcnt_t cands = 0; 2515ce8eb11aSdp int mrange, nranges = mnode_nranges[mnode]; 25167c478bd9Sstevel@tonic-gate 2517ce8eb11aSdp for (mrange = 0; mrange < nranges; mrange++) { 2518ce8eb11aSdp PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2519ce8eb11aSdp if (cands != 0) 2520ce8eb11aSdp break; 2521ce8eb11aSdp } 2522ce8eb11aSdp if (cands == 0) { 2523ce8eb11aSdp VM_STAT_ADD(vmm_vmstats. 2524ce8eb11aSdp page_ctrs_cands_skip_all); 2525ce8eb11aSdp continue; 2526ce8eb11aSdp } 25277c478bd9Sstevel@tonic-gate 2528ce8eb11aSdp full = FULL_REGION_CNT(r); 2529ce8eb11aSdp len = PAGE_COUNTERS_ENTRIES(mnode, r); 2530ce8eb11aSdp 2531ce8eb11aSdp for (idx = 0; idx < len; idx++) { 2532ce8eb11aSdp if (PAGE_COUNTERS(mnode, r, idx) == full) { 2533ce8eb11aSdp pfn_t pfnum = 2534ce8eb11aSdp IDX_TO_PNUM(mnode, r, idx); 2535ce8eb11aSdp int tmnode = interleaved_mnodes ? 2536ce8eb11aSdp PFN_2_MEM_NODE(pfnum) : mnode; 2537ce8eb11aSdp 2538ce8eb11aSdp ASSERT(pfnum >= 2539ce8eb11aSdp mem_node_config[tmnode].physbase && 2540ce8eb11aSdp pfnum < 2541ce8eb11aSdp mem_node_config[tmnode].physmax); 2542ce8eb11aSdp 2543ce8eb11aSdp (void) page_promote(tmnode, 2544ce8eb11aSdp pfnum, r, PC_FREE, PC_MTYPE_ANY); 2545ce8eb11aSdp } 25467c478bd9Sstevel@tonic-gate } 2547ce8eb11aSdp /* shared hpm_counters covers all mnodes, so we quit */ 2548ce8eb11aSdp if (interleaved_mnodes) 2549ce8eb11aSdp break; 25507c478bd9Sstevel@tonic-gate } 25517c478bd9Sstevel@tonic-gate } 2552ce8eb11aSdp for (mnode = mlo; mnode < mhi; mnode++) { 2553ce8eb11aSdp page_freelist_unlock(mnode); 2554ce8eb11aSdp rw_exit(&page_ctrs_rwlock[mnode]); 2555ce8eb11aSdp } 25567c478bd9Sstevel@tonic-gate } 25577c478bd9Sstevel@tonic-gate 25587c478bd9Sstevel@tonic-gate /* 25597c478bd9Sstevel@tonic-gate * This is where all polices for moving pages around 25607c478bd9Sstevel@tonic-gate * to different page size free lists is implemented. 25617c478bd9Sstevel@tonic-gate * Returns 1 on success, 0 on failure. 25627c478bd9Sstevel@tonic-gate * 25637c478bd9Sstevel@tonic-gate * So far these are the priorities for this algorithm in descending 25647c478bd9Sstevel@tonic-gate * order: 25657c478bd9Sstevel@tonic-gate * 25667c478bd9Sstevel@tonic-gate * 1) When servicing a request try to do so with a free page 25677c478bd9Sstevel@tonic-gate * from next size up. Helps defer fragmentation as long 25687c478bd9Sstevel@tonic-gate * as possible. 25697c478bd9Sstevel@tonic-gate * 25707c478bd9Sstevel@tonic-gate * 2) Page coalesce on demand. Only when a freelist 25717c478bd9Sstevel@tonic-gate * larger than PAGESIZE is empty and step 1 25727c478bd9Sstevel@tonic-gate * will not work since all larger size lists are 25737c478bd9Sstevel@tonic-gate * also empty. 25747c478bd9Sstevel@tonic-gate * 25757c478bd9Sstevel@tonic-gate * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 25767c478bd9Sstevel@tonic-gate */ 25775d07b933Sdp 25787c478bd9Sstevel@tonic-gate page_t * 25795d07b933Sdp page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 258019397407SSherry Moore pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw) 25817c478bd9Sstevel@tonic-gate { 25827c478bd9Sstevel@tonic-gate uchar_t nszc = szc + 1; 25835d07b933Sdp uint_t bin, sbin, bin_prev; 25847c478bd9Sstevel@tonic-gate page_t *pp, *firstpp; 25857c478bd9Sstevel@tonic-gate page_t *ret_pp = NULL; 25865d07b933Sdp uint_t color_mask; 25877c478bd9Sstevel@tonic-gate 25885d07b933Sdp if (nszc == mmu_page_sizes) 25895d07b933Sdp return (NULL); 25907c478bd9Sstevel@tonic-gate 25915d07b933Sdp ASSERT(nszc < mmu_page_sizes); 25925d07b933Sdp color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 25935d07b933Sdp bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 25945d07b933Sdp bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 25955d07b933Sdp PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 25965d07b933Sdp 25975d07b933Sdp VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 25987c478bd9Sstevel@tonic-gate /* 25995d07b933Sdp * First try to break up a larger page to fill current size freelist. 26007c478bd9Sstevel@tonic-gate */ 26015d07b933Sdp while (plw->plw_bins[nszc] != 0) { 26025d07b933Sdp 26035d07b933Sdp ASSERT(nszc < mmu_page_sizes); 26045d07b933Sdp 26057c478bd9Sstevel@tonic-gate /* 26067c478bd9Sstevel@tonic-gate * If page found then demote it. 26077c478bd9Sstevel@tonic-gate */ 26087c478bd9Sstevel@tonic-gate if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 26097c478bd9Sstevel@tonic-gate page_freelist_lock(mnode); 26107c478bd9Sstevel@tonic-gate firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 26117c478bd9Sstevel@tonic-gate 26127c478bd9Sstevel@tonic-gate /* 26137c478bd9Sstevel@tonic-gate * If pfnhi is not PFNNULL, look for large page below 26147c478bd9Sstevel@tonic-gate * pfnhi. PFNNULL signifies no pfn requirement. 26157c478bd9Sstevel@tonic-gate */ 26167196569bSSherry Moore if (pp && 26177196569bSSherry Moore ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) || 26187196569bSSherry Moore (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) { 26197c478bd9Sstevel@tonic-gate do { 26207c478bd9Sstevel@tonic-gate pp = pp->p_vpnext; 26217c478bd9Sstevel@tonic-gate if (pp == firstpp) { 26227c478bd9Sstevel@tonic-gate pp = NULL; 26237c478bd9Sstevel@tonic-gate break; 26247c478bd9Sstevel@tonic-gate } 262519397407SSherry Moore } while ((pfnhi != PFNNULL && 262619397407SSherry Moore pp->p_pagenum >= pfnhi) || 262719397407SSherry Moore (pfnlo != PFNNULL && 262819397407SSherry Moore pp->p_pagenum < pfnlo)); 262919397407SSherry Moore 263019397407SSherry Moore if (pfnhi != PFNNULL && pp != NULL) 263119397407SSherry Moore ASSERT(pp->p_pagenum < pfnhi); 263219397407SSherry Moore 263319397407SSherry Moore if (pfnlo != PFNNULL && pp != NULL) 263419397407SSherry Moore ASSERT(pp->p_pagenum >= pfnlo); 26357c478bd9Sstevel@tonic-gate } 26367c478bd9Sstevel@tonic-gate if (pp) { 26375d07b933Sdp uint_t ccolor = page_correct_color(szc, nszc, 26385d07b933Sdp color, bin, plw->plw_ceq_mask[szc]); 26395d07b933Sdp 26407c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == nszc); 26415d07b933Sdp VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 26427c478bd9Sstevel@tonic-gate ret_pp = page_demote(mnode, pp->p_pagenum, 264319397407SSherry Moore pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC); 26447c478bd9Sstevel@tonic-gate if (ret_pp) { 26457c478bd9Sstevel@tonic-gate page_freelist_unlock(mnode); 26467c478bd9Sstevel@tonic-gate #if defined(__sparc) 26477c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(ret_pp)) { 26487c478bd9Sstevel@tonic-gate pgcnt_t npgs; 26497c478bd9Sstevel@tonic-gate 26507c478bd9Sstevel@tonic-gate npgs = page_get_pagecnt( 26517c478bd9Sstevel@tonic-gate ret_pp->p_szc); 26527c478bd9Sstevel@tonic-gate kcage_freemem_sub(npgs); 26537c478bd9Sstevel@tonic-gate } 26547c478bd9Sstevel@tonic-gate #endif 26557c478bd9Sstevel@tonic-gate return (ret_pp); 26567c478bd9Sstevel@tonic-gate } 26577c478bd9Sstevel@tonic-gate } 26587c478bd9Sstevel@tonic-gate page_freelist_unlock(mnode); 26597c478bd9Sstevel@tonic-gate } 26607c478bd9Sstevel@tonic-gate 26615d07b933Sdp /* loop through next size bins */ 26625d07b933Sdp bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 26635d07b933Sdp plw->plw_bins[nszc]--; 26645d07b933Sdp 26655d07b933Sdp if (bin == sbin) { 26665d07b933Sdp uchar_t nnszc = nszc + 1; 26675d07b933Sdp 26685d07b933Sdp /* we are done with this page size - check next */ 26695d07b933Sdp if (plw->plw_bins[nnszc] == 0) 26705d07b933Sdp /* we have already checked next size bins */ 26715d07b933Sdp break; 26725d07b933Sdp 26735d07b933Sdp bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 26745d07b933Sdp if (bin_prev != INVALID_COLOR) { 26755d07b933Sdp bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 26765d07b933Sdp if (!((bin ^ bin_prev) & 26775d07b933Sdp plw->plw_ceq_mask[nnszc])) 26785d07b933Sdp break; 26795d07b933Sdp } 26805d07b933Sdp ASSERT(nnszc < mmu_page_sizes); 26815d07b933Sdp color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 26825d07b933Sdp nszc = nnszc; 26835d07b933Sdp ASSERT(nszc < mmu_page_sizes); 26845d07b933Sdp } 26857c478bd9Sstevel@tonic-gate } 26867c478bd9Sstevel@tonic-gate 26877c478bd9Sstevel@tonic-gate return (ret_pp); 26887c478bd9Sstevel@tonic-gate } 26897c478bd9Sstevel@tonic-gate 26907c478bd9Sstevel@tonic-gate /* 26917c478bd9Sstevel@tonic-gate * Helper routine used only by the freelist code to lock 26927c478bd9Sstevel@tonic-gate * a page. If the page is a large page then it succeeds in 26937c478bd9Sstevel@tonic-gate * locking all the constituent pages or none at all. 26947c478bd9Sstevel@tonic-gate * Returns 1 on sucess, 0 on failure. 26957c478bd9Sstevel@tonic-gate */ 26967c478bd9Sstevel@tonic-gate static int 26977c478bd9Sstevel@tonic-gate page_trylock_cons(page_t *pp, se_t se) 26987c478bd9Sstevel@tonic-gate { 26997c478bd9Sstevel@tonic-gate page_t *tpp, *first_pp = pp; 27007c478bd9Sstevel@tonic-gate 27017c478bd9Sstevel@tonic-gate /* 27027c478bd9Sstevel@tonic-gate * Fail if can't lock first or only page. 27037c478bd9Sstevel@tonic-gate */ 27047c478bd9Sstevel@tonic-gate if (!page_trylock(pp, se)) { 27057c478bd9Sstevel@tonic-gate return (0); 27067c478bd9Sstevel@tonic-gate } 27077c478bd9Sstevel@tonic-gate 27087c478bd9Sstevel@tonic-gate /* 27097c478bd9Sstevel@tonic-gate * PAGESIZE: common case. 27107c478bd9Sstevel@tonic-gate */ 27117c478bd9Sstevel@tonic-gate if (pp->p_szc == 0) { 27127c478bd9Sstevel@tonic-gate return (1); 27137c478bd9Sstevel@tonic-gate } 27147c478bd9Sstevel@tonic-gate 27157c478bd9Sstevel@tonic-gate /* 27167c478bd9Sstevel@tonic-gate * Large page case. 27177c478bd9Sstevel@tonic-gate */ 27187c478bd9Sstevel@tonic-gate tpp = pp->p_next; 27197c478bd9Sstevel@tonic-gate while (tpp != pp) { 27207c478bd9Sstevel@tonic-gate if (!page_trylock(tpp, se)) { 27217c478bd9Sstevel@tonic-gate /* 27228b464eb8Smec * On failure unlock what we have locked so far. 27238b464eb8Smec * We want to avoid attempting to capture these 27248b464eb8Smec * pages as the pcm mutex may be held which could 27258b464eb8Smec * lead to a recursive mutex panic. 27267c478bd9Sstevel@tonic-gate */ 27277c478bd9Sstevel@tonic-gate while (first_pp != tpp) { 27288b464eb8Smec page_unlock_nocapture(first_pp); 27297c478bd9Sstevel@tonic-gate first_pp = first_pp->p_next; 27307c478bd9Sstevel@tonic-gate } 27317c478bd9Sstevel@tonic-gate return (0); 27327c478bd9Sstevel@tonic-gate } 27337c478bd9Sstevel@tonic-gate tpp = tpp->p_next; 27347c478bd9Sstevel@tonic-gate } 27357c478bd9Sstevel@tonic-gate return (1); 27367c478bd9Sstevel@tonic-gate } 27377c478bd9Sstevel@tonic-gate 27385d07b933Sdp /* 27395d07b933Sdp * init context for walking page lists 27405d07b933Sdp * Called when a page of the given szc in unavailable. Sets markers 27415d07b933Sdp * for the beginning of the search to detect when search has 27425d07b933Sdp * completed a full cycle. Sets flags for splitting larger pages 27435d07b933Sdp * and coalescing smaller pages. Page walking procedes until a page 27445d07b933Sdp * of the desired equivalent color is found. 27455d07b933Sdp */ 27465d07b933Sdp void 27475d07b933Sdp page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 27485d07b933Sdp int use_ceq, page_list_walker_t *plw) 27497c478bd9Sstevel@tonic-gate { 27505d07b933Sdp uint_t nszc, ceq_mask, colors; 27515d07b933Sdp uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 27527c478bd9Sstevel@tonic-gate 27537c478bd9Sstevel@tonic-gate ASSERT(szc < mmu_page_sizes); 27545d07b933Sdp colors = PAGE_GET_PAGECOLORS(szc); 27557c478bd9Sstevel@tonic-gate 27565d07b933Sdp plw->plw_colors = colors; 27575d07b933Sdp plw->plw_color_mask = colors - 1; 27585d07b933Sdp plw->plw_bin_marker = plw->plw_bin0 = bin; 27595d07b933Sdp plw->plw_bin_split_prev = bin; 27605d07b933Sdp plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 27617c478bd9Sstevel@tonic-gate 27625d07b933Sdp /* 27635d07b933Sdp * if vac aliasing is possible make sure lower order color 27645d07b933Sdp * bits are never ignored 27655d07b933Sdp */ 27665d07b933Sdp if (vac_colors > 1) 27675d07b933Sdp ceq &= 0xf0; 27687c478bd9Sstevel@tonic-gate 27697c478bd9Sstevel@tonic-gate /* 27705d07b933Sdp * calculate the number of non-equivalent colors and 27715d07b933Sdp * color equivalency mask 27727c478bd9Sstevel@tonic-gate */ 27735d07b933Sdp plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 27745d07b933Sdp ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 27755d07b933Sdp ASSERT(plw->plw_ceq_dif > 0); 27765d07b933Sdp plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 27777c478bd9Sstevel@tonic-gate 27785d07b933Sdp if (flags & PG_MATCH_COLOR) { 27795d07b933Sdp if (cpu_page_colors < 0) { 27805d07b933Sdp /* 27815d07b933Sdp * this is a heterogeneous machine with different CPUs 27825d07b933Sdp * having different size e$ (not supported for ni2/rock 27835d07b933Sdp */ 27845d07b933Sdp uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 27855d07b933Sdp cpucolors = MAX(cpucolors, 1); 27865d07b933Sdp ceq_mask = plw->plw_color_mask & (cpucolors - 1); 27875d07b933Sdp plw->plw_ceq_mask[szc] = 27885d07b933Sdp MIN(ceq_mask, plw->plw_ceq_mask[szc]); 27895d07b933Sdp } 27905d07b933Sdp plw->plw_ceq_dif = 1; 27915d07b933Sdp } 27927c478bd9Sstevel@tonic-gate 27935d07b933Sdp /* we can split pages in the freelist, but not the cachelist */ 27945d07b933Sdp if (can_split) { 2795ce8eb11aSdp plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 27967c478bd9Sstevel@tonic-gate 2797ce8eb11aSdp /* set next szc color masks and number of free list bins */ 2798ce8eb11aSdp for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2799ce8eb11aSdp plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2800ce8eb11aSdp plw->plw_ceq_mask[szc]); 2801ce8eb11aSdp plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2802ce8eb11aSdp } 2803ce8eb11aSdp plw->plw_ceq_mask[nszc] = INVALID_MASK; 2804ce8eb11aSdp plw->plw_bins[nszc] = 0; 28057c478bd9Sstevel@tonic-gate 28065d07b933Sdp } else { 2807ce8eb11aSdp ASSERT(szc == 0); 2808ce8eb11aSdp plw->plw_do_split = 0; 2809ce8eb11aSdp plw->plw_bins[1] = 0; 2810ce8eb11aSdp plw->plw_ceq_mask[1] = INVALID_MASK; 28117c478bd9Sstevel@tonic-gate } 28125d07b933Sdp } 28137c478bd9Sstevel@tonic-gate 28145d07b933Sdp /* 28155d07b933Sdp * set mark to flag where next split should occur 28165d07b933Sdp */ 28175d07b933Sdp #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 28185d07b933Sdp uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 28195d07b933Sdp uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 28205d07b933Sdp uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 28215d07b933Sdp plw->plw_split_next = \ 28225d07b933Sdp INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 28235d07b933Sdp if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 28245d07b933Sdp plw->plw_split_next = \ 28255d07b933Sdp INC_MASKED(plw->plw_split_next, \ 28265d07b933Sdp neq_mask, plw->plw_color_mask); \ 28275d07b933Sdp } \ 28285d07b933Sdp } 28297c478bd9Sstevel@tonic-gate 28305d07b933Sdp uint_t 28315d07b933Sdp page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 28325d07b933Sdp { 28335d07b933Sdp uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 28345d07b933Sdp uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 28355d07b933Sdp uchar_t nszc = szc + 1; 28365d07b933Sdp 28375d07b933Sdp nbin = ADD_MASKED(bin, 28385d07b933Sdp plw->plw_bin_step, neq_mask, plw->plw_color_mask); 28395d07b933Sdp 28405d07b933Sdp if (plw->plw_do_split) { 28415d07b933Sdp plw->plw_bin_split_prev = bin; 28425d07b933Sdp PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 28435d07b933Sdp plw->plw_do_split = 0; 28445d07b933Sdp } 28455d07b933Sdp 28465d07b933Sdp if (szc == 0) { 28475d07b933Sdp if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 28485d07b933Sdp if (nbin == plw->plw_bin0 && 28495d07b933Sdp (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 28505d07b933Sdp nbin = ADD_MASKED(nbin, plw->plw_bin_step, 28515d07b933Sdp neq_mask, plw->plw_color_mask); 28525d07b933Sdp plw->plw_bin_split_prev = plw->plw_bin0; 28535d07b933Sdp } 28545d07b933Sdp 28555d07b933Sdp if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 28565d07b933Sdp plw->plw_bin_marker = 28575d07b933Sdp nbin = INC_MASKED(nbin, neq_mask, 2858ce8eb11aSdp plw->plw_color_mask); 28595d07b933Sdp plw->plw_bin_split_prev = plw->plw_bin0; 28605d07b933Sdp /* 28615d07b933Sdp * large pages all have the same vac color 28625d07b933Sdp * so by now we should be done with next 28635d07b933Sdp * size page splitting process 28645d07b933Sdp */ 28655d07b933Sdp ASSERT(plw->plw_bins[1] == 0); 28665d07b933Sdp plw->plw_do_split = 0; 28675d07b933Sdp return (nbin); 28685d07b933Sdp } 28695d07b933Sdp 28705d07b933Sdp } else { 28715d07b933Sdp uint_t bin_jump = (vac_colors == 1) ? 28725d07b933Sdp (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 28735d07b933Sdp 28745d07b933Sdp bin_jump &= ~(vac_colors - 1); 28755d07b933Sdp 28765d07b933Sdp nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 28775d07b933Sdp plw->plw_color_mask); 28785d07b933Sdp 28795d07b933Sdp if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 28805d07b933Sdp 28815d07b933Sdp plw->plw_bin_marker = nbin = nbin0; 28825d07b933Sdp 28835d07b933Sdp if (plw->plw_bins[nszc] != 0) { 28845d07b933Sdp /* 28855d07b933Sdp * check if next page size bin is the 28865d07b933Sdp * same as the next page size bin for 28875d07b933Sdp * bin0 28885d07b933Sdp */ 28895d07b933Sdp nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 28905d07b933Sdp nbin); 28915d07b933Sdp bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 28925d07b933Sdp plw->plw_bin0); 28935d07b933Sdp 28945d07b933Sdp if ((bin0_nsz ^ nbin_nsz) & 28955d07b933Sdp plw->plw_ceq_mask[nszc]) 28965d07b933Sdp plw->plw_do_split = 1; 28975d07b933Sdp } 28985d07b933Sdp return (nbin); 28995d07b933Sdp } 29005d07b933Sdp } 29015d07b933Sdp } 29025d07b933Sdp 29035d07b933Sdp if (plw->plw_bins[nszc] != 0) { 2904ce8eb11aSdp nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2905ce8eb11aSdp if (!((plw->plw_split_next ^ nbin_nsz) & 2906ce8eb11aSdp plw->plw_ceq_mask[nszc])) 2907ce8eb11aSdp plw->plw_do_split = 1; 29085d07b933Sdp } 29095d07b933Sdp 29105d07b933Sdp return (nbin); 29115d07b933Sdp } 29125d07b933Sdp 29135d07b933Sdp page_t * 29145d07b933Sdp page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 29155d07b933Sdp uint_t flags) 29165d07b933Sdp { 29175d07b933Sdp kmutex_t *pcm; 29185d07b933Sdp page_t *pp, *first_pp; 29195d07b933Sdp uint_t sbin; 29205d07b933Sdp int plw_initialized; 29215d07b933Sdp page_list_walker_t plw; 29225d07b933Sdp 29235d07b933Sdp ASSERT(szc < mmu_page_sizes); 29245d07b933Sdp 29255d07b933Sdp VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 29265d07b933Sdp 29275d07b933Sdp MTYPE_START(mnode, mtype, flags); 29285d07b933Sdp if (mtype < 0) { /* mnode does not have memory in mtype range */ 29295d07b933Sdp VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 29305d07b933Sdp return (NULL); 29315d07b933Sdp } 29325d07b933Sdp try_again: 29335d07b933Sdp 29345d07b933Sdp plw_initialized = 0; 29355d07b933Sdp plw.plw_ceq_dif = 1; 29367c478bd9Sstevel@tonic-gate 29377c478bd9Sstevel@tonic-gate /* 29387c478bd9Sstevel@tonic-gate * Only hold one freelist lock at a time, that way we 29397c478bd9Sstevel@tonic-gate * can start anywhere and not have to worry about lock 29407c478bd9Sstevel@tonic-gate * ordering. 29417c478bd9Sstevel@tonic-gate */ 29425d07b933Sdp for (plw.plw_count = 0; 29435d07b933Sdp plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 29445d07b933Sdp sbin = bin; 29455d07b933Sdp do { 29465d07b933Sdp if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 29475d07b933Sdp goto bin_empty_1; 29485d07b933Sdp 29497c478bd9Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 29507c478bd9Sstevel@tonic-gate mutex_enter(pcm); 29517c478bd9Sstevel@tonic-gate pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 29525d07b933Sdp if (pp == NULL) 29535d07b933Sdp goto bin_empty_0; 29545d07b933Sdp 29555d07b933Sdp /* 29565d07b933Sdp * These were set before the page 29575d07b933Sdp * was put on the free list, 29585d07b933Sdp * they must still be set. 29595d07b933Sdp */ 29605d07b933Sdp ASSERT(PP_ISFREE(pp)); 29615d07b933Sdp ASSERT(PP_ISAGED(pp)); 29625d07b933Sdp ASSERT(pp->p_vnode == NULL); 29635d07b933Sdp ASSERT(pp->p_hash == NULL); 29645d07b933Sdp ASSERT(pp->p_offset == (u_offset_t)-1); 29655d07b933Sdp ASSERT(pp->p_szc == szc); 29665d07b933Sdp ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 29675d07b933Sdp 29685d07b933Sdp /* 29695d07b933Sdp * Walk down the hash chain. 29705d07b933Sdp * 8k pages are linked on p_next 29715d07b933Sdp * and p_prev fields. Large pages 29725d07b933Sdp * are a contiguous group of 29735d07b933Sdp * constituent pages linked together 29745d07b933Sdp * on their p_next and p_prev fields. 29755d07b933Sdp * The large pages are linked together 29765d07b933Sdp * on the hash chain using p_vpnext 29775d07b933Sdp * p_vpprev of the base constituent 29785d07b933Sdp * page of each large page. 29795d07b933Sdp */ 29805d07b933Sdp first_pp = pp; 2981ca3e8d88SDave Plauger while (!page_trylock_cons(pp, SE_EXCL) || 2982ca3e8d88SDave Plauger IS_DUMP_PAGE(pp)) { 29835d07b933Sdp if (szc == 0) { 29845d07b933Sdp pp = pp->p_next; 29855d07b933Sdp } else { 29865d07b933Sdp pp = pp->p_vpnext; 29875d07b933Sdp } 29885d07b933Sdp 29897c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 29907c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 29917c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode == NULL); 29927c478bd9Sstevel@tonic-gate ASSERT(pp->p_hash == NULL); 29937c478bd9Sstevel@tonic-gate ASSERT(pp->p_offset == (u_offset_t)-1); 29947c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == szc); 29957c478bd9Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 29967c478bd9Sstevel@tonic-gate 29975d07b933Sdp if (pp == first_pp) 29985d07b933Sdp goto bin_empty_0; 29995d07b933Sdp } 30007c478bd9Sstevel@tonic-gate 30015d07b933Sdp ASSERT(pp != NULL); 30025d07b933Sdp ASSERT(mtype == PP_2_MTYPE(pp)); 30035d07b933Sdp ASSERT(pp->p_szc == szc); 30045d07b933Sdp if (szc == 0) { 30055d07b933Sdp page_sub(&PAGE_FREELISTS(mnode, 30065d07b933Sdp szc, bin, mtype), pp); 30075d07b933Sdp } else { 30085d07b933Sdp page_vpsub(&PAGE_FREELISTS(mnode, 30095d07b933Sdp szc, bin, mtype), pp); 30105d07b933Sdp CHK_LPG(pp, szc); 30115d07b933Sdp } 30125d07b933Sdp page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 30137c478bd9Sstevel@tonic-gate 30145d07b933Sdp if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 30155d07b933Sdp panic("free page is not. pp %p", (void *)pp); 30165d07b933Sdp mutex_exit(pcm); 30177c478bd9Sstevel@tonic-gate 30187c478bd9Sstevel@tonic-gate #if defined(__sparc) 30195d07b933Sdp ASSERT(!kcage_on || PP_ISNORELOC(pp) || 30205d07b933Sdp (flags & PG_NORELOC) == 0); 30217c478bd9Sstevel@tonic-gate 30225d07b933Sdp if (PP_ISNORELOC(pp)) 30235d07b933Sdp kcage_freemem_sub(page_get_pagecnt(szc)); 30247c478bd9Sstevel@tonic-gate #endif 30255d07b933Sdp VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 30265d07b933Sdp return (pp); 30277c478bd9Sstevel@tonic-gate 30285d07b933Sdp bin_empty_0: 30295d07b933Sdp mutex_exit(pcm); 30305d07b933Sdp bin_empty_1: 30315d07b933Sdp if (plw_initialized == 0) { 30325d07b933Sdp page_list_walk_init(szc, flags, bin, 1, 1, 30335d07b933Sdp &plw); 30345d07b933Sdp plw_initialized = 1; 30355d07b933Sdp ASSERT(plw.plw_colors <= 30365d07b933Sdp PAGE_GET_PAGECOLORS(szc)); 30375d07b933Sdp ASSERT(plw.plw_colors > 0); 30385d07b933Sdp ASSERT((plw.plw_colors & 30395d07b933Sdp (plw.plw_colors - 1)) == 0); 30405d07b933Sdp ASSERT(bin < plw.plw_colors); 30415d07b933Sdp ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 30427c478bd9Sstevel@tonic-gate } 30435d07b933Sdp /* calculate the next bin with equivalent color */ 30445d07b933Sdp bin = ADD_MASKED(bin, plw.plw_bin_step, 30455d07b933Sdp plw.plw_ceq_mask[szc], plw.plw_color_mask); 30465d07b933Sdp } while (sbin != bin); 30477c478bd9Sstevel@tonic-gate 30487c478bd9Sstevel@tonic-gate /* 30495d07b933Sdp * color bins are all empty if color match. Try and 30505d07b933Sdp * satisfy the request by breaking up or coalescing 30515d07b933Sdp * pages from a different size freelist of the correct 30525d07b933Sdp * color that satisfies the ORIGINAL color requested. 30535d07b933Sdp * If that fails then try pages of the same size but 30545d07b933Sdp * different colors assuming we are not called with 30557c478bd9Sstevel@tonic-gate * PG_MATCH_COLOR. 30567c478bd9Sstevel@tonic-gate */ 30575d07b933Sdp if (plw.plw_do_split && 30585d07b933Sdp (pp = page_freelist_split(szc, bin, mnode, 305919397407SSherry Moore mtype, PFNNULL, PFNNULL, &plw)) != NULL) 3060ce8eb11aSdp return (pp); 30617c478bd9Sstevel@tonic-gate 30625d07b933Sdp if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 30635d07b933Sdp bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 30645d07b933Sdp return (pp); 30657c478bd9Sstevel@tonic-gate 30665d07b933Sdp if (plw.plw_ceq_dif > 1) 30675d07b933Sdp bin = page_list_walk_next_bin(szc, bin, &plw); 30687c478bd9Sstevel@tonic-gate } 30697c478bd9Sstevel@tonic-gate 3070affbd3ccSkchow /* if allowed, cycle through additional mtypes */ 3071affbd3ccSkchow MTYPE_NEXT(mnode, mtype, flags); 3072affbd3ccSkchow if (mtype >= 0) 30735d07b933Sdp goto try_again; 3074affbd3ccSkchow 30757c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 30767c478bd9Sstevel@tonic-gate 30777c478bd9Sstevel@tonic-gate return (NULL); 30787c478bd9Sstevel@tonic-gate } 30797c478bd9Sstevel@tonic-gate 30807c478bd9Sstevel@tonic-gate /* 30817c478bd9Sstevel@tonic-gate * Returns the count of free pages for 'pp' with size code 'szc'. 30827c478bd9Sstevel@tonic-gate * Note: This function does not return an exact value as the page freelist 30837c478bd9Sstevel@tonic-gate * locks are not held and thus the values in the page_counters may be 30847c478bd9Sstevel@tonic-gate * changing as we walk through the data. 30857c478bd9Sstevel@tonic-gate */ 30867c478bd9Sstevel@tonic-gate static int 30877c478bd9Sstevel@tonic-gate page_freecnt(int mnode, page_t *pp, uchar_t szc) 30887c478bd9Sstevel@tonic-gate { 30897c478bd9Sstevel@tonic-gate pgcnt_t pgfree; 30907c478bd9Sstevel@tonic-gate pgcnt_t cnt; 30917c478bd9Sstevel@tonic-gate ssize_t r = szc; /* region size */ 30927c478bd9Sstevel@tonic-gate ssize_t idx; 30937c478bd9Sstevel@tonic-gate int i; 30947c478bd9Sstevel@tonic-gate int full, range; 30957c478bd9Sstevel@tonic-gate 30967c478bd9Sstevel@tonic-gate /* Make sure pagenum passed in is aligned properly */ 30977c478bd9Sstevel@tonic-gate ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 30987c478bd9Sstevel@tonic-gate ASSERT(szc > 0); 30997c478bd9Sstevel@tonic-gate 31007c478bd9Sstevel@tonic-gate /* Prevent page_counters dynamic memory from being freed */ 31017c478bd9Sstevel@tonic-gate rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 31027c478bd9Sstevel@tonic-gate idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 31037c478bd9Sstevel@tonic-gate cnt = PAGE_COUNTERS(mnode, r, idx); 31047c478bd9Sstevel@tonic-gate pgfree = cnt << PNUM_SHIFT(r - 1); 31057c478bd9Sstevel@tonic-gate range = FULL_REGION_CNT(szc); 31067c478bd9Sstevel@tonic-gate 31077c478bd9Sstevel@tonic-gate /* Check for completely full region */ 31087c478bd9Sstevel@tonic-gate if (cnt == range) { 31097c478bd9Sstevel@tonic-gate rw_exit(&page_ctrs_rwlock[mnode]); 31107c478bd9Sstevel@tonic-gate return (pgfree); 31117c478bd9Sstevel@tonic-gate } 31127c478bd9Sstevel@tonic-gate 31137c478bd9Sstevel@tonic-gate while (--r > 0) { 31147c478bd9Sstevel@tonic-gate idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 31157c478bd9Sstevel@tonic-gate full = FULL_REGION_CNT(r); 31167c478bd9Sstevel@tonic-gate for (i = 0; i < range; i++, idx++) { 31177c478bd9Sstevel@tonic-gate cnt = PAGE_COUNTERS(mnode, r, idx); 31187c478bd9Sstevel@tonic-gate /* 31197c478bd9Sstevel@tonic-gate * If cnt here is full, that means we have already 31207c478bd9Sstevel@tonic-gate * accounted for these pages earlier. 31217c478bd9Sstevel@tonic-gate */ 31227c478bd9Sstevel@tonic-gate if (cnt != full) { 31237c478bd9Sstevel@tonic-gate pgfree += (cnt << PNUM_SHIFT(r - 1)); 31247c478bd9Sstevel@tonic-gate } 31257c478bd9Sstevel@tonic-gate } 31267c478bd9Sstevel@tonic-gate range *= full; 31277c478bd9Sstevel@tonic-gate } 31287c478bd9Sstevel@tonic-gate rw_exit(&page_ctrs_rwlock[mnode]); 31297c478bd9Sstevel@tonic-gate return (pgfree); 31307c478bd9Sstevel@tonic-gate } 31317c478bd9Sstevel@tonic-gate 31327c478bd9Sstevel@tonic-gate /* 31337c478bd9Sstevel@tonic-gate * Called from page_geti_contig_pages to exclusively lock constituent pages 31347c478bd9Sstevel@tonic-gate * starting from 'spp' for page size code 'szc'. 31357c478bd9Sstevel@tonic-gate * 31367c478bd9Sstevel@tonic-gate * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 31377c478bd9Sstevel@tonic-gate * region needs to be greater than or equal to the threshold. 31387c478bd9Sstevel@tonic-gate */ 31397c478bd9Sstevel@tonic-gate static int 31407c478bd9Sstevel@tonic-gate page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 31417c478bd9Sstevel@tonic-gate { 31427c478bd9Sstevel@tonic-gate pgcnt_t pgcnt = PNUM_SIZE(szc); 31437c478bd9Sstevel@tonic-gate pgcnt_t pgfree, i; 31447c478bd9Sstevel@tonic-gate page_t *pp; 31457c478bd9Sstevel@tonic-gate 31467c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 31477c478bd9Sstevel@tonic-gate 31487c478bd9Sstevel@tonic-gate 31497c478bd9Sstevel@tonic-gate if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 31507c478bd9Sstevel@tonic-gate goto skipptcpcheck; 31517c478bd9Sstevel@tonic-gate /* 31527c478bd9Sstevel@tonic-gate * check if there are sufficient free pages available before attempting 31537c478bd9Sstevel@tonic-gate * to trylock. Count is approximate as page counters can change. 31547c478bd9Sstevel@tonic-gate */ 31557c478bd9Sstevel@tonic-gate pgfree = page_freecnt(mnode, spp, szc); 31567c478bd9Sstevel@tonic-gate 31577c478bd9Sstevel@tonic-gate /* attempt to trylock if there are sufficient already free pages */ 31587c478bd9Sstevel@tonic-gate if (pgfree < pgcnt/ptcpthreshold) { 31597c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 31607c478bd9Sstevel@tonic-gate return (0); 31617c478bd9Sstevel@tonic-gate } 31627c478bd9Sstevel@tonic-gate 31637c478bd9Sstevel@tonic-gate skipptcpcheck: 31647c478bd9Sstevel@tonic-gate 31657c478bd9Sstevel@tonic-gate for (i = 0; i < pgcnt; i++) { 31667c478bd9Sstevel@tonic-gate pp = &spp[i]; 31677c478bd9Sstevel@tonic-gate if (!page_trylock(pp, SE_EXCL)) { 31687c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 31697c478bd9Sstevel@tonic-gate while (--i != (pgcnt_t)-1) { 31707c478bd9Sstevel@tonic-gate pp = &spp[i]; 31717c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp)); 31728b464eb8Smec page_unlock_nocapture(pp); 31737c478bd9Sstevel@tonic-gate } 31747c478bd9Sstevel@tonic-gate return (0); 31757c478bd9Sstevel@tonic-gate } 31767c478bd9Sstevel@tonic-gate ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 31777c478bd9Sstevel@tonic-gate if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 31787c478bd9Sstevel@tonic-gate !PP_ISFREE(pp)) { 31797c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 31807c478bd9Sstevel@tonic-gate ASSERT(i == 0); 31818b464eb8Smec page_unlock_nocapture(pp); 31827c478bd9Sstevel@tonic-gate return (0); 31837c478bd9Sstevel@tonic-gate } 31847c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(pp)) { 31857c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 31867c478bd9Sstevel@tonic-gate while (i != (pgcnt_t)-1) { 31877c478bd9Sstevel@tonic-gate pp = &spp[i]; 31887c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp)); 31898b464eb8Smec page_unlock_nocapture(pp); 31907c478bd9Sstevel@tonic-gate i--; 31917c478bd9Sstevel@tonic-gate } 31927c478bd9Sstevel@tonic-gate return (0); 31937c478bd9Sstevel@tonic-gate } 31947c478bd9Sstevel@tonic-gate } 31957c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 31967c478bd9Sstevel@tonic-gate return (1); 31977c478bd9Sstevel@tonic-gate } 31987c478bd9Sstevel@tonic-gate 31997c478bd9Sstevel@tonic-gate /* 32007c478bd9Sstevel@tonic-gate * Claim large page pointed to by 'pp'. 'pp' is the starting set 32017c478bd9Sstevel@tonic-gate * of 'szc' constituent pages that had been locked exclusively previously. 32027c478bd9Sstevel@tonic-gate * Will attempt to relocate constituent pages in use. 32037c478bd9Sstevel@tonic-gate */ 32047c478bd9Sstevel@tonic-gate static page_t * 32057c478bd9Sstevel@tonic-gate page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 32067c478bd9Sstevel@tonic-gate { 32077c478bd9Sstevel@tonic-gate spgcnt_t pgcnt, npgs, i; 32087c478bd9Sstevel@tonic-gate page_t *targpp, *rpp, *hpp; 32097c478bd9Sstevel@tonic-gate page_t *replpp = NULL; 32107c478bd9Sstevel@tonic-gate page_t *pplist = NULL; 32117c478bd9Sstevel@tonic-gate 32127c478bd9Sstevel@tonic-gate ASSERT(pp != NULL); 32137c478bd9Sstevel@tonic-gate 32147c478bd9Sstevel@tonic-gate pgcnt = page_get_pagecnt(szc); 32157c478bd9Sstevel@tonic-gate while (pgcnt) { 32167c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp)); 32177c478bd9Sstevel@tonic-gate ASSERT(!PP_ISNORELOC(pp)); 32187c478bd9Sstevel@tonic-gate if (PP_ISFREE(pp)) { 32197c478bd9Sstevel@tonic-gate /* 32207c478bd9Sstevel@tonic-gate * If this is a PG_FREE_LIST page then its 32217c478bd9Sstevel@tonic-gate * size code can change underneath us due to 32227c478bd9Sstevel@tonic-gate * page promotion or demotion. As an optimzation 32237c478bd9Sstevel@tonic-gate * use page_list_sub_pages() instead of 32247c478bd9Sstevel@tonic-gate * page_list_sub(). 32257c478bd9Sstevel@tonic-gate */ 32267c478bd9Sstevel@tonic-gate if (PP_ISAGED(pp)) { 32277c478bd9Sstevel@tonic-gate page_list_sub_pages(pp, szc); 32287c478bd9Sstevel@tonic-gate if (pp->p_szc == szc) { 32297c478bd9Sstevel@tonic-gate return (pp); 32307c478bd9Sstevel@tonic-gate } 32317c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc < szc); 32327c478bd9Sstevel@tonic-gate npgs = page_get_pagecnt(pp->p_szc); 32337c478bd9Sstevel@tonic-gate hpp = pp; 32347c478bd9Sstevel@tonic-gate for (i = 0; i < npgs; i++, pp++) { 32357c478bd9Sstevel@tonic-gate pp->p_szc = szc; 32367c478bd9Sstevel@tonic-gate } 32377c478bd9Sstevel@tonic-gate page_list_concat(&pplist, &hpp); 32387c478bd9Sstevel@tonic-gate pgcnt -= npgs; 32397c478bd9Sstevel@tonic-gate continue; 32407c478bd9Sstevel@tonic-gate } 32417c478bd9Sstevel@tonic-gate ASSERT(!PP_ISAGED(pp)); 32427c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 32437c478bd9Sstevel@tonic-gate page_list_sub(pp, PG_CACHE_LIST); 32447c478bd9Sstevel@tonic-gate page_hashout(pp, NULL); 32457c478bd9Sstevel@tonic-gate PP_SETAGED(pp); 32467c478bd9Sstevel@tonic-gate pp->p_szc = szc; 32477c478bd9Sstevel@tonic-gate page_list_concat(&pplist, &pp); 32487c478bd9Sstevel@tonic-gate pp++; 32497c478bd9Sstevel@tonic-gate pgcnt--; 32507c478bd9Sstevel@tonic-gate continue; 32517c478bd9Sstevel@tonic-gate } 32527c478bd9Sstevel@tonic-gate npgs = page_get_pagecnt(pp->p_szc); 32537c478bd9Sstevel@tonic-gate 32547c478bd9Sstevel@tonic-gate /* 32557c478bd9Sstevel@tonic-gate * page_create_wait freemem accounting done by caller of 32567c478bd9Sstevel@tonic-gate * page_get_freelist and not necessary to call it prior to 32577c478bd9Sstevel@tonic-gate * calling page_get_replacement_page. 32587c478bd9Sstevel@tonic-gate * 32597c478bd9Sstevel@tonic-gate * page_get_replacement_page can call page_get_contig_pages 32607c478bd9Sstevel@tonic-gate * to acquire a large page (szc > 0); the replacement must be 32617c478bd9Sstevel@tonic-gate * smaller than the contig page size to avoid looping or 32627c478bd9Sstevel@tonic-gate * szc == 0 and PGI_PGCPSZC0 is set. 32637c478bd9Sstevel@tonic-gate */ 32647c478bd9Sstevel@tonic-gate if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 32657c478bd9Sstevel@tonic-gate replpp = page_get_replacement_page(pp, NULL, 0); 32667c478bd9Sstevel@tonic-gate if (replpp) { 32677c478bd9Sstevel@tonic-gate npgs = page_get_pagecnt(pp->p_szc); 32687c478bd9Sstevel@tonic-gate ASSERT(npgs <= pgcnt); 32697c478bd9Sstevel@tonic-gate targpp = pp; 32707c478bd9Sstevel@tonic-gate } 32717c478bd9Sstevel@tonic-gate } 32727c478bd9Sstevel@tonic-gate 32737c478bd9Sstevel@tonic-gate /* 32747c478bd9Sstevel@tonic-gate * If replacement is NULL or do_page_relocate fails, fail 32757c478bd9Sstevel@tonic-gate * coalescing of pages. 32767c478bd9Sstevel@tonic-gate */ 32777c478bd9Sstevel@tonic-gate if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 32787c478bd9Sstevel@tonic-gate &npgs, NULL) != 0)) { 32797c478bd9Sstevel@tonic-gate /* 32807c478bd9Sstevel@tonic-gate * Unlock un-processed target list 32817c478bd9Sstevel@tonic-gate */ 32827c478bd9Sstevel@tonic-gate while (pgcnt--) { 32837c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp)); 32848b464eb8Smec page_unlock_nocapture(pp); 32857c478bd9Sstevel@tonic-gate pp++; 32867c478bd9Sstevel@tonic-gate } 32877c478bd9Sstevel@tonic-gate /* 32887c478bd9Sstevel@tonic-gate * Free the processed target list. 32897c478bd9Sstevel@tonic-gate */ 32907c478bd9Sstevel@tonic-gate while (pplist) { 32917c478bd9Sstevel@tonic-gate pp = pplist; 32927c478bd9Sstevel@tonic-gate page_sub(&pplist, pp); 32937c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp)); 32947c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == szc); 32957c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 32967c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 32977c478bd9Sstevel@tonic-gate pp->p_szc = 0; 32987c478bd9Sstevel@tonic-gate page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 32998b464eb8Smec page_unlock_nocapture(pp); 33007c478bd9Sstevel@tonic-gate } 33017c478bd9Sstevel@tonic-gate 33027c478bd9Sstevel@tonic-gate if (replpp != NULL) 33037c478bd9Sstevel@tonic-gate page_free_replacement_page(replpp); 33047c478bd9Sstevel@tonic-gate 33057c478bd9Sstevel@tonic-gate return (NULL); 33067c478bd9Sstevel@tonic-gate } 33077c478bd9Sstevel@tonic-gate ASSERT(pp == targpp); 33087c478bd9Sstevel@tonic-gate 33097c478bd9Sstevel@tonic-gate /* LINTED */ 33107c478bd9Sstevel@tonic-gate ASSERT(hpp = pp); /* That's right, it's an assignment */ 33117c478bd9Sstevel@tonic-gate 33127c478bd9Sstevel@tonic-gate pp += npgs; 33137c478bd9Sstevel@tonic-gate pgcnt -= npgs; 33147c478bd9Sstevel@tonic-gate 33157c478bd9Sstevel@tonic-gate while (npgs--) { 33167c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(targpp)); 33177c478bd9Sstevel@tonic-gate ASSERT(!PP_ISFREE(targpp)); 33187c478bd9Sstevel@tonic-gate ASSERT(!PP_ISNORELOC(targpp)); 33197c478bd9Sstevel@tonic-gate PP_SETFREE(targpp); 33207c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(targpp)); 33217c478bd9Sstevel@tonic-gate ASSERT(targpp->p_szc < szc || (szc == 0 && 33227c478bd9Sstevel@tonic-gate (flags & PGI_PGCPSZC0))); 33237c478bd9Sstevel@tonic-gate targpp->p_szc = szc; 33247c478bd9Sstevel@tonic-gate targpp = targpp->p_next; 33257c478bd9Sstevel@tonic-gate 33267c478bd9Sstevel@tonic-gate rpp = replpp; 33277c478bd9Sstevel@tonic-gate ASSERT(rpp != NULL); 33287c478bd9Sstevel@tonic-gate page_sub(&replpp, rpp); 33297c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(rpp)); 33307c478bd9Sstevel@tonic-gate ASSERT(!PP_ISFREE(rpp)); 33318b464eb8Smec page_unlock_nocapture(rpp); 33327c478bd9Sstevel@tonic-gate } 33337c478bd9Sstevel@tonic-gate ASSERT(targpp == hpp); 33347c478bd9Sstevel@tonic-gate ASSERT(replpp == NULL); 33357c478bd9Sstevel@tonic-gate page_list_concat(&pplist, &targpp); 33367c478bd9Sstevel@tonic-gate } 33377c478bd9Sstevel@tonic-gate CHK_LPG(pplist, szc); 33387c478bd9Sstevel@tonic-gate return (pplist); 33397c478bd9Sstevel@tonic-gate } 33407c478bd9Sstevel@tonic-gate 33417c478bd9Sstevel@tonic-gate /* 33427c478bd9Sstevel@tonic-gate * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 33437c478bd9Sstevel@tonic-gate * of 0 means nothing left after trim. 33447c478bd9Sstevel@tonic-gate */ 33457c478bd9Sstevel@tonic-gate int 33467c478bd9Sstevel@tonic-gate trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 33477c478bd9Sstevel@tonic-gate { 33487c478bd9Sstevel@tonic-gate pfn_t kcagepfn; 33497c478bd9Sstevel@tonic-gate int decr; 33507c478bd9Sstevel@tonic-gate int rc = 0; 33517c478bd9Sstevel@tonic-gate 33527c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(mseg->pages)) { 33537c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(mseg->epages - 1) == 0) { 33547c478bd9Sstevel@tonic-gate 33557c478bd9Sstevel@tonic-gate /* lower part of this mseg inside kernel cage */ 33567c478bd9Sstevel@tonic-gate decr = kcage_current_pfn(&kcagepfn); 33577c478bd9Sstevel@tonic-gate 33587c478bd9Sstevel@tonic-gate /* kernel cage may have transitioned past mseg */ 33597c478bd9Sstevel@tonic-gate if (kcagepfn >= mseg->pages_base && 33607c478bd9Sstevel@tonic-gate kcagepfn < mseg->pages_end) { 33617c478bd9Sstevel@tonic-gate ASSERT(decr == 0); 336278b03d3aSkchow *lo = MAX(kcagepfn, pfnlo); 336378b03d3aSkchow *hi = MIN(pfnhi, (mseg->pages_end - 1)); 33647c478bd9Sstevel@tonic-gate rc = 1; 33657c478bd9Sstevel@tonic-gate } 33667c478bd9Sstevel@tonic-gate } 33677c478bd9Sstevel@tonic-gate /* else entire mseg in the cage */ 33687c478bd9Sstevel@tonic-gate } else { 33697c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(mseg->epages - 1)) { 33707c478bd9Sstevel@tonic-gate 33717c478bd9Sstevel@tonic-gate /* upper part of this mseg inside kernel cage */ 33727c478bd9Sstevel@tonic-gate decr = kcage_current_pfn(&kcagepfn); 33737c478bd9Sstevel@tonic-gate 33747c478bd9Sstevel@tonic-gate /* kernel cage may have transitioned past mseg */ 33757c478bd9Sstevel@tonic-gate if (kcagepfn >= mseg->pages_base && 33767c478bd9Sstevel@tonic-gate kcagepfn < mseg->pages_end) { 33777c478bd9Sstevel@tonic-gate ASSERT(decr); 337878b03d3aSkchow *hi = MIN(kcagepfn, pfnhi); 33797c478bd9Sstevel@tonic-gate *lo = MAX(pfnlo, mseg->pages_base); 33807c478bd9Sstevel@tonic-gate rc = 1; 33817c478bd9Sstevel@tonic-gate } 33827c478bd9Sstevel@tonic-gate } else { 33837c478bd9Sstevel@tonic-gate /* entire mseg outside of kernel cage */ 33847c478bd9Sstevel@tonic-gate *lo = MAX(pfnlo, mseg->pages_base); 33857c478bd9Sstevel@tonic-gate *hi = MIN(pfnhi, (mseg->pages_end - 1)); 33867c478bd9Sstevel@tonic-gate rc = 1; 33877c478bd9Sstevel@tonic-gate } 33887c478bd9Sstevel@tonic-gate } 33897c478bd9Sstevel@tonic-gate return (rc); 33907c478bd9Sstevel@tonic-gate } 33917c478bd9Sstevel@tonic-gate 33927c478bd9Sstevel@tonic-gate /* 33935d07b933Sdp * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 33947c478bd9Sstevel@tonic-gate * page with size code 'szc'. Claiming such a page requires acquiring 33957c478bd9Sstevel@tonic-gate * exclusive locks on all constituent pages (page_trylock_contig_pages), 33967c478bd9Sstevel@tonic-gate * relocating pages in use and concatenating these constituent pages into a 33977c478bd9Sstevel@tonic-gate * large page. 33987c478bd9Sstevel@tonic-gate * 33995d07b933Sdp * The page lists do not have such a large page and page_freelist_split has 34007c478bd9Sstevel@tonic-gate * already failed to demote larger pages and/or coalesce smaller free pages. 34017c478bd9Sstevel@tonic-gate * 34027c478bd9Sstevel@tonic-gate * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 34037c478bd9Sstevel@tonic-gate * pages with the same color as 'bin'. 34047c478bd9Sstevel@tonic-gate * 34057c478bd9Sstevel@tonic-gate * 'pfnflag' specifies the subset of the pfn range to search. 34067c478bd9Sstevel@tonic-gate */ 34077c478bd9Sstevel@tonic-gate 34087c478bd9Sstevel@tonic-gate static page_t * 34097c478bd9Sstevel@tonic-gate page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 341083f9b804Skchow pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 34117c478bd9Sstevel@tonic-gate { 34127c478bd9Sstevel@tonic-gate struct memseg *mseg; 34137c478bd9Sstevel@tonic-gate pgcnt_t szcpgcnt = page_get_pagecnt(szc); 34147c478bd9Sstevel@tonic-gate pgcnt_t szcpgmask = szcpgcnt - 1; 34157c478bd9Sstevel@tonic-gate pfn_t randpfn; 34167c478bd9Sstevel@tonic-gate page_t *pp, *randpp, *endpp; 34175d07b933Sdp uint_t colors, ceq_mask; 34185d07b933Sdp /* LINTED : set but not used in function */ 34195d07b933Sdp uint_t color_mask; 34207c478bd9Sstevel@tonic-gate pfn_t hi, lo; 34217c478bd9Sstevel@tonic-gate uint_t skip; 3422ce8eb11aSdp MEM_NODE_ITERATOR_DECL(it); 34237c478bd9Sstevel@tonic-gate 34247c478bd9Sstevel@tonic-gate ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 34257c478bd9Sstevel@tonic-gate 342602bc52beSkchow pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 342702bc52beSkchow 342802bc52beSkchow if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 34297c478bd9Sstevel@tonic-gate return (NULL); 34307c478bd9Sstevel@tonic-gate 34317c478bd9Sstevel@tonic-gate ASSERT(szc < mmu_page_sizes); 34327c478bd9Sstevel@tonic-gate 34335d07b933Sdp colors = PAGE_GET_PAGECOLORS(szc); 34345d07b933Sdp color_mask = colors - 1; 34355d07b933Sdp if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 34365d07b933Sdp uchar_t ceq = colorequivszc[szc]; 34375d07b933Sdp uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 34385d07b933Sdp 34395d07b933Sdp ASSERT(ceq_dif > 0); 34405d07b933Sdp ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 34415d07b933Sdp } else { 34425d07b933Sdp ceq_mask = 0; 34435d07b933Sdp } 34447c478bd9Sstevel@tonic-gate 34457c478bd9Sstevel@tonic-gate ASSERT(bin < colors); 34467c478bd9Sstevel@tonic-gate 34475d07b933Sdp /* clear "non-significant" color bits */ 34485d07b933Sdp bin &= ceq_mask; 34495d07b933Sdp 34507c478bd9Sstevel@tonic-gate /* 34517c478bd9Sstevel@tonic-gate * trim the pfn range to search based on pfnflag. pfnflag is set 34527c478bd9Sstevel@tonic-gate * when there have been previous page_get_contig_page failures to 34537c478bd9Sstevel@tonic-gate * limit the search. 34547c478bd9Sstevel@tonic-gate * 34557c478bd9Sstevel@tonic-gate * The high bit in pfnflag specifies the number of 'slots' in the 34567c478bd9Sstevel@tonic-gate * pfn range and the remainder of pfnflag specifies which slot. 34577c478bd9Sstevel@tonic-gate * For example, a value of 1010b would mean the second slot of 34587c478bd9Sstevel@tonic-gate * the pfn range that has been divided into 8 slots. 34597c478bd9Sstevel@tonic-gate */ 34607c478bd9Sstevel@tonic-gate if (pfnflag > 1) { 34617c478bd9Sstevel@tonic-gate int slots = 1 << (highbit(pfnflag) - 1); 34627c478bd9Sstevel@tonic-gate int slotid = pfnflag & (slots - 1); 34637c478bd9Sstevel@tonic-gate pgcnt_t szcpages; 34647c478bd9Sstevel@tonic-gate int slotlen; 34657c478bd9Sstevel@tonic-gate 346602bc52beSkchow pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 34677c478bd9Sstevel@tonic-gate szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 34687c478bd9Sstevel@tonic-gate slotlen = howmany(szcpages, slots); 346902bc52beSkchow /* skip if 'slotid' slot is empty */ 347002bc52beSkchow if (slotid * slotlen >= szcpages) 347102bc52beSkchow return (NULL); 34727c478bd9Sstevel@tonic-gate pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 34737c478bd9Sstevel@tonic-gate ASSERT(pfnlo < pfnhi); 34747c478bd9Sstevel@tonic-gate if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 347502bc52beSkchow pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 34767c478bd9Sstevel@tonic-gate } 34777c478bd9Sstevel@tonic-gate 3478af4c679fSSean McEnroe /* 3479af4c679fSSean McEnroe * This routine is can be called recursively so we shouldn't 3480af4c679fSSean McEnroe * acquire a reader lock if a write request is pending. This 3481af4c679fSSean McEnroe * could lead to a deadlock with the DR thread. 3482af4c679fSSean McEnroe * 3483af4c679fSSean McEnroe * Returning NULL informs the caller that we could not get 3484af4c679fSSean McEnroe * a contig page with the required characteristics. 3485af4c679fSSean McEnroe */ 3486af4c679fSSean McEnroe 3487af4c679fSSean McEnroe if (!memsegs_trylock(0)) 3488af4c679fSSean McEnroe return (NULL); 34897c478bd9Sstevel@tonic-gate 34907c478bd9Sstevel@tonic-gate /* 34917c478bd9Sstevel@tonic-gate * loop through memsegs to look for contig page candidates 34927c478bd9Sstevel@tonic-gate */ 34937c478bd9Sstevel@tonic-gate 34947c478bd9Sstevel@tonic-gate for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 34957c478bd9Sstevel@tonic-gate if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 34967c478bd9Sstevel@tonic-gate /* no overlap */ 34977c478bd9Sstevel@tonic-gate continue; 34987c478bd9Sstevel@tonic-gate } 34997c478bd9Sstevel@tonic-gate 35007c478bd9Sstevel@tonic-gate if (mseg->pages_end - mseg->pages_base < szcpgcnt) 35017c478bd9Sstevel@tonic-gate /* mseg too small */ 35027c478bd9Sstevel@tonic-gate continue; 35037c478bd9Sstevel@tonic-gate 350478b03d3aSkchow /* 350578b03d3aSkchow * trim off kernel cage pages from pfn range and check for 350678b03d3aSkchow * a trimmed pfn range returned that does not span the 350778b03d3aSkchow * desired large page size. 350878b03d3aSkchow */ 35097c478bd9Sstevel@tonic-gate if (kcage_on) { 351078b03d3aSkchow if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3511d94d1888Skchow lo >= hi || ((hi - lo) + 1) < szcpgcnt) 35127c478bd9Sstevel@tonic-gate continue; 35137c478bd9Sstevel@tonic-gate } else { 35147c478bd9Sstevel@tonic-gate lo = MAX(pfnlo, mseg->pages_base); 35157c478bd9Sstevel@tonic-gate hi = MIN(pfnhi, (mseg->pages_end - 1)); 35167c478bd9Sstevel@tonic-gate } 35177c478bd9Sstevel@tonic-gate 35187c478bd9Sstevel@tonic-gate /* round to szcpgcnt boundaries */ 35197c478bd9Sstevel@tonic-gate lo = P2ROUNDUP(lo, szcpgcnt); 352002bc52beSkchow 3521b779d3e0Sdp MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 352202bc52beSkchow hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 35237c478bd9Sstevel@tonic-gate 35247c478bd9Sstevel@tonic-gate if (hi <= lo) 35257c478bd9Sstevel@tonic-gate continue; 35267c478bd9Sstevel@tonic-gate 35277c478bd9Sstevel@tonic-gate /* 35287c478bd9Sstevel@tonic-gate * set lo to point to the pfn for the desired bin. Large 35297c478bd9Sstevel@tonic-gate * page sizes may only have a single page color 35307c478bd9Sstevel@tonic-gate */ 35315d07b933Sdp skip = szcpgcnt; 3532ce8eb11aSdp if (ceq_mask > 0 || interleaved_mnodes) { 35335d07b933Sdp /* set lo to point at appropriate color */ 3534ce8eb11aSdp if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3535ce8eb11aSdp (interleaved_mnodes && 3536ce8eb11aSdp PFN_2_MEM_NODE(lo) != mnode)) { 3537ce8eb11aSdp PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3538ce8eb11aSdp color_mask, &it); 3539ce8eb11aSdp } 35405d07b933Sdp if (hi <= lo) 35415d07b933Sdp /* mseg cannot satisfy color request */ 35425d07b933Sdp continue; 35437c478bd9Sstevel@tonic-gate } 35447c478bd9Sstevel@tonic-gate 35457c478bd9Sstevel@tonic-gate /* randomly choose a point between lo and hi to begin search */ 35467c478bd9Sstevel@tonic-gate 35477c478bd9Sstevel@tonic-gate randpfn = (pfn_t)GETTICK(); 35487c478bd9Sstevel@tonic-gate randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3549b779d3e0Sdp MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it); 3550b779d3e0Sdp if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) { 3551a7c3ca36Sdp if (randpfn != (pfn_t)-1) { 3552ce8eb11aSdp PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3553ce8eb11aSdp ceq_mask, color_mask, &it); 3554a7c3ca36Sdp } 3555ce8eb11aSdp if (randpfn >= hi) { 3556ce8eb11aSdp randpfn = lo; 3557b779d3e0Sdp MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, 3558b779d3e0Sdp &it); 3559ce8eb11aSdp } 35605d07b933Sdp } 35617c478bd9Sstevel@tonic-gate randpp = mseg->pages + (randpfn - mseg->pages_base); 35627c478bd9Sstevel@tonic-gate 35637c478bd9Sstevel@tonic-gate ASSERT(randpp->p_pagenum == randpfn); 35647c478bd9Sstevel@tonic-gate 35657c478bd9Sstevel@tonic-gate pp = randpp; 356602bc52beSkchow endpp = mseg->pages + (hi - mseg->pages_base) + 1; 35677c478bd9Sstevel@tonic-gate 35687c478bd9Sstevel@tonic-gate ASSERT(randpp + szcpgcnt <= endpp); 35697c478bd9Sstevel@tonic-gate 35707c478bd9Sstevel@tonic-gate do { 35717c478bd9Sstevel@tonic-gate ASSERT(!(pp->p_pagenum & szcpgmask)); 35725d07b933Sdp ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 35735d07b933Sdp 35747c478bd9Sstevel@tonic-gate if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 35757c478bd9Sstevel@tonic-gate /* pages unlocked by page_claim on failure */ 35767c478bd9Sstevel@tonic-gate if (page_claim_contig_pages(pp, szc, flags)) { 35777c478bd9Sstevel@tonic-gate memsegs_unlock(0); 35787c478bd9Sstevel@tonic-gate return (pp); 35797c478bd9Sstevel@tonic-gate } 35807c478bd9Sstevel@tonic-gate } 35817c478bd9Sstevel@tonic-gate 3582ce8eb11aSdp if (ceq_mask == 0 && !interleaved_mnodes) { 35835d07b933Sdp pp += skip; 35845d07b933Sdp } else { 35855d07b933Sdp pfn_t pfn = pp->p_pagenum; 35865d07b933Sdp 35875d07b933Sdp PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3588ce8eb11aSdp ceq_mask, color_mask, &it); 3589ce8eb11aSdp if (pfn == (pfn_t)-1) { 3590ce8eb11aSdp pp = endpp; 3591ce8eb11aSdp } else { 3592ce8eb11aSdp pp = mseg->pages + 3593ce8eb11aSdp (pfn - mseg->pages_base); 3594ce8eb11aSdp } 35955d07b933Sdp } 35967c478bd9Sstevel@tonic-gate if (pp >= endpp) { 35977c478bd9Sstevel@tonic-gate /* start from the beginning */ 3598b779d3e0Sdp MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 35997c478bd9Sstevel@tonic-gate pp = mseg->pages + (lo - mseg->pages_base); 36007c478bd9Sstevel@tonic-gate ASSERT(pp->p_pagenum == lo); 36017c478bd9Sstevel@tonic-gate ASSERT(pp + szcpgcnt <= endpp); 36027c478bd9Sstevel@tonic-gate } 36037c478bd9Sstevel@tonic-gate } while (pp != randpp); 36047c478bd9Sstevel@tonic-gate } 36057c478bd9Sstevel@tonic-gate memsegs_unlock(0); 36067c478bd9Sstevel@tonic-gate return (NULL); 36077c478bd9Sstevel@tonic-gate } 36087c478bd9Sstevel@tonic-gate 36097c478bd9Sstevel@tonic-gate 36107c478bd9Sstevel@tonic-gate /* 36117c478bd9Sstevel@tonic-gate * controlling routine that searches through physical memory in an attempt to 36127c478bd9Sstevel@tonic-gate * claim a large page based on the input parameters. 36137c478bd9Sstevel@tonic-gate * on the page free lists. 36147c478bd9Sstevel@tonic-gate * 36157c478bd9Sstevel@tonic-gate * calls page_geti_contig_pages with an initial pfn range from the mnode 36167c478bd9Sstevel@tonic-gate * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 36177c478bd9Sstevel@tonic-gate * that overlaps with the kernel cage or does not match the requested page 36187c478bd9Sstevel@tonic-gate * color if PG_MATCH_COLOR is set. Since this search is very expensive, 36197c478bd9Sstevel@tonic-gate * page_geti_contig_pages may further limit the search range based on 36207c478bd9Sstevel@tonic-gate * previous failure counts (pgcpfailcnt[]). 36217c478bd9Sstevel@tonic-gate * 36227c478bd9Sstevel@tonic-gate * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 36237c478bd9Sstevel@tonic-gate * pagesize page that satisfies mtype. 36247c478bd9Sstevel@tonic-gate */ 36257c478bd9Sstevel@tonic-gate page_t * 36267c478bd9Sstevel@tonic-gate page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 36277c478bd9Sstevel@tonic-gate uint_t flags) 36287c478bd9Sstevel@tonic-gate { 36297c478bd9Sstevel@tonic-gate pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 36307c478bd9Sstevel@tonic-gate page_t *pp; 363183f9b804Skchow pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 36327c478bd9Sstevel@tonic-gate 36337c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 36347c478bd9Sstevel@tonic-gate 36350b5aa17bSmec /* no allocations from cage */ 36360b5aa17bSmec flags |= PGI_NOCAGE; 36370b5aa17bSmec 36387c478bd9Sstevel@tonic-gate /* LINTED */ 36397c478bd9Sstevel@tonic-gate MTYPE_START(mnode, mtype, flags); 36407c478bd9Sstevel@tonic-gate if (mtype < 0) { /* mnode does not have memory in mtype range */ 36417c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 36427c478bd9Sstevel@tonic-gate return (NULL); 36437c478bd9Sstevel@tonic-gate } 36447c478bd9Sstevel@tonic-gate 36457c478bd9Sstevel@tonic-gate ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 36467c478bd9Sstevel@tonic-gate 36477c478bd9Sstevel@tonic-gate /* do not limit search and ignore color if hi pri */ 36487c478bd9Sstevel@tonic-gate 36497c478bd9Sstevel@tonic-gate if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 36507c478bd9Sstevel@tonic-gate pfnflag = pgcpfailcnt[szc]; 36517c478bd9Sstevel@tonic-gate 36527c478bd9Sstevel@tonic-gate /* remove color match to improve chances */ 36537c478bd9Sstevel@tonic-gate 36547c478bd9Sstevel@tonic-gate if (flags & PGI_PGCPHIPRI || pfnflag) 36557c478bd9Sstevel@tonic-gate flags &= ~PG_MATCH_COLOR; 36567c478bd9Sstevel@tonic-gate 36577c478bd9Sstevel@tonic-gate do { 36587c478bd9Sstevel@tonic-gate /* get pfn range based on mnode and mtype */ 36597c478bd9Sstevel@tonic-gate MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 36607c478bd9Sstevel@tonic-gate 36617c478bd9Sstevel@tonic-gate ASSERT(pfnhi >= pfnlo); 36627c478bd9Sstevel@tonic-gate 36637c478bd9Sstevel@tonic-gate pp = page_geti_contig_pages(mnode, bin, szc, flags, 36647c478bd9Sstevel@tonic-gate pfnlo, pfnhi, pfnflag); 36657c478bd9Sstevel@tonic-gate 36667c478bd9Sstevel@tonic-gate if (pp != NULL) { 36677c478bd9Sstevel@tonic-gate pfnflag = pgcpfailcnt[szc]; 36687c478bd9Sstevel@tonic-gate if (pfnflag) { 36697c478bd9Sstevel@tonic-gate /* double the search size */ 36707c478bd9Sstevel@tonic-gate pgcpfailcnt[szc] = pfnflag >> 1; 36717c478bd9Sstevel@tonic-gate } 36727c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 36737c478bd9Sstevel@tonic-gate return (pp); 36747c478bd9Sstevel@tonic-gate } 3675affbd3ccSkchow MTYPE_NEXT(mnode, mtype, flags); 3676affbd3ccSkchow } while (mtype >= 0); 36777c478bd9Sstevel@tonic-gate 36787c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 36797c478bd9Sstevel@tonic-gate return (NULL); 36807c478bd9Sstevel@tonic-gate } 36817c478bd9Sstevel@tonic-gate 368278b03d3aSkchow #if defined(__i386) || defined(__amd64) 368378b03d3aSkchow /* 368478b03d3aSkchow * Determine the likelihood of finding/coalescing a szc page. 368578b03d3aSkchow * Return 0 if the likelihood is small otherwise return 1. 368678b03d3aSkchow * 368778b03d3aSkchow * For now, be conservative and check only 1g pages and return 0 368878b03d3aSkchow * if there had been previous coalescing failures and the szc pages 368978b03d3aSkchow * needed to satisfy request would exhaust most of freemem. 369078b03d3aSkchow */ 369178b03d3aSkchow int 369278b03d3aSkchow page_chk_freelist(uint_t szc) 369378b03d3aSkchow { 369478b03d3aSkchow pgcnt_t pgcnt; 369578b03d3aSkchow 369678b03d3aSkchow if (szc <= 1) 369778b03d3aSkchow return (1); 369878b03d3aSkchow 369978b03d3aSkchow pgcnt = page_get_pagecnt(szc); 370078b03d3aSkchow if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 370178b03d3aSkchow VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 370278b03d3aSkchow return (0); 370378b03d3aSkchow } 370478b03d3aSkchow VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 370578b03d3aSkchow return (1); 370678b03d3aSkchow } 370778b03d3aSkchow #endif 37087c478bd9Sstevel@tonic-gate 37097c478bd9Sstevel@tonic-gate /* 37107c478bd9Sstevel@tonic-gate * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 37117c478bd9Sstevel@tonic-gate * 37127c478bd9Sstevel@tonic-gate * Does its own locking and accounting. 37137c478bd9Sstevel@tonic-gate * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 37147c478bd9Sstevel@tonic-gate * pages of the proper color even if there are pages of a different color. 37157c478bd9Sstevel@tonic-gate * 37167c478bd9Sstevel@tonic-gate * Finds a page, removes it, THEN locks it. 37177c478bd9Sstevel@tonic-gate */ 37187c478bd9Sstevel@tonic-gate 37197c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 37207c478bd9Sstevel@tonic-gate page_t * 37217c478bd9Sstevel@tonic-gate page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 37227c478bd9Sstevel@tonic-gate caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 37237c478bd9Sstevel@tonic-gate { 37247c478bd9Sstevel@tonic-gate struct as *as = seg->s_as; 37257c478bd9Sstevel@tonic-gate page_t *pp = NULL; 37267c478bd9Sstevel@tonic-gate ulong_t bin; 37277c478bd9Sstevel@tonic-gate uchar_t szc; 37287c478bd9Sstevel@tonic-gate int mnode; 37297c478bd9Sstevel@tonic-gate int mtype; 37307c478bd9Sstevel@tonic-gate page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 37317c478bd9Sstevel@tonic-gate lgrp_mnode_cookie_t lgrp_cookie; 37327c478bd9Sstevel@tonic-gate 37337c478bd9Sstevel@tonic-gate page_get_func = page_get_mnode_freelist; 37347c478bd9Sstevel@tonic-gate 37357c478bd9Sstevel@tonic-gate /* 37367c478bd9Sstevel@tonic-gate * If we aren't passed a specific lgroup, or passed a freed lgrp 37377c478bd9Sstevel@tonic-gate * assume we wish to allocate near to the current thread's home. 37387c478bd9Sstevel@tonic-gate */ 37397c478bd9Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 37407c478bd9Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 37417c478bd9Sstevel@tonic-gate 37427c478bd9Sstevel@tonic-gate if (kcage_on) { 37437c478bd9Sstevel@tonic-gate if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 37447c478bd9Sstevel@tonic-gate kcage_freemem < kcage_throttlefree + btop(size) && 37457c478bd9Sstevel@tonic-gate curthread != kcage_cageout_thread) { 37467c478bd9Sstevel@tonic-gate /* 37477c478bd9Sstevel@tonic-gate * Set a "reserve" of kcage_throttlefree pages for 37487c478bd9Sstevel@tonic-gate * PG_PANIC and cageout thread allocations. 37497c478bd9Sstevel@tonic-gate * 37507c478bd9Sstevel@tonic-gate * Everybody else has to serialize in 37517c478bd9Sstevel@tonic-gate * page_create_get_something() to get a cage page, so 37527c478bd9Sstevel@tonic-gate * that we don't deadlock cageout! 37537c478bd9Sstevel@tonic-gate */ 37547c478bd9Sstevel@tonic-gate return (NULL); 37557c478bd9Sstevel@tonic-gate } 37567c478bd9Sstevel@tonic-gate } else { 37577c478bd9Sstevel@tonic-gate flags &= ~PG_NORELOC; 37587c478bd9Sstevel@tonic-gate flags |= PGI_NOCAGE; 37597c478bd9Sstevel@tonic-gate } 37607c478bd9Sstevel@tonic-gate 37617c478bd9Sstevel@tonic-gate /* LINTED */ 376207ad560dSkchow MTYPE_INIT(mtype, vp, vaddr, flags, size); 37637c478bd9Sstevel@tonic-gate 37647c478bd9Sstevel@tonic-gate /* 37657c478bd9Sstevel@tonic-gate * Convert size to page size code. 37667c478bd9Sstevel@tonic-gate */ 37677c478bd9Sstevel@tonic-gate if ((szc = page_szc(size)) == (uchar_t)-1) 37687c478bd9Sstevel@tonic-gate panic("page_get_freelist: illegal page size request"); 37697c478bd9Sstevel@tonic-gate ASSERT(szc < mmu_page_sizes); 37707c478bd9Sstevel@tonic-gate 37717c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 37727c478bd9Sstevel@tonic-gate 37737c478bd9Sstevel@tonic-gate /* LINTED */ 37745d07b933Sdp AS_2_BIN(as, seg, vp, vaddr, bin, szc); 37757c478bd9Sstevel@tonic-gate 37765d07b933Sdp ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 37777c478bd9Sstevel@tonic-gate 37787c478bd9Sstevel@tonic-gate /* 37797c478bd9Sstevel@tonic-gate * Try to get a local page first, but try remote if we can't 37807c478bd9Sstevel@tonic-gate * get a page of the right color. 37817c478bd9Sstevel@tonic-gate */ 37827c478bd9Sstevel@tonic-gate pgretry: 37837c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 37847c478bd9Sstevel@tonic-gate while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 37857c478bd9Sstevel@tonic-gate pp = page_get_func(mnode, bin, mtype, szc, flags); 37867c478bd9Sstevel@tonic-gate if (pp != NULL) { 37877c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 37887c478bd9Sstevel@tonic-gate DTRACE_PROBE4(page__get, 37897c478bd9Sstevel@tonic-gate lgrp_t *, lgrp, 37907c478bd9Sstevel@tonic-gate int, mnode, 37917c478bd9Sstevel@tonic-gate ulong_t, bin, 37927c478bd9Sstevel@tonic-gate uint_t, flags); 37937c478bd9Sstevel@tonic-gate return (pp); 37947c478bd9Sstevel@tonic-gate } 37957c478bd9Sstevel@tonic-gate } 37967c478bd9Sstevel@tonic-gate ASSERT(pp == NULL); 37977c478bd9Sstevel@tonic-gate 37987c478bd9Sstevel@tonic-gate /* 37997c478bd9Sstevel@tonic-gate * for non-SZC0 PAGESIZE requests, check cachelist before checking 38007c478bd9Sstevel@tonic-gate * remote free lists. Caller expected to call page_get_cachelist which 38017c478bd9Sstevel@tonic-gate * will check local cache lists and remote free lists. 38027c478bd9Sstevel@tonic-gate */ 38037c478bd9Sstevel@tonic-gate if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 38047c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 38057c478bd9Sstevel@tonic-gate return (NULL); 38067c478bd9Sstevel@tonic-gate } 38077c478bd9Sstevel@tonic-gate 38087c478bd9Sstevel@tonic-gate ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 38097c478bd9Sstevel@tonic-gate 38107c478bd9Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 38117c478bd9Sstevel@tonic-gate 38122cb27123Saguzovsk if (!(flags & PG_LOCAL)) { 38132cb27123Saguzovsk /* 38142cb27123Saguzovsk * Try to get a non-local freelist page. 38152cb27123Saguzovsk */ 38162cb27123Saguzovsk LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 38172cb27123Saguzovsk while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 38182cb27123Saguzovsk pp = page_get_func(mnode, bin, mtype, szc, flags); 38192cb27123Saguzovsk if (pp != NULL) { 38202cb27123Saguzovsk DTRACE_PROBE4(page__get, 38212cb27123Saguzovsk lgrp_t *, lgrp, 38222cb27123Saguzovsk int, mnode, 38232cb27123Saguzovsk ulong_t, bin, 38242cb27123Saguzovsk uint_t, flags); 38252cb27123Saguzovsk VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 38262cb27123Saguzovsk return (pp); 38272cb27123Saguzovsk } 38287c478bd9Sstevel@tonic-gate } 38292cb27123Saguzovsk ASSERT(pp == NULL); 38307c478bd9Sstevel@tonic-gate } 38317c478bd9Sstevel@tonic-gate 38327c478bd9Sstevel@tonic-gate /* 38337c478bd9Sstevel@tonic-gate * when the cage is off chances are page_get_contig_pages() will fail 38347c478bd9Sstevel@tonic-gate * to lock a large page chunk therefore when the cage is off it's not 38357c478bd9Sstevel@tonic-gate * called by default. this can be changed via /etc/system. 38367c478bd9Sstevel@tonic-gate * 38377c478bd9Sstevel@tonic-gate * page_get_contig_pages() also called to acquire a base pagesize page 38387c478bd9Sstevel@tonic-gate * for page_create_get_something(). 38397c478bd9Sstevel@tonic-gate */ 38407c478bd9Sstevel@tonic-gate if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 38417c478bd9Sstevel@tonic-gate (kcage_on || pg_lpgcreate_nocage || szc == 0) && 38427c478bd9Sstevel@tonic-gate (page_get_func != page_get_contig_pages)) { 38437c478bd9Sstevel@tonic-gate 38447c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 38457c478bd9Sstevel@tonic-gate page_get_func = page_get_contig_pages; 38467c478bd9Sstevel@tonic-gate goto pgretry; 38477c478bd9Sstevel@tonic-gate } 38487c478bd9Sstevel@tonic-gate 38492cb27123Saguzovsk if (!(flags & PG_LOCAL) && pgcplimitsearch && 38502cb27123Saguzovsk page_get_func == page_get_contig_pages) 385183f9b804Skchow SETPGCPFAILCNT(szc); 38527c478bd9Sstevel@tonic-gate 38537c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 38547c478bd9Sstevel@tonic-gate return (NULL); 38557c478bd9Sstevel@tonic-gate } 38567c478bd9Sstevel@tonic-gate 38577c478bd9Sstevel@tonic-gate /* 38587c478bd9Sstevel@tonic-gate * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 38597c478bd9Sstevel@tonic-gate * 38607c478bd9Sstevel@tonic-gate * Does its own locking. 38617c478bd9Sstevel@tonic-gate * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 38627c478bd9Sstevel@tonic-gate * pages of the proper color even if there are pages of a different color. 38637c478bd9Sstevel@tonic-gate * Otherwise, scan the bins for ones with pages. For each bin with pages, 38647c478bd9Sstevel@tonic-gate * try to lock one of them. If no page can be locked, try the 38657c478bd9Sstevel@tonic-gate * next bin. Return NULL if a page can not be found and locked. 38667c478bd9Sstevel@tonic-gate * 38677c478bd9Sstevel@tonic-gate * Finds a pages, trys to lock it, then removes it. 38687c478bd9Sstevel@tonic-gate */ 38697c478bd9Sstevel@tonic-gate 38707c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 38717c478bd9Sstevel@tonic-gate page_t * 38727c478bd9Sstevel@tonic-gate page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 38737c478bd9Sstevel@tonic-gate caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 38747c478bd9Sstevel@tonic-gate { 38757c478bd9Sstevel@tonic-gate page_t *pp; 38767c478bd9Sstevel@tonic-gate struct as *as = seg->s_as; 38777c478bd9Sstevel@tonic-gate ulong_t bin; 38787c478bd9Sstevel@tonic-gate /*LINTED*/ 38797c478bd9Sstevel@tonic-gate int mnode; 38807c478bd9Sstevel@tonic-gate int mtype; 38817c478bd9Sstevel@tonic-gate lgrp_mnode_cookie_t lgrp_cookie; 38827c478bd9Sstevel@tonic-gate 38837c478bd9Sstevel@tonic-gate /* 38847c478bd9Sstevel@tonic-gate * If we aren't passed a specific lgroup, or pasased a freed lgrp 38857c478bd9Sstevel@tonic-gate * assume we wish to allocate near to the current thread's home. 38867c478bd9Sstevel@tonic-gate */ 38877c478bd9Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 38887c478bd9Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 38897c478bd9Sstevel@tonic-gate 38907c478bd9Sstevel@tonic-gate if (!kcage_on) { 38917c478bd9Sstevel@tonic-gate flags &= ~PG_NORELOC; 38927c478bd9Sstevel@tonic-gate flags |= PGI_NOCAGE; 38937c478bd9Sstevel@tonic-gate } 38947c478bd9Sstevel@tonic-gate 38957c478bd9Sstevel@tonic-gate if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 38967c478bd9Sstevel@tonic-gate kcage_freemem <= kcage_throttlefree) { 38977c478bd9Sstevel@tonic-gate /* 38987c478bd9Sstevel@tonic-gate * Reserve kcage_throttlefree pages for critical kernel 38997c478bd9Sstevel@tonic-gate * threads. 39007c478bd9Sstevel@tonic-gate * 39017c478bd9Sstevel@tonic-gate * Everybody else has to go to page_create_get_something() 39027c478bd9Sstevel@tonic-gate * to get a cage page, so we don't deadlock cageout. 39037c478bd9Sstevel@tonic-gate */ 39047c478bd9Sstevel@tonic-gate return (NULL); 39057c478bd9Sstevel@tonic-gate } 39067c478bd9Sstevel@tonic-gate 39077c478bd9Sstevel@tonic-gate /* LINTED */ 39085d07b933Sdp AS_2_BIN(as, seg, vp, vaddr, bin, 0); 39097c478bd9Sstevel@tonic-gate 39105d07b933Sdp ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 39117c478bd9Sstevel@tonic-gate 39127c478bd9Sstevel@tonic-gate /* LINTED */ 391307ad560dSkchow MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 39147c478bd9Sstevel@tonic-gate 39157c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgc_alloc); 39167c478bd9Sstevel@tonic-gate 39177c478bd9Sstevel@tonic-gate /* 39187c478bd9Sstevel@tonic-gate * Try local cachelists first 39197c478bd9Sstevel@tonic-gate */ 39207c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 39217c478bd9Sstevel@tonic-gate while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 39227c478bd9Sstevel@tonic-gate pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 39237c478bd9Sstevel@tonic-gate if (pp != NULL) { 39247c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgc_allocok); 39257c478bd9Sstevel@tonic-gate DTRACE_PROBE4(page__get, 39267c478bd9Sstevel@tonic-gate lgrp_t *, lgrp, 39277c478bd9Sstevel@tonic-gate int, mnode, 39287c478bd9Sstevel@tonic-gate ulong_t, bin, 39297c478bd9Sstevel@tonic-gate uint_t, flags); 39307c478bd9Sstevel@tonic-gate return (pp); 39317c478bd9Sstevel@tonic-gate } 39327c478bd9Sstevel@tonic-gate } 39337c478bd9Sstevel@tonic-gate 39347c478bd9Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 39357c478bd9Sstevel@tonic-gate 39367c478bd9Sstevel@tonic-gate /* 39377c478bd9Sstevel@tonic-gate * Try freelists/cachelists that are farther away 39387c478bd9Sstevel@tonic-gate * This is our only chance to allocate remote pages for PAGESIZE 39397c478bd9Sstevel@tonic-gate * requests. 39407c478bd9Sstevel@tonic-gate */ 39417c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 39427c478bd9Sstevel@tonic-gate while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 39437c478bd9Sstevel@tonic-gate pp = page_get_mnode_freelist(mnode, bin, mtype, 39447c478bd9Sstevel@tonic-gate 0, flags); 39457c478bd9Sstevel@tonic-gate if (pp != NULL) { 39467c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 39477c478bd9Sstevel@tonic-gate DTRACE_PROBE4(page__get, 39487c478bd9Sstevel@tonic-gate lgrp_t *, lgrp, 39497c478bd9Sstevel@tonic-gate int, mnode, 39507c478bd9Sstevel@tonic-gate ulong_t, bin, 39517c478bd9Sstevel@tonic-gate uint_t, flags); 39527c478bd9Sstevel@tonic-gate return (pp); 39537c478bd9Sstevel@tonic-gate } 39547c478bd9Sstevel@tonic-gate pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 39557c478bd9Sstevel@tonic-gate if (pp != NULL) { 39567c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 39577c478bd9Sstevel@tonic-gate DTRACE_PROBE4(page__get, 39587c478bd9Sstevel@tonic-gate lgrp_t *, lgrp, 39597c478bd9Sstevel@tonic-gate int, mnode, 39607c478bd9Sstevel@tonic-gate ulong_t, bin, 39617c478bd9Sstevel@tonic-gate uint_t, flags); 39627c478bd9Sstevel@tonic-gate return (pp); 39637c478bd9Sstevel@tonic-gate } 39647c478bd9Sstevel@tonic-gate } 39657c478bd9Sstevel@tonic-gate 39667c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 39677c478bd9Sstevel@tonic-gate return (NULL); 39687c478bd9Sstevel@tonic-gate } 39697c478bd9Sstevel@tonic-gate 39707c478bd9Sstevel@tonic-gate page_t * 39717c478bd9Sstevel@tonic-gate page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 39727c478bd9Sstevel@tonic-gate { 39735d07b933Sdp kmutex_t *pcm; 39745d07b933Sdp page_t *pp, *first_pp; 39755d07b933Sdp uint_t sbin; 39765d07b933Sdp int plw_initialized; 39775d07b933Sdp page_list_walker_t plw; 39787c478bd9Sstevel@tonic-gate 39797c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 39807c478bd9Sstevel@tonic-gate 39817c478bd9Sstevel@tonic-gate /* LINTED */ 39827c478bd9Sstevel@tonic-gate MTYPE_START(mnode, mtype, flags); 39837c478bd9Sstevel@tonic-gate if (mtype < 0) { /* mnode does not have memory in mtype range */ 39847c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 39857c478bd9Sstevel@tonic-gate return (NULL); 39867c478bd9Sstevel@tonic-gate } 39877c478bd9Sstevel@tonic-gate 39885d07b933Sdp try_again: 39897c478bd9Sstevel@tonic-gate 39905d07b933Sdp plw_initialized = 0; 39915d07b933Sdp plw.plw_ceq_dif = 1; 39927c478bd9Sstevel@tonic-gate 39937c478bd9Sstevel@tonic-gate /* 39947c478bd9Sstevel@tonic-gate * Only hold one cachelist lock at a time, that way we 39957c478bd9Sstevel@tonic-gate * can start anywhere and not have to worry about lock 39967c478bd9Sstevel@tonic-gate * ordering. 39977c478bd9Sstevel@tonic-gate */ 39987c478bd9Sstevel@tonic-gate 39995d07b933Sdp for (plw.plw_count = 0; 40005d07b933Sdp plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 40015d07b933Sdp sbin = bin; 40025d07b933Sdp do { 40035d07b933Sdp 40045d07b933Sdp if (!PAGE_CACHELISTS(mnode, bin, mtype)) 40055d07b933Sdp goto bin_empty_1; 40067c478bd9Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 40077c478bd9Sstevel@tonic-gate mutex_enter(pcm); 40087c478bd9Sstevel@tonic-gate pp = PAGE_CACHELISTS(mnode, bin, mtype); 40095d07b933Sdp if (pp == NULL) 40105d07b933Sdp goto bin_empty_0; 40117c478bd9Sstevel@tonic-gate 40125d07b933Sdp first_pp = pp; 40135d07b933Sdp ASSERT(pp->p_vnode); 40145d07b933Sdp ASSERT(PP_ISAGED(pp) == 0); 40155d07b933Sdp ASSERT(pp->p_szc == 0); 40165d07b933Sdp ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 40175d07b933Sdp while (!page_trylock(pp, SE_EXCL)) { 40185d07b933Sdp pp = pp->p_next; 40195d07b933Sdp ASSERT(pp->p_szc == 0); 40205d07b933Sdp if (pp == first_pp) { 40217c478bd9Sstevel@tonic-gate /* 40225d07b933Sdp * We have searched the complete list! 40235d07b933Sdp * And all of them (might only be one) 40245d07b933Sdp * are locked. This can happen since 40255d07b933Sdp * these pages can also be found via 40265d07b933Sdp * the hash list. When found via the 40275d07b933Sdp * hash list, they are locked first, 40285d07b933Sdp * then removed. We give up to let the 40295d07b933Sdp * other thread run. 40307c478bd9Sstevel@tonic-gate */ 40315d07b933Sdp pp = NULL; 40325d07b933Sdp break; 40337c478bd9Sstevel@tonic-gate } 40345d07b933Sdp ASSERT(pp->p_vnode); 40355d07b933Sdp ASSERT(PP_ISFREE(pp)); 40365d07b933Sdp ASSERT(PP_ISAGED(pp) == 0); 40375d07b933Sdp ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 40385d07b933Sdp mnode); 40397c478bd9Sstevel@tonic-gate } 40407c478bd9Sstevel@tonic-gate 40415d07b933Sdp if (pp) { 40425d07b933Sdp page_t **ppp; 40435d07b933Sdp /* 40445d07b933Sdp * Found and locked a page. 40455d07b933Sdp * Pull it off the list. 40465d07b933Sdp */ 40475d07b933Sdp ASSERT(mtype == PP_2_MTYPE(pp)); 40485d07b933Sdp ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 40495d07b933Sdp page_sub(ppp, pp); 40505d07b933Sdp /* 40515d07b933Sdp * Subtract counters before releasing pcm mutex 40525d07b933Sdp * to avoid a race with page_freelist_coalesce 40535d07b933Sdp * and page_freelist_split. 40545d07b933Sdp */ 40555d07b933Sdp page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 40565d07b933Sdp mutex_exit(pcm); 40575d07b933Sdp ASSERT(pp->p_vnode); 40585d07b933Sdp ASSERT(PP_ISAGED(pp) == 0); 40595d07b933Sdp #if defined(__sparc) 40605d07b933Sdp ASSERT(!kcage_on || 40615d07b933Sdp (flags & PG_NORELOC) == 0 || 40625d07b933Sdp PP_ISNORELOC(pp)); 40635d07b933Sdp if (PP_ISNORELOC(pp)) { 40645d07b933Sdp kcage_freemem_sub(1); 40657c478bd9Sstevel@tonic-gate } 40665d07b933Sdp #endif 40675d07b933Sdp VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 40685d07b933Sdp return (pp); 40697c478bd9Sstevel@tonic-gate } 40705d07b933Sdp bin_empty_0: 40715d07b933Sdp mutex_exit(pcm); 40725d07b933Sdp bin_empty_1: 40735d07b933Sdp if (plw_initialized == 0) { 40745d07b933Sdp page_list_walk_init(0, flags, bin, 0, 1, &plw); 40755d07b933Sdp plw_initialized = 1; 40767c478bd9Sstevel@tonic-gate } 40775d07b933Sdp /* calculate the next bin with equivalent color */ 40785d07b933Sdp bin = ADD_MASKED(bin, plw.plw_bin_step, 40795d07b933Sdp plw.plw_ceq_mask[0], plw.plw_color_mask); 40805d07b933Sdp } while (sbin != bin); 40817c478bd9Sstevel@tonic-gate 40825d07b933Sdp if (plw.plw_ceq_dif > 1) 40835d07b933Sdp bin = page_list_walk_next_bin(0, bin, &plw); 40847c478bd9Sstevel@tonic-gate } 40857c478bd9Sstevel@tonic-gate 4086affbd3ccSkchow MTYPE_NEXT(mnode, mtype, flags); 4087affbd3ccSkchow if (mtype >= 0) 40885d07b933Sdp goto try_again; 4089affbd3ccSkchow 40907c478bd9Sstevel@tonic-gate VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 40917c478bd9Sstevel@tonic-gate return (NULL); 40927c478bd9Sstevel@tonic-gate } 40937c478bd9Sstevel@tonic-gate 40947c478bd9Sstevel@tonic-gate #ifdef DEBUG 40957c478bd9Sstevel@tonic-gate #define REPL_PAGE_STATS 40967c478bd9Sstevel@tonic-gate #endif /* DEBUG */ 40977c478bd9Sstevel@tonic-gate 40987c478bd9Sstevel@tonic-gate #ifdef REPL_PAGE_STATS 40997c478bd9Sstevel@tonic-gate struct repl_page_stats { 41007c478bd9Sstevel@tonic-gate uint_t ngets; 41017c478bd9Sstevel@tonic-gate uint_t ngets_noreloc; 41027c478bd9Sstevel@tonic-gate uint_t npgr_noreloc; 41037c478bd9Sstevel@tonic-gate uint_t nnopage_first; 41047c478bd9Sstevel@tonic-gate uint_t nnopage; 41057c478bd9Sstevel@tonic-gate uint_t nhashout; 41067c478bd9Sstevel@tonic-gate uint_t nnofree; 41077c478bd9Sstevel@tonic-gate uint_t nnext_pp; 41087c478bd9Sstevel@tonic-gate } repl_page_stats; 41097c478bd9Sstevel@tonic-gate #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 41107c478bd9Sstevel@tonic-gate #else /* REPL_PAGE_STATS */ 41117c478bd9Sstevel@tonic-gate #define REPL_STAT_INCR(v) 41127c478bd9Sstevel@tonic-gate #endif /* REPL_PAGE_STATS */ 41137c478bd9Sstevel@tonic-gate 41147c478bd9Sstevel@tonic-gate int pgrppgcp; 41157c478bd9Sstevel@tonic-gate 41167c478bd9Sstevel@tonic-gate /* 41177c478bd9Sstevel@tonic-gate * The freemem accounting must be done by the caller. 41187c478bd9Sstevel@tonic-gate * First we try to get a replacement page of the same size as like_pp, 41197c478bd9Sstevel@tonic-gate * if that is not possible, then we just get a set of discontiguous 41207c478bd9Sstevel@tonic-gate * PAGESIZE pages. 41217c478bd9Sstevel@tonic-gate */ 41227c478bd9Sstevel@tonic-gate page_t * 41232dae3fb5Sjjc page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 41247c478bd9Sstevel@tonic-gate uint_t pgrflags) 41257c478bd9Sstevel@tonic-gate { 41267c478bd9Sstevel@tonic-gate page_t *like_pp; 41277c478bd9Sstevel@tonic-gate page_t *pp, *pplist; 41287c478bd9Sstevel@tonic-gate page_t *pl = NULL; 41297c478bd9Sstevel@tonic-gate ulong_t bin; 41307c478bd9Sstevel@tonic-gate int mnode, page_mnode; 41317c478bd9Sstevel@tonic-gate int szc; 41327c478bd9Sstevel@tonic-gate spgcnt_t npgs, pg_cnt; 41337c478bd9Sstevel@tonic-gate pfn_t pfnum; 41347c478bd9Sstevel@tonic-gate int mtype; 41357c478bd9Sstevel@tonic-gate int flags = 0; 41367c478bd9Sstevel@tonic-gate lgrp_mnode_cookie_t lgrp_cookie; 41372dae3fb5Sjjc lgrp_t *lgrp; 41387c478bd9Sstevel@tonic-gate 41397c478bd9Sstevel@tonic-gate REPL_STAT_INCR(ngets); 41407c478bd9Sstevel@tonic-gate like_pp = orig_like_pp; 41417c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(like_pp)); 41427c478bd9Sstevel@tonic-gate 41437c478bd9Sstevel@tonic-gate szc = like_pp->p_szc; 41447c478bd9Sstevel@tonic-gate npgs = page_get_pagecnt(szc); 41457c478bd9Sstevel@tonic-gate /* 41467c478bd9Sstevel@tonic-gate * Now we reset like_pp to the base page_t. 41477c478bd9Sstevel@tonic-gate * That way, we won't walk past the end of this 'szc' page. 41487c478bd9Sstevel@tonic-gate */ 41497c478bd9Sstevel@tonic-gate pfnum = PFN_BASE(like_pp->p_pagenum, szc); 41507c478bd9Sstevel@tonic-gate like_pp = page_numtopp_nolock(pfnum); 41517c478bd9Sstevel@tonic-gate ASSERT(like_pp->p_szc == szc); 41527c478bd9Sstevel@tonic-gate 41537c478bd9Sstevel@tonic-gate if (PP_ISNORELOC(like_pp)) { 41547c478bd9Sstevel@tonic-gate ASSERT(kcage_on); 41557c478bd9Sstevel@tonic-gate REPL_STAT_INCR(ngets_noreloc); 41567c478bd9Sstevel@tonic-gate flags = PGI_RELOCONLY; 41577c478bd9Sstevel@tonic-gate } else if (pgrflags & PGR_NORELOC) { 41587c478bd9Sstevel@tonic-gate ASSERT(kcage_on); 41597c478bd9Sstevel@tonic-gate REPL_STAT_INCR(npgr_noreloc); 41607c478bd9Sstevel@tonic-gate flags = PG_NORELOC; 41617c478bd9Sstevel@tonic-gate } 41627c478bd9Sstevel@tonic-gate 41637c478bd9Sstevel@tonic-gate /* 41647c478bd9Sstevel@tonic-gate * Kernel pages must always be replaced with the same size 41657c478bd9Sstevel@tonic-gate * pages, since we cannot properly handle demotion of kernel 41667c478bd9Sstevel@tonic-gate * pages. 41677c478bd9Sstevel@tonic-gate */ 4168ad23a2dbSjohansen if (PP_ISKAS(like_pp)) 41697c478bd9Sstevel@tonic-gate pgrflags |= PGR_SAMESZC; 41707c478bd9Sstevel@tonic-gate 41717c478bd9Sstevel@tonic-gate /* LINTED */ 417207ad560dSkchow MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 41737c478bd9Sstevel@tonic-gate 41747c478bd9Sstevel@tonic-gate while (npgs) { 41757c478bd9Sstevel@tonic-gate pplist = NULL; 41767c478bd9Sstevel@tonic-gate for (;;) { 41777c478bd9Sstevel@tonic-gate pg_cnt = page_get_pagecnt(szc); 41787c478bd9Sstevel@tonic-gate bin = PP_2_BIN(like_pp); 41797c478bd9Sstevel@tonic-gate ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 41807c478bd9Sstevel@tonic-gate ASSERT(pg_cnt <= npgs); 41817c478bd9Sstevel@tonic-gate 41827c478bd9Sstevel@tonic-gate /* 41837c478bd9Sstevel@tonic-gate * If an lgroup was specified, try to get the 41847c478bd9Sstevel@tonic-gate * page from that lgroup. 41852dae3fb5Sjjc * NOTE: Must be careful with code below because 41862dae3fb5Sjjc * lgroup may disappear and reappear since there 41872dae3fb5Sjjc * is no locking for lgroup here. 41887c478bd9Sstevel@tonic-gate */ 41892dae3fb5Sjjc if (LGRP_EXISTS(lgrp_target)) { 41902dae3fb5Sjjc /* 41912dae3fb5Sjjc * Keep local variable for lgroup separate 41922dae3fb5Sjjc * from lgroup argument since this code should 41932dae3fb5Sjjc * only be exercised when lgroup argument 41942dae3fb5Sjjc * exists.... 41952dae3fb5Sjjc */ 41962dae3fb5Sjjc lgrp = lgrp_target; 41972dae3fb5Sjjc 41987c478bd9Sstevel@tonic-gate /* Try the lgroup's freelists first */ 41997c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 42007c478bd9Sstevel@tonic-gate LGRP_SRCH_LOCAL); 42017c478bd9Sstevel@tonic-gate while ((pplist == NULL) && 42027c478bd9Sstevel@tonic-gate (mnode = lgrp_memnode_choose(&lgrp_cookie)) 42037c478bd9Sstevel@tonic-gate != -1) { 4204ce8eb11aSdp pplist = 4205ce8eb11aSdp page_get_mnode_freelist(mnode, bin, 4206ce8eb11aSdp mtype, szc, flags); 42077c478bd9Sstevel@tonic-gate } 42087c478bd9Sstevel@tonic-gate 42097c478bd9Sstevel@tonic-gate /* 42107c478bd9Sstevel@tonic-gate * Now try it's cachelists if this is a 42117c478bd9Sstevel@tonic-gate * small page. Don't need to do it for 42127c478bd9Sstevel@tonic-gate * larger ones since page_freelist_coalesce() 42137c478bd9Sstevel@tonic-gate * already failed. 42147c478bd9Sstevel@tonic-gate */ 42157c478bd9Sstevel@tonic-gate if (pplist != NULL || szc != 0) 42167c478bd9Sstevel@tonic-gate break; 42177c478bd9Sstevel@tonic-gate 42187c478bd9Sstevel@tonic-gate /* Now try it's cachelists */ 42197c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 42207c478bd9Sstevel@tonic-gate LGRP_SRCH_LOCAL); 42217c478bd9Sstevel@tonic-gate 42227c478bd9Sstevel@tonic-gate while ((pplist == NULL) && 42237c478bd9Sstevel@tonic-gate (mnode = lgrp_memnode_choose(&lgrp_cookie)) 42247c478bd9Sstevel@tonic-gate != -1) { 4225ce8eb11aSdp pplist = 4226ce8eb11aSdp page_get_mnode_cachelist(bin, flags, 4227ce8eb11aSdp mnode, mtype); 42287c478bd9Sstevel@tonic-gate } 42297c478bd9Sstevel@tonic-gate if (pplist != NULL) { 42307c478bd9Sstevel@tonic-gate page_hashout(pplist, NULL); 42317c478bd9Sstevel@tonic-gate PP_SETAGED(pplist); 42327c478bd9Sstevel@tonic-gate REPL_STAT_INCR(nhashout); 42337c478bd9Sstevel@tonic-gate break; 42347c478bd9Sstevel@tonic-gate } 42357c478bd9Sstevel@tonic-gate /* Done looking in this lgroup. Bail out. */ 42367c478bd9Sstevel@tonic-gate break; 42377c478bd9Sstevel@tonic-gate } 42387c478bd9Sstevel@tonic-gate 42397c478bd9Sstevel@tonic-gate /* 42402dae3fb5Sjjc * No lgroup was specified (or lgroup was removed by 42412dae3fb5Sjjc * DR, so just try to get the page as close to 42422dae3fb5Sjjc * like_pp's mnode as possible. 42437c478bd9Sstevel@tonic-gate * First try the local freelist... 42447c478bd9Sstevel@tonic-gate */ 42457c478bd9Sstevel@tonic-gate mnode = PP_2_MEM_NODE(like_pp); 42467c478bd9Sstevel@tonic-gate pplist = page_get_mnode_freelist(mnode, bin, 42477c478bd9Sstevel@tonic-gate mtype, szc, flags); 42487c478bd9Sstevel@tonic-gate if (pplist != NULL) 42497c478bd9Sstevel@tonic-gate break; 42507c478bd9Sstevel@tonic-gate 42517c478bd9Sstevel@tonic-gate REPL_STAT_INCR(nnofree); 42527c478bd9Sstevel@tonic-gate 42537c478bd9Sstevel@tonic-gate /* 42547c478bd9Sstevel@tonic-gate * ...then the local cachelist. Don't need to do it for 42557c478bd9Sstevel@tonic-gate * larger pages cause page_freelist_coalesce() already 42567c478bd9Sstevel@tonic-gate * failed there anyway. 42577c478bd9Sstevel@tonic-gate */ 42587c478bd9Sstevel@tonic-gate if (szc == 0) { 42597c478bd9Sstevel@tonic-gate pplist = page_get_mnode_cachelist(bin, flags, 42607c478bd9Sstevel@tonic-gate mnode, mtype); 42617c478bd9Sstevel@tonic-gate if (pplist != NULL) { 42627c478bd9Sstevel@tonic-gate page_hashout(pplist, NULL); 42637c478bd9Sstevel@tonic-gate PP_SETAGED(pplist); 42647c478bd9Sstevel@tonic-gate REPL_STAT_INCR(nhashout); 42657c478bd9Sstevel@tonic-gate break; 42667c478bd9Sstevel@tonic-gate } 42677c478bd9Sstevel@tonic-gate } 42687c478bd9Sstevel@tonic-gate 42697c478bd9Sstevel@tonic-gate /* Now try remote freelists */ 42707c478bd9Sstevel@tonic-gate page_mnode = mnode; 42717c478bd9Sstevel@tonic-gate lgrp = 42727c478bd9Sstevel@tonic-gate lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 42737c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 42747c478bd9Sstevel@tonic-gate LGRP_SRCH_HIER); 42757c478bd9Sstevel@tonic-gate while (pplist == NULL && 42767c478bd9Sstevel@tonic-gate (mnode = lgrp_memnode_choose(&lgrp_cookie)) 42777c478bd9Sstevel@tonic-gate != -1) { 42787c478bd9Sstevel@tonic-gate /* 42797c478bd9Sstevel@tonic-gate * Skip local mnode. 42807c478bd9Sstevel@tonic-gate */ 42817c478bd9Sstevel@tonic-gate if ((mnode == page_mnode) || 42827c478bd9Sstevel@tonic-gate (mem_node_config[mnode].exists == 0)) 42837c478bd9Sstevel@tonic-gate continue; 42847c478bd9Sstevel@tonic-gate 42857c478bd9Sstevel@tonic-gate pplist = page_get_mnode_freelist(mnode, 42867c478bd9Sstevel@tonic-gate bin, mtype, szc, flags); 42877c478bd9Sstevel@tonic-gate } 42887c478bd9Sstevel@tonic-gate 42897c478bd9Sstevel@tonic-gate if (pplist != NULL) 42907c478bd9Sstevel@tonic-gate break; 42917c478bd9Sstevel@tonic-gate 42927c478bd9Sstevel@tonic-gate 42937c478bd9Sstevel@tonic-gate /* Now try remote cachelists */ 42947c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 42957c478bd9Sstevel@tonic-gate LGRP_SRCH_HIER); 42967c478bd9Sstevel@tonic-gate while (pplist == NULL && szc == 0) { 42977c478bd9Sstevel@tonic-gate mnode = lgrp_memnode_choose(&lgrp_cookie); 42987c478bd9Sstevel@tonic-gate if (mnode == -1) 42997c478bd9Sstevel@tonic-gate break; 43007c478bd9Sstevel@tonic-gate /* 43017c478bd9Sstevel@tonic-gate * Skip local mnode. 43027c478bd9Sstevel@tonic-gate */ 43037c478bd9Sstevel@tonic-gate if ((mnode == page_mnode) || 43047c478bd9Sstevel@tonic-gate (mem_node_config[mnode].exists == 0)) 43057c478bd9Sstevel@tonic-gate continue; 43067c478bd9Sstevel@tonic-gate 43077c478bd9Sstevel@tonic-gate pplist = page_get_mnode_cachelist(bin, 43087c478bd9Sstevel@tonic-gate flags, mnode, mtype); 43097c478bd9Sstevel@tonic-gate 43107c478bd9Sstevel@tonic-gate if (pplist != NULL) { 43117c478bd9Sstevel@tonic-gate page_hashout(pplist, NULL); 43127c478bd9Sstevel@tonic-gate PP_SETAGED(pplist); 43137c478bd9Sstevel@tonic-gate REPL_STAT_INCR(nhashout); 43147c478bd9Sstevel@tonic-gate break; 43157c478bd9Sstevel@tonic-gate } 43167c478bd9Sstevel@tonic-gate } 43177c478bd9Sstevel@tonic-gate 43187c478bd9Sstevel@tonic-gate /* 43197c478bd9Sstevel@tonic-gate * Break out of while loop under the following cases: 43207c478bd9Sstevel@tonic-gate * - If we successfully got a page. 43217c478bd9Sstevel@tonic-gate * - If pgrflags specified only returning a specific 43227c478bd9Sstevel@tonic-gate * page size and we could not find that page size. 43237c478bd9Sstevel@tonic-gate * - If we could not satisfy the request with PAGESIZE 43247c478bd9Sstevel@tonic-gate * or larger pages. 43257c478bd9Sstevel@tonic-gate */ 43267c478bd9Sstevel@tonic-gate if (pplist != NULL || szc == 0) 43277c478bd9Sstevel@tonic-gate break; 43287c478bd9Sstevel@tonic-gate 43297c478bd9Sstevel@tonic-gate if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 43307c478bd9Sstevel@tonic-gate /* try to find contig page */ 43317c478bd9Sstevel@tonic-gate 43327c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 43337c478bd9Sstevel@tonic-gate LGRP_SRCH_HIER); 43347c478bd9Sstevel@tonic-gate 43357c478bd9Sstevel@tonic-gate while ((pplist == NULL) && 43367c478bd9Sstevel@tonic-gate (mnode = 4337ce8eb11aSdp lgrp_memnode_choose(&lgrp_cookie)) 43387c478bd9Sstevel@tonic-gate != -1) { 43397c478bd9Sstevel@tonic-gate pplist = page_get_contig_pages( 4340ce8eb11aSdp mnode, bin, mtype, szc, 4341ce8eb11aSdp flags | PGI_PGCPHIPRI); 43427c478bd9Sstevel@tonic-gate } 43437c478bd9Sstevel@tonic-gate break; 43447c478bd9Sstevel@tonic-gate } 43457c478bd9Sstevel@tonic-gate 43467c478bd9Sstevel@tonic-gate /* 43477c478bd9Sstevel@tonic-gate * The correct thing to do here is try the next 43487c478bd9Sstevel@tonic-gate * page size down using szc--. Due to a bug 43497c478bd9Sstevel@tonic-gate * with the processing of HAT_RELOAD_SHARE 43507c478bd9Sstevel@tonic-gate * where the sfmmu_ttecnt arrays of all 43517c478bd9Sstevel@tonic-gate * hats sharing an ISM segment don't get updated, 43527c478bd9Sstevel@tonic-gate * using intermediate size pages for relocation 43537c478bd9Sstevel@tonic-gate * can lead to continuous page faults. 43547c478bd9Sstevel@tonic-gate */ 43557c478bd9Sstevel@tonic-gate szc = 0; 43567c478bd9Sstevel@tonic-gate } 43577c478bd9Sstevel@tonic-gate 43587c478bd9Sstevel@tonic-gate if (pplist != NULL) { 43597c478bd9Sstevel@tonic-gate DTRACE_PROBE4(page__get, 43607c478bd9Sstevel@tonic-gate lgrp_t *, lgrp, 43617c478bd9Sstevel@tonic-gate int, mnode, 43627c478bd9Sstevel@tonic-gate ulong_t, bin, 43637c478bd9Sstevel@tonic-gate uint_t, flags); 43647c478bd9Sstevel@tonic-gate 43657c478bd9Sstevel@tonic-gate while (pplist != NULL && pg_cnt--) { 43667c478bd9Sstevel@tonic-gate ASSERT(pplist != NULL); 43677c478bd9Sstevel@tonic-gate pp = pplist; 43687c478bd9Sstevel@tonic-gate page_sub(&pplist, pp); 43697c478bd9Sstevel@tonic-gate PP_CLRFREE(pp); 43707c478bd9Sstevel@tonic-gate PP_CLRAGED(pp); 43717c478bd9Sstevel@tonic-gate page_list_concat(&pl, &pp); 43727c478bd9Sstevel@tonic-gate npgs--; 43737c478bd9Sstevel@tonic-gate like_pp = like_pp + 1; 43747c478bd9Sstevel@tonic-gate REPL_STAT_INCR(nnext_pp); 43757c478bd9Sstevel@tonic-gate } 43767c478bd9Sstevel@tonic-gate ASSERT(pg_cnt == 0); 43777c478bd9Sstevel@tonic-gate } else { 43787c478bd9Sstevel@tonic-gate break; 43797c478bd9Sstevel@tonic-gate } 43807c478bd9Sstevel@tonic-gate } 43817c478bd9Sstevel@tonic-gate 43827c478bd9Sstevel@tonic-gate if (npgs) { 43837c478bd9Sstevel@tonic-gate /* 43847c478bd9Sstevel@tonic-gate * We were unable to allocate the necessary number 43857c478bd9Sstevel@tonic-gate * of pages. 43867c478bd9Sstevel@tonic-gate * We need to free up any pl. 43877c478bd9Sstevel@tonic-gate */ 43887c478bd9Sstevel@tonic-gate REPL_STAT_INCR(nnopage); 43897c478bd9Sstevel@tonic-gate page_free_replacement_page(pl); 43907c478bd9Sstevel@tonic-gate return (NULL); 43917c478bd9Sstevel@tonic-gate } else { 43927c478bd9Sstevel@tonic-gate return (pl); 43937c478bd9Sstevel@tonic-gate } 43947c478bd9Sstevel@tonic-gate } 43957c478bd9Sstevel@tonic-gate 43967c478bd9Sstevel@tonic-gate /* 43977c478bd9Sstevel@tonic-gate * demote a free large page to it's constituent pages 43987c478bd9Sstevel@tonic-gate */ 43997c478bd9Sstevel@tonic-gate void 44007c478bd9Sstevel@tonic-gate page_demote_free_pages(page_t *pp) 44017c478bd9Sstevel@tonic-gate { 44027c478bd9Sstevel@tonic-gate 44037c478bd9Sstevel@tonic-gate int mnode; 44047c478bd9Sstevel@tonic-gate 44057c478bd9Sstevel@tonic-gate ASSERT(pp != NULL); 44067c478bd9Sstevel@tonic-gate ASSERT(PAGE_LOCKED(pp)); 44077c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 44087c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 44097c478bd9Sstevel@tonic-gate 44107c478bd9Sstevel@tonic-gate mnode = PP_2_MEM_NODE(pp); 44117c478bd9Sstevel@tonic-gate page_freelist_lock(mnode); 44127c478bd9Sstevel@tonic-gate if (pp->p_szc != 0) { 44137c478bd9Sstevel@tonic-gate (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 441419397407SSherry Moore pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 44157c478bd9Sstevel@tonic-gate } 44167c478bd9Sstevel@tonic-gate page_freelist_unlock(mnode); 44177c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 44187c478bd9Sstevel@tonic-gate } 4419932dc8e5Sdp 4420932dc8e5Sdp /* 4421932dc8e5Sdp * Factor in colorequiv to check additional 'equivalent' bins. 4422932dc8e5Sdp * colorequiv may be set in /etc/system 4423932dc8e5Sdp */ 4424932dc8e5Sdp void 4425932dc8e5Sdp page_set_colorequiv_arr(void) 4426932dc8e5Sdp { 4427932dc8e5Sdp if (colorequiv > 1) { 4428932dc8e5Sdp int i; 4429fe70c9cfSdp uint_t sv_a = lowbit(colorequiv) - 1; 4430932dc8e5Sdp 4431fe70c9cfSdp if (sv_a > 15) 4432fe70c9cfSdp sv_a = 15; 4433932dc8e5Sdp 4434932dc8e5Sdp for (i = 0; i < MMU_PAGE_SIZES; i++) { 4435fe70c9cfSdp uint_t colors; 4436fe70c9cfSdp uint_t a = sv_a; 4437932dc8e5Sdp 4438932dc8e5Sdp if ((colors = hw_page_array[i].hp_colors) <= 1) { 4439932dc8e5Sdp continue; 4440932dc8e5Sdp } 4441932dc8e5Sdp while ((colors >> a) == 0) 4442932dc8e5Sdp a--; 4443932dc8e5Sdp if ((a << 4) > colorequivszc[i]) { 4444932dc8e5Sdp colorequivszc[i] = (a << 4); 4445932dc8e5Sdp } 4446932dc8e5Sdp } 4447932dc8e5Sdp } 4448932dc8e5Sdp } 4449