xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision cb15d5d9)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
50b5aa17bSmec  * Common Development and Distribution License (the "License").
60b5aa17bSmec  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22*cb15d5d9SPeter Rival  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
237c478bd9Sstevel@tonic-gate  */
247c478bd9Sstevel@tonic-gate 
257c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
267c478bd9Sstevel@tonic-gate /*	All Rights Reserved   */
277c478bd9Sstevel@tonic-gate 
287c478bd9Sstevel@tonic-gate /*
297c478bd9Sstevel@tonic-gate  * Portions of this source code were derived from Berkeley 4.3 BSD
307c478bd9Sstevel@tonic-gate  * under license from the Regents of the University of California.
317c478bd9Sstevel@tonic-gate  */
327c478bd9Sstevel@tonic-gate 
337c478bd9Sstevel@tonic-gate 
347c478bd9Sstevel@tonic-gate /*
357c478bd9Sstevel@tonic-gate  * This file contains common functions to access and manage the page lists.
367c478bd9Sstevel@tonic-gate  * Many of these routines originated from platform dependent modules
377c478bd9Sstevel@tonic-gate  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
387c478bd9Sstevel@tonic-gate  * a platform independent manner.
397c478bd9Sstevel@tonic-gate  *
407c478bd9Sstevel@tonic-gate  * vm/vm_dep.h provides for platform specific support.
417c478bd9Sstevel@tonic-gate  */
427c478bd9Sstevel@tonic-gate 
437c478bd9Sstevel@tonic-gate #include <sys/types.h>
447c478bd9Sstevel@tonic-gate #include <sys/debug.h>
457c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
467c478bd9Sstevel@tonic-gate #include <sys/systm.h>
477c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
487c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
497c478bd9Sstevel@tonic-gate #include <vm/as.h>
507c478bd9Sstevel@tonic-gate #include <vm/page.h>
517c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
527c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h>
5378b03d3aSkchow #include <sys/vmsystm.h>
547c478bd9Sstevel@tonic-gate #include <sys/memnode.h>
557c478bd9Sstevel@tonic-gate #include <vm/vm_dep.h>
567c478bd9Sstevel@tonic-gate #include <sys/lgrp.h>
577c478bd9Sstevel@tonic-gate #include <sys/mem_config.h>
587c478bd9Sstevel@tonic-gate #include <sys/callb.h>
597c478bd9Sstevel@tonic-gate #include <sys/mem_cage.h>
607c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
61ca3e8d88SDave Plauger #include <sys/dumphdr.h>
62*cb15d5d9SPeter Rival #include <sys/swap.h>
637c478bd9Sstevel@tonic-gate 
647c478bd9Sstevel@tonic-gate extern uint_t	vac_colors;
657c478bd9Sstevel@tonic-gate 
666061ce8aSkchow #define	MAX_PRAGMA_ALIGN	128
676061ce8aSkchow 
686061ce8aSkchow /* vm_cpu_data0 for the boot cpu before kmem is initialized */
696061ce8aSkchow 
706061ce8aSkchow #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
71affbd3ccSkchow #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
726061ce8aSkchow #else
736061ce8aSkchow #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
746061ce8aSkchow #endif
75affbd3ccSkchow char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
76affbd3ccSkchow 
777c478bd9Sstevel@tonic-gate /*
787c478bd9Sstevel@tonic-gate  * number of page colors equivalent to reqested color in page_get routines.
797c478bd9Sstevel@tonic-gate  * If set, keeps large pages intact longer and keeps MPO allocation
807c478bd9Sstevel@tonic-gate  * from the local mnode in favor of acquiring the 'correct' page color from
817c478bd9Sstevel@tonic-gate  * a demoted large page or from a remote mnode.
827c478bd9Sstevel@tonic-gate  */
835d07b933Sdp uint_t	colorequiv;
845d07b933Sdp 
855d07b933Sdp /*
865d07b933Sdp  * color equivalency mask for each page size.
875d07b933Sdp  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
885d07b933Sdp  * High 4 bits determine the number of high order bits of the color to ignore.
895d07b933Sdp  * Low 4 bits determines number of low order bits of color to ignore (it's only
905d07b933Sdp  * relevant for hashed index based page coloring).
915d07b933Sdp  */
925d07b933Sdp uchar_t colorequivszc[MMU_PAGE_SIZES];
937c478bd9Sstevel@tonic-gate 
947c478bd9Sstevel@tonic-gate /*
957c478bd9Sstevel@tonic-gate  * if set, specifies the percentage of large pages that are free from within
967c478bd9Sstevel@tonic-gate  * a large page region before attempting to lock those pages for
977c478bd9Sstevel@tonic-gate  * page_get_contig_pages processing.
987c478bd9Sstevel@tonic-gate  *
997c478bd9Sstevel@tonic-gate  * Should be turned on when kpr is available when page_trylock_contig_pages
1007c478bd9Sstevel@tonic-gate  * can be more selective.
1017c478bd9Sstevel@tonic-gate  */
1027c478bd9Sstevel@tonic-gate 
1037c478bd9Sstevel@tonic-gate int	ptcpthreshold;
1047c478bd9Sstevel@tonic-gate 
1057c478bd9Sstevel@tonic-gate /*
1067c478bd9Sstevel@tonic-gate  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
10783f9b804Skchow  * Enabled by default via pgcplimitsearch.
10883f9b804Skchow  *
10983f9b804Skchow  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
11083f9b804Skchow  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
11183f9b804Skchow  * bound. This upper bound range guarantees:
11283f9b804Skchow  *    - all large page 'slots' will be searched over time
11383f9b804Skchow  *    - the minimum (1) large page candidates considered on each pgcp call
11483f9b804Skchow  *    - count doesn't wrap around to 0
1157c478bd9Sstevel@tonic-gate  */
11683f9b804Skchow pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
1177c478bd9Sstevel@tonic-gate int	pgcplimitsearch = 1;
1187c478bd9Sstevel@tonic-gate 
11983f9b804Skchow #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
12083f9b804Skchow #define	SETPGCPFAILCNT(szc)						\
12183f9b804Skchow 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
12283f9b804Skchow 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
12383f9b804Skchow 
1247c478bd9Sstevel@tonic-gate #ifdef VM_STATS
1257c478bd9Sstevel@tonic-gate struct vmm_vmstats_str  vmm_vmstats;
1267c478bd9Sstevel@tonic-gate 
1277c478bd9Sstevel@tonic-gate #endif /* VM_STATS */
1287c478bd9Sstevel@tonic-gate 
1297c478bd9Sstevel@tonic-gate #if defined(__sparc)
1307c478bd9Sstevel@tonic-gate #define	LPGCREATE	0
1317c478bd9Sstevel@tonic-gate #else
1327c478bd9Sstevel@tonic-gate /* enable page_get_contig_pages */
1337c478bd9Sstevel@tonic-gate #define	LPGCREATE	1
1347c478bd9Sstevel@tonic-gate #endif
1357c478bd9Sstevel@tonic-gate 
1367c478bd9Sstevel@tonic-gate int pg_contig_disable;
1377c478bd9Sstevel@tonic-gate int pg_lpgcreate_nocage = LPGCREATE;
1387c478bd9Sstevel@tonic-gate 
1397c478bd9Sstevel@tonic-gate /*
14019397407SSherry Moore  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
1417c478bd9Sstevel@tonic-gate  */
1427c478bd9Sstevel@tonic-gate #define	PFNNULL		0
1437c478bd9Sstevel@tonic-gate 
1447c478bd9Sstevel@tonic-gate /* Flags involved in promotion and demotion routines */
1457c478bd9Sstevel@tonic-gate #define	PC_FREE		0x1	/* put page on freelist */
1467c478bd9Sstevel@tonic-gate #define	PC_ALLOC	0x2	/* return page for allocation */
1477c478bd9Sstevel@tonic-gate 
1487c478bd9Sstevel@tonic-gate /*
1497c478bd9Sstevel@tonic-gate  * Flag for page_demote to be used with PC_FREE to denote that we don't care
1507c478bd9Sstevel@tonic-gate  * what the color is as the color parameter to the function is ignored.
1517c478bd9Sstevel@tonic-gate  */
1527c478bd9Sstevel@tonic-gate #define	PC_NO_COLOR	(-1)
1537c478bd9Sstevel@tonic-gate 
1545d07b933Sdp /* mtype value for page_promote to use when mtype does not matter */
1555d07b933Sdp #define	PC_MTYPE_ANY	(-1)
1565d07b933Sdp 
1577c478bd9Sstevel@tonic-gate /*
1587c478bd9Sstevel@tonic-gate  * page counters candidates info
1597c478bd9Sstevel@tonic-gate  * See page_ctrs_cands comment below for more details.
1607c478bd9Sstevel@tonic-gate  * fields are as follows:
1617c478bd9Sstevel@tonic-gate  *	pcc_pages_free:		# pages which freelist coalesce can create
1627c478bd9Sstevel@tonic-gate  *	pcc_color_free:		pointer to page free counts per color
1637c478bd9Sstevel@tonic-gate  */
1647c478bd9Sstevel@tonic-gate typedef struct pcc_info {
1657c478bd9Sstevel@tonic-gate 	pgcnt_t	pcc_pages_free;
1667c478bd9Sstevel@tonic-gate 	pgcnt_t	*pcc_color_free;
16706fb6a36Sdv 	uint_t	pad[12];
1687c478bd9Sstevel@tonic-gate } pcc_info_t;
1697c478bd9Sstevel@tonic-gate 
1707c478bd9Sstevel@tonic-gate /*
1717c478bd9Sstevel@tonic-gate  * On big machines it can take a long time to check page_counters
1727c478bd9Sstevel@tonic-gate  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
1737c478bd9Sstevel@tonic-gate  * updated sum of all elements of the corresponding page_counters arrays.
1747c478bd9Sstevel@tonic-gate  * page_freelist_coalesce() searches page_counters only if an appropriate
1757c478bd9Sstevel@tonic-gate  * element of page_ctrs_cands array is greater than 0.
1767c478bd9Sstevel@tonic-gate  *
1775d07b933Sdp  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
1787c478bd9Sstevel@tonic-gate  */
1795d07b933Sdp pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
1807c478bd9Sstevel@tonic-gate 
1817c478bd9Sstevel@tonic-gate /*
1827c478bd9Sstevel@tonic-gate  * Return in val the total number of free pages which can be created
1835d07b933Sdp  * for the given mnode (m), mrange (g), and region size (r)
1847c478bd9Sstevel@tonic-gate  */
1855d07b933Sdp #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
1867c478bd9Sstevel@tonic-gate 	int i;								\
1877c478bd9Sstevel@tonic-gate 	val = 0;							\
1887c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {				\
1895d07b933Sdp 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
1907c478bd9Sstevel@tonic-gate 	}								\
1917c478bd9Sstevel@tonic-gate }
1927c478bd9Sstevel@tonic-gate 
1937c478bd9Sstevel@tonic-gate /*
1947c478bd9Sstevel@tonic-gate  * Return in val the total number of free pages which can be created
1955d07b933Sdp  * for the given mnode (m), mrange (g), region size (r), and color (c)
1967c478bd9Sstevel@tonic-gate  */
1975d07b933Sdp #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
1987c478bd9Sstevel@tonic-gate 	int i;								\
1997c478bd9Sstevel@tonic-gate 	val = 0;							\
2005d07b933Sdp 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
2017c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {				\
2025d07b933Sdp 	    val +=							\
2035d07b933Sdp 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
2047c478bd9Sstevel@tonic-gate 	}								\
2057c478bd9Sstevel@tonic-gate }
2067c478bd9Sstevel@tonic-gate 
2077c478bd9Sstevel@tonic-gate /*
2087c478bd9Sstevel@tonic-gate  * We can only allow a single thread to update a counter within the physical
2097c478bd9Sstevel@tonic-gate  * range of the largest supported page size. That is the finest granularity
2107c478bd9Sstevel@tonic-gate  * possible since the counter values are dependent on each other
2117c478bd9Sstevel@tonic-gate  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
2127c478bd9Sstevel@tonic-gate  * ctr_mutex lock index for a particular physical range.
2137c478bd9Sstevel@tonic-gate  */
2147c478bd9Sstevel@tonic-gate static kmutex_t	*ctr_mutex[NPC_MUTEX];
2157c478bd9Sstevel@tonic-gate 
2167c478bd9Sstevel@tonic-gate #define	PP_CTR_LOCK_INDX(pp)						\
2175d07b933Sdp 	(((pp)->p_pagenum >>						\
2187c478bd9Sstevel@tonic-gate 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
2197c478bd9Sstevel@tonic-gate 
2205d07b933Sdp #define	INVALID_COLOR 0xffffffff
2215d07b933Sdp #define	INVALID_MASK  0xffffffff
2225d07b933Sdp 
2237c478bd9Sstevel@tonic-gate /*
2247c478bd9Sstevel@tonic-gate  * Local functions prototypes.
2257c478bd9Sstevel@tonic-gate  */
2267c478bd9Sstevel@tonic-gate 
227affbd3ccSkchow void page_ctr_add(int, int, page_t *, int);
228affbd3ccSkchow void page_ctr_add_internal(int, int, page_t *, int);
229affbd3ccSkchow void page_ctr_sub(int, int, page_t *, int);
2305d07b933Sdp void page_ctr_sub_internal(int, int, page_t *, int);
2317c478bd9Sstevel@tonic-gate void page_freelist_lock(int);
2327c478bd9Sstevel@tonic-gate void page_freelist_unlock(int);
2335d07b933Sdp page_t *page_promote(int, pfn_t, uchar_t, int, int);
23419397407SSherry Moore page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
2355d07b933Sdp page_t *page_freelist_split(uchar_t,
23619397407SSherry Moore     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
2377c478bd9Sstevel@tonic-gate page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
2387c478bd9Sstevel@tonic-gate static int page_trylock_cons(page_t *pp, se_t se);
2397c478bd9Sstevel@tonic-gate 
2407c478bd9Sstevel@tonic-gate /*
2417c478bd9Sstevel@tonic-gate  * The page_counters array below is used to keep track of free contiguous
2427c478bd9Sstevel@tonic-gate  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
2437c478bd9Sstevel@tonic-gate  * This contains an array of counters, the size of the array, a shift value
2447c478bd9Sstevel@tonic-gate  * used to convert a pagenum into a counter array index or vice versa, as
2457c478bd9Sstevel@tonic-gate  * well as a cache of the last successful index to be promoted to a larger
2467c478bd9Sstevel@tonic-gate  * page size.  As an optimization, we keep track of the last successful index
2477c478bd9Sstevel@tonic-gate  * to be promoted per page color for the given size region, and this is
2487c478bd9Sstevel@tonic-gate  * allocated dynamically based upon the number of colors for a given
2497c478bd9Sstevel@tonic-gate  * region size.
2507c478bd9Sstevel@tonic-gate  *
2517c478bd9Sstevel@tonic-gate  * Conceptually, the page counters are represented as:
2527c478bd9Sstevel@tonic-gate  *
2537c478bd9Sstevel@tonic-gate  *	page_counters[region_size][mnode]
2547c478bd9Sstevel@tonic-gate  *
2557c478bd9Sstevel@tonic-gate  *	region_size:	size code of a candidate larger page made up
2567c478bd9Sstevel@tonic-gate  *			of contiguous free smaller pages.
2577c478bd9Sstevel@tonic-gate  *
2587c478bd9Sstevel@tonic-gate  *	page_counters[region_size][mnode].hpm_counters[index]:
2597c478bd9Sstevel@tonic-gate  *		represents how many (region_size - 1) pages either
2607c478bd9Sstevel@tonic-gate  *		exist or can be created within the given index range.
2617c478bd9Sstevel@tonic-gate  *
2627c478bd9Sstevel@tonic-gate  * Let's look at a sparc example:
2637c478bd9Sstevel@tonic-gate  *	If we want to create a free 512k page, we look at region_size 2
2647c478bd9Sstevel@tonic-gate  *	for the mnode we want.  We calculate the index and look at a specific
2657c478bd9Sstevel@tonic-gate  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
2667c478bd9Sstevel@tonic-gate  *	this location, it means that 8 64k pages either exist or can be created
2677c478bd9Sstevel@tonic-gate  *	from 8K pages in order to make a single free 512k page at the given
2687c478bd9Sstevel@tonic-gate  *	index.  Note that when a region is full, it will contribute to the
2697c478bd9Sstevel@tonic-gate  *	counts in the region above it.  Thus we will not know what page
2707c478bd9Sstevel@tonic-gate  *	size the free pages will be which can be promoted to this new free
2717c478bd9Sstevel@tonic-gate  *	page unless we look at all regions below the current region.
2727c478bd9Sstevel@tonic-gate  */
2737c478bd9Sstevel@tonic-gate 
2747c478bd9Sstevel@tonic-gate /*
2757c478bd9Sstevel@tonic-gate  * Note: hpmctr_t is defined in platform vm_dep.h
2767c478bd9Sstevel@tonic-gate  * hw_page_map_t contains all the information needed for the page_counters
2777c478bd9Sstevel@tonic-gate  * logic. The fields are as follows:
2787c478bd9Sstevel@tonic-gate  *
2797c478bd9Sstevel@tonic-gate  *	hpm_counters:	dynamically allocated array to hold counter data
2807c478bd9Sstevel@tonic-gate  *	hpm_entries:	entries in hpm_counters
2817c478bd9Sstevel@tonic-gate  *	hpm_shift:	shift for pnum/array index conv
2827c478bd9Sstevel@tonic-gate  *	hpm_base:	PFN mapped to counter index 0
2837c478bd9Sstevel@tonic-gate  *	hpm_color_current:	last index in counter array for this color at
2847c478bd9Sstevel@tonic-gate  *				which we successfully created a large page
2857c478bd9Sstevel@tonic-gate  */
2867c478bd9Sstevel@tonic-gate typedef struct hw_page_map {
2877c478bd9Sstevel@tonic-gate 	hpmctr_t	*hpm_counters;
2887c478bd9Sstevel@tonic-gate 	size_t		hpm_entries;
2897c478bd9Sstevel@tonic-gate 	int		hpm_shift;
2907c478bd9Sstevel@tonic-gate 	pfn_t		hpm_base;
2915d07b933Sdp 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
29206fb6a36Sdv #if defined(__sparc)
29306fb6a36Sdv 	uint_t		pad[4];
29406fb6a36Sdv #endif
2957c478bd9Sstevel@tonic-gate } hw_page_map_t;
2967c478bd9Sstevel@tonic-gate 
2977c478bd9Sstevel@tonic-gate /*
2987c478bd9Sstevel@tonic-gate  * Element zero is not used, but is allocated for convenience.
2997c478bd9Sstevel@tonic-gate  */
3007c478bd9Sstevel@tonic-gate static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
3017c478bd9Sstevel@tonic-gate 
3025d07b933Sdp /*
3035d07b933Sdp  * Cached value of MNODE_RANGE_CNT(mnode).
3045d07b933Sdp  * This is a function call in x86.
3055d07b933Sdp  */
3065d07b933Sdp static int mnode_nranges[MAX_MEM_NODES];
3075d07b933Sdp static int mnode_maxmrange[MAX_MEM_NODES];
3085d07b933Sdp 
3097c478bd9Sstevel@tonic-gate /*
3107c478bd9Sstevel@tonic-gate  * The following macros are convenient ways to get access to the individual
3117c478bd9Sstevel@tonic-gate  * elements of the page_counters arrays.  They can be used on both
3127c478bd9Sstevel@tonic-gate  * the left side and right side of equations.
3137c478bd9Sstevel@tonic-gate  */
3147c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
3157c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
3167c478bd9Sstevel@tonic-gate 
3177c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
3187c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
3197c478bd9Sstevel@tonic-gate 
3207c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
3217c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
3227c478bd9Sstevel@tonic-gate 
3237c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
3247c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
3257c478bd9Sstevel@tonic-gate 
3267c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
3277c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_base)
3287c478bd9Sstevel@tonic-gate 
3295d07b933Sdp #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
3305d07b933Sdp 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
3317c478bd9Sstevel@tonic-gate 
3325d07b933Sdp #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
3335d07b933Sdp 	(page_counters[(rg_szc)][(mnode)].				\
3345d07b933Sdp 	hpm_color_current[(mrange)][(color)])
3357c478bd9Sstevel@tonic-gate 
3367c478bd9Sstevel@tonic-gate #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
3377c478bd9Sstevel@tonic-gate 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
3387c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
3397c478bd9Sstevel@tonic-gate 
3407c478bd9Sstevel@tonic-gate #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
3417c478bd9Sstevel@tonic-gate 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
3427c478bd9Sstevel@tonic-gate 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
3437c478bd9Sstevel@tonic-gate 
3447c478bd9Sstevel@tonic-gate /*
3457c478bd9Sstevel@tonic-gate  * Protects the hpm_counters and hpm_color_current memory from changing while
3467c478bd9Sstevel@tonic-gate  * looking at page counters information.
3477c478bd9Sstevel@tonic-gate  * Grab the write lock to modify what these fields point at.
3487c478bd9Sstevel@tonic-gate  * Grab the read lock to prevent any pointers from changing.
3497c478bd9Sstevel@tonic-gate  * The write lock can not be held during memory allocation due to a possible
3507c478bd9Sstevel@tonic-gate  * recursion deadlock with trying to grab the read lock while the
3517c478bd9Sstevel@tonic-gate  * write lock is already held.
3527c478bd9Sstevel@tonic-gate  */
3537c478bd9Sstevel@tonic-gate krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
3547c478bd9Sstevel@tonic-gate 
355affbd3ccSkchow 
356affbd3ccSkchow /*
357affbd3ccSkchow  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
358affbd3ccSkchow  */
359affbd3ccSkchow void
360affbd3ccSkchow cpu_vm_data_init(struct cpu *cp)
361affbd3ccSkchow {
362affbd3ccSkchow 	if (cp == CPU0) {
363affbd3ccSkchow 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
364affbd3ccSkchow 	} else {
365affbd3ccSkchow 		void	*kmptr;
3666061ce8aSkchow 		int	align;
3676061ce8aSkchow 		size_t	sz;
368affbd3ccSkchow 
3696061ce8aSkchow 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
3706061ce8aSkchow 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
3716061ce8aSkchow 		kmptr = kmem_zalloc(sz, KM_SLEEP);
372affbd3ccSkchow 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
373affbd3ccSkchow 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
3746061ce8aSkchow 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
375affbd3ccSkchow 	}
376affbd3ccSkchow }
377affbd3ccSkchow 
378affbd3ccSkchow /*
379affbd3ccSkchow  * free cpu_vm_data
380affbd3ccSkchow  */
381affbd3ccSkchow void
382affbd3ccSkchow cpu_vm_data_destroy(struct cpu *cp)
383affbd3ccSkchow {
384affbd3ccSkchow 	if (cp->cpu_seqid && cp->cpu_vm_data) {
385affbd3ccSkchow 		ASSERT(cp != CPU0);
386affbd3ccSkchow 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
3876061ce8aSkchow 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
388affbd3ccSkchow 	}
389affbd3ccSkchow 	cp->cpu_vm_data = NULL;
390affbd3ccSkchow }
391affbd3ccSkchow 
392affbd3ccSkchow 
3937c478bd9Sstevel@tonic-gate /*
3947c478bd9Sstevel@tonic-gate  * page size to page size code
3957c478bd9Sstevel@tonic-gate  */
3967c478bd9Sstevel@tonic-gate int
3977c478bd9Sstevel@tonic-gate page_szc(size_t pagesize)
3987c478bd9Sstevel@tonic-gate {
3997c478bd9Sstevel@tonic-gate 	int	i = 0;
4007c478bd9Sstevel@tonic-gate 
4017c478bd9Sstevel@tonic-gate 	while (hw_page_array[i].hp_size) {
4027c478bd9Sstevel@tonic-gate 		if (pagesize == hw_page_array[i].hp_size)
4037c478bd9Sstevel@tonic-gate 			return (i);
4047c478bd9Sstevel@tonic-gate 		i++;
4057c478bd9Sstevel@tonic-gate 	}
4067c478bd9Sstevel@tonic-gate 	return (-1);
4077c478bd9Sstevel@tonic-gate }
4087c478bd9Sstevel@tonic-gate 
4097c478bd9Sstevel@tonic-gate /*
4104abce959Smec  * page size to page size code with the restriction that it be a supported
4114abce959Smec  * user page size.  If it's not a supported user page size, -1 will be returned.
4127c478bd9Sstevel@tonic-gate  */
4137c478bd9Sstevel@tonic-gate int
4144abce959Smec page_szc_user_filtered(size_t pagesize)
4157c478bd9Sstevel@tonic-gate {
4167c478bd9Sstevel@tonic-gate 	int szc = page_szc(pagesize);
4174abce959Smec 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
4184abce959Smec 		return (szc);
4194abce959Smec 	}
4207c478bd9Sstevel@tonic-gate 	return (-1);
4217c478bd9Sstevel@tonic-gate }
4227c478bd9Sstevel@tonic-gate 
4237c478bd9Sstevel@tonic-gate /*
4247c478bd9Sstevel@tonic-gate  * Return how many page sizes are available for the user to use.  This is
4257c478bd9Sstevel@tonic-gate  * what the hardware supports and not based upon how the OS implements the
4267c478bd9Sstevel@tonic-gate  * support of different page sizes.
42702bc52beSkchow  *
42802bc52beSkchow  * If legacy is non-zero, return the number of pagesizes available to legacy
42902bc52beSkchow  * applications. The number of legacy page sizes might be less than the
43002bc52beSkchow  * exported user page sizes. This is to prevent legacy applications that
43102bc52beSkchow  * use the largest page size returned from getpagesizes(3c) from inadvertantly
43202bc52beSkchow  * using the 'new' large pagesizes.
4337c478bd9Sstevel@tonic-gate  */
4347c478bd9Sstevel@tonic-gate uint_t
43502bc52beSkchow page_num_user_pagesizes(int legacy)
4367c478bd9Sstevel@tonic-gate {
43702bc52beSkchow 	if (legacy)
43802bc52beSkchow 		return (mmu_legacy_page_sizes);
4397c478bd9Sstevel@tonic-gate 	return (mmu_exported_page_sizes);
4407c478bd9Sstevel@tonic-gate }
4417c478bd9Sstevel@tonic-gate 
4427c478bd9Sstevel@tonic-gate uint_t
4437c478bd9Sstevel@tonic-gate page_num_pagesizes(void)
4447c478bd9Sstevel@tonic-gate {
4457c478bd9Sstevel@tonic-gate 	return (mmu_page_sizes);
4467c478bd9Sstevel@tonic-gate }
4477c478bd9Sstevel@tonic-gate 
4487c478bd9Sstevel@tonic-gate /*
4497c478bd9Sstevel@tonic-gate  * returns the count of the number of base pagesize pages associated with szc
4507c478bd9Sstevel@tonic-gate  */
4517c478bd9Sstevel@tonic-gate pgcnt_t
4527c478bd9Sstevel@tonic-gate page_get_pagecnt(uint_t szc)
4537c478bd9Sstevel@tonic-gate {
4547c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4557c478bd9Sstevel@tonic-gate 		panic("page_get_pagecnt: out of range %d", szc);
4567c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_pgcnt);
4577c478bd9Sstevel@tonic-gate }
4587c478bd9Sstevel@tonic-gate 
4597c478bd9Sstevel@tonic-gate size_t
4607c478bd9Sstevel@tonic-gate page_get_pagesize(uint_t szc)
4617c478bd9Sstevel@tonic-gate {
4627c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4637c478bd9Sstevel@tonic-gate 		panic("page_get_pagesize: out of range %d", szc);
4647c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_size);
4657c478bd9Sstevel@tonic-gate }
4667c478bd9Sstevel@tonic-gate 
4677c478bd9Sstevel@tonic-gate /*
4687c478bd9Sstevel@tonic-gate  * Return the size of a page based upon the index passed in.  An index of
4697c478bd9Sstevel@tonic-gate  * zero refers to the smallest page size in the system, and as index increases
4707c478bd9Sstevel@tonic-gate  * it refers to the next larger supported page size in the system.
4717c478bd9Sstevel@tonic-gate  * Note that szc and userszc may not be the same due to unsupported szc's on
4727c478bd9Sstevel@tonic-gate  * some systems.
4737c478bd9Sstevel@tonic-gate  */
4747c478bd9Sstevel@tonic-gate size_t
4757c478bd9Sstevel@tonic-gate page_get_user_pagesize(uint_t userszc)
4767c478bd9Sstevel@tonic-gate {
4777c478bd9Sstevel@tonic-gate 	uint_t szc = USERSZC_2_SZC(userszc);
4787c478bd9Sstevel@tonic-gate 
4797c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4807c478bd9Sstevel@tonic-gate 		panic("page_get_user_pagesize: out of range %d", szc);
4817c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_size);
4827c478bd9Sstevel@tonic-gate }
4837c478bd9Sstevel@tonic-gate 
4847c478bd9Sstevel@tonic-gate uint_t
4857c478bd9Sstevel@tonic-gate page_get_shift(uint_t szc)
4867c478bd9Sstevel@tonic-gate {
4877c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4887c478bd9Sstevel@tonic-gate 		panic("page_get_shift: out of range %d", szc);
4895d07b933Sdp 	return (PAGE_GET_SHIFT(szc));
4907c478bd9Sstevel@tonic-gate }
4917c478bd9Sstevel@tonic-gate 
4927c478bd9Sstevel@tonic-gate uint_t
4937c478bd9Sstevel@tonic-gate page_get_pagecolors(uint_t szc)
4947c478bd9Sstevel@tonic-gate {
4955d07b933Sdp 	if (szc >= mmu_page_sizes)
4965d07b933Sdp 		panic("page_get_pagecolors: out of range %d", szc);
4975d07b933Sdp 	return (PAGE_GET_PAGECOLORS(szc));
4985d07b933Sdp }
4995d07b933Sdp 
5005d07b933Sdp /*
5015d07b933Sdp  * this assigns the desired equivalent color after a split
5025d07b933Sdp  */
5035d07b933Sdp uint_t
5045d07b933Sdp page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
5055d07b933Sdp     uint_t ncolor, uint_t ceq_mask)
5065d07b933Sdp {
5075d07b933Sdp 	ASSERT(nszc > szc);
5085d07b933Sdp 	ASSERT(szc < mmu_page_sizes);
5095d07b933Sdp 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
5105d07b933Sdp 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
5115d07b933Sdp 
5125d07b933Sdp 	color &= ceq_mask;
513ce8eb11aSdp 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
5145d07b933Sdp 	return (color | (ncolor & ~ceq_mask));
5157c478bd9Sstevel@tonic-gate }
5167c478bd9Sstevel@tonic-gate 
517ce8eb11aSdp /*
518ce8eb11aSdp  * The interleaved_mnodes flag is set when mnodes overlap in
519ce8eb11aSdp  * the physbase..physmax range, but have disjoint slices.
520ce8eb11aSdp  * In this case hpm_counters is shared by all mnodes.
521ce8eb11aSdp  * This flag is set dynamically by the platform.
522ce8eb11aSdp  */
523ce8eb11aSdp int interleaved_mnodes = 0;
524ce8eb11aSdp 
5257c478bd9Sstevel@tonic-gate /*
5267c478bd9Sstevel@tonic-gate  * Called by startup().
5277c478bd9Sstevel@tonic-gate  * Size up the per page size free list counters based on physmax
5287c478bd9Sstevel@tonic-gate  * of each node and max_mem_nodes.
529ce8eb11aSdp  *
530ce8eb11aSdp  * If interleaved_mnodes is set we need to find the first mnode that
531ce8eb11aSdp  * exists. hpm_counters for the first mnode will then be shared by
532ce8eb11aSdp  * all other mnodes. If interleaved_mnodes is not set, just set
533ce8eb11aSdp  * first=mnode each time. That means there will be no sharing.
5347c478bd9Sstevel@tonic-gate  */
5357c478bd9Sstevel@tonic-gate size_t
5367c478bd9Sstevel@tonic-gate page_ctrs_sz(void)
5377c478bd9Sstevel@tonic-gate {
5387c478bd9Sstevel@tonic-gate 	int	r;		/* region size */
5397c478bd9Sstevel@tonic-gate 	int	mnode;
540ce8eb11aSdp 	int	firstmn;	/* first mnode that exists */
5415d07b933Sdp 	int	nranges;
542ce8eb11aSdp 	pfn_t	physbase;
543ce8eb11aSdp 	pfn_t	physmax;
5447c478bd9Sstevel@tonic-gate 	uint_t	ctrs_sz = 0;
5457c478bd9Sstevel@tonic-gate 	int 	i;
5467c478bd9Sstevel@tonic-gate 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
5477c478bd9Sstevel@tonic-gate 
5487c478bd9Sstevel@tonic-gate 	/*
5497c478bd9Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
5507c478bd9Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
5517c478bd9Sstevel@tonic-gate 	 * arrays.
5527c478bd9Sstevel@tonic-gate 	 */
5535d07b933Sdp 	for (i = 0; i < mmu_page_sizes; i++) {
5545d07b933Sdp 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
5557c478bd9Sstevel@tonic-gate 	}
5567c478bd9Sstevel@tonic-gate 
557ce8eb11aSdp 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
5587c478bd9Sstevel@tonic-gate 
5597c478bd9Sstevel@tonic-gate 		pgcnt_t r_pgcnt;
5607c478bd9Sstevel@tonic-gate 		pfn_t   r_base;
5617c478bd9Sstevel@tonic-gate 		pgcnt_t r_align;
5627c478bd9Sstevel@tonic-gate 
5637c478bd9Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
5647c478bd9Sstevel@tonic-gate 			continue;
5657c478bd9Sstevel@tonic-gate 
566ce8eb11aSdp 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
5675d07b933Sdp 		nranges = MNODE_RANGE_CNT(mnode);
5685d07b933Sdp 		mnode_nranges[mnode] = nranges;
5695d07b933Sdp 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
5705d07b933Sdp 
5717c478bd9Sstevel@tonic-gate 		/*
5727c478bd9Sstevel@tonic-gate 		 * determine size needed for page counter arrays with
5737c478bd9Sstevel@tonic-gate 		 * base aligned to large page size.
5747c478bd9Sstevel@tonic-gate 		 */
5757c478bd9Sstevel@tonic-gate 		for (r = 1; r < mmu_page_sizes; r++) {
576ce8eb11aSdp 			/* add in space for hpm_color_current */
577ce8eb11aSdp 			ctrs_sz += sizeof (size_t) *
578ce8eb11aSdp 			    colors_per_szc[r] * nranges;
579ce8eb11aSdp 
580ce8eb11aSdp 			if (firstmn != mnode)
581ce8eb11aSdp 				continue;
582ce8eb11aSdp 
5837c478bd9Sstevel@tonic-gate 			/* add in space for hpm_counters */
5847c478bd9Sstevel@tonic-gate 			r_align = page_get_pagecnt(r);
585ce8eb11aSdp 			r_base = physbase;
5867c478bd9Sstevel@tonic-gate 			r_base &= ~(r_align - 1);
587ce8eb11aSdp 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
588ce8eb11aSdp 
5897c478bd9Sstevel@tonic-gate 			/*
5907c478bd9Sstevel@tonic-gate 			 * Round up to always allocate on pointer sized
5917c478bd9Sstevel@tonic-gate 			 * boundaries.
5927c478bd9Sstevel@tonic-gate 			 */
5937c478bd9Sstevel@tonic-gate 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
5947c478bd9Sstevel@tonic-gate 			    sizeof (hpmctr_t *));
5957c478bd9Sstevel@tonic-gate 		}
5967c478bd9Sstevel@tonic-gate 	}
5977c478bd9Sstevel@tonic-gate 
5987c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
5997c478bd9Sstevel@tonic-gate 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
6005d07b933Sdp 	}
6015d07b933Sdp 
6025d07b933Sdp 	/* add in space for page_ctrs_cands and pcc_color_free */
6035d07b933Sdp 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
6045d07b933Sdp 	    mmu_page_sizes * NPC_MUTEX;
6055d07b933Sdp 
6065d07b933Sdp 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
6075d07b933Sdp 
6085d07b933Sdp 		if (mem_node_config[mnode].exists == 0)
6095d07b933Sdp 			continue;
6107c478bd9Sstevel@tonic-gate 
6115d07b933Sdp 		nranges = mnode_nranges[mnode];
6125d07b933Sdp 		ctrs_sz += sizeof (pcc_info_t) * nranges *
6135d07b933Sdp 		    mmu_page_sizes * NPC_MUTEX;
6145d07b933Sdp 		for (r = 1; r < mmu_page_sizes; r++) {
6155d07b933Sdp 			ctrs_sz += sizeof (pgcnt_t) * nranges *
6165d07b933Sdp 			    colors_per_szc[r] * NPC_MUTEX;
6175d07b933Sdp 		}
6187c478bd9Sstevel@tonic-gate 	}
6197c478bd9Sstevel@tonic-gate 
6207c478bd9Sstevel@tonic-gate 	/* ctr_mutex */
6217c478bd9Sstevel@tonic-gate 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
6227c478bd9Sstevel@tonic-gate 
6237c478bd9Sstevel@tonic-gate 	/* size for page list counts */
6247c478bd9Sstevel@tonic-gate 	PLCNT_SZ(ctrs_sz);
6257c478bd9Sstevel@tonic-gate 
6267c478bd9Sstevel@tonic-gate 	/*
6277c478bd9Sstevel@tonic-gate 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
6287c478bd9Sstevel@tonic-gate 	 * address of the counters to ecache_alignsize boundary for every
6297c478bd9Sstevel@tonic-gate 	 * memory node.
6307c478bd9Sstevel@tonic-gate 	 */
6317c478bd9Sstevel@tonic-gate 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
6327c478bd9Sstevel@tonic-gate }
6337c478bd9Sstevel@tonic-gate 
6347c478bd9Sstevel@tonic-gate caddr_t
6357c478bd9Sstevel@tonic-gate page_ctrs_alloc(caddr_t alloc_base)
6367c478bd9Sstevel@tonic-gate {
6377c478bd9Sstevel@tonic-gate 	int	mnode;
6385d07b933Sdp 	int	mrange, nranges;
6397c478bd9Sstevel@tonic-gate 	int	r;		/* region size */
6407c478bd9Sstevel@tonic-gate 	int	i;
641ce8eb11aSdp 	int	firstmn;	/* first mnode that exists */
642ce8eb11aSdp 	pfn_t	physbase;
643ce8eb11aSdp 	pfn_t	physmax;
6447c478bd9Sstevel@tonic-gate 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
6457c478bd9Sstevel@tonic-gate 
6467c478bd9Sstevel@tonic-gate 	/*
6477c478bd9Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
6487c478bd9Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
6497c478bd9Sstevel@tonic-gate 	 * arrays.
6507c478bd9Sstevel@tonic-gate 	 */
6515d07b933Sdp 	for (i = 0; i < mmu_page_sizes; i++) {
6525d07b933Sdp 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
6537c478bd9Sstevel@tonic-gate 	}
6547c478bd9Sstevel@tonic-gate 
6557c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
6567c478bd9Sstevel@tonic-gate 		page_counters[r] = (hw_page_map_t *)alloc_base;
6577c478bd9Sstevel@tonic-gate 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
6587c478bd9Sstevel@tonic-gate 	}
6597c478bd9Sstevel@tonic-gate 
6605d07b933Sdp 	/* page_ctrs_cands and pcc_color_free array */
6615d07b933Sdp 	for (i = 0; i < NPC_MUTEX; i++) {
6625d07b933Sdp 		for (r = 1; r < mmu_page_sizes; r++) {
6637c478bd9Sstevel@tonic-gate 
6645d07b933Sdp 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
6655d07b933Sdp 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
6667c478bd9Sstevel@tonic-gate 
6677c478bd9Sstevel@tonic-gate 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
6685d07b933Sdp 				pcc_info_t *pi;
6695d07b933Sdp 
6705d07b933Sdp 				if (mem_node_config[mnode].exists == 0)
6715d07b933Sdp 					continue;
6725d07b933Sdp 
6735d07b933Sdp 				nranges = mnode_nranges[mnode];
6745d07b933Sdp 
6755d07b933Sdp 				pi = (pcc_info_t *)alloc_base;
6765d07b933Sdp 				alloc_base += sizeof (pcc_info_t) * nranges;
6775d07b933Sdp 				page_ctrs_cands[i][r][mnode] = pi;
6785d07b933Sdp 
6795d07b933Sdp 				for (mrange = 0; mrange < nranges; mrange++) {
6805d07b933Sdp 					pi->pcc_color_free =
6815d07b933Sdp 					    (pgcnt_t *)alloc_base;
6825d07b933Sdp 					alloc_base += sizeof (pgcnt_t) *
6835d07b933Sdp 					    colors_per_szc[r];
6845d07b933Sdp 					pi++;
6855d07b933Sdp 				}
6867c478bd9Sstevel@tonic-gate 			}
6877c478bd9Sstevel@tonic-gate 		}
6887c478bd9Sstevel@tonic-gate 	}
6897c478bd9Sstevel@tonic-gate 
6907c478bd9Sstevel@tonic-gate 	/* ctr_mutex */
6917c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
6927c478bd9Sstevel@tonic-gate 		ctr_mutex[i] = (kmutex_t *)alloc_base;
6937c478bd9Sstevel@tonic-gate 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
6947c478bd9Sstevel@tonic-gate 	}
6957c478bd9Sstevel@tonic-gate 
6967c478bd9Sstevel@tonic-gate 	/* initialize page list counts */
6977c478bd9Sstevel@tonic-gate 	PLCNT_INIT(alloc_base);
6987c478bd9Sstevel@tonic-gate 
699ce8eb11aSdp 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
7007c478bd9Sstevel@tonic-gate 
7017c478bd9Sstevel@tonic-gate 		pgcnt_t r_pgcnt;
7027c478bd9Sstevel@tonic-gate 		pfn_t	r_base;
7037c478bd9Sstevel@tonic-gate 		pgcnt_t r_align;
7047c478bd9Sstevel@tonic-gate 		int	r_shift;
7055d07b933Sdp 		int	nranges = mnode_nranges[mnode];
7067c478bd9Sstevel@tonic-gate 
7077c478bd9Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
7087c478bd9Sstevel@tonic-gate 			continue;
7097c478bd9Sstevel@tonic-gate 
710ce8eb11aSdp 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
711ce8eb11aSdp 
7127c478bd9Sstevel@tonic-gate 		for (r = 1; r < mmu_page_sizes; r++) {
7137c478bd9Sstevel@tonic-gate 			/*
7147c478bd9Sstevel@tonic-gate 			 * the page_counters base has to be aligned to the
7157c478bd9Sstevel@tonic-gate 			 * page count of page size code r otherwise the counts
7167c478bd9Sstevel@tonic-gate 			 * will cross large page boundaries.
7177c478bd9Sstevel@tonic-gate 			 */
7187c478bd9Sstevel@tonic-gate 			r_align = page_get_pagecnt(r);
719ce8eb11aSdp 			r_base = physbase;
7207c478bd9Sstevel@tonic-gate 			/* base needs to be aligned - lower to aligned value */
7217c478bd9Sstevel@tonic-gate 			r_base &= ~(r_align - 1);
722ce8eb11aSdp 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
7237c478bd9Sstevel@tonic-gate 			r_shift = PAGE_BSZS_SHIFT(r);
7247c478bd9Sstevel@tonic-gate 
7257c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
7267c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
7277c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
7285d07b933Sdp 			for (mrange = 0; mrange < nranges; mrange++) {
7295d07b933Sdp 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
7305d07b933Sdp 				    r, mrange) = (size_t *)alloc_base;
7315d07b933Sdp 				alloc_base += sizeof (size_t) *
7325d07b933Sdp 				    colors_per_szc[r];
7335d07b933Sdp 			}
7347c478bd9Sstevel@tonic-gate 			for (i = 0; i < colors_per_szc[r]; i++) {
7355d07b933Sdp 				uint_t color_mask = colors_per_szc[r] - 1;
7365d07b933Sdp 				pfn_t  pfnum = r_base;
7375d07b933Sdp 				size_t idx;
7385d07b933Sdp 				int mrange;
739ce8eb11aSdp 				MEM_NODE_ITERATOR_DECL(it);
7405d07b933Sdp 
741b779d3e0Sdp 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
742b779d3e0Sdp 				if (pfnum == (pfn_t)-1) {
743b779d3e0Sdp 					idx = 0;
744b779d3e0Sdp 				} else {
745b779d3e0Sdp 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
746b779d3e0Sdp 					    color_mask, color_mask, &it);
747b779d3e0Sdp 					idx = PNUM_TO_IDX(mnode, r, pfnum);
748b779d3e0Sdp 					idx = (idx >= r_pgcnt) ? 0 : idx;
749b779d3e0Sdp 				}
7505d07b933Sdp 				for (mrange = 0; mrange < nranges; mrange++) {
7515d07b933Sdp 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
7525d07b933Sdp 					    r, i, mrange) = idx;
7535d07b933Sdp 				}
7547c478bd9Sstevel@tonic-gate 			}
755ce8eb11aSdp 
756ce8eb11aSdp 			/* hpm_counters may be shared by all mnodes */
757ce8eb11aSdp 			if (firstmn == mnode) {
758ce8eb11aSdp 				PAGE_COUNTERS_COUNTERS(mnode, r) =
759ce8eb11aSdp 				    (hpmctr_t *)alloc_base;
760ce8eb11aSdp 				alloc_base +=
761ce8eb11aSdp 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
762ce8eb11aSdp 				    sizeof (hpmctr_t *));
763ce8eb11aSdp 			} else {
764ce8eb11aSdp 				PAGE_COUNTERS_COUNTERS(mnode, r) =
765ce8eb11aSdp 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
766ce8eb11aSdp 			}
7677c478bd9Sstevel@tonic-gate 
7687c478bd9Sstevel@tonic-gate 			/*
7697c478bd9Sstevel@tonic-gate 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
7707c478bd9Sstevel@tonic-gate 			 * satisfy the identity requirement.
7717c478bd9Sstevel@tonic-gate 			 * We should be able to go from one to the other
7727c478bd9Sstevel@tonic-gate 			 * and get consistent values.
7737c478bd9Sstevel@tonic-gate 			 */
7747c478bd9Sstevel@tonic-gate 			ASSERT(PNUM_TO_IDX(mnode, r,
7757c478bd9Sstevel@tonic-gate 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
7767c478bd9Sstevel@tonic-gate 			ASSERT(IDX_TO_PNUM(mnode, r,
7777c478bd9Sstevel@tonic-gate 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
7787c478bd9Sstevel@tonic-gate 		}
7797c478bd9Sstevel@tonic-gate 		/*
7807c478bd9Sstevel@tonic-gate 		 * Roundup the start address of the page_counters to
7817c478bd9Sstevel@tonic-gate 		 * cache aligned boundary for every memory node.
7827c478bd9Sstevel@tonic-gate 		 * page_ctrs_sz() has added some slop for these roundups.
7837c478bd9Sstevel@tonic-gate 		 */
7847c478bd9Sstevel@tonic-gate 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
785ce8eb11aSdp 		    L2CACHE_ALIGN);
7867c478bd9Sstevel@tonic-gate 	}
7877c478bd9Sstevel@tonic-gate 
7887c478bd9Sstevel@tonic-gate 	/* Initialize other page counter specific data structures. */
7897c478bd9Sstevel@tonic-gate 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
7907c478bd9Sstevel@tonic-gate 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
7917c478bd9Sstevel@tonic-gate 	}
7927c478bd9Sstevel@tonic-gate 
7937c478bd9Sstevel@tonic-gate 	return (alloc_base);
7947c478bd9Sstevel@tonic-gate }
7957c478bd9Sstevel@tonic-gate 
7967c478bd9Sstevel@tonic-gate /*
7977c478bd9Sstevel@tonic-gate  * Functions to adjust region counters for each size free list.
7987c478bd9Sstevel@tonic-gate  * Caller is responsible to acquire the ctr_mutex lock if necessary and
7997c478bd9Sstevel@tonic-gate  * thus can be called during startup without locks.
8007c478bd9Sstevel@tonic-gate  */
8017c478bd9Sstevel@tonic-gate /* ARGSUSED */
8027c478bd9Sstevel@tonic-gate void
803affbd3ccSkchow page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
8047c478bd9Sstevel@tonic-gate {
8057c478bd9Sstevel@tonic-gate 	ssize_t		r;	/* region size */
8067c478bd9Sstevel@tonic-gate 	ssize_t		idx;
8077c478bd9Sstevel@tonic-gate 	pfn_t		pfnum;
8087c478bd9Sstevel@tonic-gate 	int		lckidx;
8097c478bd9Sstevel@tonic-gate 
810affbd3ccSkchow 	ASSERT(mnode == PP_2_MEM_NODE(pp));
811affbd3ccSkchow 	ASSERT(mtype == PP_2_MTYPE(pp));
812affbd3ccSkchow 
8137c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc < mmu_page_sizes);
8147c478bd9Sstevel@tonic-gate 
815affbd3ccSkchow 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
8167c478bd9Sstevel@tonic-gate 
8177c478bd9Sstevel@tonic-gate 	/* no counter update needed for largest page size */
8187c478bd9Sstevel@tonic-gate 	if (pp->p_szc >= mmu_page_sizes - 1) {
8197c478bd9Sstevel@tonic-gate 		return;
8207c478bd9Sstevel@tonic-gate 	}
8217c478bd9Sstevel@tonic-gate 
8227c478bd9Sstevel@tonic-gate 	r = pp->p_szc + 1;
8237c478bd9Sstevel@tonic-gate 	pfnum = pp->p_pagenum;
8247c478bd9Sstevel@tonic-gate 	lckidx = PP_CTR_LOCK_INDX(pp);
8257c478bd9Sstevel@tonic-gate 
8267c478bd9Sstevel@tonic-gate 	/*
8277c478bd9Sstevel@tonic-gate 	 * Increment the count of free pages for the current
8287c478bd9Sstevel@tonic-gate 	 * region. Continue looping up in region size incrementing
8297c478bd9Sstevel@tonic-gate 	 * count if the preceeding region is full.
8307c478bd9Sstevel@tonic-gate 	 */
8317c478bd9Sstevel@tonic-gate 	while (r < mmu_page_sizes) {
8327c478bd9Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pfnum);
8337c478bd9Sstevel@tonic-gate 
8347c478bd9Sstevel@tonic-gate 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
8357c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
8367c478bd9Sstevel@tonic-gate 
8375d07b933Sdp 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
8387c478bd9Sstevel@tonic-gate 			break;
8395d07b933Sdp 		} else {
8405d07b933Sdp 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
8415d07b933Sdp 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
8425d07b933Sdp 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
8437c478bd9Sstevel@tonic-gate 
8445d07b933Sdp 			cand->pcc_pages_free++;
8455d07b933Sdp 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
8465d07b933Sdp 		}
8477c478bd9Sstevel@tonic-gate 		r++;
8487c478bd9Sstevel@tonic-gate 	}
8497c478bd9Sstevel@tonic-gate }
8507c478bd9Sstevel@tonic-gate 
8517c478bd9Sstevel@tonic-gate void
852affbd3ccSkchow page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
8537c478bd9Sstevel@tonic-gate {
8547c478bd9Sstevel@tonic-gate 	int		lckidx = PP_CTR_LOCK_INDX(pp);
8557c478bd9Sstevel@tonic-gate 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
8567c478bd9Sstevel@tonic-gate 
8577c478bd9Sstevel@tonic-gate 	mutex_enter(lock);
858affbd3ccSkchow 	page_ctr_add_internal(mnode, mtype, pp, flags);
8597c478bd9Sstevel@tonic-gate 	mutex_exit(lock);
8607c478bd9Sstevel@tonic-gate }
8617c478bd9Sstevel@tonic-gate 
8627c478bd9Sstevel@tonic-gate void
8635d07b933Sdp page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
8647c478bd9Sstevel@tonic-gate {
8657c478bd9Sstevel@tonic-gate 	int		lckidx;
8667c478bd9Sstevel@tonic-gate 	ssize_t		r;	/* region size */
8677c478bd9Sstevel@tonic-gate 	ssize_t		idx;
8687c478bd9Sstevel@tonic-gate 	pfn_t		pfnum;
8697c478bd9Sstevel@tonic-gate 
870affbd3ccSkchow 	ASSERT(mnode == PP_2_MEM_NODE(pp));
871affbd3ccSkchow 	ASSERT(mtype == PP_2_MTYPE(pp));
872affbd3ccSkchow 
8737c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc < mmu_page_sizes);
8747c478bd9Sstevel@tonic-gate 
875affbd3ccSkchow 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
8767c478bd9Sstevel@tonic-gate 
8777c478bd9Sstevel@tonic-gate 	/* no counter update needed for largest page size */
8787c478bd9Sstevel@tonic-gate 	if (pp->p_szc >= mmu_page_sizes - 1) {
8797c478bd9Sstevel@tonic-gate 		return;
8807c478bd9Sstevel@tonic-gate 	}
8817c478bd9Sstevel@tonic-gate 
8827c478bd9Sstevel@tonic-gate 	r = pp->p_szc + 1;
8837c478bd9Sstevel@tonic-gate 	pfnum = pp->p_pagenum;
8847c478bd9Sstevel@tonic-gate 	lckidx = PP_CTR_LOCK_INDX(pp);
8857c478bd9Sstevel@tonic-gate 
8867c478bd9Sstevel@tonic-gate 	/*
8877c478bd9Sstevel@tonic-gate 	 * Decrement the count of free pages for the current
8887c478bd9Sstevel@tonic-gate 	 * region. Continue looping up in region size decrementing
8897c478bd9Sstevel@tonic-gate 	 * count if the preceeding region was full.
8907c478bd9Sstevel@tonic-gate 	 */
8917c478bd9Sstevel@tonic-gate 	while (r < mmu_page_sizes) {
8927c478bd9Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pfnum);
8937c478bd9Sstevel@tonic-gate 
8947c478bd9Sstevel@tonic-gate 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
8957c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
8967c478bd9Sstevel@tonic-gate 
8977c478bd9Sstevel@tonic-gate 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
8987c478bd9Sstevel@tonic-gate 			break;
8995d07b933Sdp 		} else {
9005d07b933Sdp 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
9015d07b933Sdp 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
9025d07b933Sdp 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
9037c478bd9Sstevel@tonic-gate 
9045d07b933Sdp 			ASSERT(cand->pcc_pages_free != 0);
9055d07b933Sdp 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
9065d07b933Sdp 
9075d07b933Sdp 			cand->pcc_pages_free--;
9085d07b933Sdp 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
9095d07b933Sdp 		}
9107c478bd9Sstevel@tonic-gate 		r++;
9117c478bd9Sstevel@tonic-gate 	}
9125d07b933Sdp }
9135d07b933Sdp 
9145d07b933Sdp void
9155d07b933Sdp page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
9165d07b933Sdp {
9175d07b933Sdp 	int		lckidx = PP_CTR_LOCK_INDX(pp);
9185d07b933Sdp 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
9195d07b933Sdp 
9205d07b933Sdp 	mutex_enter(lock);
9215d07b933Sdp 	page_ctr_sub_internal(mnode, mtype, pp, flags);
9227c478bd9Sstevel@tonic-gate 	mutex_exit(lock);
9237c478bd9Sstevel@tonic-gate }
9247c478bd9Sstevel@tonic-gate 
9257c478bd9Sstevel@tonic-gate /*
9267c478bd9Sstevel@tonic-gate  * Adjust page counters following a memory attach, since typically the
9277c478bd9Sstevel@tonic-gate  * size of the array needs to change, and the PFN to counter index
9287c478bd9Sstevel@tonic-gate  * mapping needs to change.
9295d07b933Sdp  *
9305d07b933Sdp  * It is possible this mnode did not exist at startup. In that case
9315d07b933Sdp  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
9325d07b933Sdp  * to change (a theoretical possibility on x86), which means pcc_color_free
9335d07b933Sdp  * arrays must be extended.
9347c478bd9Sstevel@tonic-gate  */
9357c478bd9Sstevel@tonic-gate uint_t
9367c478bd9Sstevel@tonic-gate page_ctrs_adjust(int mnode)
9377c478bd9Sstevel@tonic-gate {
9387c478bd9Sstevel@tonic-gate 	pgcnt_t npgs;
9397c478bd9Sstevel@tonic-gate 	int	r;		/* region size */
9407c478bd9Sstevel@tonic-gate 	int	i;
9417c478bd9Sstevel@tonic-gate 	size_t	pcsz, old_csz;
9427c478bd9Sstevel@tonic-gate 	hpmctr_t *new_ctr, *old_ctr;
9437c478bd9Sstevel@tonic-gate 	pfn_t	oldbase, newbase;
944ce8eb11aSdp 	pfn_t	physbase, physmax;
9457c478bd9Sstevel@tonic-gate 	size_t	old_npgs;
9467c478bd9Sstevel@tonic-gate 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
9477c478bd9Sstevel@tonic-gate 	size_t	size_cache[MMU_PAGE_SIZES];
9485d07b933Sdp 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
9495d07b933Sdp 	size_t	*old_color_array[MAX_MNODE_MRANGES];
9507c478bd9Sstevel@tonic-gate 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
9515d07b933Sdp 	pcc_info_t **cands_cache;
9525d07b933Sdp 	pcc_info_t *old_pi, *pi;
9535d07b933Sdp 	pgcnt_t *pgcntp;
9545d07b933Sdp 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
9555d07b933Sdp 	int cands_cache_nranges;
9565d07b933Sdp 	int old_maxmrange, new_maxmrange;
9575d07b933Sdp 	int rc = 0;
9589853d9e8SJason Beloro 	int oldmnode;
9597c478bd9Sstevel@tonic-gate 
9605d07b933Sdp 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
9615d07b933Sdp 	    MMU_PAGE_SIZES, KM_NOSLEEP);
9625d07b933Sdp 	if (cands_cache == NULL)
9635d07b933Sdp 		return (ENOMEM);
9645d07b933Sdp 
965ce8eb11aSdp 	i = -1;
966ce8eb11aSdp 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
967ce8eb11aSdp 
968ce8eb11aSdp 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
969ce8eb11aSdp 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
970ce8eb11aSdp 
9715d07b933Sdp 	/* prepare to free non-null pointers on the way out */
9725d07b933Sdp 	cands_cache_nranges = nranges;
9735d07b933Sdp 	bzero(ctr_cache, sizeof (ctr_cache));
9745d07b933Sdp 	bzero(color_cache, sizeof (color_cache));
9755d07b933Sdp 
9767c478bd9Sstevel@tonic-gate 	/*
9777c478bd9Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
9787c478bd9Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
9797c478bd9Sstevel@tonic-gate 	 * arrays.
9807c478bd9Sstevel@tonic-gate 	 */
9815d07b933Sdp 	for (r = 0; r < mmu_page_sizes; r++) {
9825d07b933Sdp 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
9837c478bd9Sstevel@tonic-gate 	}
9847c478bd9Sstevel@tonic-gate 
9857c478bd9Sstevel@tonic-gate 	/*
9867c478bd9Sstevel@tonic-gate 	 * Preallocate all of the new hpm_counters arrays as we can't
9877c478bd9Sstevel@tonic-gate 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
9887c478bd9Sstevel@tonic-gate 	 * If we can't allocate all of the arrays, undo our work so far
9897c478bd9Sstevel@tonic-gate 	 * and return failure.
9907c478bd9Sstevel@tonic-gate 	 */
9917c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
9927c478bd9Sstevel@tonic-gate 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
9935d07b933Sdp 		size_cache[r] = pcsz;
9947c478bd9Sstevel@tonic-gate 		ctr_cache[r] = kmem_zalloc(pcsz *
9957c478bd9Sstevel@tonic-gate 		    sizeof (hpmctr_t), KM_NOSLEEP);
9967c478bd9Sstevel@tonic-gate 		if (ctr_cache[r] == NULL) {
9975d07b933Sdp 			rc = ENOMEM;
9985d07b933Sdp 			goto cleanup;
9997c478bd9Sstevel@tonic-gate 		}
10007c478bd9Sstevel@tonic-gate 	}
10015d07b933Sdp 
10027c478bd9Sstevel@tonic-gate 	/*
10037c478bd9Sstevel@tonic-gate 	 * Preallocate all of the new color current arrays as we can't
10047c478bd9Sstevel@tonic-gate 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
10057c478bd9Sstevel@tonic-gate 	 * If we can't allocate all of the arrays, undo our work so far
10067c478bd9Sstevel@tonic-gate 	 * and return failure.
10077c478bd9Sstevel@tonic-gate 	 */
10087c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
10095d07b933Sdp 		for (mrange = 0; mrange < nranges; mrange++) {
10105d07b933Sdp 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
10115d07b933Sdp 			    colors_per_szc[r], KM_NOSLEEP);
10125d07b933Sdp 			if (color_cache[r][mrange] == NULL) {
10135d07b933Sdp 				rc = ENOMEM;
10145d07b933Sdp 				goto cleanup;
10155d07b933Sdp 			}
10165d07b933Sdp 		}
10175d07b933Sdp 	}
10185d07b933Sdp 
10195d07b933Sdp 	/*
10205d07b933Sdp 	 * Preallocate all of the new pcc_info_t arrays as we can't
10215d07b933Sdp 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
10225d07b933Sdp 	 * If we can't allocate all of the arrays, undo our work so far
10235d07b933Sdp 	 * and return failure.
10245d07b933Sdp 	 */
10255d07b933Sdp 	for (r = 1; r < mmu_page_sizes; r++) {
10265d07b933Sdp 		for (i = 0; i < NPC_MUTEX; i++) {
10275d07b933Sdp 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
10285d07b933Sdp 			    KM_NOSLEEP);
10295d07b933Sdp 			if (pi == NULL) {
10305d07b933Sdp 				rc = ENOMEM;
10315d07b933Sdp 				goto cleanup;
10327c478bd9Sstevel@tonic-gate 			}
10335d07b933Sdp 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
10345d07b933Sdp 
10355d07b933Sdp 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
10365d07b933Sdp 				pgcntp = kmem_zalloc(colors_per_szc[r] *
10375d07b933Sdp 				    sizeof (pgcnt_t), KM_NOSLEEP);
10385d07b933Sdp 				if (pgcntp == NULL) {
10395d07b933Sdp 					rc = ENOMEM;
10405d07b933Sdp 					goto cleanup;
10415d07b933Sdp 				}
10425d07b933Sdp 				pi->pcc_color_free = pgcntp;
10437c478bd9Sstevel@tonic-gate 			}
10447c478bd9Sstevel@tonic-gate 		}
10457c478bd9Sstevel@tonic-gate 	}
10467c478bd9Sstevel@tonic-gate 
10477c478bd9Sstevel@tonic-gate 	/*
10487c478bd9Sstevel@tonic-gate 	 * Grab the write lock to prevent others from walking these arrays
10497c478bd9Sstevel@tonic-gate 	 * while we are modifying them.
10507c478bd9Sstevel@tonic-gate 	 */
1051ce8eb11aSdp 	PAGE_CTRS_WRITE_LOCK(mnode);
10525d07b933Sdp 
10539853d9e8SJason Beloro 	/*
10549853d9e8SJason Beloro 	 * For interleaved mnodes, find the first mnode
10559853d9e8SJason Beloro 	 * with valid page counters since the current
10569853d9e8SJason Beloro 	 * mnode may have just been added and not have
10579853d9e8SJason Beloro 	 * valid page counters.
10589853d9e8SJason Beloro 	 */
10599853d9e8SJason Beloro 	if (interleaved_mnodes) {
10609853d9e8SJason Beloro 		for (i = 0; i < max_mem_nodes; i++)
10619853d9e8SJason Beloro 			if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
10629853d9e8SJason Beloro 				break;
10639853d9e8SJason Beloro 		ASSERT(i < max_mem_nodes);
10649853d9e8SJason Beloro 		oldmnode = i;
10659853d9e8SJason Beloro 	} else
10669853d9e8SJason Beloro 		oldmnode = mnode;
10679853d9e8SJason Beloro 
10685d07b933Sdp 	old_nranges = mnode_nranges[mnode];
10695d07b933Sdp 	cands_cache_nranges = old_nranges;
10705d07b933Sdp 	mnode_nranges[mnode] = nranges;
10715d07b933Sdp 	old_maxmrange = mnode_maxmrange[mnode];
10725d07b933Sdp 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
10735d07b933Sdp 	new_maxmrange = mnode_maxmrange[mnode];
10745d07b933Sdp 
10757c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
10767c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
10779853d9e8SJason Beloro 		old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
10789853d9e8SJason Beloro 		old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
10799853d9e8SJason Beloro 		oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
10809853d9e8SJason Beloro 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
10815d07b933Sdp 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
10825d07b933Sdp 			old_color_array[mrange] =
10835d07b933Sdp 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1084ce8eb11aSdp 			    r, mrange);
10855d07b933Sdp 		}
10867c478bd9Sstevel@tonic-gate 
10877c478bd9Sstevel@tonic-gate 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
10887c478bd9Sstevel@tonic-gate 		new_ctr = ctr_cache[r];
10897c478bd9Sstevel@tonic-gate 		ctr_cache[r] = NULL;
10907c478bd9Sstevel@tonic-gate 		if (old_ctr != NULL &&
10917c478bd9Sstevel@tonic-gate 		    (oldbase + old_npgs > newbase) &&
10927c478bd9Sstevel@tonic-gate 		    (newbase + npgs > oldbase)) {
10937c478bd9Sstevel@tonic-gate 			/*
10947c478bd9Sstevel@tonic-gate 			 * Map the intersection of the old and new
10957c478bd9Sstevel@tonic-gate 			 * counters into the new array.
10967c478bd9Sstevel@tonic-gate 			 */
10977c478bd9Sstevel@tonic-gate 			size_t offset;
10987c478bd9Sstevel@tonic-gate 			if (newbase > oldbase) {
10997c478bd9Sstevel@tonic-gate 				offset = (newbase - oldbase) >>
11007c478bd9Sstevel@tonic-gate 				    PAGE_COUNTERS_SHIFT(mnode, r);
11017c478bd9Sstevel@tonic-gate 				bcopy(old_ctr + offset, new_ctr,
11027c478bd9Sstevel@tonic-gate 				    MIN(pcsz, (old_csz - offset)) *
11037c478bd9Sstevel@tonic-gate 				    sizeof (hpmctr_t));
11047c478bd9Sstevel@tonic-gate 			} else {
11057c478bd9Sstevel@tonic-gate 				offset = (oldbase - newbase) >>
11067c478bd9Sstevel@tonic-gate 				    PAGE_COUNTERS_SHIFT(mnode, r);
11077c478bd9Sstevel@tonic-gate 				bcopy(old_ctr, new_ctr + offset,
11087c478bd9Sstevel@tonic-gate 				    MIN(pcsz - offset, old_csz) *
11097c478bd9Sstevel@tonic-gate 				    sizeof (hpmctr_t));
11107c478bd9Sstevel@tonic-gate 			}
11117c478bd9Sstevel@tonic-gate 		}
11127c478bd9Sstevel@tonic-gate 
11137c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
11147c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
11157c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1116ce8eb11aSdp 
1117ce8eb11aSdp 		/* update shared hpm_counters in other mnodes */
1118ce8eb11aSdp 		if (interleaved_mnodes) {
1119ce8eb11aSdp 			for (i = 0; i < max_mem_nodes; i++) {
1120af4c679fSSean McEnroe 				if ((i == mnode) ||
1121af4c679fSSean McEnroe 				    (mem_node_config[i].exists == 0))
1122ce8eb11aSdp 					continue;
11239853d9e8SJason Beloro 				ASSERT(
11249853d9e8SJason Beloro 				    PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
11259853d9e8SJason Beloro 				    PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1126ce8eb11aSdp 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1127ce8eb11aSdp 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1128ce8eb11aSdp 				PAGE_COUNTERS_BASE(i, r) = newbase;
1129ce8eb11aSdp 			}
1130ce8eb11aSdp 		}
1131ce8eb11aSdp 
11325d07b933Sdp 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
11335d07b933Sdp 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
11345d07b933Sdp 			    color_cache[r][mrange];
11355d07b933Sdp 			color_cache[r][mrange] = NULL;
11365d07b933Sdp 		}
11377c478bd9Sstevel@tonic-gate 		/*
11387c478bd9Sstevel@tonic-gate 		 * for now, just reset on these events as it's probably
11397c478bd9Sstevel@tonic-gate 		 * not worthwhile to try and optimize this.
11407c478bd9Sstevel@tonic-gate 		 */
11417c478bd9Sstevel@tonic-gate 		for (i = 0; i < colors_per_szc[r]; i++) {
11425d07b933Sdp 			uint_t color_mask = colors_per_szc[r] - 1;
1143ce8eb11aSdp 			int mlo = interleaved_mnodes ? 0 : mnode;
1144ce8eb11aSdp 			int mhi = interleaved_mnodes ? max_mem_nodes :
1145ce8eb11aSdp 			    (mnode + 1);
1146ce8eb11aSdp 			int m;
11479853d9e8SJason Beloro 			pfn_t  pfnum;
11485d07b933Sdp 			size_t idx;
1149ce8eb11aSdp 			MEM_NODE_ITERATOR_DECL(it);
11505d07b933Sdp 
1151ce8eb11aSdp 			for (m = mlo; m < mhi; m++) {
1152ce8eb11aSdp 				if (mem_node_config[m].exists == 0)
1153ce8eb11aSdp 					continue;
11549853d9e8SJason Beloro 				pfnum = newbase;
1155b779d3e0Sdp 				MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1156b779d3e0Sdp 				if (pfnum == (pfn_t)-1) {
1157b779d3e0Sdp 					idx = 0;
1158b779d3e0Sdp 				} else {
1159b779d3e0Sdp 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1160b779d3e0Sdp 					    color_mask, color_mask, &it);
1161b779d3e0Sdp 					idx = PNUM_TO_IDX(m, r, pfnum);
1162b779d3e0Sdp 					idx = (idx < pcsz) ? idx : 0;
1163b779d3e0Sdp 				}
1164ce8eb11aSdp 				for (mrange = 0; mrange < nranges; mrange++) {
11659853d9e8SJason Beloro 					if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
11669853d9e8SJason Beloro 					    r, mrange) != NULL)
11679853d9e8SJason Beloro 						PAGE_COUNTERS_CURRENT_COLOR(m,
11689853d9e8SJason Beloro 						    r, i, mrange) = idx;
1169ce8eb11aSdp 				}
11705d07b933Sdp 			}
11717c478bd9Sstevel@tonic-gate 		}
11727c478bd9Sstevel@tonic-gate 
11737c478bd9Sstevel@tonic-gate 		/* cache info for freeing out of the critical path */
11747c478bd9Sstevel@tonic-gate 		if ((caddr_t)old_ctr >= kernelheap &&
11757c478bd9Sstevel@tonic-gate 		    (caddr_t)old_ctr < ekernelheap) {
11767c478bd9Sstevel@tonic-gate 			ctr_cache[r] = old_ctr;
11777c478bd9Sstevel@tonic-gate 			size_cache[r] = old_csz;
11787c478bd9Sstevel@tonic-gate 		}
11795d07b933Sdp 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
11805d07b933Sdp 			size_t *tmp = old_color_array[mrange];
11815d07b933Sdp 			if ((caddr_t)tmp >= kernelheap &&
11825d07b933Sdp 			    (caddr_t)tmp < ekernelheap) {
11835d07b933Sdp 				color_cache[r][mrange] = tmp;
11845d07b933Sdp 			}
11857c478bd9Sstevel@tonic-gate 		}
11867c478bd9Sstevel@tonic-gate 		/*
11877c478bd9Sstevel@tonic-gate 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
11887c478bd9Sstevel@tonic-gate 		 * satisfy the identity requirement.
11897c478bd9Sstevel@tonic-gate 		 * We should be able to go from one to the other
11907c478bd9Sstevel@tonic-gate 		 * and get consistent values.
11917c478bd9Sstevel@tonic-gate 		 */
11927c478bd9Sstevel@tonic-gate 		ASSERT(PNUM_TO_IDX(mnode, r,
11937c478bd9Sstevel@tonic-gate 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
11947c478bd9Sstevel@tonic-gate 		ASSERT(IDX_TO_PNUM(mnode, r,
11957c478bd9Sstevel@tonic-gate 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
11965d07b933Sdp 
11975d07b933Sdp 		/* pcc_info_t and pcc_color_free */
11985d07b933Sdp 		for (i = 0; i < NPC_MUTEX; i++) {
11995d07b933Sdp 			pcc_info_t *epi;
12005d07b933Sdp 			pcc_info_t *eold_pi;
12015d07b933Sdp 
12025d07b933Sdp 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
12035d07b933Sdp 			old_pi = page_ctrs_cands[i][r][mnode];
12045d07b933Sdp 			page_ctrs_cands[i][r][mnode] = pi;
12055d07b933Sdp 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
12065d07b933Sdp 
12075d07b933Sdp 			/* preserve old pcc_color_free values, if any */
12085d07b933Sdp 			if (old_pi == NULL)
12095d07b933Sdp 				continue;
12105d07b933Sdp 
12115d07b933Sdp 			/*
12125d07b933Sdp 			 * when/if x86 does DR, must account for
12135d07b933Sdp 			 * possible change in range index when
12145d07b933Sdp 			 * preserving pcc_info
12155d07b933Sdp 			 */
12165d07b933Sdp 			epi = &pi[nranges];
12175d07b933Sdp 			eold_pi = &old_pi[old_nranges];
12185d07b933Sdp 			if (new_maxmrange > old_maxmrange) {
12195d07b933Sdp 				pi += new_maxmrange - old_maxmrange;
12205d07b933Sdp 			} else if (new_maxmrange < old_maxmrange) {
12215d07b933Sdp 				old_pi += old_maxmrange - new_maxmrange;
12225d07b933Sdp 			}
12235d07b933Sdp 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
12245d07b933Sdp 				pcc_info_t tmp = *pi;
12255d07b933Sdp 				*pi = *old_pi;
12265d07b933Sdp 				*old_pi = tmp;
12275d07b933Sdp 			}
12285d07b933Sdp 		}
12297c478bd9Sstevel@tonic-gate 	}
1230ce8eb11aSdp 	PAGE_CTRS_WRITE_UNLOCK(mnode);
12317c478bd9Sstevel@tonic-gate 
12327c478bd9Sstevel@tonic-gate 	/*
12337c478bd9Sstevel@tonic-gate 	 * Now that we have dropped the write lock, it is safe to free all
12347c478bd9Sstevel@tonic-gate 	 * of the memory we have cached above.
12355d07b933Sdp 	 * We come thru here to free memory when pre-alloc fails, and also to
12365d07b933Sdp 	 * free old pointers which were recorded while locked.
12377c478bd9Sstevel@tonic-gate 	 */
12385d07b933Sdp cleanup:
12397c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
12407c478bd9Sstevel@tonic-gate 		if (ctr_cache[r] != NULL) {
12417c478bd9Sstevel@tonic-gate 			kmem_free(ctr_cache[r],
12427c478bd9Sstevel@tonic-gate 			    size_cache[r] * sizeof (hpmctr_t));
12437c478bd9Sstevel@tonic-gate 		}
12445d07b933Sdp 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
12455d07b933Sdp 			if (color_cache[r][mrange] != NULL) {
12465d07b933Sdp 				kmem_free(color_cache[r][mrange],
12475d07b933Sdp 				    colors_per_szc[r] * sizeof (size_t));
12485d07b933Sdp 			}
12495d07b933Sdp 		}
12505d07b933Sdp 		for (i = 0; i < NPC_MUTEX; i++) {
12515d07b933Sdp 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
12525d07b933Sdp 			if (pi == NULL)
12535d07b933Sdp 				continue;
12545d07b933Sdp 			nr = cands_cache_nranges;
12555d07b933Sdp 			for (mrange = 0; mrange < nr; mrange++, pi++) {
12565d07b933Sdp 				pgcntp = pi->pcc_color_free;
12575d07b933Sdp 				if (pgcntp == NULL)
12585d07b933Sdp 					continue;
12595d07b933Sdp 				if ((caddr_t)pgcntp >= kernelheap &&
12605d07b933Sdp 				    (caddr_t)pgcntp < ekernelheap) {
12615d07b933Sdp 					kmem_free(pgcntp,
12625d07b933Sdp 					    colors_per_szc[r] *
12635d07b933Sdp 					    sizeof (pgcnt_t));
12645d07b933Sdp 				}
12655d07b933Sdp 			}
12665d07b933Sdp 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
12675d07b933Sdp 			if ((caddr_t)pi >= kernelheap &&
12685d07b933Sdp 			    (caddr_t)pi < ekernelheap) {
12695d07b933Sdp 				kmem_free(pi, nr * sizeof (pcc_info_t));
12705d07b933Sdp 			}
12717c478bd9Sstevel@tonic-gate 		}
12727c478bd9Sstevel@tonic-gate 	}
12737c478bd9Sstevel@tonic-gate 
12745d07b933Sdp 	kmem_free(cands_cache,
12755d07b933Sdp 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
12765d07b933Sdp 	return (rc);
12777c478bd9Sstevel@tonic-gate }
12787c478bd9Sstevel@tonic-gate 
1279af4c679fSSean McEnroe /*
1280af4c679fSSean McEnroe  * Cleanup the hpm_counters field in the page counters
1281af4c679fSSean McEnroe  * array.
1282af4c679fSSean McEnroe  */
1283af4c679fSSean McEnroe void
1284af4c679fSSean McEnroe page_ctrs_cleanup(void)
1285af4c679fSSean McEnroe {
1286af4c679fSSean McEnroe 	int r;	/* region size */
1287af4c679fSSean McEnroe 	int i;	/* mnode index */
1288af4c679fSSean McEnroe 
1289af4c679fSSean McEnroe 	/*
1290af4c679fSSean McEnroe 	 * Get the page counters write lock while we are
1291af4c679fSSean McEnroe 	 * setting the page hpm_counters field to NULL
1292af4c679fSSean McEnroe 	 * for non-existent mnodes.
1293af4c679fSSean McEnroe 	 */
1294af4c679fSSean McEnroe 	for (i = 0; i < max_mem_nodes; i++) {
1295af4c679fSSean McEnroe 		PAGE_CTRS_WRITE_LOCK(i);
1296af4c679fSSean McEnroe 		if (mem_node_config[i].exists) {
1297af4c679fSSean McEnroe 			PAGE_CTRS_WRITE_UNLOCK(i);
1298af4c679fSSean McEnroe 			continue;
1299af4c679fSSean McEnroe 		}
1300af4c679fSSean McEnroe 		for (r = 1; r < mmu_page_sizes; r++) {
1301af4c679fSSean McEnroe 			PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1302af4c679fSSean McEnroe 		}
1303af4c679fSSean McEnroe 		PAGE_CTRS_WRITE_UNLOCK(i);
1304af4c679fSSean McEnroe 	}
1305af4c679fSSean McEnroe }
13065d07b933Sdp 
13077c478bd9Sstevel@tonic-gate #ifdef DEBUG
13087c478bd9Sstevel@tonic-gate 
13097c478bd9Sstevel@tonic-gate /*
13107c478bd9Sstevel@tonic-gate  * confirm pp is a large page corresponding to szc
13117c478bd9Sstevel@tonic-gate  */
13127c478bd9Sstevel@tonic-gate void
13137c478bd9Sstevel@tonic-gate chk_lpg(page_t *pp, uchar_t szc)
13147c478bd9Sstevel@tonic-gate {
13157c478bd9Sstevel@tonic-gate 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
13167c478bd9Sstevel@tonic-gate 	uint_t noreloc;
13177c478bd9Sstevel@tonic-gate 
13187c478bd9Sstevel@tonic-gate 	if (npgs == 1) {
13197c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
13207c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_next == pp);
13217c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_prev == pp);
13227c478bd9Sstevel@tonic-gate 		return;
13237c478bd9Sstevel@tonic-gate 	}
13247c478bd9Sstevel@tonic-gate 
13257c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
13267c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
13277c478bd9Sstevel@tonic-gate 
13287c478bd9Sstevel@tonic-gate 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
13297c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
13307c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
13317c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
13327c478bd9Sstevel@tonic-gate 
13337c478bd9Sstevel@tonic-gate 	/*
13347c478bd9Sstevel@tonic-gate 	 * Check list of pages.
13357c478bd9Sstevel@tonic-gate 	 */
13367c478bd9Sstevel@tonic-gate 	noreloc = PP_ISNORELOC(pp);
13377c478bd9Sstevel@tonic-gate 	while (npgs--) {
13387c478bd9Sstevel@tonic-gate 		if (npgs != 0) {
13397c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
13407c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_next == (pp + 1));
13417c478bd9Sstevel@tonic-gate 		}
13427c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == szc);
13437c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISFREE(pp));
13447c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISAGED(pp));
13457c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
13467c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
13477c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vnode  == NULL);
13487c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISNORELOC(pp) == noreloc);
13497c478bd9Sstevel@tonic-gate 
13507c478bd9Sstevel@tonic-gate 		pp = pp->p_next;
13517c478bd9Sstevel@tonic-gate 	}
13527c478bd9Sstevel@tonic-gate }
13537c478bd9Sstevel@tonic-gate #endif /* DEBUG */
13547c478bd9Sstevel@tonic-gate 
13557c478bd9Sstevel@tonic-gate void
13567c478bd9Sstevel@tonic-gate page_freelist_lock(int mnode)
13577c478bd9Sstevel@tonic-gate {
13587c478bd9Sstevel@tonic-gate 	int i;
13597c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
13607c478bd9Sstevel@tonic-gate 		mutex_enter(FPC_MUTEX(mnode, i));
13617c478bd9Sstevel@tonic-gate 		mutex_enter(CPC_MUTEX(mnode, i));
13627c478bd9Sstevel@tonic-gate 	}
13637c478bd9Sstevel@tonic-gate }
13647c478bd9Sstevel@tonic-gate 
13657c478bd9Sstevel@tonic-gate void
13667c478bd9Sstevel@tonic-gate page_freelist_unlock(int mnode)
13677c478bd9Sstevel@tonic-gate {
13687c478bd9Sstevel@tonic-gate 	int i;
13697c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
13707c478bd9Sstevel@tonic-gate 		mutex_exit(FPC_MUTEX(mnode, i));
13717c478bd9Sstevel@tonic-gate 		mutex_exit(CPC_MUTEX(mnode, i));
13727c478bd9Sstevel@tonic-gate 	}
13737c478bd9Sstevel@tonic-gate }
13747c478bd9Sstevel@tonic-gate 
13757c478bd9Sstevel@tonic-gate /*
13767c478bd9Sstevel@tonic-gate  * add pp to the specified page list. Defaults to head of the page list
13777c478bd9Sstevel@tonic-gate  * unless PG_LIST_TAIL is specified.
13787c478bd9Sstevel@tonic-gate  */
13797c478bd9Sstevel@tonic-gate void
13807c478bd9Sstevel@tonic-gate page_list_add(page_t *pp, int flags)
13817c478bd9Sstevel@tonic-gate {
13827c478bd9Sstevel@tonic-gate 	page_t		**ppp;
13837c478bd9Sstevel@tonic-gate 	kmutex_t	*pcm;
13847c478bd9Sstevel@tonic-gate 	uint_t		bin, mtype;
13857c478bd9Sstevel@tonic-gate 	int		mnode;
13867c478bd9Sstevel@tonic-gate 
13877c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
13887c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
13897c478bd9Sstevel@tonic-gate 	ASSERT(!hat_page_is_mapped(pp));
13907c478bd9Sstevel@tonic-gate 	ASSERT(hat_page_getshare(pp) == 0);
13917c478bd9Sstevel@tonic-gate 
13927c478bd9Sstevel@tonic-gate 	/*
13937c478bd9Sstevel@tonic-gate 	 * Large pages should be freed via page_list_add_pages().
13947c478bd9Sstevel@tonic-gate 	 */
13957c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
13967c478bd9Sstevel@tonic-gate 
13977c478bd9Sstevel@tonic-gate 	/*
13987c478bd9Sstevel@tonic-gate 	 * Don't need to lock the freelist first here
13997c478bd9Sstevel@tonic-gate 	 * because the page isn't on the freelist yet.
14007c478bd9Sstevel@tonic-gate 	 * This means p_szc can't change on us.
14017c478bd9Sstevel@tonic-gate 	 */
14027c478bd9Sstevel@tonic-gate 
14037c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
14047c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
14057c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
14067c478bd9Sstevel@tonic-gate 
14077c478bd9Sstevel@tonic-gate 	if (flags & PG_LIST_ISINIT) {
14087c478bd9Sstevel@tonic-gate 		/*
14097c478bd9Sstevel@tonic-gate 		 * PG_LIST_ISINIT is set during system startup (ie. single
14107c478bd9Sstevel@tonic-gate 		 * threaded), add a page to the free list and add to the
14117c478bd9Sstevel@tonic-gate 		 * the free region counters w/o any locking
14127c478bd9Sstevel@tonic-gate 		 */
14137c478bd9Sstevel@tonic-gate 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
14147c478bd9Sstevel@tonic-gate 
14157c478bd9Sstevel@tonic-gate 		/* inline version of page_add() */
14167c478bd9Sstevel@tonic-gate 		if (*ppp != NULL) {
14177c478bd9Sstevel@tonic-gate 			pp->p_next = *ppp;
14187c478bd9Sstevel@tonic-gate 			pp->p_prev = (*ppp)->p_prev;
14197c478bd9Sstevel@tonic-gate 			(*ppp)->p_prev = pp;
14207c478bd9Sstevel@tonic-gate 			pp->p_prev->p_next = pp;
14217c478bd9Sstevel@tonic-gate 		} else
14227c478bd9Sstevel@tonic-gate 			*ppp = pp;
14237c478bd9Sstevel@tonic-gate 
1424affbd3ccSkchow 		page_ctr_add_internal(mnode, mtype, pp, flags);
1425affbd3ccSkchow 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
14267c478bd9Sstevel@tonic-gate 	} else {
14277c478bd9Sstevel@tonic-gate 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
14287c478bd9Sstevel@tonic-gate 
14297c478bd9Sstevel@tonic-gate 		if (flags & PG_FREE_LIST) {
1430affbd3ccSkchow 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
14317c478bd9Sstevel@tonic-gate 			ASSERT(PP_ISAGED(pp));
14327c478bd9Sstevel@tonic-gate 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
14337c478bd9Sstevel@tonic-gate 
14347c478bd9Sstevel@tonic-gate 		} else {
1435affbd3ccSkchow 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
14367c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_vnode);
14377c478bd9Sstevel@tonic-gate 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
14387c478bd9Sstevel@tonic-gate 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
14397c478bd9Sstevel@tonic-gate 		}
14407c478bd9Sstevel@tonic-gate 		mutex_enter(pcm);
14417c478bd9Sstevel@tonic-gate 		page_add(ppp, pp);
14427c478bd9Sstevel@tonic-gate 
14437c478bd9Sstevel@tonic-gate 		if (flags & PG_LIST_TAIL)
14447c478bd9Sstevel@tonic-gate 			*ppp = (*ppp)->p_next;
14457c478bd9Sstevel@tonic-gate 		/*
14467c478bd9Sstevel@tonic-gate 		 * Add counters before releasing pcm mutex to avoid a race with
14475d07b933Sdp 		 * page_freelist_coalesce and page_freelist_split.
14487c478bd9Sstevel@tonic-gate 		 */
1449affbd3ccSkchow 		page_ctr_add(mnode, mtype, pp, flags);
14507c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
14517c478bd9Sstevel@tonic-gate 	}
14527c478bd9Sstevel@tonic-gate 
14537c478bd9Sstevel@tonic-gate 
14547c478bd9Sstevel@tonic-gate #if defined(__sparc)
14557c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(pp)) {
14567c478bd9Sstevel@tonic-gate 		kcage_freemem_add(1);
14577c478bd9Sstevel@tonic-gate 	}
14587c478bd9Sstevel@tonic-gate #endif
14597c478bd9Sstevel@tonic-gate 	/*
14607c478bd9Sstevel@tonic-gate 	 * It is up to the caller to unlock the page!
14617c478bd9Sstevel@tonic-gate 	 */
14627c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
14637c478bd9Sstevel@tonic-gate }
14647c478bd9Sstevel@tonic-gate 
14657c478bd9Sstevel@tonic-gate 
14667c478bd9Sstevel@tonic-gate #ifdef __sparc
14677c478bd9Sstevel@tonic-gate /*
14687c478bd9Sstevel@tonic-gate  * This routine is only used by kcage_init during system startup.
14697c478bd9Sstevel@tonic-gate  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
14707c478bd9Sstevel@tonic-gate  * without the overhead of taking locks and updating counters.
14717c478bd9Sstevel@tonic-gate  */
14727c478bd9Sstevel@tonic-gate void
14737c478bd9Sstevel@tonic-gate page_list_noreloc_startup(page_t *pp)
14747c478bd9Sstevel@tonic-gate {
14757c478bd9Sstevel@tonic-gate 	page_t		**ppp;
14767c478bd9Sstevel@tonic-gate 	uint_t		bin;
14777c478bd9Sstevel@tonic-gate 	int		mnode;
14787c478bd9Sstevel@tonic-gate 	int		mtype;
1479e21bae1bSkchow 	int		flags = 0;
14807c478bd9Sstevel@tonic-gate 
14817c478bd9Sstevel@tonic-gate 	/*
14827c478bd9Sstevel@tonic-gate 	 * If this is a large page on the freelist then
14837c478bd9Sstevel@tonic-gate 	 * break it up into smaller pages.
14847c478bd9Sstevel@tonic-gate 	 */
14857c478bd9Sstevel@tonic-gate 	if (pp->p_szc != 0)
14867c478bd9Sstevel@tonic-gate 		page_boot_demote(pp);
14877c478bd9Sstevel@tonic-gate 
14887c478bd9Sstevel@tonic-gate 	/*
14897c478bd9Sstevel@tonic-gate 	 * Get list page is currently on.
14907c478bd9Sstevel@tonic-gate 	 */
14917c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
14927c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
14937c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
14947c478bd9Sstevel@tonic-gate 	ASSERT(mtype == MTYPE_RELOC);
14957c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
14967c478bd9Sstevel@tonic-gate 
14977c478bd9Sstevel@tonic-gate 	if (PP_ISAGED(pp)) {
14987c478bd9Sstevel@tonic-gate 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
14997c478bd9Sstevel@tonic-gate 		flags |= PG_FREE_LIST;
15007c478bd9Sstevel@tonic-gate 	} else {
15017c478bd9Sstevel@tonic-gate 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
15027c478bd9Sstevel@tonic-gate 		flags |= PG_CACHE_LIST;
15037c478bd9Sstevel@tonic-gate 	}
15047c478bd9Sstevel@tonic-gate 
15057c478bd9Sstevel@tonic-gate 	ASSERT(*ppp != NULL);
15067c478bd9Sstevel@tonic-gate 
15077c478bd9Sstevel@tonic-gate 	/*
15087c478bd9Sstevel@tonic-gate 	 * Delete page from current list.
15097c478bd9Sstevel@tonic-gate 	 */
15107c478bd9Sstevel@tonic-gate 	if (*ppp == pp)
15117c478bd9Sstevel@tonic-gate 		*ppp = pp->p_next;		/* go to next page */
15127c478bd9Sstevel@tonic-gate 	if (*ppp == pp) {
15137c478bd9Sstevel@tonic-gate 		*ppp = NULL;			/* page list is gone */
15147c478bd9Sstevel@tonic-gate 	} else {
15157c478bd9Sstevel@tonic-gate 		pp->p_prev->p_next = pp->p_next;
15167c478bd9Sstevel@tonic-gate 		pp->p_next->p_prev = pp->p_prev;
15177c478bd9Sstevel@tonic-gate 	}
15187c478bd9Sstevel@tonic-gate 
15195d07b933Sdp 	/*
15205d07b933Sdp 	 * Decrement page counters
15215d07b933Sdp 	 */
15225d07b933Sdp 	page_ctr_sub_internal(mnode, mtype, pp, flags);
15237c478bd9Sstevel@tonic-gate 
15247c478bd9Sstevel@tonic-gate 	/*
15257c478bd9Sstevel@tonic-gate 	 * Set no reloc for cage initted pages.
15267c478bd9Sstevel@tonic-gate 	 */
15277c478bd9Sstevel@tonic-gate 	PP_SETNORELOC(pp);
15287c478bd9Sstevel@tonic-gate 
15297c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
15307c478bd9Sstevel@tonic-gate 	ASSERT(mtype == MTYPE_NORELOC);
15317c478bd9Sstevel@tonic-gate 
15327c478bd9Sstevel@tonic-gate 	/*
15337c478bd9Sstevel@tonic-gate 	 * Get new list for page.
15347c478bd9Sstevel@tonic-gate 	 */
15357c478bd9Sstevel@tonic-gate 	if (PP_ISAGED(pp)) {
15367c478bd9Sstevel@tonic-gate 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
15377c478bd9Sstevel@tonic-gate 	} else {
15387c478bd9Sstevel@tonic-gate 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
15397c478bd9Sstevel@tonic-gate 	}
15407c478bd9Sstevel@tonic-gate 
15417c478bd9Sstevel@tonic-gate 	/*
15427c478bd9Sstevel@tonic-gate 	 * Insert page on new list.
15437c478bd9Sstevel@tonic-gate 	 */
15447c478bd9Sstevel@tonic-gate 	if (*ppp == NULL) {
15457c478bd9Sstevel@tonic-gate 		*ppp = pp;
15467c478bd9Sstevel@tonic-gate 		pp->p_next = pp->p_prev = pp;
15477c478bd9Sstevel@tonic-gate 	} else {
15487c478bd9Sstevel@tonic-gate 		pp->p_next = *ppp;
15497c478bd9Sstevel@tonic-gate 		pp->p_prev = (*ppp)->p_prev;
15507c478bd9Sstevel@tonic-gate 		(*ppp)->p_prev = pp;
15517c478bd9Sstevel@tonic-gate 		pp->p_prev->p_next = pp;
15527c478bd9Sstevel@tonic-gate 	}
15537c478bd9Sstevel@tonic-gate 
15545d07b933Sdp 	/*
15555d07b933Sdp 	 * Increment page counters
15565d07b933Sdp 	 */
15575d07b933Sdp 	page_ctr_add_internal(mnode, mtype, pp, flags);
15587c478bd9Sstevel@tonic-gate 
15597c478bd9Sstevel@tonic-gate 	/*
15607c478bd9Sstevel@tonic-gate 	 * Update cage freemem counter
15617c478bd9Sstevel@tonic-gate 	 */
15627c478bd9Sstevel@tonic-gate 	atomic_add_long(&kcage_freemem, 1);
15637c478bd9Sstevel@tonic-gate }
15647c478bd9Sstevel@tonic-gate #else	/* __sparc */
15657c478bd9Sstevel@tonic-gate 
15667c478bd9Sstevel@tonic-gate /* ARGSUSED */
15677c478bd9Sstevel@tonic-gate void
15687c478bd9Sstevel@tonic-gate page_list_noreloc_startup(page_t *pp)
15697c478bd9Sstevel@tonic-gate {
15707c478bd9Sstevel@tonic-gate 	panic("page_list_noreloc_startup: should be here only for sparc");
15717c478bd9Sstevel@tonic-gate }
15727c478bd9Sstevel@tonic-gate #endif
15737c478bd9Sstevel@tonic-gate 
15747c478bd9Sstevel@tonic-gate void
15757c478bd9Sstevel@tonic-gate page_list_add_pages(page_t *pp, int flags)
15767c478bd9Sstevel@tonic-gate {
15777c478bd9Sstevel@tonic-gate 	kmutex_t *pcm;
15787c478bd9Sstevel@tonic-gate 	pgcnt_t	pgcnt;
15797c478bd9Sstevel@tonic-gate 	uint_t	bin, mtype, i;
15807c478bd9Sstevel@tonic-gate 	int	mnode;
15817c478bd9Sstevel@tonic-gate 
15827c478bd9Sstevel@tonic-gate 	/* default to freelist/head */
15837c478bd9Sstevel@tonic-gate 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
15847c478bd9Sstevel@tonic-gate 
15857c478bd9Sstevel@tonic-gate 	CHK_LPG(pp, pp->p_szc);
1586affbd3ccSkchow 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
15877c478bd9Sstevel@tonic-gate 
15887c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
15897c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
15907c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
15917c478bd9Sstevel@tonic-gate 
15927c478bd9Sstevel@tonic-gate 	if (flags & PG_LIST_ISINIT) {
15937c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
15947c478bd9Sstevel@tonic-gate 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
15957c478bd9Sstevel@tonic-gate 		ASSERT(!PP_ISNORELOC(pp));
1596affbd3ccSkchow 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
15977c478bd9Sstevel@tonic-gate 	} else {
15987c478bd9Sstevel@tonic-gate 
15997c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
16007c478bd9Sstevel@tonic-gate 
16017c478bd9Sstevel@tonic-gate 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
16027c478bd9Sstevel@tonic-gate 
16037c478bd9Sstevel@tonic-gate 		mutex_enter(pcm);
16047c478bd9Sstevel@tonic-gate 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1605affbd3ccSkchow 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
16067c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
16077c478bd9Sstevel@tonic-gate 
16087c478bd9Sstevel@tonic-gate 		pgcnt = page_get_pagecnt(pp->p_szc);
16097c478bd9Sstevel@tonic-gate #if defined(__sparc)
16107c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(pp))
16117c478bd9Sstevel@tonic-gate 			kcage_freemem_add(pgcnt);
16127c478bd9Sstevel@tonic-gate #endif
16137c478bd9Sstevel@tonic-gate 		for (i = 0; i < pgcnt; i++, pp++)
16148b464eb8Smec 			page_unlock_nocapture(pp);
16157c478bd9Sstevel@tonic-gate 	}
16167c478bd9Sstevel@tonic-gate }
16177c478bd9Sstevel@tonic-gate 
16187c478bd9Sstevel@tonic-gate /*
16197c478bd9Sstevel@tonic-gate  * During boot, need to demote a large page to base
16207c478bd9Sstevel@tonic-gate  * pagesize pages for seg_kmem for use in boot_alloc()
16217c478bd9Sstevel@tonic-gate  */
16227c478bd9Sstevel@tonic-gate void
16237c478bd9Sstevel@tonic-gate page_boot_demote(page_t *pp)
16247c478bd9Sstevel@tonic-gate {
16257c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc != 0);
16267c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
16277c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
16287c478bd9Sstevel@tonic-gate 
16297c478bd9Sstevel@tonic-gate 	(void) page_demote(PP_2_MEM_NODE(pp),
163019397407SSherry Moore 	    PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
16317c478bd9Sstevel@tonic-gate 	    PC_FREE);
16327c478bd9Sstevel@tonic-gate 
16337c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
16347c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
16357c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
16367c478bd9Sstevel@tonic-gate }
16377c478bd9Sstevel@tonic-gate 
16387c478bd9Sstevel@tonic-gate /*
16397c478bd9Sstevel@tonic-gate  * Take a particular page off of whatever freelist the page
16407c478bd9Sstevel@tonic-gate  * is claimed to be on.
16417c478bd9Sstevel@tonic-gate  *
16427c478bd9Sstevel@tonic-gate  * NOTE: Only used for PAGESIZE pages.
16437c478bd9Sstevel@tonic-gate  */
16447c478bd9Sstevel@tonic-gate void
16457c478bd9Sstevel@tonic-gate page_list_sub(page_t *pp, int flags)
16467c478bd9Sstevel@tonic-gate {
16477c478bd9Sstevel@tonic-gate 	int		bin;
16487c478bd9Sstevel@tonic-gate 	uint_t		mtype;
16497c478bd9Sstevel@tonic-gate 	int		mnode;
16507c478bd9Sstevel@tonic-gate 	kmutex_t	*pcm;
16517c478bd9Sstevel@tonic-gate 	page_t		**ppp;
16527c478bd9Sstevel@tonic-gate 
16537c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
16547c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
16557c478bd9Sstevel@tonic-gate 
16567c478bd9Sstevel@tonic-gate 	/*
16577c478bd9Sstevel@tonic-gate 	 * The p_szc field can only be changed by page_promote()
16587c478bd9Sstevel@tonic-gate 	 * and page_demote(). Only free pages can be promoted and
16597c478bd9Sstevel@tonic-gate 	 * demoted and the free list MUST be locked during these
16607c478bd9Sstevel@tonic-gate 	 * operations. So to prevent a race in page_list_sub()
16617c478bd9Sstevel@tonic-gate 	 * between computing which bin of the freelist lock to
16627c478bd9Sstevel@tonic-gate 	 * grab and actually grabing the lock we check again that
16637c478bd9Sstevel@tonic-gate 	 * the bin we locked is still the correct one. Notice that
16647c478bd9Sstevel@tonic-gate 	 * the p_szc field could have actually changed on us but
16657c478bd9Sstevel@tonic-gate 	 * if the bin happens to still be the same we are safe.
16667c478bd9Sstevel@tonic-gate 	 */
16677c478bd9Sstevel@tonic-gate try_again:
16687c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
16697c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
16707c478bd9Sstevel@tonic-gate 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
16717c478bd9Sstevel@tonic-gate 	mutex_enter(pcm);
16727c478bd9Sstevel@tonic-gate 	if (PP_2_BIN(pp) != bin) {
16737c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
16747c478bd9Sstevel@tonic-gate 		goto try_again;
16757c478bd9Sstevel@tonic-gate 	}
16767c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
16777c478bd9Sstevel@tonic-gate 
16787c478bd9Sstevel@tonic-gate 	if (flags & PG_FREE_LIST) {
1679affbd3ccSkchow 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
16807c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISAGED(pp));
16817c478bd9Sstevel@tonic-gate 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
16827c478bd9Sstevel@tonic-gate 	} else {
1683affbd3ccSkchow 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
16847c478bd9Sstevel@tonic-gate 		ASSERT(!PP_ISAGED(pp));
16857c478bd9Sstevel@tonic-gate 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
16867c478bd9Sstevel@tonic-gate 	}
16877c478bd9Sstevel@tonic-gate 
16887c478bd9Sstevel@tonic-gate 	/*
16897c478bd9Sstevel@tonic-gate 	 * Common PAGESIZE case.
16907c478bd9Sstevel@tonic-gate 	 *
16917c478bd9Sstevel@tonic-gate 	 * Note that we locked the freelist. This prevents
16927c478bd9Sstevel@tonic-gate 	 * any page promotion/demotion operations. Therefore
16937c478bd9Sstevel@tonic-gate 	 * the p_szc will not change until we drop pcm mutex.
16947c478bd9Sstevel@tonic-gate 	 */
16957c478bd9Sstevel@tonic-gate 	if (pp->p_szc == 0) {
16967c478bd9Sstevel@tonic-gate 		page_sub(ppp, pp);
16977c478bd9Sstevel@tonic-gate 		/*
16987c478bd9Sstevel@tonic-gate 		 * Subtract counters before releasing pcm mutex
16997c478bd9Sstevel@tonic-gate 		 * to avoid race with page_freelist_coalesce.
17007c478bd9Sstevel@tonic-gate 		 */
1701affbd3ccSkchow 		page_ctr_sub(mnode, mtype, pp, flags);
17027c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
17037c478bd9Sstevel@tonic-gate 
17047c478bd9Sstevel@tonic-gate #if defined(__sparc)
17057c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(pp)) {
17067c478bd9Sstevel@tonic-gate 			kcage_freemem_sub(1);
17077c478bd9Sstevel@tonic-gate 		}
17087c478bd9Sstevel@tonic-gate #endif
17097c478bd9Sstevel@tonic-gate 		return;
17107c478bd9Sstevel@tonic-gate 	}
17117c478bd9Sstevel@tonic-gate 
17127c478bd9Sstevel@tonic-gate 	/*
17137c478bd9Sstevel@tonic-gate 	 * Large pages on the cache list are not supported.
17147c478bd9Sstevel@tonic-gate 	 */
17157c478bd9Sstevel@tonic-gate 	if (flags & PG_CACHE_LIST)
17167c478bd9Sstevel@tonic-gate 		panic("page_list_sub: large page on cachelist");
17177c478bd9Sstevel@tonic-gate 
17187c478bd9Sstevel@tonic-gate 	/*
17197c478bd9Sstevel@tonic-gate 	 * Slow but rare.
17207c478bd9Sstevel@tonic-gate 	 *
17217c478bd9Sstevel@tonic-gate 	 * Somebody wants this particular page which is part
17227c478bd9Sstevel@tonic-gate 	 * of a large page. In this case we just demote the page
17237c478bd9Sstevel@tonic-gate 	 * if it's on the freelist.
17247c478bd9Sstevel@tonic-gate 	 *
17257c478bd9Sstevel@tonic-gate 	 * We have to drop pcm before locking the entire freelist.
17267c478bd9Sstevel@tonic-gate 	 * Once we have re-locked the freelist check to make sure
17277c478bd9Sstevel@tonic-gate 	 * the page hasn't already been demoted or completely
17287c478bd9Sstevel@tonic-gate 	 * freed.
17297c478bd9Sstevel@tonic-gate 	 */
17307c478bd9Sstevel@tonic-gate 	mutex_exit(pcm);
17317c478bd9Sstevel@tonic-gate 	page_freelist_lock(mnode);
17327c478bd9Sstevel@tonic-gate 	if (pp->p_szc != 0) {
17337c478bd9Sstevel@tonic-gate 		/*
17347c478bd9Sstevel@tonic-gate 		 * Large page is on freelist.
17357c478bd9Sstevel@tonic-gate 		 */
17367c478bd9Sstevel@tonic-gate 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
173719397407SSherry Moore 		    0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
17387c478bd9Sstevel@tonic-gate 	}
17397c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
17407c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
17417c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
17427c478bd9Sstevel@tonic-gate 
17437c478bd9Sstevel@tonic-gate 	/*
17447c478bd9Sstevel@tonic-gate 	 * Subtract counters before releasing pcm mutex
17457c478bd9Sstevel@tonic-gate 	 * to avoid race with page_freelist_coalesce.
17467c478bd9Sstevel@tonic-gate 	 */
17477c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
17487c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
17497c478bd9Sstevel@tonic-gate 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
17507c478bd9Sstevel@tonic-gate 
17517c478bd9Sstevel@tonic-gate 	page_sub(ppp, pp);
1752affbd3ccSkchow 	page_ctr_sub(mnode, mtype, pp, flags);
17537c478bd9Sstevel@tonic-gate 	page_freelist_unlock(mnode);
17547c478bd9Sstevel@tonic-gate 
17557c478bd9Sstevel@tonic-gate #if defined(__sparc)
17567c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(pp)) {
17577c478bd9Sstevel@tonic-gate 		kcage_freemem_sub(1);
17587c478bd9Sstevel@tonic-gate 	}
17597c478bd9Sstevel@tonic-gate #endif
17607c478bd9Sstevel@tonic-gate }
17617c478bd9Sstevel@tonic-gate 
17627c478bd9Sstevel@tonic-gate void
17637c478bd9Sstevel@tonic-gate page_list_sub_pages(page_t *pp, uint_t szc)
17647c478bd9Sstevel@tonic-gate {
17657c478bd9Sstevel@tonic-gate 	kmutex_t *pcm;
17667c478bd9Sstevel@tonic-gate 	uint_t	bin, mtype;
17677c478bd9Sstevel@tonic-gate 	int	mnode;
17687c478bd9Sstevel@tonic-gate 
17697c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
17707c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
17717c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
17727c478bd9Sstevel@tonic-gate 
17737c478bd9Sstevel@tonic-gate 	/*
17747c478bd9Sstevel@tonic-gate 	 * See comment in page_list_sub().
17757c478bd9Sstevel@tonic-gate 	 */
17767c478bd9Sstevel@tonic-gate try_again:
17777c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
17787c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
17797c478bd9Sstevel@tonic-gate 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
17807c478bd9Sstevel@tonic-gate 	mutex_enter(pcm);
17817c478bd9Sstevel@tonic-gate 	if (PP_2_BIN(pp) != bin) {
17827c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
17837c478bd9Sstevel@tonic-gate 		goto	try_again;
17847c478bd9Sstevel@tonic-gate 	}
17857c478bd9Sstevel@tonic-gate 
17867c478bd9Sstevel@tonic-gate 	/*
17877c478bd9Sstevel@tonic-gate 	 * If we're called with a page larger than szc or it got
17887c478bd9Sstevel@tonic-gate 	 * promoted above szc before we locked the freelist then
17897c478bd9Sstevel@tonic-gate 	 * drop pcm and re-lock entire freelist. If page still larger
17907c478bd9Sstevel@tonic-gate 	 * than szc then demote it.
17917c478bd9Sstevel@tonic-gate 	 */
17927c478bd9Sstevel@tonic-gate 	if (pp->p_szc > szc) {
17937c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
17947c478bd9Sstevel@tonic-gate 		pcm = NULL;
17957c478bd9Sstevel@tonic-gate 		page_freelist_lock(mnode);
17967c478bd9Sstevel@tonic-gate 		if (pp->p_szc > szc) {
1797affbd3ccSkchow 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
17987c478bd9Sstevel@tonic-gate 			(void) page_demote(mnode,
179919397407SSherry Moore 			    PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
18007c478bd9Sstevel@tonic-gate 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
18017c478bd9Sstevel@tonic-gate 		}
18027c478bd9Sstevel@tonic-gate 		bin = PP_2_BIN(pp);
18037c478bd9Sstevel@tonic-gate 	}
18047c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
18057c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
18067c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc <= szc);
18077c478bd9Sstevel@tonic-gate 	ASSERT(pp == PP_PAGEROOT(pp));
18087c478bd9Sstevel@tonic-gate 
1809affbd3ccSkchow 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1810affbd3ccSkchow 
18117c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
18127c478bd9Sstevel@tonic-gate 	if (pp->p_szc != 0) {
18137c478bd9Sstevel@tonic-gate 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
18147c478bd9Sstevel@tonic-gate 		CHK_LPG(pp, pp->p_szc);
18157c478bd9Sstevel@tonic-gate 	} else {
1816affbd3ccSkchow 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
18177c478bd9Sstevel@tonic-gate 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
18187c478bd9Sstevel@tonic-gate 	}
1819affbd3ccSkchow 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
18207c478bd9Sstevel@tonic-gate 
18217c478bd9Sstevel@tonic-gate 	if (pcm != NULL) {
18227c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
18237c478bd9Sstevel@tonic-gate 	} else {
18247c478bd9Sstevel@tonic-gate 		page_freelist_unlock(mnode);
18257c478bd9Sstevel@tonic-gate 	}
18267c478bd9Sstevel@tonic-gate 
18277c478bd9Sstevel@tonic-gate #if defined(__sparc)
18287c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(pp)) {
18297c478bd9Sstevel@tonic-gate 		pgcnt_t	pgcnt;
18307c478bd9Sstevel@tonic-gate 
18317c478bd9Sstevel@tonic-gate 		pgcnt = page_get_pagecnt(pp->p_szc);
18327c478bd9Sstevel@tonic-gate 		kcage_freemem_sub(pgcnt);
18337c478bd9Sstevel@tonic-gate 	}
18347c478bd9Sstevel@tonic-gate #endif
18357c478bd9Sstevel@tonic-gate }
18367c478bd9Sstevel@tonic-gate 
18377c478bd9Sstevel@tonic-gate /*
18387c478bd9Sstevel@tonic-gate  * Add the page to the front of a linked list of pages
18397c478bd9Sstevel@tonic-gate  * using the p_next & p_prev pointers for the list.
18407c478bd9Sstevel@tonic-gate  * The caller is responsible for protecting the list pointers.
18417c478bd9Sstevel@tonic-gate  */
18427c478bd9Sstevel@tonic-gate void
18437c478bd9Sstevel@tonic-gate mach_page_add(page_t **ppp, page_t *pp)
18447c478bd9Sstevel@tonic-gate {
18457c478bd9Sstevel@tonic-gate 	if (*ppp == NULL) {
18467c478bd9Sstevel@tonic-gate 		pp->p_next = pp->p_prev = pp;
18477c478bd9Sstevel@tonic-gate 	} else {
18487c478bd9Sstevel@tonic-gate 		pp->p_next = *ppp;
18497c478bd9Sstevel@tonic-gate 		pp->p_prev = (*ppp)->p_prev;
18507c478bd9Sstevel@tonic-gate 		(*ppp)->p_prev = pp;
18517c478bd9Sstevel@tonic-gate 		pp->p_prev->p_next = pp;
18527c478bd9Sstevel@tonic-gate 	}
18537c478bd9Sstevel@tonic-gate 	*ppp = pp;
18547c478bd9Sstevel@tonic-gate }
18557c478bd9Sstevel@tonic-gate 
18567c478bd9Sstevel@tonic-gate /*
18577c478bd9Sstevel@tonic-gate  * Remove this page from a linked list of pages
18587c478bd9Sstevel@tonic-gate  * using the p_next & p_prev pointers for the list.
18597c478bd9Sstevel@tonic-gate  *
18607c478bd9Sstevel@tonic-gate  * The caller is responsible for protecting the list pointers.
18617c478bd9Sstevel@tonic-gate  */
18627c478bd9Sstevel@tonic-gate void
18637c478bd9Sstevel@tonic-gate mach_page_sub(page_t **ppp, page_t *pp)
18647c478bd9Sstevel@tonic-gate {
18657c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
18667c478bd9Sstevel@tonic-gate 
18677c478bd9Sstevel@tonic-gate 	if (*ppp == NULL || pp == NULL)
18687c478bd9Sstevel@tonic-gate 		panic("mach_page_sub");
18697c478bd9Sstevel@tonic-gate 
18707c478bd9Sstevel@tonic-gate 	if (*ppp == pp)
18717c478bd9Sstevel@tonic-gate 		*ppp = pp->p_next;		/* go to next page */
18727c478bd9Sstevel@tonic-gate 
18737c478bd9Sstevel@tonic-gate 	if (*ppp == pp)
18747c478bd9Sstevel@tonic-gate 		*ppp = NULL;			/* page list is gone */
18757c478bd9Sstevel@tonic-gate 	else {
18767c478bd9Sstevel@tonic-gate 		pp->p_prev->p_next = pp->p_next;
18777c478bd9Sstevel@tonic-gate 		pp->p_next->p_prev = pp->p_prev;
18787c478bd9Sstevel@tonic-gate 	}
18797c478bd9Sstevel@tonic-gate 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
18807c478bd9Sstevel@tonic-gate }
18817c478bd9Sstevel@tonic-gate 
18827c478bd9Sstevel@tonic-gate /*
18837c478bd9Sstevel@tonic-gate  * Routine fsflush uses to gradually coalesce the free list into larger pages.
18847c478bd9Sstevel@tonic-gate  */
18857c478bd9Sstevel@tonic-gate void
18867c478bd9Sstevel@tonic-gate page_promote_size(page_t *pp, uint_t cur_szc)
18877c478bd9Sstevel@tonic-gate {
18887c478bd9Sstevel@tonic-gate 	pfn_t pfn;
18897c478bd9Sstevel@tonic-gate 	int mnode;
18907c478bd9Sstevel@tonic-gate 	int idx;
18917c478bd9Sstevel@tonic-gate 	int new_szc = cur_szc + 1;
18927c478bd9Sstevel@tonic-gate 	int full = FULL_REGION_CNT(new_szc);
18937c478bd9Sstevel@tonic-gate 
18947c478bd9Sstevel@tonic-gate 	pfn = page_pptonum(pp);
18957c478bd9Sstevel@tonic-gate 	mnode = PFN_2_MEM_NODE(pfn);
18967c478bd9Sstevel@tonic-gate 
18977c478bd9Sstevel@tonic-gate 	page_freelist_lock(mnode);
18987c478bd9Sstevel@tonic-gate 
18997c478bd9Sstevel@tonic-gate 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
19007c478bd9Sstevel@tonic-gate 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
19015d07b933Sdp 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
19027c478bd9Sstevel@tonic-gate 
19037c478bd9Sstevel@tonic-gate 	page_freelist_unlock(mnode);
19047c478bd9Sstevel@tonic-gate }
19057c478bd9Sstevel@tonic-gate 
19067c478bd9Sstevel@tonic-gate static uint_t page_promote_err;
19077c478bd9Sstevel@tonic-gate static uint_t page_promote_noreloc_err;
19087c478bd9Sstevel@tonic-gate 
19097c478bd9Sstevel@tonic-gate /*
19107c478bd9Sstevel@tonic-gate  * Create a single larger page (of szc new_szc) from smaller contiguous pages
19117c478bd9Sstevel@tonic-gate  * for the given mnode starting at pfnum. Pages involved are on the freelist
19127c478bd9Sstevel@tonic-gate  * before the call and may be returned to the caller if requested, otherwise
19137c478bd9Sstevel@tonic-gate  * they will be placed back on the freelist.
19147c478bd9Sstevel@tonic-gate  * If flags is PC_ALLOC, then the large page will be returned to the user in
19157c478bd9Sstevel@tonic-gate  * a state which is consistent with a page being taken off the freelist.  If
19167c478bd9Sstevel@tonic-gate  * we failed to lock the new large page, then we will return NULL to the
19177c478bd9Sstevel@tonic-gate  * caller and put the large page on the freelist instead.
19187c478bd9Sstevel@tonic-gate  * If flags is PC_FREE, then the large page will be placed on the freelist,
19197c478bd9Sstevel@tonic-gate  * and NULL will be returned.
19207c478bd9Sstevel@tonic-gate  * The caller is responsible for locking the freelist as well as any other
19217c478bd9Sstevel@tonic-gate  * accounting which needs to be done for a returned page.
19227c478bd9Sstevel@tonic-gate  *
19237c478bd9Sstevel@tonic-gate  * RFE: For performance pass in pp instead of pfnum so
19247c478bd9Sstevel@tonic-gate  * 	we can avoid excessive calls to page_numtopp_nolock().
19257c478bd9Sstevel@tonic-gate  *	This would depend on an assumption that all contiguous
19267c478bd9Sstevel@tonic-gate  *	pages are in the same memseg so we can just add/dec
19277c478bd9Sstevel@tonic-gate  *	our pp.
19287c478bd9Sstevel@tonic-gate  *
19297c478bd9Sstevel@tonic-gate  * Lock ordering:
19307c478bd9Sstevel@tonic-gate  *
19317c478bd9Sstevel@tonic-gate  *	There is a potential but rare deadlock situation
19327c478bd9Sstevel@tonic-gate  *	for page promotion and demotion operations. The problem
19337c478bd9Sstevel@tonic-gate  *	is there are two paths into the freelist manager and
19347c478bd9Sstevel@tonic-gate  *	they have different lock orders:
19357c478bd9Sstevel@tonic-gate  *
19367c478bd9Sstevel@tonic-gate  *	page_create()
19377c478bd9Sstevel@tonic-gate  *		lock freelist
19387c478bd9Sstevel@tonic-gate  *		page_lock(EXCL)
19397c478bd9Sstevel@tonic-gate  *		unlock freelist
19407c478bd9Sstevel@tonic-gate  *		return
19417c478bd9Sstevel@tonic-gate  *		caller drops page_lock
19427c478bd9Sstevel@tonic-gate  *
19437c478bd9Sstevel@tonic-gate  *	page_free() and page_reclaim()
19447c478bd9Sstevel@tonic-gate  *		caller grabs page_lock(EXCL)
19457c478bd9Sstevel@tonic-gate  *
19467c478bd9Sstevel@tonic-gate  *		lock freelist
19477c478bd9Sstevel@tonic-gate  *		unlock freelist
19487c478bd9Sstevel@tonic-gate  *		drop page_lock
19497c478bd9Sstevel@tonic-gate  *
19507c478bd9Sstevel@tonic-gate  *	What prevents a thread in page_create() from deadlocking
19517c478bd9Sstevel@tonic-gate  *	with a thread freeing or reclaiming the same page is the
19527c478bd9Sstevel@tonic-gate  *	page_trylock() in page_get_freelist(). If the trylock fails
19537c478bd9Sstevel@tonic-gate  *	it skips the page.
19547c478bd9Sstevel@tonic-gate  *
19557c478bd9Sstevel@tonic-gate  *	The lock ordering for promotion and demotion is the same as
19567c478bd9Sstevel@tonic-gate  *	for page_create(). Since the same deadlock could occur during
19577c478bd9Sstevel@tonic-gate  *	page promotion and freeing or reclaiming of a page on the
19587c478bd9Sstevel@tonic-gate  *	cache list we might have to fail the operation and undo what
19597c478bd9Sstevel@tonic-gate  *	have done so far. Again this is rare.
19607c478bd9Sstevel@tonic-gate  */
19617c478bd9Sstevel@tonic-gate page_t *
19625d07b933Sdp page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
19637c478bd9Sstevel@tonic-gate {
19647c478bd9Sstevel@tonic-gate 	page_t		*pp, *pplist, *tpp, *start_pp;
19657c478bd9Sstevel@tonic-gate 	pgcnt_t		new_npgs, npgs;
19667c478bd9Sstevel@tonic-gate 	uint_t		bin;
19677c478bd9Sstevel@tonic-gate 	pgcnt_t		tmpnpgs, pages_left;
19687c478bd9Sstevel@tonic-gate 	uint_t		noreloc;
19697c478bd9Sstevel@tonic-gate 	int 		which_list;
19707c478bd9Sstevel@tonic-gate 	ulong_t		index;
19717c478bd9Sstevel@tonic-gate 	kmutex_t	*phm;
19727c478bd9Sstevel@tonic-gate 
19737c478bd9Sstevel@tonic-gate 	/*
19747c478bd9Sstevel@tonic-gate 	 * General algorithm:
19757c478bd9Sstevel@tonic-gate 	 * Find the starting page
19767c478bd9Sstevel@tonic-gate 	 * Walk each page struct removing it from the freelist,
19777c478bd9Sstevel@tonic-gate 	 * and linking it to all the other pages removed.
19787c478bd9Sstevel@tonic-gate 	 * Once all pages are off the freelist,
19797c478bd9Sstevel@tonic-gate 	 * walk the list, modifying p_szc to new_szc and what
19807c478bd9Sstevel@tonic-gate 	 * ever other info needs to be done to create a large free page.
19817c478bd9Sstevel@tonic-gate 	 * According to the flags, either return the page or put it
19827c478bd9Sstevel@tonic-gate 	 * on the freelist.
19837c478bd9Sstevel@tonic-gate 	 */
19847c478bd9Sstevel@tonic-gate 
19857c478bd9Sstevel@tonic-gate 	start_pp = page_numtopp_nolock(pfnum);
19867c478bd9Sstevel@tonic-gate 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
19877c478bd9Sstevel@tonic-gate 	new_npgs = page_get_pagecnt(new_szc);
19887c478bd9Sstevel@tonic-gate 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
19897c478bd9Sstevel@tonic-gate 
19905d07b933Sdp 	/* don't return page of the wrong mtype */
19915d07b933Sdp 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
19925d07b933Sdp 			return (NULL);
19935d07b933Sdp 
19947c478bd9Sstevel@tonic-gate 	/*
19957c478bd9Sstevel@tonic-gate 	 * Loop through smaller pages to confirm that all pages
19967c478bd9Sstevel@tonic-gate 	 * give the same result for PP_ISNORELOC().
19977c478bd9Sstevel@tonic-gate 	 * We can check this reliably here as the protocol for setting
19987c478bd9Sstevel@tonic-gate 	 * P_NORELOC requires pages to be taken off the free list first.
19997c478bd9Sstevel@tonic-gate 	 */
20005d07b933Sdp 	noreloc = PP_ISNORELOC(start_pp);
20015d07b933Sdp 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
20025d07b933Sdp 		if (noreloc != PP_ISNORELOC(pp)) {
20037c478bd9Sstevel@tonic-gate 			page_promote_noreloc_err++;
20047c478bd9Sstevel@tonic-gate 			page_promote_err++;
20057c478bd9Sstevel@tonic-gate 			return (NULL);
20067c478bd9Sstevel@tonic-gate 		}
20077c478bd9Sstevel@tonic-gate 	}
20087c478bd9Sstevel@tonic-gate 
20097c478bd9Sstevel@tonic-gate 	pages_left = new_npgs;
20107c478bd9Sstevel@tonic-gate 	pplist = NULL;
20117c478bd9Sstevel@tonic-gate 	pp = start_pp;
20127c478bd9Sstevel@tonic-gate 
20137c478bd9Sstevel@tonic-gate 	/* Loop around coalescing the smaller pages into a big page. */
20147c478bd9Sstevel@tonic-gate 	while (pages_left) {
20157c478bd9Sstevel@tonic-gate 		/*
20167c478bd9Sstevel@tonic-gate 		 * Remove from the freelist.
20177c478bd9Sstevel@tonic-gate 		 */
20187c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISFREE(pp));
20197c478bd9Sstevel@tonic-gate 		bin = PP_2_BIN(pp);
20207c478bd9Sstevel@tonic-gate 		ASSERT(mnode == PP_2_MEM_NODE(pp));
20217c478bd9Sstevel@tonic-gate 		mtype = PP_2_MTYPE(pp);
20227c478bd9Sstevel@tonic-gate 		if (PP_ISAGED(pp)) {
20237c478bd9Sstevel@tonic-gate 
20247c478bd9Sstevel@tonic-gate 			/*
20257c478bd9Sstevel@tonic-gate 			 * PG_FREE_LIST
20267c478bd9Sstevel@tonic-gate 			 */
20277c478bd9Sstevel@tonic-gate 			if (pp->p_szc) {
20287c478bd9Sstevel@tonic-gate 				page_vpsub(&PAGE_FREELISTS(mnode,
20297c478bd9Sstevel@tonic-gate 				    pp->p_szc, bin, mtype), pp);
20307c478bd9Sstevel@tonic-gate 			} else {
20317c478bd9Sstevel@tonic-gate 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
20327c478bd9Sstevel@tonic-gate 				    bin, mtype), pp);
20337c478bd9Sstevel@tonic-gate 			}
20347c478bd9Sstevel@tonic-gate 			which_list = PG_FREE_LIST;
20357c478bd9Sstevel@tonic-gate 		} else {
20367c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_szc == 0);
20377c478bd9Sstevel@tonic-gate 
20387c478bd9Sstevel@tonic-gate 			/*
20397c478bd9Sstevel@tonic-gate 			 * PG_CACHE_LIST
20407c478bd9Sstevel@tonic-gate 			 *
20417c478bd9Sstevel@tonic-gate 			 * Since this page comes from the
20427c478bd9Sstevel@tonic-gate 			 * cachelist, we must destroy the
20437c478bd9Sstevel@tonic-gate 			 * vnode association.
20447c478bd9Sstevel@tonic-gate 			 */
20457c478bd9Sstevel@tonic-gate 			if (!page_trylock(pp, SE_EXCL)) {
20467c478bd9Sstevel@tonic-gate 				goto fail_promote;
20477c478bd9Sstevel@tonic-gate 			}
20487c478bd9Sstevel@tonic-gate 
20497c478bd9Sstevel@tonic-gate 			/*
20507c478bd9Sstevel@tonic-gate 			 * We need to be careful not to deadlock
20517c478bd9Sstevel@tonic-gate 			 * with another thread in page_lookup().
20527c478bd9Sstevel@tonic-gate 			 * The page_lookup() thread could be holding
20537c478bd9Sstevel@tonic-gate 			 * the same phm that we need if the two
20547c478bd9Sstevel@tonic-gate 			 * pages happen to hash to the same phm lock.
20557c478bd9Sstevel@tonic-gate 			 * At this point we have locked the entire
20567c478bd9Sstevel@tonic-gate 			 * freelist and page_lookup() could be trying
20577c478bd9Sstevel@tonic-gate 			 * to grab a freelist lock.
20587c478bd9Sstevel@tonic-gate 			 */
20597c478bd9Sstevel@tonic-gate 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
20607c478bd9Sstevel@tonic-gate 			phm = PAGE_HASH_MUTEX(index);
20617c478bd9Sstevel@tonic-gate 			if (!mutex_tryenter(phm)) {
20628b464eb8Smec 				page_unlock_nocapture(pp);
20637c478bd9Sstevel@tonic-gate 				goto fail_promote;
20647c478bd9Sstevel@tonic-gate 			}
20657c478bd9Sstevel@tonic-gate 
20667c478bd9Sstevel@tonic-gate 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
20677c478bd9Sstevel@tonic-gate 			page_hashout(pp, phm);
20687c478bd9Sstevel@tonic-gate 			mutex_exit(phm);
20697c478bd9Sstevel@tonic-gate 			PP_SETAGED(pp);
20708b464eb8Smec 			page_unlock_nocapture(pp);
20717c478bd9Sstevel@tonic-gate 			which_list = PG_CACHE_LIST;
20727c478bd9Sstevel@tonic-gate 		}
2073affbd3ccSkchow 		page_ctr_sub(mnode, mtype, pp, which_list);
20747c478bd9Sstevel@tonic-gate 
20757c478bd9Sstevel@tonic-gate 		/*
20767c478bd9Sstevel@tonic-gate 		 * Concatenate the smaller page(s) onto
20777c478bd9Sstevel@tonic-gate 		 * the large page list.
20787c478bd9Sstevel@tonic-gate 		 */
20797c478bd9Sstevel@tonic-gate 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
20807c478bd9Sstevel@tonic-gate 		pages_left -= npgs;
20817c478bd9Sstevel@tonic-gate 		tpp = pp;
20827c478bd9Sstevel@tonic-gate 		while (npgs--) {
20837c478bd9Sstevel@tonic-gate 			tpp->p_szc = new_szc;
20847c478bd9Sstevel@tonic-gate 			tpp = tpp->p_next;
20857c478bd9Sstevel@tonic-gate 		}
20867c478bd9Sstevel@tonic-gate 		page_list_concat(&pplist, &pp);
20877c478bd9Sstevel@tonic-gate 		pp += tmpnpgs;
20887c478bd9Sstevel@tonic-gate 	}
20897c478bd9Sstevel@tonic-gate 	CHK_LPG(pplist, new_szc);
20907c478bd9Sstevel@tonic-gate 
20917c478bd9Sstevel@tonic-gate 	/*
20927c478bd9Sstevel@tonic-gate 	 * return the page to the user if requested
20937c478bd9Sstevel@tonic-gate 	 * in the properly locked state.
20947c478bd9Sstevel@tonic-gate 	 */
20957c478bd9Sstevel@tonic-gate 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
20967c478bd9Sstevel@tonic-gate 		return (pplist);
20977c478bd9Sstevel@tonic-gate 	}
20987c478bd9Sstevel@tonic-gate 
20997c478bd9Sstevel@tonic-gate 	/*
21007c478bd9Sstevel@tonic-gate 	 * Otherwise place the new large page on the freelist
21017c478bd9Sstevel@tonic-gate 	 */
21027c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pplist);
21037c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pplist);
21047c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pplist);
21057c478bd9Sstevel@tonic-gate 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
21067c478bd9Sstevel@tonic-gate 
2107affbd3ccSkchow 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
21087c478bd9Sstevel@tonic-gate 	return (NULL);
21097c478bd9Sstevel@tonic-gate 
21107c478bd9Sstevel@tonic-gate fail_promote:
21117c478bd9Sstevel@tonic-gate 	/*
21127c478bd9Sstevel@tonic-gate 	 * A thread must have still been freeing or
21137c478bd9Sstevel@tonic-gate 	 * reclaiming the page on the cachelist.
21147c478bd9Sstevel@tonic-gate 	 * To prevent a deadlock undo what we have
21157c478bd9Sstevel@tonic-gate 	 * done sofar and return failure. This
21167c478bd9Sstevel@tonic-gate 	 * situation can only happen while promoting
21177c478bd9Sstevel@tonic-gate 	 * PAGESIZE pages.
21187c478bd9Sstevel@tonic-gate 	 */
21197c478bd9Sstevel@tonic-gate 	page_promote_err++;
21207c478bd9Sstevel@tonic-gate 	while (pplist) {
21217c478bd9Sstevel@tonic-gate 		pp = pplist;
21227c478bd9Sstevel@tonic-gate 		mach_page_sub(&pplist, pp);
21237c478bd9Sstevel@tonic-gate 		pp->p_szc = 0;
21247c478bd9Sstevel@tonic-gate 		bin = PP_2_BIN(pp);
21257c478bd9Sstevel@tonic-gate 		mtype = PP_2_MTYPE(pp);
21267c478bd9Sstevel@tonic-gate 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2127affbd3ccSkchow 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
21287c478bd9Sstevel@tonic-gate 	}
21297c478bd9Sstevel@tonic-gate 	return (NULL);
21307c478bd9Sstevel@tonic-gate 
21317c478bd9Sstevel@tonic-gate }
21327c478bd9Sstevel@tonic-gate 
21337c478bd9Sstevel@tonic-gate /*
21347c478bd9Sstevel@tonic-gate  * Break up a large page into smaller size pages.
21357c478bd9Sstevel@tonic-gate  * Pages involved are on the freelist before the call and may
21367c478bd9Sstevel@tonic-gate  * be returned to the caller if requested, otherwise they will
21377c478bd9Sstevel@tonic-gate  * be placed back on the freelist.
21387c478bd9Sstevel@tonic-gate  * The caller is responsible for locking the freelist as well as any other
21397c478bd9Sstevel@tonic-gate  * accounting which needs to be done for a returned page.
21407c478bd9Sstevel@tonic-gate  * If flags is not PC_ALLOC, the color argument is ignored, and thus
21417c478bd9Sstevel@tonic-gate  * technically, any value may be passed in but PC_NO_COLOR is the standard
21427c478bd9Sstevel@tonic-gate  * which should be followed for clarity's sake.
214319397407SSherry Moore  * Returns a page whose pfn is < pfnmax
21447c478bd9Sstevel@tonic-gate  */
21457c478bd9Sstevel@tonic-gate page_t *
214619397407SSherry Moore page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
214719397407SSherry Moore     uchar_t new_szc, int color, int flags)
21487c478bd9Sstevel@tonic-gate {
21497c478bd9Sstevel@tonic-gate 	page_t	*pp, *pplist, *npplist;
21507c478bd9Sstevel@tonic-gate 	pgcnt_t	npgs, n;
21517c478bd9Sstevel@tonic-gate 	uint_t	bin;
21527c478bd9Sstevel@tonic-gate 	uint_t	mtype;
21537c478bd9Sstevel@tonic-gate 	page_t	*ret_pp = NULL;
21547c478bd9Sstevel@tonic-gate 
21557c478bd9Sstevel@tonic-gate 	ASSERT(cur_szc != 0);
21567c478bd9Sstevel@tonic-gate 	ASSERT(new_szc < cur_szc);
21577c478bd9Sstevel@tonic-gate 
21587c478bd9Sstevel@tonic-gate 	pplist = page_numtopp_nolock(pfnum);
21597c478bd9Sstevel@tonic-gate 	ASSERT(pplist != NULL);
21607c478bd9Sstevel@tonic-gate 
21617c478bd9Sstevel@tonic-gate 	ASSERT(pplist->p_szc == cur_szc);
21627c478bd9Sstevel@tonic-gate 
21637c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pplist);
21647c478bd9Sstevel@tonic-gate 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
21657c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pplist);
21667c478bd9Sstevel@tonic-gate 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
21677c478bd9Sstevel@tonic-gate 
21687c478bd9Sstevel@tonic-gate 	CHK_LPG(pplist, cur_szc);
2169affbd3ccSkchow 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
21707c478bd9Sstevel@tonic-gate 
21717c478bd9Sstevel@tonic-gate 	/*
21727c478bd9Sstevel@tonic-gate 	 * Number of PAGESIZE pages for smaller new_szc
21737c478bd9Sstevel@tonic-gate 	 * page.
21747c478bd9Sstevel@tonic-gate 	 */
21757c478bd9Sstevel@tonic-gate 	npgs = page_get_pagecnt(new_szc);
21767c478bd9Sstevel@tonic-gate 
21777c478bd9Sstevel@tonic-gate 	while (pplist) {
21787c478bd9Sstevel@tonic-gate 		pp = pplist;
21797c478bd9Sstevel@tonic-gate 
21807c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == cur_szc);
21817c478bd9Sstevel@tonic-gate 
21827c478bd9Sstevel@tonic-gate 		/*
21837c478bd9Sstevel@tonic-gate 		 * We either break it up into PAGESIZE pages or larger.
21847c478bd9Sstevel@tonic-gate 		 */
21857c478bd9Sstevel@tonic-gate 		if (npgs == 1) {	/* PAGESIZE case */
21867c478bd9Sstevel@tonic-gate 			mach_page_sub(&pplist, pp);
21877c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_szc == cur_szc);
21887c478bd9Sstevel@tonic-gate 			ASSERT(new_szc == 0);
21897c478bd9Sstevel@tonic-gate 			ASSERT(mnode == PP_2_MEM_NODE(pp));
21907c478bd9Sstevel@tonic-gate 			pp->p_szc = new_szc;
21917c478bd9Sstevel@tonic-gate 			bin = PP_2_BIN(pp);
21927c478bd9Sstevel@tonic-gate 			if ((bin == color) && (flags == PC_ALLOC) &&
219319397407SSherry Moore 			    (ret_pp == NULL) && (pfnmax == 0 ||
219419397407SSherry Moore 			    pp->p_pagenum < pfnmax) &&
21957c478bd9Sstevel@tonic-gate 			    page_trylock_cons(pp, SE_EXCL)) {
21967c478bd9Sstevel@tonic-gate 				ret_pp = pp;
21977c478bd9Sstevel@tonic-gate 			} else {
21987c478bd9Sstevel@tonic-gate 				mtype = PP_2_MTYPE(pp);
21997c478bd9Sstevel@tonic-gate 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
22007c478bd9Sstevel@tonic-gate 				    mtype), pp);
2201affbd3ccSkchow 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
22027c478bd9Sstevel@tonic-gate 			}
22037c478bd9Sstevel@tonic-gate 		} else {
220419397407SSherry Moore 			page_t *try_to_return_this_page = NULL;
220519397407SSherry Moore 			int count = 0;
22067c478bd9Sstevel@tonic-gate 
22077c478bd9Sstevel@tonic-gate 			/*
22087c478bd9Sstevel@tonic-gate 			 * Break down into smaller lists of pages.
22097c478bd9Sstevel@tonic-gate 			 */
22107c478bd9Sstevel@tonic-gate 			page_list_break(&pplist, &npplist, npgs);
22117c478bd9Sstevel@tonic-gate 
22127c478bd9Sstevel@tonic-gate 			pp = pplist;
22137c478bd9Sstevel@tonic-gate 			n = npgs;
22147c478bd9Sstevel@tonic-gate 			while (n--) {
22157c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == cur_szc);
221619397407SSherry Moore 				/*
221719397407SSherry Moore 				 * Check whether all the pages in this list
221819397407SSherry Moore 				 * fit the request criteria.
221919397407SSherry Moore 				 */
222019397407SSherry Moore 				if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
222119397407SSherry Moore 					count++;
222219397407SSherry Moore 				}
22237c478bd9Sstevel@tonic-gate 				pp->p_szc = new_szc;
22247c478bd9Sstevel@tonic-gate 				pp = pp->p_next;
22257c478bd9Sstevel@tonic-gate 			}
22267c478bd9Sstevel@tonic-gate 
222719397407SSherry Moore 			if (count == npgs &&
222819397407SSherry Moore 			    (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
222919397407SSherry Moore 				try_to_return_this_page = pp;
223019397407SSherry Moore 			}
223119397407SSherry Moore 
22327c478bd9Sstevel@tonic-gate 			CHK_LPG(pplist, new_szc);
22337c478bd9Sstevel@tonic-gate 
22347c478bd9Sstevel@tonic-gate 			bin = PP_2_BIN(pplist);
223519397407SSherry Moore 			if (try_to_return_this_page)
223619397407SSherry Moore 				ASSERT(mnode ==
223719397407SSherry Moore 				    PP_2_MEM_NODE(try_to_return_this_page));
22387c478bd9Sstevel@tonic-gate 			if ((bin == color) && (flags == PC_ALLOC) &&
223919397407SSherry Moore 			    (ret_pp == NULL) && try_to_return_this_page &&
224019397407SSherry Moore 			    page_trylock_cons(try_to_return_this_page,
224119397407SSherry Moore 			    SE_EXCL)) {
224219397407SSherry Moore 				ret_pp = try_to_return_this_page;
22437c478bd9Sstevel@tonic-gate 			} else {
22447c478bd9Sstevel@tonic-gate 				mtype = PP_2_MTYPE(pp);
22457c478bd9Sstevel@tonic-gate 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
22467c478bd9Sstevel@tonic-gate 				    bin, mtype), pplist);
22477c478bd9Sstevel@tonic-gate 
2248affbd3ccSkchow 				page_ctr_add(mnode, mtype, pplist,
2249affbd3ccSkchow 				    PG_FREE_LIST);
22507c478bd9Sstevel@tonic-gate 			}
22517c478bd9Sstevel@tonic-gate 			pplist = npplist;
22527c478bd9Sstevel@tonic-gate 		}
22537c478bd9Sstevel@tonic-gate 	}
22547c478bd9Sstevel@tonic-gate 	return (ret_pp);
22557c478bd9Sstevel@tonic-gate }
22567c478bd9Sstevel@tonic-gate 
22577c478bd9Sstevel@tonic-gate int mpss_coalesce_disable = 0;
22587c478bd9Sstevel@tonic-gate 
22597c478bd9Sstevel@tonic-gate /*
22607c478bd9Sstevel@tonic-gate  * Coalesce free pages into a page of the given szc and color if possible.
22617c478bd9Sstevel@tonic-gate  * Return the pointer to the page created, otherwise, return NULL.
22625d07b933Sdp  *
22635d07b933Sdp  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
22647c478bd9Sstevel@tonic-gate  */
22655d07b933Sdp page_t *
22665d07b933Sdp page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
22675d07b933Sdp     int mtype, pfn_t pfnhi)
22687c478bd9Sstevel@tonic-gate {
22695d07b933Sdp 	int 	r = szc;		/* region size */
22705d07b933Sdp 	int	mrange;
22715d07b933Sdp 	uint_t 	full, bin, color_mask, wrap = 0;
22725d07b933Sdp 	pfn_t	pfnum, lo, hi;
22735d07b933Sdp 	size_t	len, idx, idx0;
22745d07b933Sdp 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
22757c478bd9Sstevel@tonic-gate 	page_t	*ret_pp;
2276ce8eb11aSdp 	MEM_NODE_ITERATOR_DECL(it);
22775d07b933Sdp #if defined(__sparc)
22785d07b933Sdp 	pfn_t pfnum0, nlo, nhi;
22795d07b933Sdp #endif
22807c478bd9Sstevel@tonic-gate 
22817c478bd9Sstevel@tonic-gate 	if (mpss_coalesce_disable) {
22825d07b933Sdp 		ASSERT(szc < MMU_PAGE_SIZES);
22835d07b933Sdp 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
22847c478bd9Sstevel@tonic-gate 		return (NULL);
22857c478bd9Sstevel@tonic-gate 	}
22867c478bd9Sstevel@tonic-gate 
22875d07b933Sdp 	ASSERT(szc < mmu_page_sizes);
22885d07b933Sdp 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
22895d07b933Sdp 	ASSERT(ceq_mask <= color_mask);
22905d07b933Sdp 	ASSERT(color <= color_mask);
22915d07b933Sdp 	color &= ceq_mask;
22925d07b933Sdp 
22935d07b933Sdp 	/* Prevent page_counters dynamic memory from being freed */
22945d07b933Sdp 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
22955d07b933Sdp 
22965d07b933Sdp 	mrange = MTYPE_2_MRANGE(mnode, mtype);
22975d07b933Sdp 	ASSERT(mrange < mnode_nranges[mnode]);
22985d07b933Sdp 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
22995d07b933Sdp 
23005d07b933Sdp 	/* get pfn range for mtype */
23015d07b933Sdp 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
23025d07b933Sdp 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
23035d07b933Sdp 	hi++;
23045d07b933Sdp 
23055d07b933Sdp 	/* use lower limit if given */
23065d07b933Sdp 	if (pfnhi != PFNNULL && pfnhi < hi)
23075d07b933Sdp 		hi = pfnhi;
23085d07b933Sdp 
23095d07b933Sdp 	/* round to szcpgcnt boundaries */
23105d07b933Sdp 	lo = P2ROUNDUP(lo, szcpgcnt);
2311b779d3e0Sdp 	MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2312b779d3e0Sdp 	if (lo == (pfn_t)-1) {
2313b779d3e0Sdp 		rw_exit(&page_ctrs_rwlock[mnode]);
2314b779d3e0Sdp 		return (NULL);
2315b779d3e0Sdp 	}
23165d07b933Sdp 	hi = hi & ~(szcpgcnt - 1);
23175d07b933Sdp 
23185d07b933Sdp 	/* set lo to the closest pfn of the right color */
2319ce8eb11aSdp 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2320ce8eb11aSdp 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2321ce8eb11aSdp 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2322ce8eb11aSdp 		    &it);
23235d07b933Sdp 	}
23245d07b933Sdp 
23255d07b933Sdp 	if (hi <= lo) {
23265d07b933Sdp 		rw_exit(&page_ctrs_rwlock[mnode]);
23277c478bd9Sstevel@tonic-gate 		return (NULL);
23287c478bd9Sstevel@tonic-gate 	}
23295d07b933Sdp 
23307c478bd9Sstevel@tonic-gate 	full = FULL_REGION_CNT(r);
23317c478bd9Sstevel@tonic-gate 
23325d07b933Sdp 	/* calculate the number of page candidates and initial search index */
23335d07b933Sdp 	bin = color;
23345d07b933Sdp 	idx0 = (size_t)(-1);
23355d07b933Sdp 	do {
23365d07b933Sdp 		pgcnt_t acand;
23375d07b933Sdp 
23385d07b933Sdp 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
23395d07b933Sdp 		if (acand) {
23405d07b933Sdp 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
23415d07b933Sdp 			    r, bin, mrange);
23425d07b933Sdp 			idx0 = MIN(idx0, idx);
23435d07b933Sdp 			cands += acand;
23445d07b933Sdp 		}
23455d07b933Sdp 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
23465d07b933Sdp 	} while (bin != color);
23475d07b933Sdp 
23485d07b933Sdp 	if (cands == 0) {
23495d07b933Sdp 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
23505d07b933Sdp 		rw_exit(&page_ctrs_rwlock[mnode]);
23515d07b933Sdp 		return (NULL);
23525d07b933Sdp 	}
23535d07b933Sdp 
23545d07b933Sdp 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
23555d07b933Sdp 	if (pfnum < lo || pfnum >= hi) {
23565d07b933Sdp 		pfnum = lo;
2357ce8eb11aSdp 	} else {
2358b779d3e0Sdp 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2359ce8eb11aSdp 		if (pfnum == (pfn_t)-1) {
2360ce8eb11aSdp 			pfnum = lo;
2361b779d3e0Sdp 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2362ce8eb11aSdp 			ASSERT(pfnum != (pfn_t)-1);
2363ce8eb11aSdp 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2364ce8eb11aSdp 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2365ce8eb11aSdp 			/* invalid color, get the closest correct pfn */
2366ce8eb11aSdp 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2367ce8eb11aSdp 			    color_mask, &it);
2368ce8eb11aSdp 			if (pfnum >= hi) {
2369ce8eb11aSdp 				pfnum = lo;
2370b779d3e0Sdp 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2371ce8eb11aSdp 			}
2372ce8eb11aSdp 		}
23735d07b933Sdp 	}
23745d07b933Sdp 
23755d07b933Sdp 	/* set starting index */
23765d07b933Sdp 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
23775d07b933Sdp 	ASSERT(idx0 < len);
23785d07b933Sdp 
23795d07b933Sdp #if defined(__sparc)
23805d07b933Sdp 	pfnum0 = pfnum;		/* page corresponding to idx0 */
23815d07b933Sdp 	nhi = 0;		/* search kcage ranges */
23825d07b933Sdp #endif
23835d07b933Sdp 
23845d07b933Sdp 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
23855d07b933Sdp 
23865d07b933Sdp #if defined(__sparc)
23875d07b933Sdp 		/*
23885d07b933Sdp 		 * Find lowest intersection of kcage ranges and mnode.
23895d07b933Sdp 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
23905d07b933Sdp 		 */
23915d07b933Sdp 		if (nhi <= pfnum) {
23925d07b933Sdp 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
23935d07b933Sdp 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
23945d07b933Sdp 				goto wrapit;
23955d07b933Sdp 
23965d07b933Sdp 			/* jump to the next page in the range */
23975d07b933Sdp 			if (pfnum < nlo) {
23985d07b933Sdp 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2399b779d3e0Sdp 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
24005d07b933Sdp 				idx = PNUM_TO_IDX(mnode, r, pfnum);
24015d07b933Sdp 				if (idx >= len || pfnum >= hi)
24025d07b933Sdp 					goto wrapit;
2403ce8eb11aSdp 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
24045d07b933Sdp 				    ceq_mask)
24055d07b933Sdp 					goto next;
2406ce8eb11aSdp 				if (interleaved_mnodes &&
2407ce8eb11aSdp 				    PFN_2_MEM_NODE(pfnum) != mnode)
2408ce8eb11aSdp 					goto next;
24097c478bd9Sstevel@tonic-gate 			}
24105d07b933Sdp 		}
24115d07b933Sdp #endif
24125d07b933Sdp 
24135d07b933Sdp 		if (PAGE_COUNTERS(mnode, r, idx) != full)
24145d07b933Sdp 			goto next;
24155d07b933Sdp 
24165d07b933Sdp 		/*
24175d07b933Sdp 		 * RFE: For performance maybe we can do something less
24185d07b933Sdp 		 *	brutal than locking the entire freelist. So far
24195d07b933Sdp 		 * 	this doesn't seem to be a performance problem?
24205d07b933Sdp 		 */
24215d07b933Sdp 		page_freelist_lock(mnode);
24225d07b933Sdp 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
24235d07b933Sdp 			ret_pp =
24245d07b933Sdp 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
24257c478bd9Sstevel@tonic-gate 			if (ret_pp != NULL) {
24265d07b933Sdp 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
24275d07b933Sdp 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2428ce8eb11aSdp 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
24297c478bd9Sstevel@tonic-gate 				page_freelist_unlock(mnode);
24307c478bd9Sstevel@tonic-gate 				rw_exit(&page_ctrs_rwlock[mnode]);
24317c478bd9Sstevel@tonic-gate #if defined(__sparc)
24327c478bd9Sstevel@tonic-gate 				if (PP_ISNORELOC(ret_pp)) {
24337c478bd9Sstevel@tonic-gate 					pgcnt_t npgs;
24347c478bd9Sstevel@tonic-gate 
24357c478bd9Sstevel@tonic-gate 					npgs = page_get_pagecnt(ret_pp->p_szc);
24367c478bd9Sstevel@tonic-gate 					kcage_freemem_sub(npgs);
24377c478bd9Sstevel@tonic-gate 				}
24387c478bd9Sstevel@tonic-gate #endif
24397c478bd9Sstevel@tonic-gate 				return (ret_pp);
24407c478bd9Sstevel@tonic-gate 			}
24415d07b933Sdp 		} else {
24425d07b933Sdp 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
24435d07b933Sdp 		}
24445d07b933Sdp 
24455d07b933Sdp 		page_freelist_unlock(mnode);
24465d07b933Sdp 		/*
24475d07b933Sdp 		 * No point looking for another page if we've
24485d07b933Sdp 		 * already tried all of the ones that
24495d07b933Sdp 		 * page_ctr_cands indicated.  Stash off where we left
24505d07b933Sdp 		 * off.
24515d07b933Sdp 		 * Note: this is not exact since we don't hold the
24525d07b933Sdp 		 * page_freelist_locks before we initially get the
24535d07b933Sdp 		 * value of cands for performance reasons, but should
24545d07b933Sdp 		 * be a decent approximation.
24555d07b933Sdp 		 */
24565d07b933Sdp 		if (--cands == 0) {
24575d07b933Sdp 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
24585d07b933Sdp 			    idx;
24595d07b933Sdp 			break;
24605d07b933Sdp 		}
24615d07b933Sdp next:
24625d07b933Sdp 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2463ce8eb11aSdp 		    color_mask, &it);
24645d07b933Sdp 		idx = PNUM_TO_IDX(mnode, r, pfnum);
24655d07b933Sdp 		if (idx >= len || pfnum >= hi) {
24665d07b933Sdp wrapit:
24675d07b933Sdp 			pfnum = lo;
2468b779d3e0Sdp 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
24695d07b933Sdp 			idx = PNUM_TO_IDX(mnode, r, pfnum);
24705d07b933Sdp 			wrap++;
24715d07b933Sdp #if defined(__sparc)
24725d07b933Sdp 			nhi = 0;	/* search kcage ranges */
24735d07b933Sdp #endif
24747c478bd9Sstevel@tonic-gate 		}
24757c478bd9Sstevel@tonic-gate 	}
24765d07b933Sdp 
24777c478bd9Sstevel@tonic-gate 	rw_exit(&page_ctrs_rwlock[mnode]);
24785d07b933Sdp 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
24797c478bd9Sstevel@tonic-gate 	return (NULL);
24807c478bd9Sstevel@tonic-gate }
24817c478bd9Sstevel@tonic-gate 
24827c478bd9Sstevel@tonic-gate /*
24837c478bd9Sstevel@tonic-gate  * For the given mnode, promote as many small pages to large pages as possible.
2484ce8eb11aSdp  * mnode can be -1, which means do them all
24857c478bd9Sstevel@tonic-gate  */
24867c478bd9Sstevel@tonic-gate void
24877c478bd9Sstevel@tonic-gate page_freelist_coalesce_all(int mnode)
24887c478bd9Sstevel@tonic-gate {
24897c478bd9Sstevel@tonic-gate 	int 	r;		/* region size */
24907c478bd9Sstevel@tonic-gate 	int 	idx, full;
24917c478bd9Sstevel@tonic-gate 	size_t	len;
2492ce8eb11aSdp 	int doall = interleaved_mnodes || mnode < 0;
2493ce8eb11aSdp 	int mlo = doall ? 0 : mnode;
2494ce8eb11aSdp 	int mhi = doall ? max_mem_nodes : (mnode + 1);
24957c478bd9Sstevel@tonic-gate 
24967c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
24977c478bd9Sstevel@tonic-gate 
24987c478bd9Sstevel@tonic-gate 	if (mpss_coalesce_disable) {
24997c478bd9Sstevel@tonic-gate 		return;
25007c478bd9Sstevel@tonic-gate 	}
25017c478bd9Sstevel@tonic-gate 
25027c478bd9Sstevel@tonic-gate 	/*
25037c478bd9Sstevel@tonic-gate 	 * Lock the entire freelist and coalesce what we can.
25047c478bd9Sstevel@tonic-gate 	 *
25057c478bd9Sstevel@tonic-gate 	 * Always promote to the largest page possible
25067c478bd9Sstevel@tonic-gate 	 * first to reduce the number of page promotions.
25077c478bd9Sstevel@tonic-gate 	 */
2508ce8eb11aSdp 	for (mnode = mlo; mnode < mhi; mnode++) {
2509ce8eb11aSdp 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2510ce8eb11aSdp 		page_freelist_lock(mnode);
2511ce8eb11aSdp 	}
25127c478bd9Sstevel@tonic-gate 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2513ce8eb11aSdp 		for (mnode = mlo; mnode < mhi; mnode++) {
2514ce8eb11aSdp 			pgcnt_t cands = 0;
2515ce8eb11aSdp 			int mrange, nranges = mnode_nranges[mnode];
25167c478bd9Sstevel@tonic-gate 
2517ce8eb11aSdp 			for (mrange = 0; mrange < nranges; mrange++) {
2518ce8eb11aSdp 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2519ce8eb11aSdp 				if (cands != 0)
2520ce8eb11aSdp 					break;
2521ce8eb11aSdp 			}
2522ce8eb11aSdp 			if (cands == 0) {
2523ce8eb11aSdp 				VM_STAT_ADD(vmm_vmstats.
2524ce8eb11aSdp 				    page_ctrs_cands_skip_all);
2525ce8eb11aSdp 				continue;
2526ce8eb11aSdp 			}
25277c478bd9Sstevel@tonic-gate 
2528ce8eb11aSdp 			full = FULL_REGION_CNT(r);
2529ce8eb11aSdp 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2530ce8eb11aSdp 
2531ce8eb11aSdp 			for (idx = 0; idx < len; idx++) {
2532ce8eb11aSdp 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
2533ce8eb11aSdp 					pfn_t pfnum =
2534ce8eb11aSdp 					    IDX_TO_PNUM(mnode, r, idx);
2535ce8eb11aSdp 					int tmnode = interleaved_mnodes ?
2536ce8eb11aSdp 					    PFN_2_MEM_NODE(pfnum) : mnode;
2537ce8eb11aSdp 
2538ce8eb11aSdp 					ASSERT(pfnum >=
2539ce8eb11aSdp 					    mem_node_config[tmnode].physbase &&
2540ce8eb11aSdp 					    pfnum <
2541ce8eb11aSdp 					    mem_node_config[tmnode].physmax);
2542ce8eb11aSdp 
2543ce8eb11aSdp 					(void) page_promote(tmnode,
2544ce8eb11aSdp 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2545ce8eb11aSdp 				}
25467c478bd9Sstevel@tonic-gate 			}
2547ce8eb11aSdp 			/* shared hpm_counters covers all mnodes, so we quit */
2548ce8eb11aSdp 			if (interleaved_mnodes)
2549ce8eb11aSdp 				break;
25507c478bd9Sstevel@tonic-gate 		}
25517c478bd9Sstevel@tonic-gate 	}
2552ce8eb11aSdp 	for (mnode = mlo; mnode < mhi; mnode++) {
2553ce8eb11aSdp 		page_freelist_unlock(mnode);
2554ce8eb11aSdp 		rw_exit(&page_ctrs_rwlock[mnode]);
2555ce8eb11aSdp 	}
25567c478bd9Sstevel@tonic-gate }
25577c478bd9Sstevel@tonic-gate 
25587c478bd9Sstevel@tonic-gate /*
25597c478bd9Sstevel@tonic-gate  * This is where all polices for moving pages around
25607c478bd9Sstevel@tonic-gate  * to different page size free lists is implemented.
25617c478bd9Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
25627c478bd9Sstevel@tonic-gate  *
25637c478bd9Sstevel@tonic-gate  * So far these are the priorities for this algorithm in descending
25647c478bd9Sstevel@tonic-gate  * order:
25657c478bd9Sstevel@tonic-gate  *
25667c478bd9Sstevel@tonic-gate  *	1) When servicing a request try to do so with a free page
25677c478bd9Sstevel@tonic-gate  *	   from next size up. Helps defer fragmentation as long
25687c478bd9Sstevel@tonic-gate  *	   as possible.
25697c478bd9Sstevel@tonic-gate  *
25707c478bd9Sstevel@tonic-gate  *	2) Page coalesce on demand. Only when a freelist
25717c478bd9Sstevel@tonic-gate  *	   larger than PAGESIZE is empty and step 1
25727c478bd9Sstevel@tonic-gate  *	   will not work since all larger size lists are
25737c478bd9Sstevel@tonic-gate  *	   also empty.
25747c478bd9Sstevel@tonic-gate  *
25757c478bd9Sstevel@tonic-gate  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
25767c478bd9Sstevel@tonic-gate  */
25775d07b933Sdp 
25787c478bd9Sstevel@tonic-gate page_t *
25795d07b933Sdp page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
258019397407SSherry Moore     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
25817c478bd9Sstevel@tonic-gate {
25827c478bd9Sstevel@tonic-gate 	uchar_t nszc = szc + 1;
25835d07b933Sdp 	uint_t 	bin, sbin, bin_prev;
25847c478bd9Sstevel@tonic-gate 	page_t	*pp, *firstpp;
25857c478bd9Sstevel@tonic-gate 	page_t	*ret_pp = NULL;
25865d07b933Sdp 	uint_t  color_mask;
25877c478bd9Sstevel@tonic-gate 
25885d07b933Sdp 	if (nszc == mmu_page_sizes)
25895d07b933Sdp 		return (NULL);
25907c478bd9Sstevel@tonic-gate 
25915d07b933Sdp 	ASSERT(nszc < mmu_page_sizes);
25925d07b933Sdp 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
25935d07b933Sdp 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
25945d07b933Sdp 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
25955d07b933Sdp 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
25965d07b933Sdp 
25975d07b933Sdp 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
25987c478bd9Sstevel@tonic-gate 	/*
25995d07b933Sdp 	 * First try to break up a larger page to fill current size freelist.
26007c478bd9Sstevel@tonic-gate 	 */
26015d07b933Sdp 	while (plw->plw_bins[nszc] != 0) {
26025d07b933Sdp 
26035d07b933Sdp 		ASSERT(nszc < mmu_page_sizes);
26045d07b933Sdp 
26057c478bd9Sstevel@tonic-gate 		/*
26067c478bd9Sstevel@tonic-gate 		 * If page found then demote it.
26077c478bd9Sstevel@tonic-gate 		 */
26087c478bd9Sstevel@tonic-gate 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
26097c478bd9Sstevel@tonic-gate 			page_freelist_lock(mnode);
26107c478bd9Sstevel@tonic-gate 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
26117c478bd9Sstevel@tonic-gate 
26127c478bd9Sstevel@tonic-gate 			/*
26137c478bd9Sstevel@tonic-gate 			 * If pfnhi is not PFNNULL, look for large page below
26147c478bd9Sstevel@tonic-gate 			 * pfnhi. PFNNULL signifies no pfn requirement.
26157c478bd9Sstevel@tonic-gate 			 */
26167196569bSSherry Moore 			if (pp &&
26177196569bSSherry Moore 			    ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
26187196569bSSherry Moore 			    (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
26197c478bd9Sstevel@tonic-gate 				do {
26207c478bd9Sstevel@tonic-gate 					pp = pp->p_vpnext;
26217c478bd9Sstevel@tonic-gate 					if (pp == firstpp) {
26227c478bd9Sstevel@tonic-gate 						pp = NULL;
26237c478bd9Sstevel@tonic-gate 						break;
26247c478bd9Sstevel@tonic-gate 					}
262519397407SSherry Moore 				} while ((pfnhi != PFNNULL &&
262619397407SSherry Moore 				    pp->p_pagenum >= pfnhi) ||
262719397407SSherry Moore 				    (pfnlo != PFNNULL &&
262819397407SSherry Moore 				    pp->p_pagenum < pfnlo));
262919397407SSherry Moore 
263019397407SSherry Moore 				if (pfnhi != PFNNULL && pp != NULL)
263119397407SSherry Moore 					ASSERT(pp->p_pagenum < pfnhi);
263219397407SSherry Moore 
263319397407SSherry Moore 				if (pfnlo != PFNNULL && pp != NULL)
263419397407SSherry Moore 					ASSERT(pp->p_pagenum >= pfnlo);
26357c478bd9Sstevel@tonic-gate 			}
26367c478bd9Sstevel@tonic-gate 			if (pp) {
26375d07b933Sdp 				uint_t ccolor = page_correct_color(szc, nszc,
26385d07b933Sdp 				    color, bin, plw->plw_ceq_mask[szc]);
26395d07b933Sdp 
26407c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == nszc);
26415d07b933Sdp 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
26427c478bd9Sstevel@tonic-gate 				ret_pp = page_demote(mnode, pp->p_pagenum,
264319397407SSherry Moore 				    pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
26447c478bd9Sstevel@tonic-gate 				if (ret_pp) {
26457c478bd9Sstevel@tonic-gate 					page_freelist_unlock(mnode);
26467c478bd9Sstevel@tonic-gate #if defined(__sparc)
26477c478bd9Sstevel@tonic-gate 					if (PP_ISNORELOC(ret_pp)) {
26487c478bd9Sstevel@tonic-gate 						pgcnt_t npgs;
26497c478bd9Sstevel@tonic-gate 
26507c478bd9Sstevel@tonic-gate 						npgs = page_get_pagecnt(
26517c478bd9Sstevel@tonic-gate 						    ret_pp->p_szc);
26527c478bd9Sstevel@tonic-gate 						kcage_freemem_sub(npgs);
26537c478bd9Sstevel@tonic-gate 					}
26547c478bd9Sstevel@tonic-gate #endif
26557c478bd9Sstevel@tonic-gate 					return (ret_pp);
26567c478bd9Sstevel@tonic-gate 				}
26577c478bd9Sstevel@tonic-gate 			}
26587c478bd9Sstevel@tonic-gate 			page_freelist_unlock(mnode);
26597c478bd9Sstevel@tonic-gate 		}
26607c478bd9Sstevel@tonic-gate 
26615d07b933Sdp 		/* loop through next size bins */
26625d07b933Sdp 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
26635d07b933Sdp 		plw->plw_bins[nszc]--;
26645d07b933Sdp 
26655d07b933Sdp 		if (bin == sbin) {
26665d07b933Sdp 			uchar_t nnszc = nszc + 1;
26675d07b933Sdp 
26685d07b933Sdp 			/* we are done with this page size - check next */
26695d07b933Sdp 			if (plw->plw_bins[nnszc] == 0)
26705d07b933Sdp 				/* we have already checked next size bins */
26715d07b933Sdp 				break;
26725d07b933Sdp 
26735d07b933Sdp 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
26745d07b933Sdp 			if (bin_prev != INVALID_COLOR) {
26755d07b933Sdp 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
26765d07b933Sdp 				if (!((bin ^ bin_prev) &
26775d07b933Sdp 				    plw->plw_ceq_mask[nnszc]))
26785d07b933Sdp 					break;
26795d07b933Sdp 			}
26805d07b933Sdp 			ASSERT(nnszc < mmu_page_sizes);
26815d07b933Sdp 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
26825d07b933Sdp 			nszc = nnszc;
26835d07b933Sdp 			ASSERT(nszc < mmu_page_sizes);
26845d07b933Sdp 		}
26857c478bd9Sstevel@tonic-gate 	}
26867c478bd9Sstevel@tonic-gate 
26877c478bd9Sstevel@tonic-gate 	return (ret_pp);
26887c478bd9Sstevel@tonic-gate }
26897c478bd9Sstevel@tonic-gate 
26907c478bd9Sstevel@tonic-gate /*
26917c478bd9Sstevel@tonic-gate  * Helper routine used only by the freelist code to lock
26927c478bd9Sstevel@tonic-gate  * a page. If the page is a large page then it succeeds in
26937c478bd9Sstevel@tonic-gate  * locking all the constituent pages or none at all.
26947c478bd9Sstevel@tonic-gate  * Returns 1 on sucess, 0 on failure.
26957c478bd9Sstevel@tonic-gate  */
26967c478bd9Sstevel@tonic-gate static int
26977c478bd9Sstevel@tonic-gate page_trylock_cons(page_t *pp, se_t se)
26987c478bd9Sstevel@tonic-gate {
26997c478bd9Sstevel@tonic-gate 	page_t	*tpp, *first_pp = pp;
27007c478bd9Sstevel@tonic-gate 
27017c478bd9Sstevel@tonic-gate 	/*
27027c478bd9Sstevel@tonic-gate 	 * Fail if can't lock first or only page.
27037c478bd9Sstevel@tonic-gate 	 */
27047c478bd9Sstevel@tonic-gate 	if (!page_trylock(pp, se)) {
27057c478bd9Sstevel@tonic-gate 		return (0);
27067c478bd9Sstevel@tonic-gate 	}
27077c478bd9Sstevel@tonic-gate 
27087c478bd9Sstevel@tonic-gate 	/*
27097c478bd9Sstevel@tonic-gate 	 * PAGESIZE: common case.
27107c478bd9Sstevel@tonic-gate 	 */
27117c478bd9Sstevel@tonic-gate 	if (pp->p_szc == 0) {
27127c478bd9Sstevel@tonic-gate 		return (1);
27137c478bd9Sstevel@tonic-gate 	}
27147c478bd9Sstevel@tonic-gate 
27157c478bd9Sstevel@tonic-gate 	/*
27167c478bd9Sstevel@tonic-gate 	 * Large page case.
27177c478bd9Sstevel@tonic-gate 	 */
27187c478bd9Sstevel@tonic-gate 	tpp = pp->p_next;
27197c478bd9Sstevel@tonic-gate 	while (tpp != pp) {
27207c478bd9Sstevel@tonic-gate 		if (!page_trylock(tpp, se)) {
27217c478bd9Sstevel@tonic-gate 			/*
27228b464eb8Smec 			 * On failure unlock what we have locked so far.
27238b464eb8Smec 			 * We want to avoid attempting to capture these
27248b464eb8Smec 			 * pages as the pcm mutex may be held which could
27258b464eb8Smec 			 * lead to a recursive mutex panic.
27267c478bd9Sstevel@tonic-gate 			 */
27277c478bd9Sstevel@tonic-gate 			while (first_pp != tpp) {
27288b464eb8Smec 				page_unlock_nocapture(first_pp);
27297c478bd9Sstevel@tonic-gate 				first_pp = first_pp->p_next;
27307c478bd9Sstevel@tonic-gate 			}
27317c478bd9Sstevel@tonic-gate 			return (0);
27327c478bd9Sstevel@tonic-gate 		}
27337c478bd9Sstevel@tonic-gate 		tpp = tpp->p_next;
27347c478bd9Sstevel@tonic-gate 	}
27357c478bd9Sstevel@tonic-gate 	return (1);
27367c478bd9Sstevel@tonic-gate }
27377c478bd9Sstevel@tonic-gate 
27385d07b933Sdp /*
27395d07b933Sdp  * init context for walking page lists
27405d07b933Sdp  * Called when a page of the given szc in unavailable. Sets markers
27415d07b933Sdp  * for the beginning of the search to detect when search has
27425d07b933Sdp  * completed a full cycle. Sets flags for splitting larger pages
27435d07b933Sdp  * and coalescing smaller pages. Page walking procedes until a page
27445d07b933Sdp  * of the desired equivalent color is found.
27455d07b933Sdp  */
27465d07b933Sdp void
27475d07b933Sdp page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
27485d07b933Sdp     int use_ceq, page_list_walker_t *plw)
27497c478bd9Sstevel@tonic-gate {
27505d07b933Sdp 	uint_t  nszc, ceq_mask, colors;
27515d07b933Sdp 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
27527c478bd9Sstevel@tonic-gate 
27537c478bd9Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
27545d07b933Sdp 	colors = PAGE_GET_PAGECOLORS(szc);
27557c478bd9Sstevel@tonic-gate 
27565d07b933Sdp 	plw->plw_colors = colors;
27575d07b933Sdp 	plw->plw_color_mask = colors - 1;
27585d07b933Sdp 	plw->plw_bin_marker = plw->plw_bin0 = bin;
27595d07b933Sdp 	plw->plw_bin_split_prev = bin;
27605d07b933Sdp 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
27617c478bd9Sstevel@tonic-gate 
27625d07b933Sdp 	/*
27635d07b933Sdp 	 * if vac aliasing is possible make sure lower order color
27645d07b933Sdp 	 * bits are never ignored
27655d07b933Sdp 	 */
27665d07b933Sdp 	if (vac_colors > 1)
27675d07b933Sdp 		ceq &= 0xf0;
27687c478bd9Sstevel@tonic-gate 
27697c478bd9Sstevel@tonic-gate 	/*
27705d07b933Sdp 	 * calculate the number of non-equivalent colors and
27715d07b933Sdp 	 * color equivalency mask
27727c478bd9Sstevel@tonic-gate 	 */
27735d07b933Sdp 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
27745d07b933Sdp 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
27755d07b933Sdp 	ASSERT(plw->plw_ceq_dif > 0);
27765d07b933Sdp 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
27777c478bd9Sstevel@tonic-gate 
27785d07b933Sdp 	if (flags & PG_MATCH_COLOR) {
27795d07b933Sdp 		if (cpu_page_colors <  0) {
27805d07b933Sdp 			/*
27815d07b933Sdp 			 * this is a heterogeneous machine with different CPUs
27825d07b933Sdp 			 * having different size e$ (not supported for ni2/rock
27835d07b933Sdp 			 */
27845d07b933Sdp 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
27855d07b933Sdp 			cpucolors = MAX(cpucolors, 1);
27865d07b933Sdp 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
27875d07b933Sdp 			plw->plw_ceq_mask[szc] =
27885d07b933Sdp 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
27895d07b933Sdp 		}
27905d07b933Sdp 		plw->plw_ceq_dif = 1;
27915d07b933Sdp 	}
27927c478bd9Sstevel@tonic-gate 
27935d07b933Sdp 	/* we can split pages in the freelist, but not the cachelist */
27945d07b933Sdp 	if (can_split) {
2795ce8eb11aSdp 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
27967c478bd9Sstevel@tonic-gate 
2797ce8eb11aSdp 		/* set next szc color masks and number of free list bins */
2798ce8eb11aSdp 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2799ce8eb11aSdp 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2800ce8eb11aSdp 			    plw->plw_ceq_mask[szc]);
2801ce8eb11aSdp 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2802ce8eb11aSdp 		}
2803ce8eb11aSdp 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
2804ce8eb11aSdp 		plw->plw_bins[nszc] = 0;
28057c478bd9Sstevel@tonic-gate 
28065d07b933Sdp 	} else {
2807ce8eb11aSdp 		ASSERT(szc == 0);
2808ce8eb11aSdp 		plw->plw_do_split = 0;
2809ce8eb11aSdp 		plw->plw_bins[1] = 0;
2810ce8eb11aSdp 		plw->plw_ceq_mask[1] = INVALID_MASK;
28117c478bd9Sstevel@tonic-gate 	}
28125d07b933Sdp }
28137c478bd9Sstevel@tonic-gate 
28145d07b933Sdp /*
28155d07b933Sdp  * set mark to flag where next split should occur
28165d07b933Sdp  */
28175d07b933Sdp #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
28185d07b933Sdp 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
28195d07b933Sdp 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
28205d07b933Sdp 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
28215d07b933Sdp 	plw->plw_split_next =						     \
28225d07b933Sdp 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
28235d07b933Sdp 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
28245d07b933Sdp 		plw->plw_split_next =					     \
28255d07b933Sdp 		INC_MASKED(plw->plw_split_next,				     \
28265d07b933Sdp 		    neq_mask, plw->plw_color_mask);			     \
28275d07b933Sdp 	}								     \
28285d07b933Sdp }
28297c478bd9Sstevel@tonic-gate 
28305d07b933Sdp uint_t
28315d07b933Sdp page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
28325d07b933Sdp {
28335d07b933Sdp 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
28345d07b933Sdp 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
28355d07b933Sdp 	uchar_t nszc = szc + 1;
28365d07b933Sdp 
28375d07b933Sdp 	nbin = ADD_MASKED(bin,
28385d07b933Sdp 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
28395d07b933Sdp 
28405d07b933Sdp 	if (plw->plw_do_split) {
28415d07b933Sdp 		plw->plw_bin_split_prev = bin;
28425d07b933Sdp 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
28435d07b933Sdp 		plw->plw_do_split = 0;
28445d07b933Sdp 	}
28455d07b933Sdp 
28465d07b933Sdp 	if (szc == 0) {
28475d07b933Sdp 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
28485d07b933Sdp 			if (nbin == plw->plw_bin0 &&
28495d07b933Sdp 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
28505d07b933Sdp 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
28515d07b933Sdp 				    neq_mask, plw->plw_color_mask);
28525d07b933Sdp 				plw->plw_bin_split_prev = plw->plw_bin0;
28535d07b933Sdp 			}
28545d07b933Sdp 
28555d07b933Sdp 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
28565d07b933Sdp 				plw->plw_bin_marker =
28575d07b933Sdp 				    nbin = INC_MASKED(nbin, neq_mask,
2858ce8eb11aSdp 				    plw->plw_color_mask);
28595d07b933Sdp 				plw->plw_bin_split_prev = plw->plw_bin0;
28605d07b933Sdp 				/*
28615d07b933Sdp 				 * large pages all have the same vac color
28625d07b933Sdp 				 * so by now we should be done with next
28635d07b933Sdp 				 * size page splitting process
28645d07b933Sdp 				 */
28655d07b933Sdp 				ASSERT(plw->plw_bins[1] == 0);
28665d07b933Sdp 				plw->plw_do_split = 0;
28675d07b933Sdp 				return (nbin);
28685d07b933Sdp 			}
28695d07b933Sdp 
28705d07b933Sdp 		} else {
28715d07b933Sdp 			uint_t bin_jump = (vac_colors == 1) ?
28725d07b933Sdp 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
28735d07b933Sdp 
28745d07b933Sdp 			bin_jump &= ~(vac_colors - 1);
28755d07b933Sdp 
28765d07b933Sdp 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
28775d07b933Sdp 			    plw->plw_color_mask);
28785d07b933Sdp 
28795d07b933Sdp 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
28805d07b933Sdp 
28815d07b933Sdp 				plw->plw_bin_marker = nbin = nbin0;
28825d07b933Sdp 
28835d07b933Sdp 				if (plw->plw_bins[nszc] != 0) {
28845d07b933Sdp 					/*
28855d07b933Sdp 					 * check if next page size bin is the
28865d07b933Sdp 					 * same as the next page size bin for
28875d07b933Sdp 					 * bin0
28885d07b933Sdp 					 */
28895d07b933Sdp 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
28905d07b933Sdp 					    nbin);
28915d07b933Sdp 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
28925d07b933Sdp 					    plw->plw_bin0);
28935d07b933Sdp 
28945d07b933Sdp 					if ((bin0_nsz ^ nbin_nsz) &
28955d07b933Sdp 					    plw->plw_ceq_mask[nszc])
28965d07b933Sdp 						plw->plw_do_split = 1;
28975d07b933Sdp 				}
28985d07b933Sdp 				return (nbin);
28995d07b933Sdp 			}
29005d07b933Sdp 		}
29015d07b933Sdp 	}
29025d07b933Sdp 
29035d07b933Sdp 	if (plw->plw_bins[nszc] != 0) {
2904ce8eb11aSdp 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2905ce8eb11aSdp 		if (!((plw->plw_split_next ^ nbin_nsz) &
2906ce8eb11aSdp 		    plw->plw_ceq_mask[nszc]))
2907ce8eb11aSdp 			plw->plw_do_split = 1;
29085d07b933Sdp 	}
29095d07b933Sdp 
29105d07b933Sdp 	return (nbin);
29115d07b933Sdp }
29125d07b933Sdp 
29135d07b933Sdp page_t *
29145d07b933Sdp page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
29155d07b933Sdp     uint_t flags)
29165d07b933Sdp {
29175d07b933Sdp 	kmutex_t		*pcm;
29185d07b933Sdp 	page_t			*pp, *first_pp;
29195d07b933Sdp 	uint_t			sbin;
29205d07b933Sdp 	int			plw_initialized;
29215d07b933Sdp 	page_list_walker_t	plw;
29225d07b933Sdp 
29235d07b933Sdp 	ASSERT(szc < mmu_page_sizes);
29245d07b933Sdp 
29255d07b933Sdp 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
29265d07b933Sdp 
29275d07b933Sdp 	MTYPE_START(mnode, mtype, flags);
29285d07b933Sdp 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
29295d07b933Sdp 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
29305d07b933Sdp 		return (NULL);
29315d07b933Sdp 	}
29325d07b933Sdp try_again:
29335d07b933Sdp 
29345d07b933Sdp 	plw_initialized = 0;
29355d07b933Sdp 	plw.plw_ceq_dif = 1;
29367c478bd9Sstevel@tonic-gate 
29377c478bd9Sstevel@tonic-gate 	/*
29387c478bd9Sstevel@tonic-gate 	 * Only hold one freelist lock at a time, that way we
29397c478bd9Sstevel@tonic-gate 	 * can start anywhere and not have to worry about lock
29407c478bd9Sstevel@tonic-gate 	 * ordering.
29417c478bd9Sstevel@tonic-gate 	 */
29425d07b933Sdp 	for (plw.plw_count = 0;
29435d07b933Sdp 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
29445d07b933Sdp 		sbin = bin;
29455d07b933Sdp 		do {
29465d07b933Sdp 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
29475d07b933Sdp 				goto bin_empty_1;
29485d07b933Sdp 
29497c478bd9Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
29507c478bd9Sstevel@tonic-gate 			mutex_enter(pcm);
29517c478bd9Sstevel@tonic-gate 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
29525d07b933Sdp 			if (pp == NULL)
29535d07b933Sdp 				goto bin_empty_0;
29545d07b933Sdp 
29555d07b933Sdp 			/*
29565d07b933Sdp 			 * These were set before the page
29575d07b933Sdp 			 * was put on the free list,
29585d07b933Sdp 			 * they must still be set.
29595d07b933Sdp 			 */
29605d07b933Sdp 			ASSERT(PP_ISFREE(pp));
29615d07b933Sdp 			ASSERT(PP_ISAGED(pp));
29625d07b933Sdp 			ASSERT(pp->p_vnode == NULL);
29635d07b933Sdp 			ASSERT(pp->p_hash == NULL);
29645d07b933Sdp 			ASSERT(pp->p_offset == (u_offset_t)-1);
29655d07b933Sdp 			ASSERT(pp->p_szc == szc);
29665d07b933Sdp 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
29675d07b933Sdp 
29685d07b933Sdp 			/*
29695d07b933Sdp 			 * Walk down the hash chain.
29705d07b933Sdp 			 * 8k pages are linked on p_next
29715d07b933Sdp 			 * and p_prev fields. Large pages
29725d07b933Sdp 			 * are a contiguous group of
29735d07b933Sdp 			 * constituent pages linked together
29745d07b933Sdp 			 * on their p_next and p_prev fields.
29755d07b933Sdp 			 * The large pages are linked together
29765d07b933Sdp 			 * on the hash chain using p_vpnext
29775d07b933Sdp 			 * p_vpprev of the base constituent
29785d07b933Sdp 			 * page of each large page.
29795d07b933Sdp 			 */
29805d07b933Sdp 			first_pp = pp;
2981ca3e8d88SDave Plauger 			while (!page_trylock_cons(pp, SE_EXCL) ||
2982ca3e8d88SDave Plauger 			    IS_DUMP_PAGE(pp)) {
29835d07b933Sdp 				if (szc == 0) {
29845d07b933Sdp 					pp = pp->p_next;
29855d07b933Sdp 				} else {
29865d07b933Sdp 					pp = pp->p_vpnext;
29875d07b933Sdp 				}
29885d07b933Sdp 
29897c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISFREE(pp));
29907c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp));
29917c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_vnode == NULL);
29927c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_hash == NULL);
29937c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_offset == (u_offset_t)-1);
29947c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == szc);
29957c478bd9Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
29967c478bd9Sstevel@tonic-gate 
29975d07b933Sdp 				if (pp == first_pp)
29985d07b933Sdp 					goto bin_empty_0;
29995d07b933Sdp 			}
30007c478bd9Sstevel@tonic-gate 
30015d07b933Sdp 			ASSERT(pp != NULL);
30025d07b933Sdp 			ASSERT(mtype == PP_2_MTYPE(pp));
30035d07b933Sdp 			ASSERT(pp->p_szc == szc);
30045d07b933Sdp 			if (szc == 0) {
30055d07b933Sdp 				page_sub(&PAGE_FREELISTS(mnode,
30065d07b933Sdp 				    szc, bin, mtype), pp);
30075d07b933Sdp 			} else {
30085d07b933Sdp 				page_vpsub(&PAGE_FREELISTS(mnode,
30095d07b933Sdp 				    szc, bin, mtype), pp);
30105d07b933Sdp 				CHK_LPG(pp, szc);
30115d07b933Sdp 			}
30125d07b933Sdp 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
30137c478bd9Sstevel@tonic-gate 
30145d07b933Sdp 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
30155d07b933Sdp 				panic("free page is not. pp %p", (void *)pp);
30165d07b933Sdp 			mutex_exit(pcm);
30177c478bd9Sstevel@tonic-gate 
30187c478bd9Sstevel@tonic-gate #if defined(__sparc)
30195d07b933Sdp 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
30205d07b933Sdp 			    (flags & PG_NORELOC) == 0);
30217c478bd9Sstevel@tonic-gate 
30225d07b933Sdp 			if (PP_ISNORELOC(pp))
30235d07b933Sdp 				kcage_freemem_sub(page_get_pagecnt(szc));
30247c478bd9Sstevel@tonic-gate #endif
30255d07b933Sdp 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
30265d07b933Sdp 			return (pp);
30277c478bd9Sstevel@tonic-gate 
30285d07b933Sdp bin_empty_0:
30295d07b933Sdp 			mutex_exit(pcm);
30305d07b933Sdp bin_empty_1:
30315d07b933Sdp 			if (plw_initialized == 0) {
30325d07b933Sdp 				page_list_walk_init(szc, flags, bin, 1, 1,
30335d07b933Sdp 				    &plw);
30345d07b933Sdp 				plw_initialized = 1;
30355d07b933Sdp 				ASSERT(plw.plw_colors <=
30365d07b933Sdp 				    PAGE_GET_PAGECOLORS(szc));
30375d07b933Sdp 				ASSERT(plw.plw_colors > 0);
30385d07b933Sdp 				ASSERT((plw.plw_colors &
30395d07b933Sdp 				    (plw.plw_colors - 1)) == 0);
30405d07b933Sdp 				ASSERT(bin < plw.plw_colors);
30415d07b933Sdp 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
30427c478bd9Sstevel@tonic-gate 			}
30435d07b933Sdp 			/* calculate the next bin with equivalent color */
30445d07b933Sdp 			bin = ADD_MASKED(bin, plw.plw_bin_step,
30455d07b933Sdp 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
30465d07b933Sdp 		} while (sbin != bin);
30477c478bd9Sstevel@tonic-gate 
30487c478bd9Sstevel@tonic-gate 		/*
30495d07b933Sdp 		 * color bins are all empty if color match. Try and
30505d07b933Sdp 		 * satisfy the request by breaking up or coalescing
30515d07b933Sdp 		 * pages from a different size freelist of the correct
30525d07b933Sdp 		 * color that satisfies the ORIGINAL color requested.
30535d07b933Sdp 		 * If that fails then try pages of the same size but
30545d07b933Sdp 		 * different colors assuming we are not called with
30557c478bd9Sstevel@tonic-gate 		 * PG_MATCH_COLOR.
30567c478bd9Sstevel@tonic-gate 		 */
30575d07b933Sdp 		if (plw.plw_do_split &&
30585d07b933Sdp 		    (pp = page_freelist_split(szc, bin, mnode,
305919397407SSherry Moore 		    mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3060ce8eb11aSdp 			return (pp);
30617c478bd9Sstevel@tonic-gate 
30625d07b933Sdp 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
30635d07b933Sdp 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
30645d07b933Sdp 			return (pp);
30657c478bd9Sstevel@tonic-gate 
30665d07b933Sdp 		if (plw.plw_ceq_dif > 1)
30675d07b933Sdp 			bin = page_list_walk_next_bin(szc, bin, &plw);
30687c478bd9Sstevel@tonic-gate 	}
30697c478bd9Sstevel@tonic-gate 
3070affbd3ccSkchow 	/* if allowed, cycle through additional mtypes */
3071affbd3ccSkchow 	MTYPE_NEXT(mnode, mtype, flags);
3072affbd3ccSkchow 	if (mtype >= 0)
30735d07b933Sdp 		goto try_again;
3074affbd3ccSkchow 
30757c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
30767c478bd9Sstevel@tonic-gate 
30777c478bd9Sstevel@tonic-gate 	return (NULL);
30787c478bd9Sstevel@tonic-gate }
30797c478bd9Sstevel@tonic-gate 
30807c478bd9Sstevel@tonic-gate /*
30817c478bd9Sstevel@tonic-gate  * Returns the count of free pages for 'pp' with size code 'szc'.
30827c478bd9Sstevel@tonic-gate  * Note: This function does not return an exact value as the page freelist
30837c478bd9Sstevel@tonic-gate  * locks are not held and thus the values in the page_counters may be
30847c478bd9Sstevel@tonic-gate  * changing as we walk through the data.
30857c478bd9Sstevel@tonic-gate  */
30867c478bd9Sstevel@tonic-gate static int
30877c478bd9Sstevel@tonic-gate page_freecnt(int mnode, page_t *pp, uchar_t szc)
30887c478bd9Sstevel@tonic-gate {
30897c478bd9Sstevel@tonic-gate 	pgcnt_t	pgfree;
30907c478bd9Sstevel@tonic-gate 	pgcnt_t cnt;
30917c478bd9Sstevel@tonic-gate 	ssize_t	r = szc;	/* region size */
30927c478bd9Sstevel@tonic-gate 	ssize_t	idx;
30937c478bd9Sstevel@tonic-gate 	int	i;
30947c478bd9Sstevel@tonic-gate 	int	full, range;
30957c478bd9Sstevel@tonic-gate 
30967c478bd9Sstevel@tonic-gate 	/* Make sure pagenum passed in is aligned properly */
30977c478bd9Sstevel@tonic-gate 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
30987c478bd9Sstevel@tonic-gate 	ASSERT(szc > 0);
30997c478bd9Sstevel@tonic-gate 
31007c478bd9Sstevel@tonic-gate 	/* Prevent page_counters dynamic memory from being freed */
31017c478bd9Sstevel@tonic-gate 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
31027c478bd9Sstevel@tonic-gate 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
31037c478bd9Sstevel@tonic-gate 	cnt = PAGE_COUNTERS(mnode, r, idx);
31047c478bd9Sstevel@tonic-gate 	pgfree = cnt << PNUM_SHIFT(r - 1);
31057c478bd9Sstevel@tonic-gate 	range = FULL_REGION_CNT(szc);
31067c478bd9Sstevel@tonic-gate 
31077c478bd9Sstevel@tonic-gate 	/* Check for completely full region */
31087c478bd9Sstevel@tonic-gate 	if (cnt == range) {
31097c478bd9Sstevel@tonic-gate 		rw_exit(&page_ctrs_rwlock[mnode]);
31107c478bd9Sstevel@tonic-gate 		return (pgfree);
31117c478bd9Sstevel@tonic-gate 	}
31127c478bd9Sstevel@tonic-gate 
31137c478bd9Sstevel@tonic-gate 	while (--r > 0) {
31147c478bd9Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
31157c478bd9Sstevel@tonic-gate 		full = FULL_REGION_CNT(r);
31167c478bd9Sstevel@tonic-gate 		for (i = 0; i < range; i++, idx++) {
31177c478bd9Sstevel@tonic-gate 			cnt = PAGE_COUNTERS(mnode, r, idx);
31187c478bd9Sstevel@tonic-gate 			/*
31197c478bd9Sstevel@tonic-gate 			 * If cnt here is full, that means we have already
31207c478bd9Sstevel@tonic-gate 			 * accounted for these pages earlier.
31217c478bd9Sstevel@tonic-gate 			 */
31227c478bd9Sstevel@tonic-gate 			if (cnt != full) {
31237c478bd9Sstevel@tonic-gate 				pgfree += (cnt << PNUM_SHIFT(r - 1));
31247c478bd9Sstevel@tonic-gate 			}
31257c478bd9Sstevel@tonic-gate 		}
31267c478bd9Sstevel@tonic-gate 		range *= full;
31277c478bd9Sstevel@tonic-gate 	}
31287c478bd9Sstevel@tonic-gate 	rw_exit(&page_ctrs_rwlock[mnode]);
31297c478bd9Sstevel@tonic-gate 	return (pgfree);
31307c478bd9Sstevel@tonic-gate }
31317c478bd9Sstevel@tonic-gate 
31327c478bd9Sstevel@tonic-gate /*
31337c478bd9Sstevel@tonic-gate  * Called from page_geti_contig_pages to exclusively lock constituent pages
31347c478bd9Sstevel@tonic-gate  * starting from 'spp' for page size code 'szc'.
31357c478bd9Sstevel@tonic-gate  *
31367c478bd9Sstevel@tonic-gate  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
31377c478bd9Sstevel@tonic-gate  * region needs to be greater than or equal to the threshold.
31387c478bd9Sstevel@tonic-gate  */
31397c478bd9Sstevel@tonic-gate static int
31407c478bd9Sstevel@tonic-gate page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
31417c478bd9Sstevel@tonic-gate {
31427c478bd9Sstevel@tonic-gate 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
31437c478bd9Sstevel@tonic-gate 	pgcnt_t pgfree, i;
31447c478bd9Sstevel@tonic-gate 	page_t *pp;
31457c478bd9Sstevel@tonic-gate 
31467c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
31477c478bd9Sstevel@tonic-gate 
31487c478bd9Sstevel@tonic-gate 
31497c478bd9Sstevel@tonic-gate 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
31507c478bd9Sstevel@tonic-gate 		goto skipptcpcheck;
31517c478bd9Sstevel@tonic-gate 	/*
31527c478bd9Sstevel@tonic-gate 	 * check if there are sufficient free pages available before attempting
31537c478bd9Sstevel@tonic-gate 	 * to trylock. Count is approximate as page counters can change.
31547c478bd9Sstevel@tonic-gate 	 */
31557c478bd9Sstevel@tonic-gate 	pgfree = page_freecnt(mnode, spp, szc);
31567c478bd9Sstevel@tonic-gate 
31577c478bd9Sstevel@tonic-gate 	/* attempt to trylock if there are sufficient already free pages */
31587c478bd9Sstevel@tonic-gate 	if (pgfree < pgcnt/ptcpthreshold) {
31597c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
31607c478bd9Sstevel@tonic-gate 		return (0);
31617c478bd9Sstevel@tonic-gate 	}
31627c478bd9Sstevel@tonic-gate 
31637c478bd9Sstevel@tonic-gate skipptcpcheck:
31647c478bd9Sstevel@tonic-gate 
31657c478bd9Sstevel@tonic-gate 	for (i = 0; i < pgcnt; i++) {
31667c478bd9Sstevel@tonic-gate 		pp = &spp[i];
31677c478bd9Sstevel@tonic-gate 		if (!page_trylock(pp, SE_EXCL)) {
31687c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
31697c478bd9Sstevel@tonic-gate 			while (--i != (pgcnt_t)-1) {
31707c478bd9Sstevel@tonic-gate 				pp = &spp[i];
31717c478bd9Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
31728b464eb8Smec 				page_unlock_nocapture(pp);
31737c478bd9Sstevel@tonic-gate 			}
31747c478bd9Sstevel@tonic-gate 			return (0);
31757c478bd9Sstevel@tonic-gate 		}
31767c478bd9Sstevel@tonic-gate 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
31777c478bd9Sstevel@tonic-gate 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
31787c478bd9Sstevel@tonic-gate 		    !PP_ISFREE(pp)) {
31797c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
31807c478bd9Sstevel@tonic-gate 			ASSERT(i == 0);
31818b464eb8Smec 			page_unlock_nocapture(pp);
31827c478bd9Sstevel@tonic-gate 			return (0);
31837c478bd9Sstevel@tonic-gate 		}
31847c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(pp)) {
31857c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
31867c478bd9Sstevel@tonic-gate 			while (i != (pgcnt_t)-1) {
31877c478bd9Sstevel@tonic-gate 				pp = &spp[i];
31887c478bd9Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
31898b464eb8Smec 				page_unlock_nocapture(pp);
31907c478bd9Sstevel@tonic-gate 				i--;
31917c478bd9Sstevel@tonic-gate 			}
31927c478bd9Sstevel@tonic-gate 			return (0);
31937c478bd9Sstevel@tonic-gate 		}
31947c478bd9Sstevel@tonic-gate 	}
31957c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
31967c478bd9Sstevel@tonic-gate 	return (1);
31977c478bd9Sstevel@tonic-gate }
31987c478bd9Sstevel@tonic-gate 
31997c478bd9Sstevel@tonic-gate /*
32007c478bd9Sstevel@tonic-gate  * Claim large page pointed to by 'pp'. 'pp' is the starting set
32017c478bd9Sstevel@tonic-gate  * of 'szc' constituent pages that had been locked exclusively previously.
32027c478bd9Sstevel@tonic-gate  * Will attempt to relocate constituent pages in use.
32037c478bd9Sstevel@tonic-gate  */
32047c478bd9Sstevel@tonic-gate static page_t *
32057c478bd9Sstevel@tonic-gate page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
32067c478bd9Sstevel@tonic-gate {
32077c478bd9Sstevel@tonic-gate 	spgcnt_t pgcnt, npgs, i;
32087c478bd9Sstevel@tonic-gate 	page_t *targpp, *rpp, *hpp;
32097c478bd9Sstevel@tonic-gate 	page_t *replpp = NULL;
32107c478bd9Sstevel@tonic-gate 	page_t *pplist = NULL;
32117c478bd9Sstevel@tonic-gate 
32127c478bd9Sstevel@tonic-gate 	ASSERT(pp != NULL);
32137c478bd9Sstevel@tonic-gate 
32147c478bd9Sstevel@tonic-gate 	pgcnt = page_get_pagecnt(szc);
32157c478bd9Sstevel@tonic-gate 	while (pgcnt) {
32167c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(pp));
32177c478bd9Sstevel@tonic-gate 		ASSERT(!PP_ISNORELOC(pp));
32187c478bd9Sstevel@tonic-gate 		if (PP_ISFREE(pp)) {
32197c478bd9Sstevel@tonic-gate 			/*
32207c478bd9Sstevel@tonic-gate 			 * If this is a PG_FREE_LIST page then its
32217c478bd9Sstevel@tonic-gate 			 * size code can change underneath us due to
32227c478bd9Sstevel@tonic-gate 			 * page promotion or demotion. As an optimzation
32237c478bd9Sstevel@tonic-gate 			 * use page_list_sub_pages() instead of
32247c478bd9Sstevel@tonic-gate 			 * page_list_sub().
32257c478bd9Sstevel@tonic-gate 			 */
32267c478bd9Sstevel@tonic-gate 			if (PP_ISAGED(pp)) {
32277c478bd9Sstevel@tonic-gate 				page_list_sub_pages(pp, szc);
32287c478bd9Sstevel@tonic-gate 				if (pp->p_szc == szc) {
32297c478bd9Sstevel@tonic-gate 					return (pp);
32307c478bd9Sstevel@tonic-gate 				}
32317c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc < szc);
32327c478bd9Sstevel@tonic-gate 				npgs = page_get_pagecnt(pp->p_szc);
32337c478bd9Sstevel@tonic-gate 				hpp = pp;
32347c478bd9Sstevel@tonic-gate 				for (i = 0; i < npgs; i++, pp++) {
32357c478bd9Sstevel@tonic-gate 					pp->p_szc = szc;
32367c478bd9Sstevel@tonic-gate 				}
32377c478bd9Sstevel@tonic-gate 				page_list_concat(&pplist, &hpp);
32387c478bd9Sstevel@tonic-gate 				pgcnt -= npgs;
32397c478bd9Sstevel@tonic-gate 				continue;
32407c478bd9Sstevel@tonic-gate 			}
32417c478bd9Sstevel@tonic-gate 			ASSERT(!PP_ISAGED(pp));
32427c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_szc == 0);
32437c478bd9Sstevel@tonic-gate 			page_list_sub(pp, PG_CACHE_LIST);
32447c478bd9Sstevel@tonic-gate 			page_hashout(pp, NULL);
32457c478bd9Sstevel@tonic-gate 			PP_SETAGED(pp);
32467c478bd9Sstevel@tonic-gate 			pp->p_szc = szc;
32477c478bd9Sstevel@tonic-gate 			page_list_concat(&pplist, &pp);
32487c478bd9Sstevel@tonic-gate 			pp++;
32497c478bd9Sstevel@tonic-gate 			pgcnt--;
32507c478bd9Sstevel@tonic-gate 			continue;
32517c478bd9Sstevel@tonic-gate 		}
32527c478bd9Sstevel@tonic-gate 		npgs = page_get_pagecnt(pp->p_szc);
32537c478bd9Sstevel@tonic-gate 
32547c478bd9Sstevel@tonic-gate 		/*
32557c478bd9Sstevel@tonic-gate 		 * page_create_wait freemem accounting done by caller of
32567c478bd9Sstevel@tonic-gate 		 * page_get_freelist and not necessary to call it prior to
32577c478bd9Sstevel@tonic-gate 		 * calling page_get_replacement_page.
32587c478bd9Sstevel@tonic-gate 		 *
32597c478bd9Sstevel@tonic-gate 		 * page_get_replacement_page can call page_get_contig_pages
32607c478bd9Sstevel@tonic-gate 		 * to acquire a large page (szc > 0); the replacement must be
32617c478bd9Sstevel@tonic-gate 		 * smaller than the contig page size to avoid looping or
32627c478bd9Sstevel@tonic-gate 		 * szc == 0 and PGI_PGCPSZC0 is set.
32637c478bd9Sstevel@tonic-gate 		 */
32647c478bd9Sstevel@tonic-gate 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
32657c478bd9Sstevel@tonic-gate 			replpp = page_get_replacement_page(pp, NULL, 0);
32667c478bd9Sstevel@tonic-gate 			if (replpp) {
32677c478bd9Sstevel@tonic-gate 				npgs = page_get_pagecnt(pp->p_szc);
32687c478bd9Sstevel@tonic-gate 				ASSERT(npgs <= pgcnt);
32697c478bd9Sstevel@tonic-gate 				targpp = pp;
32707c478bd9Sstevel@tonic-gate 			}
32717c478bd9Sstevel@tonic-gate 		}
32727c478bd9Sstevel@tonic-gate 
32737c478bd9Sstevel@tonic-gate 		/*
32747c478bd9Sstevel@tonic-gate 		 * If replacement is NULL or do_page_relocate fails, fail
32757c478bd9Sstevel@tonic-gate 		 * coalescing of pages.
32767c478bd9Sstevel@tonic-gate 		 */
32777c478bd9Sstevel@tonic-gate 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
32787c478bd9Sstevel@tonic-gate 		    &npgs, NULL) != 0)) {
32797c478bd9Sstevel@tonic-gate 			/*
32807c478bd9Sstevel@tonic-gate 			 * Unlock un-processed target list
32817c478bd9Sstevel@tonic-gate 			 */
32827c478bd9Sstevel@tonic-gate 			while (pgcnt--) {
32837c478bd9Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
32848b464eb8Smec 				page_unlock_nocapture(pp);
32857c478bd9Sstevel@tonic-gate 				pp++;
32867c478bd9Sstevel@tonic-gate 			}
32877c478bd9Sstevel@tonic-gate 			/*
32887c478bd9Sstevel@tonic-gate 			 * Free the processed target list.
32897c478bd9Sstevel@tonic-gate 			 */
32907c478bd9Sstevel@tonic-gate 			while (pplist) {
32917c478bd9Sstevel@tonic-gate 				pp = pplist;
32927c478bd9Sstevel@tonic-gate 				page_sub(&pplist, pp);
32937c478bd9Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
32947c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == szc);
32957c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISFREE(pp));
32967c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp));
32977c478bd9Sstevel@tonic-gate 				pp->p_szc = 0;
32987c478bd9Sstevel@tonic-gate 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
32998b464eb8Smec 				page_unlock_nocapture(pp);
33007c478bd9Sstevel@tonic-gate 			}
33017c478bd9Sstevel@tonic-gate 
33027c478bd9Sstevel@tonic-gate 			if (replpp != NULL)
33037c478bd9Sstevel@tonic-gate 				page_free_replacement_page(replpp);
33047c478bd9Sstevel@tonic-gate 
33057c478bd9Sstevel@tonic-gate 			return (NULL);
33067c478bd9Sstevel@tonic-gate 		}
33077c478bd9Sstevel@tonic-gate 		ASSERT(pp == targpp);
33087c478bd9Sstevel@tonic-gate 
33097c478bd9Sstevel@tonic-gate 		/* LINTED */
33107c478bd9Sstevel@tonic-gate 		ASSERT(hpp = pp); /* That's right, it's an assignment */
33117c478bd9Sstevel@tonic-gate 
33127c478bd9Sstevel@tonic-gate 		pp += npgs;
33137c478bd9Sstevel@tonic-gate 		pgcnt -= npgs;
33147c478bd9Sstevel@tonic-gate 
33157c478bd9Sstevel@tonic-gate 		while (npgs--) {
33167c478bd9Sstevel@tonic-gate 			ASSERT(PAGE_EXCL(targpp));
33177c478bd9Sstevel@tonic-gate 			ASSERT(!PP_ISFREE(targpp));
33187c478bd9Sstevel@tonic-gate 			ASSERT(!PP_ISNORELOC(targpp));
33197c478bd9Sstevel@tonic-gate 			PP_SETFREE(targpp);
33207c478bd9Sstevel@tonic-gate 			ASSERT(PP_ISAGED(targpp));
33217c478bd9Sstevel@tonic-gate 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
33227c478bd9Sstevel@tonic-gate 			    (flags & PGI_PGCPSZC0)));
33237c478bd9Sstevel@tonic-gate 			targpp->p_szc = szc;
33247c478bd9Sstevel@tonic-gate 			targpp = targpp->p_next;
33257c478bd9Sstevel@tonic-gate 
33267c478bd9Sstevel@tonic-gate 			rpp = replpp;
33277c478bd9Sstevel@tonic-gate 			ASSERT(rpp != NULL);
33287c478bd9Sstevel@tonic-gate 			page_sub(&replpp, rpp);
33297c478bd9Sstevel@tonic-gate 			ASSERT(PAGE_EXCL(rpp));
33307c478bd9Sstevel@tonic-gate 			ASSERT(!PP_ISFREE(rpp));
33318b464eb8Smec 			page_unlock_nocapture(rpp);
33327c478bd9Sstevel@tonic-gate 		}
33337c478bd9Sstevel@tonic-gate 		ASSERT(targpp == hpp);
33347c478bd9Sstevel@tonic-gate 		ASSERT(replpp == NULL);
33357c478bd9Sstevel@tonic-gate 		page_list_concat(&pplist, &targpp);
33367c478bd9Sstevel@tonic-gate 	}
33377c478bd9Sstevel@tonic-gate 	CHK_LPG(pplist, szc);
33387c478bd9Sstevel@tonic-gate 	return (pplist);
33397c478bd9Sstevel@tonic-gate }
33407c478bd9Sstevel@tonic-gate 
33417c478bd9Sstevel@tonic-gate /*
33427c478bd9Sstevel@tonic-gate  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
33437c478bd9Sstevel@tonic-gate  * of 0 means nothing left after trim.
33447c478bd9Sstevel@tonic-gate  */
33457c478bd9Sstevel@tonic-gate int
33467c478bd9Sstevel@tonic-gate trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
33477c478bd9Sstevel@tonic-gate {
33487c478bd9Sstevel@tonic-gate 	pfn_t	kcagepfn;
33497c478bd9Sstevel@tonic-gate 	int	decr;
33507c478bd9Sstevel@tonic-gate 	int	rc = 0;
33517c478bd9Sstevel@tonic-gate 
33527c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(mseg->pages)) {
33537c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
33547c478bd9Sstevel@tonic-gate 
33557c478bd9Sstevel@tonic-gate 			/* lower part of this mseg inside kernel cage */
33567c478bd9Sstevel@tonic-gate 			decr = kcage_current_pfn(&kcagepfn);
33577c478bd9Sstevel@tonic-gate 
33587c478bd9Sstevel@tonic-gate 			/* kernel cage may have transitioned past mseg */
33597c478bd9Sstevel@tonic-gate 			if (kcagepfn >= mseg->pages_base &&
33607c478bd9Sstevel@tonic-gate 			    kcagepfn < mseg->pages_end) {
33617c478bd9Sstevel@tonic-gate 				ASSERT(decr == 0);
336278b03d3aSkchow 				*lo = MAX(kcagepfn, pfnlo);
336378b03d3aSkchow 				*hi = MIN(pfnhi, (mseg->pages_end - 1));
33647c478bd9Sstevel@tonic-gate 				rc = 1;
33657c478bd9Sstevel@tonic-gate 			}
33667c478bd9Sstevel@tonic-gate 		}
33677c478bd9Sstevel@tonic-gate 		/* else entire mseg in the cage */
33687c478bd9Sstevel@tonic-gate 	} else {
33697c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(mseg->epages - 1)) {
33707c478bd9Sstevel@tonic-gate 
33717c478bd9Sstevel@tonic-gate 			/* upper part of this mseg inside kernel cage */
33727c478bd9Sstevel@tonic-gate 			decr = kcage_current_pfn(&kcagepfn);
33737c478bd9Sstevel@tonic-gate 
33747c478bd9Sstevel@tonic-gate 			/* kernel cage may have transitioned past mseg */
33757c478bd9Sstevel@tonic-gate 			if (kcagepfn >= mseg->pages_base &&
33767c478bd9Sstevel@tonic-gate 			    kcagepfn < mseg->pages_end) {
33777c478bd9Sstevel@tonic-gate 				ASSERT(decr);
337878b03d3aSkchow 				*hi = MIN(kcagepfn, pfnhi);
33797c478bd9Sstevel@tonic-gate 				*lo = MAX(pfnlo, mseg->pages_base);
33807c478bd9Sstevel@tonic-gate 				rc = 1;
33817c478bd9Sstevel@tonic-gate 			}
33827c478bd9Sstevel@tonic-gate 		} else {
33837c478bd9Sstevel@tonic-gate 			/* entire mseg outside of kernel cage */
33847c478bd9Sstevel@tonic-gate 			*lo = MAX(pfnlo, mseg->pages_base);
33857c478bd9Sstevel@tonic-gate 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
33867c478bd9Sstevel@tonic-gate 			rc = 1;
33877c478bd9Sstevel@tonic-gate 		}
33887c478bd9Sstevel@tonic-gate 	}
33897c478bd9Sstevel@tonic-gate 	return (rc);
33907c478bd9Sstevel@tonic-gate }
33917c478bd9Sstevel@tonic-gate 
33927c478bd9Sstevel@tonic-gate /*
33935d07b933Sdp  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
33947c478bd9Sstevel@tonic-gate  * page with size code 'szc'. Claiming such a page requires acquiring
33957c478bd9Sstevel@tonic-gate  * exclusive locks on all constituent pages (page_trylock_contig_pages),
33967c478bd9Sstevel@tonic-gate  * relocating pages in use and concatenating these constituent pages into a
33977c478bd9Sstevel@tonic-gate  * large page.
33987c478bd9Sstevel@tonic-gate  *
33995d07b933Sdp  * The page lists do not have such a large page and page_freelist_split has
34007c478bd9Sstevel@tonic-gate  * already failed to demote larger pages and/or coalesce smaller free pages.
34017c478bd9Sstevel@tonic-gate  *
34027c478bd9Sstevel@tonic-gate  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
34037c478bd9Sstevel@tonic-gate  * pages with the same color as 'bin'.
34047c478bd9Sstevel@tonic-gate  *
34057c478bd9Sstevel@tonic-gate  * 'pfnflag' specifies the subset of the pfn range to search.
34067c478bd9Sstevel@tonic-gate  */
34077c478bd9Sstevel@tonic-gate 
34087c478bd9Sstevel@tonic-gate static page_t *
34097c478bd9Sstevel@tonic-gate page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
341083f9b804Skchow     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
34117c478bd9Sstevel@tonic-gate {
34127c478bd9Sstevel@tonic-gate 	struct memseg *mseg;
34137c478bd9Sstevel@tonic-gate 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
34147c478bd9Sstevel@tonic-gate 	pgcnt_t szcpgmask = szcpgcnt - 1;
34157c478bd9Sstevel@tonic-gate 	pfn_t	randpfn;
34167c478bd9Sstevel@tonic-gate 	page_t *pp, *randpp, *endpp;
34175d07b933Sdp 	uint_t colors, ceq_mask;
34185d07b933Sdp 	/* LINTED : set but not used in function */
34195d07b933Sdp 	uint_t color_mask;
34207c478bd9Sstevel@tonic-gate 	pfn_t hi, lo;
34217c478bd9Sstevel@tonic-gate 	uint_t skip;
3422ce8eb11aSdp 	MEM_NODE_ITERATOR_DECL(it);
34237c478bd9Sstevel@tonic-gate 
34247c478bd9Sstevel@tonic-gate 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
34257c478bd9Sstevel@tonic-gate 
342602bc52beSkchow 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
342702bc52beSkchow 
342802bc52beSkchow 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
34297c478bd9Sstevel@tonic-gate 		return (NULL);
34307c478bd9Sstevel@tonic-gate 
34317c478bd9Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
34327c478bd9Sstevel@tonic-gate 
34335d07b933Sdp 	colors = PAGE_GET_PAGECOLORS(szc);
34345d07b933Sdp 	color_mask = colors - 1;
34355d07b933Sdp 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
34365d07b933Sdp 		uchar_t ceq = colorequivszc[szc];
34375d07b933Sdp 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
34385d07b933Sdp 
34395d07b933Sdp 		ASSERT(ceq_dif > 0);
34405d07b933Sdp 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
34415d07b933Sdp 	} else {
34425d07b933Sdp 		ceq_mask = 0;
34435d07b933Sdp 	}
34447c478bd9Sstevel@tonic-gate 
34457c478bd9Sstevel@tonic-gate 	ASSERT(bin < colors);
34467c478bd9Sstevel@tonic-gate 
34475d07b933Sdp 	/* clear "non-significant" color bits */
34485d07b933Sdp 	bin &= ceq_mask;
34495d07b933Sdp 
34507c478bd9Sstevel@tonic-gate 	/*
34517c478bd9Sstevel@tonic-gate 	 * trim the pfn range to search based on pfnflag. pfnflag is set
34527c478bd9Sstevel@tonic-gate 	 * when there have been previous page_get_contig_page failures to
34537c478bd9Sstevel@tonic-gate 	 * limit the search.
34547c478bd9Sstevel@tonic-gate 	 *
34557c478bd9Sstevel@tonic-gate 	 * The high bit in pfnflag specifies the number of 'slots' in the
34567c478bd9Sstevel@tonic-gate 	 * pfn range and the remainder of pfnflag specifies which slot.
34577c478bd9Sstevel@tonic-gate 	 * For example, a value of 1010b would mean the second slot of
34587c478bd9Sstevel@tonic-gate 	 * the pfn range that has been divided into 8 slots.
34597c478bd9Sstevel@tonic-gate 	 */
34607c478bd9Sstevel@tonic-gate 	if (pfnflag > 1) {
34617c478bd9Sstevel@tonic-gate 		int	slots = 1 << (highbit(pfnflag) - 1);
34627c478bd9Sstevel@tonic-gate 		int	slotid = pfnflag & (slots - 1);
34637c478bd9Sstevel@tonic-gate 		pgcnt_t	szcpages;
34647c478bd9Sstevel@tonic-gate 		int	slotlen;
34657c478bd9Sstevel@tonic-gate 
346602bc52beSkchow 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
34677c478bd9Sstevel@tonic-gate 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
34687c478bd9Sstevel@tonic-gate 		slotlen = howmany(szcpages, slots);
346902bc52beSkchow 		/* skip if 'slotid' slot is empty */
347002bc52beSkchow 		if (slotid * slotlen >= szcpages)
347102bc52beSkchow 			return (NULL);
34727c478bd9Sstevel@tonic-gate 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
34737c478bd9Sstevel@tonic-gate 		ASSERT(pfnlo < pfnhi);
34747c478bd9Sstevel@tonic-gate 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
347502bc52beSkchow 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
34767c478bd9Sstevel@tonic-gate 	}
34777c478bd9Sstevel@tonic-gate 
3478af4c679fSSean McEnroe 	/*
3479af4c679fSSean McEnroe 	 * This routine is can be called recursively so we shouldn't
3480af4c679fSSean McEnroe 	 * acquire a reader lock if a write request is pending. This
3481af4c679fSSean McEnroe 	 * could lead to a deadlock with the DR thread.
3482af4c679fSSean McEnroe 	 *
3483af4c679fSSean McEnroe 	 * Returning NULL informs the caller that we could not get
3484af4c679fSSean McEnroe 	 * a contig page with the required characteristics.
3485af4c679fSSean McEnroe 	 */
3486af4c679fSSean McEnroe 
3487af4c679fSSean McEnroe 	if (!memsegs_trylock(0))
3488af4c679fSSean McEnroe 		return (NULL);
34897c478bd9Sstevel@tonic-gate 
34907c478bd9Sstevel@tonic-gate 	/*
34917c478bd9Sstevel@tonic-gate 	 * loop through memsegs to look for contig page candidates
34927c478bd9Sstevel@tonic-gate 	 */
34937c478bd9Sstevel@tonic-gate 
34947c478bd9Sstevel@tonic-gate 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
34957c478bd9Sstevel@tonic-gate 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
34967c478bd9Sstevel@tonic-gate 			/* no overlap */
34977c478bd9Sstevel@tonic-gate 			continue;
34987c478bd9Sstevel@tonic-gate 		}
34997c478bd9Sstevel@tonic-gate 
35007c478bd9Sstevel@tonic-gate 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
35017c478bd9Sstevel@tonic-gate 			/* mseg too small */
35027c478bd9Sstevel@tonic-gate 			continue;
35037c478bd9Sstevel@tonic-gate 
350478b03d3aSkchow 		/*
350578b03d3aSkchow 		 * trim off kernel cage pages from pfn range and check for
350678b03d3aSkchow 		 * a trimmed pfn range returned that does not span the
350778b03d3aSkchow 		 * desired large page size.
350878b03d3aSkchow 		 */
35097c478bd9Sstevel@tonic-gate 		if (kcage_on) {
351078b03d3aSkchow 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3511d94d1888Skchow 			    lo >= hi || ((hi - lo) + 1) < szcpgcnt)
35127c478bd9Sstevel@tonic-gate 				continue;
35137c478bd9Sstevel@tonic-gate 		} else {
35147c478bd9Sstevel@tonic-gate 			lo = MAX(pfnlo, mseg->pages_base);
35157c478bd9Sstevel@tonic-gate 			hi = MIN(pfnhi, (mseg->pages_end - 1));
35167c478bd9Sstevel@tonic-gate 		}
35177c478bd9Sstevel@tonic-gate 
35187c478bd9Sstevel@tonic-gate 		/* round to szcpgcnt boundaries */
35197c478bd9Sstevel@tonic-gate 		lo = P2ROUNDUP(lo, szcpgcnt);
352002bc52beSkchow 
3521b779d3e0Sdp 		MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
352202bc52beSkchow 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
35237c478bd9Sstevel@tonic-gate 
35247c478bd9Sstevel@tonic-gate 		if (hi <= lo)
35257c478bd9Sstevel@tonic-gate 			continue;
35267c478bd9Sstevel@tonic-gate 
35277c478bd9Sstevel@tonic-gate 		/*
35287c478bd9Sstevel@tonic-gate 		 * set lo to point to the pfn for the desired bin. Large
35297c478bd9Sstevel@tonic-gate 		 * page sizes may only have a single page color
35307c478bd9Sstevel@tonic-gate 		 */
35315d07b933Sdp 		skip = szcpgcnt;
3532ce8eb11aSdp 		if (ceq_mask > 0 || interleaved_mnodes) {
35335d07b933Sdp 			/* set lo to point at appropriate color */
3534ce8eb11aSdp 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3535ce8eb11aSdp 			    (interleaved_mnodes &&
3536ce8eb11aSdp 			    PFN_2_MEM_NODE(lo) != mnode)) {
3537ce8eb11aSdp 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3538ce8eb11aSdp 				    color_mask, &it);
3539ce8eb11aSdp 			}
35405d07b933Sdp 			if (hi <= lo)
35415d07b933Sdp 				/* mseg cannot satisfy color request */
35425d07b933Sdp 				continue;
35437c478bd9Sstevel@tonic-gate 		}
35447c478bd9Sstevel@tonic-gate 
35457c478bd9Sstevel@tonic-gate 		/* randomly choose a point between lo and hi to begin search */
35467c478bd9Sstevel@tonic-gate 
35477c478bd9Sstevel@tonic-gate 		randpfn = (pfn_t)GETTICK();
35487c478bd9Sstevel@tonic-gate 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3549b779d3e0Sdp 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3550b779d3e0Sdp 		if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3551a7c3ca36Sdp 			if (randpfn != (pfn_t)-1) {
3552ce8eb11aSdp 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3553ce8eb11aSdp 				    ceq_mask, color_mask, &it);
3554a7c3ca36Sdp 			}
3555ce8eb11aSdp 			if (randpfn >= hi) {
3556ce8eb11aSdp 				randpfn = lo;
3557b779d3e0Sdp 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3558b779d3e0Sdp 				    &it);
3559ce8eb11aSdp 			}
35605d07b933Sdp 		}
35617c478bd9Sstevel@tonic-gate 		randpp = mseg->pages + (randpfn - mseg->pages_base);
35627c478bd9Sstevel@tonic-gate 
35637c478bd9Sstevel@tonic-gate 		ASSERT(randpp->p_pagenum == randpfn);
35647c478bd9Sstevel@tonic-gate 
35657c478bd9Sstevel@tonic-gate 		pp = randpp;
356602bc52beSkchow 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
35677c478bd9Sstevel@tonic-gate 
35687c478bd9Sstevel@tonic-gate 		ASSERT(randpp + szcpgcnt <= endpp);
35697c478bd9Sstevel@tonic-gate 
35707c478bd9Sstevel@tonic-gate 		do {
35717c478bd9Sstevel@tonic-gate 			ASSERT(!(pp->p_pagenum & szcpgmask));
35725d07b933Sdp 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
35735d07b933Sdp 
35747c478bd9Sstevel@tonic-gate 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
35757c478bd9Sstevel@tonic-gate 				/* pages unlocked by page_claim on failure */
35767c478bd9Sstevel@tonic-gate 				if (page_claim_contig_pages(pp, szc, flags)) {
35777c478bd9Sstevel@tonic-gate 					memsegs_unlock(0);
35787c478bd9Sstevel@tonic-gate 					return (pp);
35797c478bd9Sstevel@tonic-gate 				}
35807c478bd9Sstevel@tonic-gate 			}
35817c478bd9Sstevel@tonic-gate 
3582ce8eb11aSdp 			if (ceq_mask == 0 && !interleaved_mnodes) {
35835d07b933Sdp 				pp += skip;
35845d07b933Sdp 			} else {
35855d07b933Sdp 				pfn_t pfn = pp->p_pagenum;
35865d07b933Sdp 
35875d07b933Sdp 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3588ce8eb11aSdp 				    ceq_mask, color_mask, &it);
3589ce8eb11aSdp 				if (pfn == (pfn_t)-1) {
3590ce8eb11aSdp 					pp = endpp;
3591ce8eb11aSdp 				} else {
3592ce8eb11aSdp 					pp = mseg->pages +
3593ce8eb11aSdp 					    (pfn - mseg->pages_base);
3594ce8eb11aSdp 				}
35955d07b933Sdp 			}
35967c478bd9Sstevel@tonic-gate 			if (pp >= endpp) {
35977c478bd9Sstevel@tonic-gate 				/* start from the beginning */
3598b779d3e0Sdp 				MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
35997c478bd9Sstevel@tonic-gate 				pp = mseg->pages + (lo - mseg->pages_base);
36007c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_pagenum == lo);
36017c478bd9Sstevel@tonic-gate 				ASSERT(pp + szcpgcnt <= endpp);
36027c478bd9Sstevel@tonic-gate 			}
36037c478bd9Sstevel@tonic-gate 		} while (pp != randpp);
36047c478bd9Sstevel@tonic-gate 	}
36057c478bd9Sstevel@tonic-gate 	memsegs_unlock(0);
36067c478bd9Sstevel@tonic-gate 	return (NULL);
36077c478bd9Sstevel@tonic-gate }
36087c478bd9Sstevel@tonic-gate 
36097c478bd9Sstevel@tonic-gate 
36107c478bd9Sstevel@tonic-gate /*
36117c478bd9Sstevel@tonic-gate  * controlling routine that searches through physical memory in an attempt to
36127c478bd9Sstevel@tonic-gate  * claim a large page based on the input parameters.
36137c478bd9Sstevel@tonic-gate  * on the page free lists.
36147c478bd9Sstevel@tonic-gate  *
36157c478bd9Sstevel@tonic-gate  * calls page_geti_contig_pages with an initial pfn range from the mnode
36167c478bd9Sstevel@tonic-gate  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
36177c478bd9Sstevel@tonic-gate  * that overlaps with the kernel cage or does not match the requested page
36187c478bd9Sstevel@tonic-gate  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
36197c478bd9Sstevel@tonic-gate  * page_geti_contig_pages may further limit the search range based on
36207c478bd9Sstevel@tonic-gate  * previous failure counts (pgcpfailcnt[]).
36217c478bd9Sstevel@tonic-gate  *
36227c478bd9Sstevel@tonic-gate  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
36237c478bd9Sstevel@tonic-gate  * pagesize page that satisfies mtype.
36247c478bd9Sstevel@tonic-gate  */
36257c478bd9Sstevel@tonic-gate page_t *
36267c478bd9Sstevel@tonic-gate page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
36277c478bd9Sstevel@tonic-gate     uint_t flags)
36287c478bd9Sstevel@tonic-gate {
36297c478bd9Sstevel@tonic-gate 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
36307c478bd9Sstevel@tonic-gate 	page_t		*pp;
363183f9b804Skchow 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
36327c478bd9Sstevel@tonic-gate 
36337c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
36347c478bd9Sstevel@tonic-gate 
36350b5aa17bSmec 	/* no allocations from cage */
36360b5aa17bSmec 	flags |= PGI_NOCAGE;
36370b5aa17bSmec 
36387c478bd9Sstevel@tonic-gate 	/* LINTED */
36397c478bd9Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
36407c478bd9Sstevel@tonic-gate 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
36417c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
36427c478bd9Sstevel@tonic-gate 		return (NULL);
36437c478bd9Sstevel@tonic-gate 	}
36447c478bd9Sstevel@tonic-gate 
36457c478bd9Sstevel@tonic-gate 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
36467c478bd9Sstevel@tonic-gate 
36477c478bd9Sstevel@tonic-gate 	/* do not limit search and ignore color if hi pri */
36487c478bd9Sstevel@tonic-gate 
36497c478bd9Sstevel@tonic-gate 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
36507c478bd9Sstevel@tonic-gate 		pfnflag = pgcpfailcnt[szc];
36517c478bd9Sstevel@tonic-gate 
36527c478bd9Sstevel@tonic-gate 	/* remove color match to improve chances */
36537c478bd9Sstevel@tonic-gate 
36547c478bd9Sstevel@tonic-gate 	if (flags & PGI_PGCPHIPRI || pfnflag)
36557c478bd9Sstevel@tonic-gate 		flags &= ~PG_MATCH_COLOR;
36567c478bd9Sstevel@tonic-gate 
36577c478bd9Sstevel@tonic-gate 	do {
36587c478bd9Sstevel@tonic-gate 		/* get pfn range based on mnode and mtype */
36597c478bd9Sstevel@tonic-gate 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
36607c478bd9Sstevel@tonic-gate 
36617c478bd9Sstevel@tonic-gate 		ASSERT(pfnhi >= pfnlo);
36627c478bd9Sstevel@tonic-gate 
36637c478bd9Sstevel@tonic-gate 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
36647c478bd9Sstevel@tonic-gate 		    pfnlo, pfnhi, pfnflag);
36657c478bd9Sstevel@tonic-gate 
36667c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
36677c478bd9Sstevel@tonic-gate 			pfnflag = pgcpfailcnt[szc];
36687c478bd9Sstevel@tonic-gate 			if (pfnflag) {
36697c478bd9Sstevel@tonic-gate 				/* double the search size */
36707c478bd9Sstevel@tonic-gate 				pgcpfailcnt[szc] = pfnflag >> 1;
36717c478bd9Sstevel@tonic-gate 			}
36727c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
36737c478bd9Sstevel@tonic-gate 			return (pp);
36747c478bd9Sstevel@tonic-gate 		}
3675affbd3ccSkchow 		MTYPE_NEXT(mnode, mtype, flags);
3676affbd3ccSkchow 	} while (mtype >= 0);
36777c478bd9Sstevel@tonic-gate 
36787c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
36797c478bd9Sstevel@tonic-gate 	return (NULL);
36807c478bd9Sstevel@tonic-gate }
36817c478bd9Sstevel@tonic-gate 
368278b03d3aSkchow #if defined(__i386) || defined(__amd64)
368378b03d3aSkchow /*
368478b03d3aSkchow  * Determine the likelihood of finding/coalescing a szc page.
368578b03d3aSkchow  * Return 0 if the likelihood is small otherwise return 1.
368678b03d3aSkchow  *
368778b03d3aSkchow  * For now, be conservative and check only 1g pages and return 0
368878b03d3aSkchow  * if there had been previous coalescing failures and the szc pages
368978b03d3aSkchow  * needed to satisfy request would exhaust most of freemem.
369078b03d3aSkchow  */
369178b03d3aSkchow int
369278b03d3aSkchow page_chk_freelist(uint_t szc)
369378b03d3aSkchow {
369478b03d3aSkchow 	pgcnt_t		pgcnt;
369578b03d3aSkchow 
369678b03d3aSkchow 	if (szc <= 1)
369778b03d3aSkchow 		return (1);
369878b03d3aSkchow 
369978b03d3aSkchow 	pgcnt = page_get_pagecnt(szc);
370078b03d3aSkchow 	if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
370178b03d3aSkchow 		VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
370278b03d3aSkchow 		return (0);
370378b03d3aSkchow 	}
370478b03d3aSkchow 	VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
370578b03d3aSkchow 	return (1);
370678b03d3aSkchow }
370778b03d3aSkchow #endif
37087c478bd9Sstevel@tonic-gate 
37097c478bd9Sstevel@tonic-gate /*
37107c478bd9Sstevel@tonic-gate  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
37117c478bd9Sstevel@tonic-gate  *
37127c478bd9Sstevel@tonic-gate  * Does its own locking and accounting.
37137c478bd9Sstevel@tonic-gate  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
37147c478bd9Sstevel@tonic-gate  * pages of the proper color even if there are pages of a different color.
37157c478bd9Sstevel@tonic-gate  *
37167c478bd9Sstevel@tonic-gate  * Finds a page, removes it, THEN locks it.
37177c478bd9Sstevel@tonic-gate  */
37187c478bd9Sstevel@tonic-gate 
37197c478bd9Sstevel@tonic-gate /*ARGSUSED*/
37207c478bd9Sstevel@tonic-gate page_t *
37217c478bd9Sstevel@tonic-gate page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
37227c478bd9Sstevel@tonic-gate 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
37237c478bd9Sstevel@tonic-gate {
37247c478bd9Sstevel@tonic-gate 	struct as	*as = seg->s_as;
37257c478bd9Sstevel@tonic-gate 	page_t		*pp = NULL;
37267c478bd9Sstevel@tonic-gate 	ulong_t		bin;
37277c478bd9Sstevel@tonic-gate 	uchar_t		szc;
37287c478bd9Sstevel@tonic-gate 	int		mnode;
37297c478bd9Sstevel@tonic-gate 	int		mtype;
37307c478bd9Sstevel@tonic-gate 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
37317c478bd9Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
37327c478bd9Sstevel@tonic-gate 
37337c478bd9Sstevel@tonic-gate 	page_get_func = page_get_mnode_freelist;
37347c478bd9Sstevel@tonic-gate 
37357c478bd9Sstevel@tonic-gate 	/*
37367c478bd9Sstevel@tonic-gate 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
37377c478bd9Sstevel@tonic-gate 	 * assume we wish to allocate near to the current thread's home.
37387c478bd9Sstevel@tonic-gate 	 */
37397c478bd9Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
37407c478bd9Sstevel@tonic-gate 		lgrp = lgrp_home_lgrp();
37417c478bd9Sstevel@tonic-gate 
37427c478bd9Sstevel@tonic-gate 	if (kcage_on) {
37437c478bd9Sstevel@tonic-gate 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
37447c478bd9Sstevel@tonic-gate 		    kcage_freemem < kcage_throttlefree + btop(size) &&
37457c478bd9Sstevel@tonic-gate 		    curthread != kcage_cageout_thread) {
37467c478bd9Sstevel@tonic-gate 			/*
37477c478bd9Sstevel@tonic-gate 			 * Set a "reserve" of kcage_throttlefree pages for
37487c478bd9Sstevel@tonic-gate 			 * PG_PANIC and cageout thread allocations.
37497c478bd9Sstevel@tonic-gate 			 *
37507c478bd9Sstevel@tonic-gate 			 * Everybody else has to serialize in
37517c478bd9Sstevel@tonic-gate 			 * page_create_get_something() to get a cage page, so
37527c478bd9Sstevel@tonic-gate 			 * that we don't deadlock cageout!
37537c478bd9Sstevel@tonic-gate 			 */
37547c478bd9Sstevel@tonic-gate 			return (NULL);
37557c478bd9Sstevel@tonic-gate 		}
37567c478bd9Sstevel@tonic-gate 	} else {
37577c478bd9Sstevel@tonic-gate 		flags &= ~PG_NORELOC;
37587c478bd9Sstevel@tonic-gate 		flags |= PGI_NOCAGE;
37597c478bd9Sstevel@tonic-gate 	}
37607c478bd9Sstevel@tonic-gate 
37617c478bd9Sstevel@tonic-gate 	/* LINTED */
376207ad560dSkchow 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
37637c478bd9Sstevel@tonic-gate 
37647c478bd9Sstevel@tonic-gate 	/*
37657c478bd9Sstevel@tonic-gate 	 * Convert size to page size code.
37667c478bd9Sstevel@tonic-gate 	 */
37677c478bd9Sstevel@tonic-gate 	if ((szc = page_szc(size)) == (uchar_t)-1)
37687c478bd9Sstevel@tonic-gate 		panic("page_get_freelist: illegal page size request");
37697c478bd9Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
37707c478bd9Sstevel@tonic-gate 
37717c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
37727c478bd9Sstevel@tonic-gate 
37737c478bd9Sstevel@tonic-gate 	/* LINTED */
37745d07b933Sdp 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
37757c478bd9Sstevel@tonic-gate 
37765d07b933Sdp 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
37777c478bd9Sstevel@tonic-gate 
37787c478bd9Sstevel@tonic-gate 	/*
37797c478bd9Sstevel@tonic-gate 	 * Try to get a local page first, but try remote if we can't
37807c478bd9Sstevel@tonic-gate 	 * get a page of the right color.
37817c478bd9Sstevel@tonic-gate 	 */
37827c478bd9Sstevel@tonic-gate pgretry:
37837c478bd9Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
37847c478bd9Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
37857c478bd9Sstevel@tonic-gate 		pp = page_get_func(mnode, bin, mtype, szc, flags);
37867c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
37877c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
37887c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
37897c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
37907c478bd9Sstevel@tonic-gate 			    int, mnode,
37917c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
37927c478bd9Sstevel@tonic-gate 			    uint_t, flags);
37937c478bd9Sstevel@tonic-gate 			return (pp);
37947c478bd9Sstevel@tonic-gate 		}
37957c478bd9Sstevel@tonic-gate 	}
37967c478bd9Sstevel@tonic-gate 	ASSERT(pp == NULL);
37977c478bd9Sstevel@tonic-gate 
37987c478bd9Sstevel@tonic-gate 	/*
37997c478bd9Sstevel@tonic-gate 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
38007c478bd9Sstevel@tonic-gate 	 * remote free lists.  Caller expected to call page_get_cachelist which
38017c478bd9Sstevel@tonic-gate 	 * will check local cache lists and remote free lists.
38027c478bd9Sstevel@tonic-gate 	 */
38037c478bd9Sstevel@tonic-gate 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
38047c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
38057c478bd9Sstevel@tonic-gate 		return (NULL);
38067c478bd9Sstevel@tonic-gate 	}
38077c478bd9Sstevel@tonic-gate 
38087c478bd9Sstevel@tonic-gate 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
38097c478bd9Sstevel@tonic-gate 
38107c478bd9Sstevel@tonic-gate 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
38117c478bd9Sstevel@tonic-gate 
38122cb27123Saguzovsk 	if (!(flags & PG_LOCAL)) {
38132cb27123Saguzovsk 		/*
38142cb27123Saguzovsk 		 * Try to get a non-local freelist page.
38152cb27123Saguzovsk 		 */
38162cb27123Saguzovsk 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
38172cb27123Saguzovsk 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
38182cb27123Saguzovsk 			pp = page_get_func(mnode, bin, mtype, szc, flags);
38192cb27123Saguzovsk 			if (pp != NULL) {
38202cb27123Saguzovsk 				DTRACE_PROBE4(page__get,
38212cb27123Saguzovsk 				    lgrp_t *, lgrp,
38222cb27123Saguzovsk 				    int, mnode,
38232cb27123Saguzovsk 				    ulong_t, bin,
38242cb27123Saguzovsk 				    uint_t, flags);
38252cb27123Saguzovsk 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
38262cb27123Saguzovsk 				return (pp);
38272cb27123Saguzovsk 			}
38287c478bd9Sstevel@tonic-gate 		}
38292cb27123Saguzovsk 		ASSERT(pp == NULL);
38307c478bd9Sstevel@tonic-gate 	}
38317c478bd9Sstevel@tonic-gate 
38327c478bd9Sstevel@tonic-gate 	/*
38337c478bd9Sstevel@tonic-gate 	 * when the cage is off chances are page_get_contig_pages() will fail
38347c478bd9Sstevel@tonic-gate 	 * to lock a large page chunk therefore when the cage is off it's not
38357c478bd9Sstevel@tonic-gate 	 * called by default.  this can be changed via /etc/system.
38367c478bd9Sstevel@tonic-gate 	 *
38377c478bd9Sstevel@tonic-gate 	 * page_get_contig_pages() also called to acquire a base pagesize page
38387c478bd9Sstevel@tonic-gate 	 * for page_create_get_something().
38397c478bd9Sstevel@tonic-gate 	 */
38407c478bd9Sstevel@tonic-gate 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
38417c478bd9Sstevel@tonic-gate 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
38427c478bd9Sstevel@tonic-gate 	    (page_get_func != page_get_contig_pages)) {
38437c478bd9Sstevel@tonic-gate 
38447c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
38457c478bd9Sstevel@tonic-gate 		page_get_func = page_get_contig_pages;
38467c478bd9Sstevel@tonic-gate 		goto pgretry;
38477c478bd9Sstevel@tonic-gate 	}
38487c478bd9Sstevel@tonic-gate 
38492cb27123Saguzovsk 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
38502cb27123Saguzovsk 	    page_get_func == page_get_contig_pages)
385183f9b804Skchow 		SETPGCPFAILCNT(szc);
38527c478bd9Sstevel@tonic-gate 
38537c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
38547c478bd9Sstevel@tonic-gate 	return (NULL);
38557c478bd9Sstevel@tonic-gate }
38567c478bd9Sstevel@tonic-gate 
38577c478bd9Sstevel@tonic-gate /*
38587c478bd9Sstevel@tonic-gate  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
38597c478bd9Sstevel@tonic-gate  *
38607c478bd9Sstevel@tonic-gate  * Does its own locking.
38617c478bd9Sstevel@tonic-gate  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
38627c478bd9Sstevel@tonic-gate  * pages of the proper color even if there are pages of a different color.
38637c478bd9Sstevel@tonic-gate  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
38647c478bd9Sstevel@tonic-gate  * try to lock one of them.  If no page can be locked, try the
38657c478bd9Sstevel@tonic-gate  * next bin.  Return NULL if a page can not be found and locked.
38667c478bd9Sstevel@tonic-gate  *
38677c478bd9Sstevel@tonic-gate  * Finds a pages, trys to lock it, then removes it.
38687c478bd9Sstevel@tonic-gate  */
38697c478bd9Sstevel@tonic-gate 
38707c478bd9Sstevel@tonic-gate /*ARGSUSED*/
38717c478bd9Sstevel@tonic-gate page_t *
38727c478bd9Sstevel@tonic-gate page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
38737c478bd9Sstevel@tonic-gate     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
38747c478bd9Sstevel@tonic-gate {
38757c478bd9Sstevel@tonic-gate 	page_t		*pp;
38767c478bd9Sstevel@tonic-gate 	struct as	*as = seg->s_as;
38777c478bd9Sstevel@tonic-gate 	ulong_t		bin;
38787c478bd9Sstevel@tonic-gate 	/*LINTED*/
38797c478bd9Sstevel@tonic-gate 	int		mnode;
38807c478bd9Sstevel@tonic-gate 	int		mtype;
38817c478bd9Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
38827c478bd9Sstevel@tonic-gate 
38837c478bd9Sstevel@tonic-gate 	/*
38847c478bd9Sstevel@tonic-gate 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
38857c478bd9Sstevel@tonic-gate 	 * assume we wish to allocate near to the current thread's home.
38867c478bd9Sstevel@tonic-gate 	 */
38877c478bd9Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
38887c478bd9Sstevel@tonic-gate 		lgrp = lgrp_home_lgrp();
38897c478bd9Sstevel@tonic-gate 
38907c478bd9Sstevel@tonic-gate 	if (!kcage_on) {
38917c478bd9Sstevel@tonic-gate 		flags &= ~PG_NORELOC;
38927c478bd9Sstevel@tonic-gate 		flags |= PGI_NOCAGE;
38937c478bd9Sstevel@tonic-gate 	}
38947c478bd9Sstevel@tonic-gate 
38957c478bd9Sstevel@tonic-gate 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
38967c478bd9Sstevel@tonic-gate 	    kcage_freemem <= kcage_throttlefree) {
38977c478bd9Sstevel@tonic-gate 		/*
38987c478bd9Sstevel@tonic-gate 		 * Reserve kcage_throttlefree pages for critical kernel
38997c478bd9Sstevel@tonic-gate 		 * threads.
39007c478bd9Sstevel@tonic-gate 		 *
39017c478bd9Sstevel@tonic-gate 		 * Everybody else has to go to page_create_get_something()
39027c478bd9Sstevel@tonic-gate 		 * to get a cage page, so we don't deadlock cageout.
39037c478bd9Sstevel@tonic-gate 		 */
39047c478bd9Sstevel@tonic-gate 		return (NULL);
39057c478bd9Sstevel@tonic-gate 	}
39067c478bd9Sstevel@tonic-gate 
39077c478bd9Sstevel@tonic-gate 	/* LINTED */
39085d07b933Sdp 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
39097c478bd9Sstevel@tonic-gate 
39105d07b933Sdp 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
39117c478bd9Sstevel@tonic-gate 
39127c478bd9Sstevel@tonic-gate 	/* LINTED */
391307ad560dSkchow 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
39147c478bd9Sstevel@tonic-gate 
39157c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
39167c478bd9Sstevel@tonic-gate 
39177c478bd9Sstevel@tonic-gate 	/*
39187c478bd9Sstevel@tonic-gate 	 * Try local cachelists first
39197c478bd9Sstevel@tonic-gate 	 */
39207c478bd9Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
39217c478bd9Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
39227c478bd9Sstevel@tonic-gate 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
39237c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
39247c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
39257c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
39267c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
39277c478bd9Sstevel@tonic-gate 			    int, mnode,
39287c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
39297c478bd9Sstevel@tonic-gate 			    uint_t, flags);
39307c478bd9Sstevel@tonic-gate 			return (pp);
39317c478bd9Sstevel@tonic-gate 		}
39327c478bd9Sstevel@tonic-gate 	}
39337c478bd9Sstevel@tonic-gate 
39347c478bd9Sstevel@tonic-gate 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
39357c478bd9Sstevel@tonic-gate 
39367c478bd9Sstevel@tonic-gate 	/*
39377c478bd9Sstevel@tonic-gate 	 * Try freelists/cachelists that are farther away
39387c478bd9Sstevel@tonic-gate 	 * This is our only chance to allocate remote pages for PAGESIZE
39397c478bd9Sstevel@tonic-gate 	 * requests.
39407c478bd9Sstevel@tonic-gate 	 */
39417c478bd9Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
39427c478bd9Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
39437c478bd9Sstevel@tonic-gate 		pp = page_get_mnode_freelist(mnode, bin, mtype,
39447c478bd9Sstevel@tonic-gate 		    0, flags);
39457c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
39467c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
39477c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
39487c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
39497c478bd9Sstevel@tonic-gate 			    int, mnode,
39507c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
39517c478bd9Sstevel@tonic-gate 			    uint_t, flags);
39527c478bd9Sstevel@tonic-gate 			return (pp);
39537c478bd9Sstevel@tonic-gate 		}
39547c478bd9Sstevel@tonic-gate 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
39557c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
39567c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
39577c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
39587c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
39597c478bd9Sstevel@tonic-gate 			    int, mnode,
39607c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
39617c478bd9Sstevel@tonic-gate 			    uint_t, flags);
39627c478bd9Sstevel@tonic-gate 			return (pp);
39637c478bd9Sstevel@tonic-gate 		}
39647c478bd9Sstevel@tonic-gate 	}
39657c478bd9Sstevel@tonic-gate 
39667c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
39677c478bd9Sstevel@tonic-gate 	return (NULL);
39687c478bd9Sstevel@tonic-gate }
39697c478bd9Sstevel@tonic-gate 
39707c478bd9Sstevel@tonic-gate page_t *
39717c478bd9Sstevel@tonic-gate page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
39727c478bd9Sstevel@tonic-gate {
39735d07b933Sdp 	kmutex_t		*pcm;
39745d07b933Sdp 	page_t			*pp, *first_pp;
39755d07b933Sdp 	uint_t			sbin;
39765d07b933Sdp 	int			plw_initialized;
39775d07b933Sdp 	page_list_walker_t	plw;
39787c478bd9Sstevel@tonic-gate 
39797c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
39807c478bd9Sstevel@tonic-gate 
39817c478bd9Sstevel@tonic-gate 	/* LINTED */
39827c478bd9Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
39837c478bd9Sstevel@tonic-gate 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
39847c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
39857c478bd9Sstevel@tonic-gate 		return (NULL);
39867c478bd9Sstevel@tonic-gate 	}
39877c478bd9Sstevel@tonic-gate 
39885d07b933Sdp try_again:
39897c478bd9Sstevel@tonic-gate 
39905d07b933Sdp 	plw_initialized = 0;
39915d07b933Sdp 	plw.plw_ceq_dif = 1;
39927c478bd9Sstevel@tonic-gate 
39937c478bd9Sstevel@tonic-gate 	/*
39947c478bd9Sstevel@tonic-gate 	 * Only hold one cachelist lock at a time, that way we
39957c478bd9Sstevel@tonic-gate 	 * can start anywhere and not have to worry about lock
39967c478bd9Sstevel@tonic-gate 	 * ordering.
39977c478bd9Sstevel@tonic-gate 	 */
39987c478bd9Sstevel@tonic-gate 
39995d07b933Sdp 	for (plw.plw_count = 0;
40005d07b933Sdp 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
40015d07b933Sdp 		sbin = bin;
40025d07b933Sdp 		do {
40035d07b933Sdp 
40045d07b933Sdp 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
40055d07b933Sdp 				goto bin_empty_1;
40067c478bd9Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
40077c478bd9Sstevel@tonic-gate 			mutex_enter(pcm);
40087c478bd9Sstevel@tonic-gate 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
40095d07b933Sdp 			if (pp == NULL)
40105d07b933Sdp 				goto bin_empty_0;
40117c478bd9Sstevel@tonic-gate 
40125d07b933Sdp 			first_pp = pp;
40135d07b933Sdp 			ASSERT(pp->p_vnode);
40145d07b933Sdp 			ASSERT(PP_ISAGED(pp) == 0);
40155d07b933Sdp 			ASSERT(pp->p_szc == 0);
40165d07b933Sdp 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
40175d07b933Sdp 			while (!page_trylock(pp, SE_EXCL)) {
40185d07b933Sdp 				pp = pp->p_next;
40195d07b933Sdp 				ASSERT(pp->p_szc == 0);
40205d07b933Sdp 				if (pp == first_pp) {
40217c478bd9Sstevel@tonic-gate 					/*
40225d07b933Sdp 					 * We have searched the complete list!
40235d07b933Sdp 					 * And all of them (might only be one)
40245d07b933Sdp 					 * are locked. This can happen since
40255d07b933Sdp 					 * these pages can also be found via
40265d07b933Sdp 					 * the hash list. When found via the
40275d07b933Sdp 					 * hash list, they are locked first,
40285d07b933Sdp 					 * then removed. We give up to let the
40295d07b933Sdp 					 * other thread run.
40307c478bd9Sstevel@tonic-gate 					 */
40315d07b933Sdp 					pp = NULL;
40325d07b933Sdp 					break;
40337c478bd9Sstevel@tonic-gate 				}
40345d07b933Sdp 				ASSERT(pp->p_vnode);
40355d07b933Sdp 				ASSERT(PP_ISFREE(pp));
40365d07b933Sdp 				ASSERT(PP_ISAGED(pp) == 0);
40375d07b933Sdp 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
40385d07b933Sdp 				    mnode);
40397c478bd9Sstevel@tonic-gate 			}
40407c478bd9Sstevel@tonic-gate 
40415d07b933Sdp 			if (pp) {
40425d07b933Sdp 				page_t	**ppp;
40435d07b933Sdp 				/*
40445d07b933Sdp 				 * Found and locked a page.
40455d07b933Sdp 				 * Pull it off the list.
40465d07b933Sdp 				 */
40475d07b933Sdp 				ASSERT(mtype == PP_2_MTYPE(pp));
40485d07b933Sdp 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
40495d07b933Sdp 				page_sub(ppp, pp);
40505d07b933Sdp 				/*
40515d07b933Sdp 				 * Subtract counters before releasing pcm mutex
40525d07b933Sdp 				 * to avoid a race with page_freelist_coalesce
40535d07b933Sdp 				 * and page_freelist_split.
40545d07b933Sdp 				 */
40555d07b933Sdp 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
40565d07b933Sdp 				mutex_exit(pcm);
40575d07b933Sdp 				ASSERT(pp->p_vnode);
40585d07b933Sdp 				ASSERT(PP_ISAGED(pp) == 0);
40595d07b933Sdp #if defined(__sparc)
40605d07b933Sdp 				ASSERT(!kcage_on ||
40615d07b933Sdp 				    (flags & PG_NORELOC) == 0 ||
40625d07b933Sdp 				    PP_ISNORELOC(pp));
40635d07b933Sdp 				if (PP_ISNORELOC(pp)) {
40645d07b933Sdp 					kcage_freemem_sub(1);
40657c478bd9Sstevel@tonic-gate 				}
40665d07b933Sdp #endif
40675d07b933Sdp 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
40685d07b933Sdp 				return (pp);
40697c478bd9Sstevel@tonic-gate 			}
40705d07b933Sdp bin_empty_0:
40715d07b933Sdp 			mutex_exit(pcm);
40725d07b933Sdp bin_empty_1:
40735d07b933Sdp 			if (plw_initialized == 0) {
40745d07b933Sdp 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
40755d07b933Sdp 				plw_initialized = 1;
40767c478bd9Sstevel@tonic-gate 			}
40775d07b933Sdp 			/* calculate the next bin with equivalent color */
40785d07b933Sdp 			bin = ADD_MASKED(bin, plw.plw_bin_step,
40795d07b933Sdp 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
40805d07b933Sdp 		} while (sbin != bin);
40817c478bd9Sstevel@tonic-gate 
40825d07b933Sdp 		if (plw.plw_ceq_dif > 1)
40835d07b933Sdp 			bin = page_list_walk_next_bin(0, bin, &plw);
40847c478bd9Sstevel@tonic-gate 	}
40857c478bd9Sstevel@tonic-gate 
4086affbd3ccSkchow 	MTYPE_NEXT(mnode, mtype, flags);
4087affbd3ccSkchow 	if (mtype >= 0)
40885d07b933Sdp 		goto try_again;
4089affbd3ccSkchow 
40907c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
40917c478bd9Sstevel@tonic-gate 	return (NULL);
40927c478bd9Sstevel@tonic-gate }
40937c478bd9Sstevel@tonic-gate 
40947c478bd9Sstevel@tonic-gate #ifdef DEBUG
40957c478bd9Sstevel@tonic-gate #define	REPL_PAGE_STATS
40967c478bd9Sstevel@tonic-gate #endif /* DEBUG */
40977c478bd9Sstevel@tonic-gate 
40987c478bd9Sstevel@tonic-gate #ifdef REPL_PAGE_STATS
40997c478bd9Sstevel@tonic-gate struct repl_page_stats {
41007c478bd9Sstevel@tonic-gate 	uint_t	ngets;
41017c478bd9Sstevel@tonic-gate 	uint_t	ngets_noreloc;
41027c478bd9Sstevel@tonic-gate 	uint_t	npgr_noreloc;
41037c478bd9Sstevel@tonic-gate 	uint_t	nnopage_first;
41047c478bd9Sstevel@tonic-gate 	uint_t	nnopage;
41057c478bd9Sstevel@tonic-gate 	uint_t	nhashout;
41067c478bd9Sstevel@tonic-gate 	uint_t	nnofree;
41077c478bd9Sstevel@tonic-gate 	uint_t	nnext_pp;
41087c478bd9Sstevel@tonic-gate } repl_page_stats;
41097c478bd9Sstevel@tonic-gate #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
41107c478bd9Sstevel@tonic-gate #else /* REPL_PAGE_STATS */
41117c478bd9Sstevel@tonic-gate #define	REPL_STAT_INCR(v)
41127c478bd9Sstevel@tonic-gate #endif /* REPL_PAGE_STATS */
41137c478bd9Sstevel@tonic-gate 
41147c478bd9Sstevel@tonic-gate int	pgrppgcp;
41157c478bd9Sstevel@tonic-gate 
41167c478bd9Sstevel@tonic-gate /*
41177c478bd9Sstevel@tonic-gate  * The freemem accounting must be done by the caller.
41187c478bd9Sstevel@tonic-gate  * First we try to get a replacement page of the same size as like_pp,
41197c478bd9Sstevel@tonic-gate  * if that is not possible, then we just get a set of discontiguous
41207c478bd9Sstevel@tonic-gate  * PAGESIZE pages.
41217c478bd9Sstevel@tonic-gate  */
41227c478bd9Sstevel@tonic-gate page_t *
41232dae3fb5Sjjc page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
41247c478bd9Sstevel@tonic-gate     uint_t pgrflags)
41257c478bd9Sstevel@tonic-gate {
41267c478bd9Sstevel@tonic-gate 	page_t		*like_pp;
41277c478bd9Sstevel@tonic-gate 	page_t		*pp, *pplist;
41287c478bd9Sstevel@tonic-gate 	page_t		*pl = NULL;
41297c478bd9Sstevel@tonic-gate 	ulong_t		bin;
41307c478bd9Sstevel@tonic-gate 	int		mnode, page_mnode;
41317c478bd9Sstevel@tonic-gate 	int		szc;
41327c478bd9Sstevel@tonic-gate 	spgcnt_t	npgs, pg_cnt;
41337c478bd9Sstevel@tonic-gate 	pfn_t		pfnum;
41347c478bd9Sstevel@tonic-gate 	int		mtype;
41357c478bd9Sstevel@tonic-gate 	int		flags = 0;
41367c478bd9Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
41372dae3fb5Sjjc 	lgrp_t		*lgrp;
41387c478bd9Sstevel@tonic-gate 
41397c478bd9Sstevel@tonic-gate 	REPL_STAT_INCR(ngets);
41407c478bd9Sstevel@tonic-gate 	like_pp = orig_like_pp;
41417c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(like_pp));
41427c478bd9Sstevel@tonic-gate 
41437c478bd9Sstevel@tonic-gate 	szc = like_pp->p_szc;
41447c478bd9Sstevel@tonic-gate 	npgs = page_get_pagecnt(szc);
41457c478bd9Sstevel@tonic-gate 	/*
41467c478bd9Sstevel@tonic-gate 	 * Now we reset like_pp to the base page_t.
41477c478bd9Sstevel@tonic-gate 	 * That way, we won't walk past the end of this 'szc' page.
41487c478bd9Sstevel@tonic-gate 	 */
41497c478bd9Sstevel@tonic-gate 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
41507c478bd9Sstevel@tonic-gate 	like_pp = page_numtopp_nolock(pfnum);
41517c478bd9Sstevel@tonic-gate 	ASSERT(like_pp->p_szc == szc);
41527c478bd9Sstevel@tonic-gate 
41537c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(like_pp)) {
41547c478bd9Sstevel@tonic-gate 		ASSERT(kcage_on);
41557c478bd9Sstevel@tonic-gate 		REPL_STAT_INCR(ngets_noreloc);
41567c478bd9Sstevel@tonic-gate 		flags = PGI_RELOCONLY;
41577c478bd9Sstevel@tonic-gate 	} else if (pgrflags & PGR_NORELOC) {
41587c478bd9Sstevel@tonic-gate 		ASSERT(kcage_on);
41597c478bd9Sstevel@tonic-gate 		REPL_STAT_INCR(npgr_noreloc);
41607c478bd9Sstevel@tonic-gate 		flags = PG_NORELOC;
41617c478bd9Sstevel@tonic-gate 	}
41627c478bd9Sstevel@tonic-gate 
41637c478bd9Sstevel@tonic-gate 	/*
41647c478bd9Sstevel@tonic-gate 	 * Kernel pages must always be replaced with the same size
41657c478bd9Sstevel@tonic-gate 	 * pages, since we cannot properly handle demotion of kernel
41667c478bd9Sstevel@tonic-gate 	 * pages.
41677c478bd9Sstevel@tonic-gate 	 */
4168ad23a2dbSjohansen 	if (PP_ISKAS(like_pp))
41697c478bd9Sstevel@tonic-gate 		pgrflags |= PGR_SAMESZC;
41707c478bd9Sstevel@tonic-gate 
41717c478bd9Sstevel@tonic-gate 	/* LINTED */
417207ad560dSkchow 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
41737c478bd9Sstevel@tonic-gate 
41747c478bd9Sstevel@tonic-gate 	while (npgs) {
41757c478bd9Sstevel@tonic-gate 		pplist = NULL;
41767c478bd9Sstevel@tonic-gate 		for (;;) {
41777c478bd9Sstevel@tonic-gate 			pg_cnt = page_get_pagecnt(szc);
41787c478bd9Sstevel@tonic-gate 			bin = PP_2_BIN(like_pp);
41797c478bd9Sstevel@tonic-gate 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
41807c478bd9Sstevel@tonic-gate 			ASSERT(pg_cnt <= npgs);
41817c478bd9Sstevel@tonic-gate 
41827c478bd9Sstevel@tonic-gate 			/*
41837c478bd9Sstevel@tonic-gate 			 * If an lgroup was specified, try to get the
41847c478bd9Sstevel@tonic-gate 			 * page from that lgroup.
41852dae3fb5Sjjc 			 * NOTE: Must be careful with code below because
41862dae3fb5Sjjc 			 *	 lgroup may disappear and reappear since there
41872dae3fb5Sjjc 			 *	 is no locking for lgroup here.
41887c478bd9Sstevel@tonic-gate 			 */
41892dae3fb5Sjjc 			if (LGRP_EXISTS(lgrp_target)) {
41902dae3fb5Sjjc 				/*
41912dae3fb5Sjjc 				 * Keep local variable for lgroup separate
41922dae3fb5Sjjc 				 * from lgroup argument since this code should
41932dae3fb5Sjjc 				 * only be exercised when lgroup argument
41942dae3fb5Sjjc 				 * exists....
41952dae3fb5Sjjc 				 */
41962dae3fb5Sjjc 				lgrp = lgrp_target;
41972dae3fb5Sjjc 
41987c478bd9Sstevel@tonic-gate 				/* Try the lgroup's freelists first */
41997c478bd9Sstevel@tonic-gate 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
42007c478bd9Sstevel@tonic-gate 				    LGRP_SRCH_LOCAL);
42017c478bd9Sstevel@tonic-gate 				while ((pplist == NULL) &&
42027c478bd9Sstevel@tonic-gate 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
42037c478bd9Sstevel@tonic-gate 				    != -1) {
4204ce8eb11aSdp 					pplist =
4205ce8eb11aSdp 					    page_get_mnode_freelist(mnode, bin,
4206ce8eb11aSdp 					    mtype, szc, flags);
42077c478bd9Sstevel@tonic-gate 				}
42087c478bd9Sstevel@tonic-gate 
42097c478bd9Sstevel@tonic-gate 				/*
42107c478bd9Sstevel@tonic-gate 				 * Now try it's cachelists if this is a
42117c478bd9Sstevel@tonic-gate 				 * small page. Don't need to do it for
42127c478bd9Sstevel@tonic-gate 				 * larger ones since page_freelist_coalesce()
42137c478bd9Sstevel@tonic-gate 				 * already failed.
42147c478bd9Sstevel@tonic-gate 				 */
42157c478bd9Sstevel@tonic-gate 				if (pplist != NULL || szc != 0)
42167c478bd9Sstevel@tonic-gate 					break;
42177c478bd9Sstevel@tonic-gate 
42187c478bd9Sstevel@tonic-gate 				/* Now try it's cachelists */
42197c478bd9Sstevel@tonic-gate 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
42207c478bd9Sstevel@tonic-gate 				    LGRP_SRCH_LOCAL);
42217c478bd9Sstevel@tonic-gate 
42227c478bd9Sstevel@tonic-gate 				while ((pplist == NULL) &&
42237c478bd9Sstevel@tonic-gate 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
42247c478bd9Sstevel@tonic-gate 				    != -1) {
4225ce8eb11aSdp 					pplist =
4226ce8eb11aSdp 					    page_get_mnode_cachelist(bin, flags,
4227ce8eb11aSdp 					    mnode, mtype);
42287c478bd9Sstevel@tonic-gate 				}
42297c478bd9Sstevel@tonic-gate 				if (pplist != NULL) {
42307c478bd9Sstevel@tonic-gate 					page_hashout(pplist, NULL);
42317c478bd9Sstevel@tonic-gate 					PP_SETAGED(pplist);
42327c478bd9Sstevel@tonic-gate 					REPL_STAT_INCR(nhashout);
42337c478bd9Sstevel@tonic-gate 					break;
42347c478bd9Sstevel@tonic-gate 				}
42357c478bd9Sstevel@tonic-gate 				/* Done looking in this lgroup. Bail out. */
42367c478bd9Sstevel@tonic-gate 				break;
42377c478bd9Sstevel@tonic-gate 			}
42387c478bd9Sstevel@tonic-gate 
42397c478bd9Sstevel@tonic-gate 			/*
42402dae3fb5Sjjc 			 * No lgroup was specified (or lgroup was removed by
42412dae3fb5Sjjc 			 * DR, so just try to get the page as close to
42422dae3fb5Sjjc 			 * like_pp's mnode as possible.
42437c478bd9Sstevel@tonic-gate 			 * First try the local freelist...
42447c478bd9Sstevel@tonic-gate 			 */
42457c478bd9Sstevel@tonic-gate 			mnode = PP_2_MEM_NODE(like_pp);
42467c478bd9Sstevel@tonic-gate 			pplist = page_get_mnode_freelist(mnode, bin,
42477c478bd9Sstevel@tonic-gate 			    mtype, szc, flags);
42487c478bd9Sstevel@tonic-gate 			if (pplist != NULL)
42497c478bd9Sstevel@tonic-gate 				break;
42507c478bd9Sstevel@tonic-gate 
42517c478bd9Sstevel@tonic-gate 			REPL_STAT_INCR(nnofree);
42527c478bd9Sstevel@tonic-gate 
42537c478bd9Sstevel@tonic-gate 			/*
42547c478bd9Sstevel@tonic-gate 			 * ...then the local cachelist. Don't need to do it for
42557c478bd9Sstevel@tonic-gate 			 * larger pages cause page_freelist_coalesce() already
42567c478bd9Sstevel@tonic-gate 			 * failed there anyway.
42577c478bd9Sstevel@tonic-gate 			 */
42587c478bd9Sstevel@tonic-gate 			if (szc == 0) {
42597c478bd9Sstevel@tonic-gate 				pplist = page_get_mnode_cachelist(bin, flags,
42607c478bd9Sstevel@tonic-gate 				    mnode, mtype);
42617c478bd9Sstevel@tonic-gate 				if (pplist != NULL) {
42627c478bd9Sstevel@tonic-gate 					page_hashout(pplist, NULL);
42637c478bd9Sstevel@tonic-gate 					PP_SETAGED(pplist);
42647c478bd9Sstevel@tonic-gate 					REPL_STAT_INCR(nhashout);
42657c478bd9Sstevel@tonic-gate 					break;
42667c478bd9Sstevel@tonic-gate 				}
42677c478bd9Sstevel@tonic-gate 			}
42687c478bd9Sstevel@tonic-gate 
42697c478bd9Sstevel@tonic-gate 			/* Now try remote freelists */
42707c478bd9Sstevel@tonic-gate 			page_mnode = mnode;
42717c478bd9Sstevel@tonic-gate 			lgrp =
42727c478bd9Sstevel@tonic-gate 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
42737c478bd9Sstevel@tonic-gate 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
42747c478bd9Sstevel@tonic-gate 			    LGRP_SRCH_HIER);
42757c478bd9Sstevel@tonic-gate 			while (pplist == NULL &&
42767c478bd9Sstevel@tonic-gate 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
42777c478bd9Sstevel@tonic-gate 			    != -1) {
42787c478bd9Sstevel@tonic-gate 				/*
42797c478bd9Sstevel@tonic-gate 				 * Skip local mnode.
42807c478bd9Sstevel@tonic-gate 				 */
42817c478bd9Sstevel@tonic-gate 				if ((mnode == page_mnode) ||
42827c478bd9Sstevel@tonic-gate 				    (mem_node_config[mnode].exists == 0))
42837c478bd9Sstevel@tonic-gate 					continue;
42847c478bd9Sstevel@tonic-gate 
42857c478bd9Sstevel@tonic-gate 				pplist = page_get_mnode_freelist(mnode,
42867c478bd9Sstevel@tonic-gate 				    bin, mtype, szc, flags);
42877c478bd9Sstevel@tonic-gate 			}
42887c478bd9Sstevel@tonic-gate 
42897c478bd9Sstevel@tonic-gate 			if (pplist != NULL)
42907c478bd9Sstevel@tonic-gate 				break;
42917c478bd9Sstevel@tonic-gate 
42927c478bd9Sstevel@tonic-gate 
42937c478bd9Sstevel@tonic-gate 			/* Now try remote cachelists */
42947c478bd9Sstevel@tonic-gate 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
42957c478bd9Sstevel@tonic-gate 			    LGRP_SRCH_HIER);
42967c478bd9Sstevel@tonic-gate 			while (pplist == NULL && szc == 0) {
42977c478bd9Sstevel@tonic-gate 				mnode = lgrp_memnode_choose(&lgrp_cookie);
42987c478bd9Sstevel@tonic-gate 				if (mnode == -1)
42997c478bd9Sstevel@tonic-gate 					break;
43007c478bd9Sstevel@tonic-gate 				/*
43017c478bd9Sstevel@tonic-gate 				 * Skip local mnode.
43027c478bd9Sstevel@tonic-gate 				 */
43037c478bd9Sstevel@tonic-gate 				if ((mnode == page_mnode) ||
43047c478bd9Sstevel@tonic-gate 				    (mem_node_config[mnode].exists == 0))
43057c478bd9Sstevel@tonic-gate 					continue;
43067c478bd9Sstevel@tonic-gate 
43077c478bd9Sstevel@tonic-gate 				pplist = page_get_mnode_cachelist(bin,
43087c478bd9Sstevel@tonic-gate 				    flags, mnode, mtype);
43097c478bd9Sstevel@tonic-gate 
43107c478bd9Sstevel@tonic-gate 				if (pplist != NULL) {
43117c478bd9Sstevel@tonic-gate 					page_hashout(pplist, NULL);
43127c478bd9Sstevel@tonic-gate 					PP_SETAGED(pplist);
43137c478bd9Sstevel@tonic-gate 					REPL_STAT_INCR(nhashout);
43147c478bd9Sstevel@tonic-gate 					break;
43157c478bd9Sstevel@tonic-gate 				}
43167c478bd9Sstevel@tonic-gate 			}
43177c478bd9Sstevel@tonic-gate 
43187c478bd9Sstevel@tonic-gate 			/*
43197c478bd9Sstevel@tonic-gate 			 * Break out of while loop under the following cases:
43207c478bd9Sstevel@tonic-gate 			 * - If we successfully got a page.
43217c478bd9Sstevel@tonic-gate 			 * - If pgrflags specified only returning a specific
43227c478bd9Sstevel@tonic-gate 			 *   page size and we could not find that page size.
43237c478bd9Sstevel@tonic-gate 			 * - If we could not satisfy the request with PAGESIZE
43247c478bd9Sstevel@tonic-gate 			 *   or larger pages.
43257c478bd9Sstevel@tonic-gate 			 */
43267c478bd9Sstevel@tonic-gate 			if (pplist != NULL || szc == 0)
43277c478bd9Sstevel@tonic-gate 				break;
43287c478bd9Sstevel@tonic-gate 
43297c478bd9Sstevel@tonic-gate 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
43307c478bd9Sstevel@tonic-gate 				/* try to find contig page */
43317c478bd9Sstevel@tonic-gate 
43327c478bd9Sstevel@tonic-gate 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
43337c478bd9Sstevel@tonic-gate 				    LGRP_SRCH_HIER);
43347c478bd9Sstevel@tonic-gate 
43357c478bd9Sstevel@tonic-gate 				while ((pplist == NULL) &&
43367c478bd9Sstevel@tonic-gate 				    (mnode =
4337ce8eb11aSdp 				    lgrp_memnode_choose(&lgrp_cookie))
43387c478bd9Sstevel@tonic-gate 				    != -1) {
43397c478bd9Sstevel@tonic-gate 					pplist = page_get_contig_pages(
4340ce8eb11aSdp 					    mnode, bin, mtype, szc,
4341ce8eb11aSdp 					    flags | PGI_PGCPHIPRI);
43427c478bd9Sstevel@tonic-gate 				}
43437c478bd9Sstevel@tonic-gate 				break;
43447c478bd9Sstevel@tonic-gate 			}
43457c478bd9Sstevel@tonic-gate 
43467c478bd9Sstevel@tonic-gate 			/*
43477c478bd9Sstevel@tonic-gate 			 * The correct thing to do here is try the next
43487c478bd9Sstevel@tonic-gate 			 * page size down using szc--. Due to a bug
43497c478bd9Sstevel@tonic-gate 			 * with the processing of HAT_RELOAD_SHARE
43507c478bd9Sstevel@tonic-gate 			 * where the sfmmu_ttecnt arrays of all
43517c478bd9Sstevel@tonic-gate 			 * hats sharing an ISM segment don't get updated,
43527c478bd9Sstevel@tonic-gate 			 * using intermediate size pages for relocation
43537c478bd9Sstevel@tonic-gate 			 * can lead to continuous page faults.
43547c478bd9Sstevel@tonic-gate 			 */
43557c478bd9Sstevel@tonic-gate 			szc = 0;
43567c478bd9Sstevel@tonic-gate 		}
43577c478bd9Sstevel@tonic-gate 
43587c478bd9Sstevel@tonic-gate 		if (pplist != NULL) {
43597c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
43607c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
43617c478bd9Sstevel@tonic-gate 			    int, mnode,
43627c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
43637c478bd9Sstevel@tonic-gate 			    uint_t, flags);
43647c478bd9Sstevel@tonic-gate 
43657c478bd9Sstevel@tonic-gate 			while (pplist != NULL && pg_cnt--) {
43667c478bd9Sstevel@tonic-gate 				ASSERT(pplist != NULL);
43677c478bd9Sstevel@tonic-gate 				pp = pplist;
43687c478bd9Sstevel@tonic-gate 				page_sub(&pplist, pp);
43697c478bd9Sstevel@tonic-gate 				PP_CLRFREE(pp);
43707c478bd9Sstevel@tonic-gate 				PP_CLRAGED(pp);
43717c478bd9Sstevel@tonic-gate 				page_list_concat(&pl, &pp);
43727c478bd9Sstevel@tonic-gate 				npgs--;
43737c478bd9Sstevel@tonic-gate 				like_pp = like_pp + 1;
43747c478bd9Sstevel@tonic-gate 				REPL_STAT_INCR(nnext_pp);
43757c478bd9Sstevel@tonic-gate 			}
43767c478bd9Sstevel@tonic-gate 			ASSERT(pg_cnt == 0);
43777c478bd9Sstevel@tonic-gate 		} else {
43787c478bd9Sstevel@tonic-gate 			break;
43797c478bd9Sstevel@tonic-gate 		}
43807c478bd9Sstevel@tonic-gate 	}
43817c478bd9Sstevel@tonic-gate 
43827c478bd9Sstevel@tonic-gate 	if (npgs) {
43837c478bd9Sstevel@tonic-gate 		/*
43847c478bd9Sstevel@tonic-gate 		 * We were unable to allocate the necessary number
43857c478bd9Sstevel@tonic-gate 		 * of pages.
43867c478bd9Sstevel@tonic-gate 		 * We need to free up any pl.
43877c478bd9Sstevel@tonic-gate 		 */
43887c478bd9Sstevel@tonic-gate 		REPL_STAT_INCR(nnopage);
43897c478bd9Sstevel@tonic-gate 		page_free_replacement_page(pl);
43907c478bd9Sstevel@tonic-gate 		return (NULL);
43917c478bd9Sstevel@tonic-gate 	} else {
43927c478bd9Sstevel@tonic-gate 		return (pl);
43937c478bd9Sstevel@tonic-gate 	}
43947c478bd9Sstevel@tonic-gate }
43957c478bd9Sstevel@tonic-gate 
43967c478bd9Sstevel@tonic-gate /*
43977c478bd9Sstevel@tonic-gate  * demote a free large page to it's constituent pages
43987c478bd9Sstevel@tonic-gate  */
43997c478bd9Sstevel@tonic-gate void
44007c478bd9Sstevel@tonic-gate page_demote_free_pages(page_t *pp)
44017c478bd9Sstevel@tonic-gate {
44027c478bd9Sstevel@tonic-gate 
44037c478bd9Sstevel@tonic-gate 	int mnode;
44047c478bd9Sstevel@tonic-gate 
44057c478bd9Sstevel@tonic-gate 	ASSERT(pp != NULL);
44067c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
44077c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
44087c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
44097c478bd9Sstevel@tonic-gate 
44107c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
44117c478bd9Sstevel@tonic-gate 	page_freelist_lock(mnode);
44127c478bd9Sstevel@tonic-gate 	if (pp->p_szc != 0) {
44137c478bd9Sstevel@tonic-gate 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
441419397407SSherry Moore 		    pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
44157c478bd9Sstevel@tonic-gate 	}
44167c478bd9Sstevel@tonic-gate 	page_freelist_unlock(mnode);
44177c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
44187c478bd9Sstevel@tonic-gate }
4419932dc8e5Sdp 
4420932dc8e5Sdp /*
4421932dc8e5Sdp  * Factor in colorequiv to check additional 'equivalent' bins.
4422932dc8e5Sdp  * colorequiv may be set in /etc/system
4423932dc8e5Sdp  */
4424932dc8e5Sdp void
4425932dc8e5Sdp page_set_colorequiv_arr(void)
4426932dc8e5Sdp {
4427932dc8e5Sdp 	if (colorequiv > 1) {
4428932dc8e5Sdp 		int i;
4429fe70c9cfSdp 		uint_t sv_a = lowbit(colorequiv) - 1;
4430932dc8e5Sdp 
4431fe70c9cfSdp 		if (sv_a > 15)
4432fe70c9cfSdp 			sv_a = 15;
4433932dc8e5Sdp 
4434932dc8e5Sdp 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
4435fe70c9cfSdp 			uint_t colors;
4436fe70c9cfSdp 			uint_t a = sv_a;
4437932dc8e5Sdp 
4438932dc8e5Sdp 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
4439932dc8e5Sdp 				continue;
4440932dc8e5Sdp 			}
4441932dc8e5Sdp 			while ((colors >> a) == 0)
4442932dc8e5Sdp 				a--;
4443932dc8e5Sdp 			if ((a << 4) > colorequivszc[i]) {
4444932dc8e5Sdp 				colorequivszc[i] = (a << 4);
4445932dc8e5Sdp 			}
4446932dc8e5Sdp 		}
4447932dc8e5Sdp 	}
4448932dc8e5Sdp }
4449