xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision 0b5aa17b)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0b5aa17bSmec  * Common Development and Distribution License (the "License").
6*0b5aa17bSmec  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22e21bae1bSkchow  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
277c478bd9Sstevel@tonic-gate /*	All Rights Reserved   */
287c478bd9Sstevel@tonic-gate 
297c478bd9Sstevel@tonic-gate /*
307c478bd9Sstevel@tonic-gate  * Portions of this source code were derived from Berkeley 4.3 BSD
317c478bd9Sstevel@tonic-gate  * under license from the Regents of the University of California.
327c478bd9Sstevel@tonic-gate  */
337c478bd9Sstevel@tonic-gate 
347c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
357c478bd9Sstevel@tonic-gate 
367c478bd9Sstevel@tonic-gate /*
377c478bd9Sstevel@tonic-gate  * This file contains common functions to access and manage the page lists.
387c478bd9Sstevel@tonic-gate  * Many of these routines originated from platform dependent modules
397c478bd9Sstevel@tonic-gate  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
407c478bd9Sstevel@tonic-gate  * a platform independent manner.
417c478bd9Sstevel@tonic-gate  *
427c478bd9Sstevel@tonic-gate  * vm/vm_dep.h provides for platform specific support.
437c478bd9Sstevel@tonic-gate  */
447c478bd9Sstevel@tonic-gate 
457c478bd9Sstevel@tonic-gate #include <sys/types.h>
467c478bd9Sstevel@tonic-gate #include <sys/debug.h>
477c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
487c478bd9Sstevel@tonic-gate #include <sys/systm.h>
497c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
507c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
517c478bd9Sstevel@tonic-gate #include <vm/as.h>
527c478bd9Sstevel@tonic-gate #include <vm/page.h>
537c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
547c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h>
557c478bd9Sstevel@tonic-gate #include <sys/memnode.h>
567c478bd9Sstevel@tonic-gate #include <vm/vm_dep.h>
577c478bd9Sstevel@tonic-gate #include <sys/lgrp.h>
587c478bd9Sstevel@tonic-gate #include <sys/mem_config.h>
597c478bd9Sstevel@tonic-gate #include <sys/callb.h>
607c478bd9Sstevel@tonic-gate #include <sys/mem_cage.h>
617c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
627c478bd9Sstevel@tonic-gate 
637c478bd9Sstevel@tonic-gate extern uint_t	vac_colors;
647c478bd9Sstevel@tonic-gate 
656061ce8aSkchow #define	MAX_PRAGMA_ALIGN	128
666061ce8aSkchow 
676061ce8aSkchow /* vm_cpu_data0 for the boot cpu before kmem is initialized */
686061ce8aSkchow 
696061ce8aSkchow #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
70affbd3ccSkchow #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
716061ce8aSkchow #else
726061ce8aSkchow #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
736061ce8aSkchow #endif
74affbd3ccSkchow char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
75affbd3ccSkchow 
767c478bd9Sstevel@tonic-gate /*
777c478bd9Sstevel@tonic-gate  * number of page colors equivalent to reqested color in page_get routines.
787c478bd9Sstevel@tonic-gate  * If set, keeps large pages intact longer and keeps MPO allocation
797c478bd9Sstevel@tonic-gate  * from the local mnode in favor of acquiring the 'correct' page color from
807c478bd9Sstevel@tonic-gate  * a demoted large page or from a remote mnode.
817c478bd9Sstevel@tonic-gate  */
827c478bd9Sstevel@tonic-gate int	colorequiv;
837c478bd9Sstevel@tonic-gate 
847c478bd9Sstevel@tonic-gate /*
857c478bd9Sstevel@tonic-gate  * if set, specifies the percentage of large pages that are free from within
867c478bd9Sstevel@tonic-gate  * a large page region before attempting to lock those pages for
877c478bd9Sstevel@tonic-gate  * page_get_contig_pages processing.
887c478bd9Sstevel@tonic-gate  *
897c478bd9Sstevel@tonic-gate  * Should be turned on when kpr is available when page_trylock_contig_pages
907c478bd9Sstevel@tonic-gate  * can be more selective.
917c478bd9Sstevel@tonic-gate  */
927c478bd9Sstevel@tonic-gate 
937c478bd9Sstevel@tonic-gate int	ptcpthreshold;
947c478bd9Sstevel@tonic-gate 
957c478bd9Sstevel@tonic-gate /*
967c478bd9Sstevel@tonic-gate  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
9783f9b804Skchow  * Enabled by default via pgcplimitsearch.
9883f9b804Skchow  *
9983f9b804Skchow  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
10083f9b804Skchow  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
10183f9b804Skchow  * bound. This upper bound range guarantees:
10283f9b804Skchow  *    - all large page 'slots' will be searched over time
10383f9b804Skchow  *    - the minimum (1) large page candidates considered on each pgcp call
10483f9b804Skchow  *    - count doesn't wrap around to 0
1057c478bd9Sstevel@tonic-gate  */
10683f9b804Skchow pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
1077c478bd9Sstevel@tonic-gate int	pgcplimitsearch = 1;
1087c478bd9Sstevel@tonic-gate 
10983f9b804Skchow #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
11083f9b804Skchow #define	SETPGCPFAILCNT(szc)						\
11183f9b804Skchow 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
11283f9b804Skchow 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
11383f9b804Skchow 
1147c478bd9Sstevel@tonic-gate #ifdef VM_STATS
1157c478bd9Sstevel@tonic-gate struct vmm_vmstats_str  vmm_vmstats;
1167c478bd9Sstevel@tonic-gate 
1177c478bd9Sstevel@tonic-gate #endif /* VM_STATS */
1187c478bd9Sstevel@tonic-gate 
1197c478bd9Sstevel@tonic-gate #if defined(__sparc)
1207c478bd9Sstevel@tonic-gate #define	LPGCREATE	0
1217c478bd9Sstevel@tonic-gate #else
1227c478bd9Sstevel@tonic-gate /* enable page_get_contig_pages */
1237c478bd9Sstevel@tonic-gate #define	LPGCREATE	1
1247c478bd9Sstevel@tonic-gate #endif
1257c478bd9Sstevel@tonic-gate 
1267c478bd9Sstevel@tonic-gate int pg_contig_disable;
1277c478bd9Sstevel@tonic-gate int pg_lpgcreate_nocage = LPGCREATE;
1287c478bd9Sstevel@tonic-gate 
1297c478bd9Sstevel@tonic-gate /*
1307c478bd9Sstevel@tonic-gate  * page_freelist_fill pfn flag to signify no hi pfn requirement.
1317c478bd9Sstevel@tonic-gate  */
1327c478bd9Sstevel@tonic-gate #define	PFNNULL		0
1337c478bd9Sstevel@tonic-gate 
1347c478bd9Sstevel@tonic-gate /* Flags involved in promotion and demotion routines */
1357c478bd9Sstevel@tonic-gate #define	PC_FREE		0x1	/* put page on freelist */
1367c478bd9Sstevel@tonic-gate #define	PC_ALLOC	0x2	/* return page for allocation */
1377c478bd9Sstevel@tonic-gate 
1387c478bd9Sstevel@tonic-gate /*
1397c478bd9Sstevel@tonic-gate  * Flag for page_demote to be used with PC_FREE to denote that we don't care
1407c478bd9Sstevel@tonic-gate  * what the color is as the color parameter to the function is ignored.
1417c478bd9Sstevel@tonic-gate  */
1427c478bd9Sstevel@tonic-gate #define	PC_NO_COLOR	(-1)
1437c478bd9Sstevel@tonic-gate 
1447c478bd9Sstevel@tonic-gate /*
1457c478bd9Sstevel@tonic-gate  * page counters candidates info
1467c478bd9Sstevel@tonic-gate  * See page_ctrs_cands comment below for more details.
1477c478bd9Sstevel@tonic-gate  * fields are as follows:
1487c478bd9Sstevel@tonic-gate  *	pcc_pages_free:		# pages which freelist coalesce can create
1497c478bd9Sstevel@tonic-gate  *	pcc_color_free_len:	number of elements in pcc_color_free array
1507c478bd9Sstevel@tonic-gate  *	pcc_color_free:		pointer to page free counts per color
1517c478bd9Sstevel@tonic-gate  */
1527c478bd9Sstevel@tonic-gate typedef struct pcc_info {
1537c478bd9Sstevel@tonic-gate 	pgcnt_t	pcc_pages_free;
1547c478bd9Sstevel@tonic-gate 	int	pcc_color_free_len;
1557c478bd9Sstevel@tonic-gate 	pgcnt_t	*pcc_color_free;
1567c478bd9Sstevel@tonic-gate } pcc_info_t;
1577c478bd9Sstevel@tonic-gate 
1587c478bd9Sstevel@tonic-gate /*
1597c478bd9Sstevel@tonic-gate  * On big machines it can take a long time to check page_counters
1607c478bd9Sstevel@tonic-gate  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
1617c478bd9Sstevel@tonic-gate  * updated sum of all elements of the corresponding page_counters arrays.
1627c478bd9Sstevel@tonic-gate  * page_freelist_coalesce() searches page_counters only if an appropriate
1637c478bd9Sstevel@tonic-gate  * element of page_ctrs_cands array is greater than 0.
1647c478bd9Sstevel@tonic-gate  *
1657c478bd9Sstevel@tonic-gate  * An extra dimension is used for page_ctrs_cands to spread the elements
1667c478bd9Sstevel@tonic-gate  * over a few e$ cache lines to avoid serialization during the array
1677c478bd9Sstevel@tonic-gate  * updates.
1687c478bd9Sstevel@tonic-gate  */
1697c478bd9Sstevel@tonic-gate #pragma	align 64(page_ctrs_cands)
1707c478bd9Sstevel@tonic-gate 
1717c478bd9Sstevel@tonic-gate static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
1727c478bd9Sstevel@tonic-gate 
1737c478bd9Sstevel@tonic-gate /*
1747c478bd9Sstevel@tonic-gate  * Return in val the total number of free pages which can be created
1757c478bd9Sstevel@tonic-gate  * for the given mnode (m) and region size (r)
1767c478bd9Sstevel@tonic-gate  */
1777c478bd9Sstevel@tonic-gate #define	PGCTRS_CANDS_GETVALUE(m, r, val) {				\
1787c478bd9Sstevel@tonic-gate 	int i;								\
1797c478bd9Sstevel@tonic-gate 	val = 0;							\
1807c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {				\
1817c478bd9Sstevel@tonic-gate 	    val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free;		\
1827c478bd9Sstevel@tonic-gate 	}								\
1837c478bd9Sstevel@tonic-gate }
1847c478bd9Sstevel@tonic-gate 
1857c478bd9Sstevel@tonic-gate /*
1867c478bd9Sstevel@tonic-gate  * Return in val the total number of free pages which can be created
1877c478bd9Sstevel@tonic-gate  * for the given mnode (m), region size (r), and color (c)
1887c478bd9Sstevel@tonic-gate  */
1897c478bd9Sstevel@tonic-gate #define	PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) {			\
1907c478bd9Sstevel@tonic-gate 	int i;								\
1917c478bd9Sstevel@tonic-gate 	val = 0;							\
1927c478bd9Sstevel@tonic-gate 	ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len);	\
1937c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {				\
1947c478bd9Sstevel@tonic-gate 	    val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)];	\
1957c478bd9Sstevel@tonic-gate 	}								\
1967c478bd9Sstevel@tonic-gate }
1977c478bd9Sstevel@tonic-gate 
1987c478bd9Sstevel@tonic-gate /*
1997c478bd9Sstevel@tonic-gate  * We can only allow a single thread to update a counter within the physical
2007c478bd9Sstevel@tonic-gate  * range of the largest supported page size. That is the finest granularity
2017c478bd9Sstevel@tonic-gate  * possible since the counter values are dependent on each other
2027c478bd9Sstevel@tonic-gate  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
2037c478bd9Sstevel@tonic-gate  * ctr_mutex lock index for a particular physical range.
2047c478bd9Sstevel@tonic-gate  */
2057c478bd9Sstevel@tonic-gate static kmutex_t	*ctr_mutex[NPC_MUTEX];
2067c478bd9Sstevel@tonic-gate 
2077c478bd9Sstevel@tonic-gate #define	PP_CTR_LOCK_INDX(pp)						\
2087c478bd9Sstevel@tonic-gate 	(((pp)->p_pagenum >>					\
2097c478bd9Sstevel@tonic-gate 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
2107c478bd9Sstevel@tonic-gate 
2117c478bd9Sstevel@tonic-gate /*
2127c478bd9Sstevel@tonic-gate  * Local functions prototypes.
2137c478bd9Sstevel@tonic-gate  */
2147c478bd9Sstevel@tonic-gate 
215affbd3ccSkchow void page_ctr_add(int, int, page_t *, int);
216affbd3ccSkchow void page_ctr_add_internal(int, int, page_t *, int);
217affbd3ccSkchow void page_ctr_sub(int, int, page_t *, int);
2187c478bd9Sstevel@tonic-gate uint_t  page_convert_color(uchar_t, uchar_t, uint_t);
2197c478bd9Sstevel@tonic-gate void page_freelist_lock(int);
2207c478bd9Sstevel@tonic-gate void page_freelist_unlock(int);
2217c478bd9Sstevel@tonic-gate page_t *page_promote(int, pfn_t, uchar_t, int);
2227c478bd9Sstevel@tonic-gate page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
2237c478bd9Sstevel@tonic-gate page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
2247c478bd9Sstevel@tonic-gate page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
2257c478bd9Sstevel@tonic-gate static int page_trylock_cons(page_t *pp, se_t se);
2267c478bd9Sstevel@tonic-gate 
2277c478bd9Sstevel@tonic-gate #define	PNUM_SIZE(szc)							\
2287c478bd9Sstevel@tonic-gate 	(hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift)
2297c478bd9Sstevel@tonic-gate #define	PNUM_SHIFT(szc)							\
2307c478bd9Sstevel@tonic-gate 	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
2317c478bd9Sstevel@tonic-gate 
2327c478bd9Sstevel@tonic-gate /*
2337c478bd9Sstevel@tonic-gate  * The page_counters array below is used to keep track of free contiguous
2347c478bd9Sstevel@tonic-gate  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
2357c478bd9Sstevel@tonic-gate  * This contains an array of counters, the size of the array, a shift value
2367c478bd9Sstevel@tonic-gate  * used to convert a pagenum into a counter array index or vice versa, as
2377c478bd9Sstevel@tonic-gate  * well as a cache of the last successful index to be promoted to a larger
2387c478bd9Sstevel@tonic-gate  * page size.  As an optimization, we keep track of the last successful index
2397c478bd9Sstevel@tonic-gate  * to be promoted per page color for the given size region, and this is
2407c478bd9Sstevel@tonic-gate  * allocated dynamically based upon the number of colors for a given
2417c478bd9Sstevel@tonic-gate  * region size.
2427c478bd9Sstevel@tonic-gate  *
2437c478bd9Sstevel@tonic-gate  * Conceptually, the page counters are represented as:
2447c478bd9Sstevel@tonic-gate  *
2457c478bd9Sstevel@tonic-gate  *	page_counters[region_size][mnode]
2467c478bd9Sstevel@tonic-gate  *
2477c478bd9Sstevel@tonic-gate  *	region_size:	size code of a candidate larger page made up
2487c478bd9Sstevel@tonic-gate  *			of contiguous free smaller pages.
2497c478bd9Sstevel@tonic-gate  *
2507c478bd9Sstevel@tonic-gate  *	page_counters[region_size][mnode].hpm_counters[index]:
2517c478bd9Sstevel@tonic-gate  *		represents how many (region_size - 1) pages either
2527c478bd9Sstevel@tonic-gate  *		exist or can be created within the given index range.
2537c478bd9Sstevel@tonic-gate  *
2547c478bd9Sstevel@tonic-gate  * Let's look at a sparc example:
2557c478bd9Sstevel@tonic-gate  *	If we want to create a free 512k page, we look at region_size 2
2567c478bd9Sstevel@tonic-gate  *	for the mnode we want.  We calculate the index and look at a specific
2577c478bd9Sstevel@tonic-gate  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
2587c478bd9Sstevel@tonic-gate  *	this location, it means that 8 64k pages either exist or can be created
2597c478bd9Sstevel@tonic-gate  *	from 8K pages in order to make a single free 512k page at the given
2607c478bd9Sstevel@tonic-gate  *	index.  Note that when a region is full, it will contribute to the
2617c478bd9Sstevel@tonic-gate  *	counts in the region above it.  Thus we will not know what page
2627c478bd9Sstevel@tonic-gate  *	size the free pages will be which can be promoted to this new free
2637c478bd9Sstevel@tonic-gate  *	page unless we look at all regions below the current region.
2647c478bd9Sstevel@tonic-gate  */
2657c478bd9Sstevel@tonic-gate 
2667c478bd9Sstevel@tonic-gate /*
2677c478bd9Sstevel@tonic-gate  * Note: hpmctr_t is defined in platform vm_dep.h
2687c478bd9Sstevel@tonic-gate  * hw_page_map_t contains all the information needed for the page_counters
2697c478bd9Sstevel@tonic-gate  * logic. The fields are as follows:
2707c478bd9Sstevel@tonic-gate  *
2717c478bd9Sstevel@tonic-gate  *	hpm_counters:	dynamically allocated array to hold counter data
2727c478bd9Sstevel@tonic-gate  *	hpm_entries:	entries in hpm_counters
2737c478bd9Sstevel@tonic-gate  *	hpm_shift:	shift for pnum/array index conv
2747c478bd9Sstevel@tonic-gate  *	hpm_base:	PFN mapped to counter index 0
2757c478bd9Sstevel@tonic-gate  *	hpm_color_current_len:	# of elements in hpm_color_current "array" below
2767c478bd9Sstevel@tonic-gate  *	hpm_color_current:	last index in counter array for this color at
2777c478bd9Sstevel@tonic-gate  *				which we successfully created a large page
2787c478bd9Sstevel@tonic-gate  */
2797c478bd9Sstevel@tonic-gate typedef struct hw_page_map {
2807c478bd9Sstevel@tonic-gate 	hpmctr_t	*hpm_counters;
2817c478bd9Sstevel@tonic-gate 	size_t		hpm_entries;
2827c478bd9Sstevel@tonic-gate 	int		hpm_shift;
2837c478bd9Sstevel@tonic-gate 	pfn_t		hpm_base;
2847c478bd9Sstevel@tonic-gate 	size_t		hpm_color_current_len;
2857c478bd9Sstevel@tonic-gate 	size_t 		*hpm_color_current;
2867c478bd9Sstevel@tonic-gate } hw_page_map_t;
2877c478bd9Sstevel@tonic-gate 
2887c478bd9Sstevel@tonic-gate /*
2897c478bd9Sstevel@tonic-gate  * Element zero is not used, but is allocated for convenience.
2907c478bd9Sstevel@tonic-gate  */
2917c478bd9Sstevel@tonic-gate static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
2927c478bd9Sstevel@tonic-gate 
2937c478bd9Sstevel@tonic-gate /*
2947c478bd9Sstevel@tonic-gate  * The following macros are convenient ways to get access to the individual
2957c478bd9Sstevel@tonic-gate  * elements of the page_counters arrays.  They can be used on both
2967c478bd9Sstevel@tonic-gate  * the left side and right side of equations.
2977c478bd9Sstevel@tonic-gate  */
2987c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
2997c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
3007c478bd9Sstevel@tonic-gate 
3017c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
3027c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
3037c478bd9Sstevel@tonic-gate 
3047c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
3057c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
3067c478bd9Sstevel@tonic-gate 
3077c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
3087c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
3097c478bd9Sstevel@tonic-gate 
3107c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
3117c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_base)
3127c478bd9Sstevel@tonic-gate 
3137c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc)		\
3147c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_color_current_len)
3157c478bd9Sstevel@tonic-gate 
3167c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc)	\
3177c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_color_current)
3187c478bd9Sstevel@tonic-gate 
3197c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color)	\
3207c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)])
3217c478bd9Sstevel@tonic-gate 
3227c478bd9Sstevel@tonic-gate #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
3237c478bd9Sstevel@tonic-gate 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
3247c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
3257c478bd9Sstevel@tonic-gate 
3267c478bd9Sstevel@tonic-gate #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
3277c478bd9Sstevel@tonic-gate 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
3287c478bd9Sstevel@tonic-gate 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
3297c478bd9Sstevel@tonic-gate 
3307c478bd9Sstevel@tonic-gate /*
3317c478bd9Sstevel@tonic-gate  * Protects the hpm_counters and hpm_color_current memory from changing while
3327c478bd9Sstevel@tonic-gate  * looking at page counters information.
3337c478bd9Sstevel@tonic-gate  * Grab the write lock to modify what these fields point at.
3347c478bd9Sstevel@tonic-gate  * Grab the read lock to prevent any pointers from changing.
3357c478bd9Sstevel@tonic-gate  * The write lock can not be held during memory allocation due to a possible
3367c478bd9Sstevel@tonic-gate  * recursion deadlock with trying to grab the read lock while the
3377c478bd9Sstevel@tonic-gate  * write lock is already held.
3387c478bd9Sstevel@tonic-gate  */
3397c478bd9Sstevel@tonic-gate krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
3407c478bd9Sstevel@tonic-gate 
341affbd3ccSkchow 
342affbd3ccSkchow /*
343affbd3ccSkchow  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
344affbd3ccSkchow  */
345affbd3ccSkchow void
346affbd3ccSkchow cpu_vm_data_init(struct cpu *cp)
347affbd3ccSkchow {
348affbd3ccSkchow 	if (cp == CPU0) {
349affbd3ccSkchow 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
350affbd3ccSkchow 	} else {
351affbd3ccSkchow 		void	*kmptr;
3526061ce8aSkchow 		int	align;
3536061ce8aSkchow 		size_t	sz;
354affbd3ccSkchow 
3556061ce8aSkchow 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
3566061ce8aSkchow 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
3576061ce8aSkchow 		kmptr = kmem_zalloc(sz, KM_SLEEP);
358affbd3ccSkchow 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
359affbd3ccSkchow 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
3606061ce8aSkchow 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
361affbd3ccSkchow 	}
362affbd3ccSkchow }
363affbd3ccSkchow 
364affbd3ccSkchow /*
365affbd3ccSkchow  * free cpu_vm_data
366affbd3ccSkchow  */
367affbd3ccSkchow void
368affbd3ccSkchow cpu_vm_data_destroy(struct cpu *cp)
369affbd3ccSkchow {
370affbd3ccSkchow 	if (cp->cpu_seqid && cp->cpu_vm_data) {
371affbd3ccSkchow 		ASSERT(cp != CPU0);
372affbd3ccSkchow 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
3736061ce8aSkchow 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
374affbd3ccSkchow 	}
375affbd3ccSkchow 	cp->cpu_vm_data = NULL;
376affbd3ccSkchow }
377affbd3ccSkchow 
378affbd3ccSkchow 
3797c478bd9Sstevel@tonic-gate /*
3807c478bd9Sstevel@tonic-gate  * page size to page size code
3817c478bd9Sstevel@tonic-gate  */
3827c478bd9Sstevel@tonic-gate int
3837c478bd9Sstevel@tonic-gate page_szc(size_t pagesize)
3847c478bd9Sstevel@tonic-gate {
3857c478bd9Sstevel@tonic-gate 	int	i = 0;
3867c478bd9Sstevel@tonic-gate 
3877c478bd9Sstevel@tonic-gate 	while (hw_page_array[i].hp_size) {
3887c478bd9Sstevel@tonic-gate 		if (pagesize == hw_page_array[i].hp_size)
3897c478bd9Sstevel@tonic-gate 			return (i);
3907c478bd9Sstevel@tonic-gate 		i++;
3917c478bd9Sstevel@tonic-gate 	}
3927c478bd9Sstevel@tonic-gate 	return (-1);
3937c478bd9Sstevel@tonic-gate }
3947c478bd9Sstevel@tonic-gate 
3957c478bd9Sstevel@tonic-gate /*
3964abce959Smec  * page size to page size code with the restriction that it be a supported
3974abce959Smec  * user page size.  If it's not a supported user page size, -1 will be returned.
3987c478bd9Sstevel@tonic-gate  */
3997c478bd9Sstevel@tonic-gate int
4004abce959Smec page_szc_user_filtered(size_t pagesize)
4017c478bd9Sstevel@tonic-gate {
4027c478bd9Sstevel@tonic-gate 	int szc = page_szc(pagesize);
4034abce959Smec 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
4044abce959Smec 		return (szc);
4054abce959Smec 	}
4067c478bd9Sstevel@tonic-gate 	return (-1);
4077c478bd9Sstevel@tonic-gate }
4087c478bd9Sstevel@tonic-gate 
4097c478bd9Sstevel@tonic-gate /*
4107c478bd9Sstevel@tonic-gate  * Return how many page sizes are available for the user to use.  This is
4117c478bd9Sstevel@tonic-gate  * what the hardware supports and not based upon how the OS implements the
4127c478bd9Sstevel@tonic-gate  * support of different page sizes.
4137c478bd9Sstevel@tonic-gate  */
4147c478bd9Sstevel@tonic-gate uint_t
4157c478bd9Sstevel@tonic-gate page_num_user_pagesizes(void)
4167c478bd9Sstevel@tonic-gate {
4177c478bd9Sstevel@tonic-gate 	return (mmu_exported_page_sizes);
4187c478bd9Sstevel@tonic-gate }
4197c478bd9Sstevel@tonic-gate 
4207c478bd9Sstevel@tonic-gate uint_t
4217c478bd9Sstevel@tonic-gate page_num_pagesizes(void)
4227c478bd9Sstevel@tonic-gate {
4237c478bd9Sstevel@tonic-gate 	return (mmu_page_sizes);
4247c478bd9Sstevel@tonic-gate }
4257c478bd9Sstevel@tonic-gate 
4267c478bd9Sstevel@tonic-gate /*
4277c478bd9Sstevel@tonic-gate  * returns the count of the number of base pagesize pages associated with szc
4287c478bd9Sstevel@tonic-gate  */
4297c478bd9Sstevel@tonic-gate pgcnt_t
4307c478bd9Sstevel@tonic-gate page_get_pagecnt(uint_t szc)
4317c478bd9Sstevel@tonic-gate {
4327c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4337c478bd9Sstevel@tonic-gate 		panic("page_get_pagecnt: out of range %d", szc);
4347c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_pgcnt);
4357c478bd9Sstevel@tonic-gate }
4367c478bd9Sstevel@tonic-gate 
4377c478bd9Sstevel@tonic-gate size_t
4387c478bd9Sstevel@tonic-gate page_get_pagesize(uint_t szc)
4397c478bd9Sstevel@tonic-gate {
4407c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4417c478bd9Sstevel@tonic-gate 		panic("page_get_pagesize: out of range %d", szc);
4427c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_size);
4437c478bd9Sstevel@tonic-gate }
4447c478bd9Sstevel@tonic-gate 
4457c478bd9Sstevel@tonic-gate /*
4467c478bd9Sstevel@tonic-gate  * Return the size of a page based upon the index passed in.  An index of
4477c478bd9Sstevel@tonic-gate  * zero refers to the smallest page size in the system, and as index increases
4487c478bd9Sstevel@tonic-gate  * it refers to the next larger supported page size in the system.
4497c478bd9Sstevel@tonic-gate  * Note that szc and userszc may not be the same due to unsupported szc's on
4507c478bd9Sstevel@tonic-gate  * some systems.
4517c478bd9Sstevel@tonic-gate  */
4527c478bd9Sstevel@tonic-gate size_t
4537c478bd9Sstevel@tonic-gate page_get_user_pagesize(uint_t userszc)
4547c478bd9Sstevel@tonic-gate {
4557c478bd9Sstevel@tonic-gate 	uint_t szc = USERSZC_2_SZC(userszc);
4567c478bd9Sstevel@tonic-gate 
4577c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4587c478bd9Sstevel@tonic-gate 		panic("page_get_user_pagesize: out of range %d", szc);
4597c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_size);
4607c478bd9Sstevel@tonic-gate }
4617c478bd9Sstevel@tonic-gate 
4627c478bd9Sstevel@tonic-gate uint_t
4637c478bd9Sstevel@tonic-gate page_get_shift(uint_t szc)
4647c478bd9Sstevel@tonic-gate {
4657c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4667c478bd9Sstevel@tonic-gate 		panic("page_get_shift: out of range %d", szc);
4677c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_shift);
4687c478bd9Sstevel@tonic-gate }
4697c478bd9Sstevel@tonic-gate 
4707c478bd9Sstevel@tonic-gate uint_t
4717c478bd9Sstevel@tonic-gate page_get_pagecolors(uint_t szc)
4727c478bd9Sstevel@tonic-gate {
4737c478bd9Sstevel@tonic-gate 	ASSERT(page_colors != 0);
4747c478bd9Sstevel@tonic-gate 	return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1));
4757c478bd9Sstevel@tonic-gate }
4767c478bd9Sstevel@tonic-gate 
4777c478bd9Sstevel@tonic-gate /*
4787c478bd9Sstevel@tonic-gate  * Called by startup().
4797c478bd9Sstevel@tonic-gate  * Size up the per page size free list counters based on physmax
4807c478bd9Sstevel@tonic-gate  * of each node and max_mem_nodes.
4817c478bd9Sstevel@tonic-gate  */
4827c478bd9Sstevel@tonic-gate size_t
4837c478bd9Sstevel@tonic-gate page_ctrs_sz(void)
4847c478bd9Sstevel@tonic-gate {
4857c478bd9Sstevel@tonic-gate 	int	r;		/* region size */
4867c478bd9Sstevel@tonic-gate 	int	mnode;
4877c478bd9Sstevel@tonic-gate 	uint_t	ctrs_sz = 0;
4887c478bd9Sstevel@tonic-gate 	int 	i;
4897c478bd9Sstevel@tonic-gate 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
4907c478bd9Sstevel@tonic-gate 
4917c478bd9Sstevel@tonic-gate 	/*
4927c478bd9Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
4937c478bd9Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
4947c478bd9Sstevel@tonic-gate 	 * arrays.
4957c478bd9Sstevel@tonic-gate 	 */
4967c478bd9Sstevel@tonic-gate 	colors_per_szc[0] = page_colors;
4977c478bd9Sstevel@tonic-gate 	for (i = 1; i < mmu_page_sizes; i++) {
4987c478bd9Sstevel@tonic-gate 		colors_per_szc[i] =
4997c478bd9Sstevel@tonic-gate 		    page_convert_color(0, i, page_colors - 1) + 1;
5007c478bd9Sstevel@tonic-gate 	}
5017c478bd9Sstevel@tonic-gate 
5027c478bd9Sstevel@tonic-gate 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
5037c478bd9Sstevel@tonic-gate 
5047c478bd9Sstevel@tonic-gate 		pgcnt_t r_pgcnt;
5057c478bd9Sstevel@tonic-gate 		pfn_t   r_base;
5067c478bd9Sstevel@tonic-gate 		pgcnt_t r_align;
5077c478bd9Sstevel@tonic-gate 
5087c478bd9Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
5097c478bd9Sstevel@tonic-gate 			continue;
5107c478bd9Sstevel@tonic-gate 
5117c478bd9Sstevel@tonic-gate 		/*
5127c478bd9Sstevel@tonic-gate 		 * determine size needed for page counter arrays with
5137c478bd9Sstevel@tonic-gate 		 * base aligned to large page size.
5147c478bd9Sstevel@tonic-gate 		 */
5157c478bd9Sstevel@tonic-gate 		for (r = 1; r < mmu_page_sizes; r++) {
5167c478bd9Sstevel@tonic-gate 			/* add in space for hpm_counters */
5177c478bd9Sstevel@tonic-gate 			r_align = page_get_pagecnt(r);
5187c478bd9Sstevel@tonic-gate 			r_base = mem_node_config[mnode].physbase;
5197c478bd9Sstevel@tonic-gate 			r_base &= ~(r_align - 1);
5207c478bd9Sstevel@tonic-gate 			r_pgcnt = howmany(mem_node_config[mnode].physmax -
5216bb54764Skchow 			    r_base + 1, r_align);
5227c478bd9Sstevel@tonic-gate 			/*
5237c478bd9Sstevel@tonic-gate 			 * Round up to always allocate on pointer sized
5247c478bd9Sstevel@tonic-gate 			 * boundaries.
5257c478bd9Sstevel@tonic-gate 			 */
5267c478bd9Sstevel@tonic-gate 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
5277c478bd9Sstevel@tonic-gate 			    sizeof (hpmctr_t *));
5287c478bd9Sstevel@tonic-gate 
5297c478bd9Sstevel@tonic-gate 			/* add in space for hpm_color_current */
5307c478bd9Sstevel@tonic-gate 			ctrs_sz += (colors_per_szc[r] *
5317c478bd9Sstevel@tonic-gate 			    sizeof (size_t));
5327c478bd9Sstevel@tonic-gate 		}
5337c478bd9Sstevel@tonic-gate 	}
5347c478bd9Sstevel@tonic-gate 
5357c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
5367c478bd9Sstevel@tonic-gate 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
5377c478bd9Sstevel@tonic-gate 
5387c478bd9Sstevel@tonic-gate 		/* add in space for page_ctrs_cands */
5397c478bd9Sstevel@tonic-gate 		ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t));
5407c478bd9Sstevel@tonic-gate 		ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] *
5417c478bd9Sstevel@tonic-gate 		    sizeof (pgcnt_t);
5427c478bd9Sstevel@tonic-gate 	}
5437c478bd9Sstevel@tonic-gate 
5447c478bd9Sstevel@tonic-gate 	/* ctr_mutex */
5457c478bd9Sstevel@tonic-gate 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
5467c478bd9Sstevel@tonic-gate 
5477c478bd9Sstevel@tonic-gate 	/* size for page list counts */
5487c478bd9Sstevel@tonic-gate 	PLCNT_SZ(ctrs_sz);
5497c478bd9Sstevel@tonic-gate 
5507c478bd9Sstevel@tonic-gate 	/*
5517c478bd9Sstevel@tonic-gate 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
5527c478bd9Sstevel@tonic-gate 	 * address of the counters to ecache_alignsize boundary for every
5537c478bd9Sstevel@tonic-gate 	 * memory node.
5547c478bd9Sstevel@tonic-gate 	 */
5557c478bd9Sstevel@tonic-gate 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
5567c478bd9Sstevel@tonic-gate }
5577c478bd9Sstevel@tonic-gate 
5587c478bd9Sstevel@tonic-gate caddr_t
5597c478bd9Sstevel@tonic-gate page_ctrs_alloc(caddr_t alloc_base)
5607c478bd9Sstevel@tonic-gate {
5617c478bd9Sstevel@tonic-gate 	int	mnode;
5627c478bd9Sstevel@tonic-gate 	int	r;		/* region size */
5637c478bd9Sstevel@tonic-gate 	int	i;
5647c478bd9Sstevel@tonic-gate 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
5657c478bd9Sstevel@tonic-gate 
5667c478bd9Sstevel@tonic-gate 	/*
5677c478bd9Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
5687c478bd9Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
5697c478bd9Sstevel@tonic-gate 	 * arrays.
5707c478bd9Sstevel@tonic-gate 	 */
5717c478bd9Sstevel@tonic-gate 	colors_per_szc[0] = page_colors;
5727c478bd9Sstevel@tonic-gate 	for (i = 1; i < mmu_page_sizes; i++) {
5737c478bd9Sstevel@tonic-gate 		colors_per_szc[i] =
5747c478bd9Sstevel@tonic-gate 		    page_convert_color(0, i, page_colors - 1) + 1;
5757c478bd9Sstevel@tonic-gate 	}
5767c478bd9Sstevel@tonic-gate 
5777c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
5787c478bd9Sstevel@tonic-gate 		page_counters[r] = (hw_page_map_t *)alloc_base;
5797c478bd9Sstevel@tonic-gate 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
5807c478bd9Sstevel@tonic-gate 	}
5817c478bd9Sstevel@tonic-gate 
5827c478bd9Sstevel@tonic-gate 	/* page_ctrs_cands */
5837c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
5847c478bd9Sstevel@tonic-gate 		for (i = 0; i < NPC_MUTEX; i++) {
5857c478bd9Sstevel@tonic-gate 			page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base;
5867c478bd9Sstevel@tonic-gate 			alloc_base += max_mem_nodes * (sizeof (pcc_info_t));
5877c478bd9Sstevel@tonic-gate 
5887c478bd9Sstevel@tonic-gate 		}
5897c478bd9Sstevel@tonic-gate 	}
5907c478bd9Sstevel@tonic-gate 
5917c478bd9Sstevel@tonic-gate 	/* page_ctrs_cands pcc_color_free array */
5927c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
5937c478bd9Sstevel@tonic-gate 		for (i = 0; i < NPC_MUTEX; i++) {
5947c478bd9Sstevel@tonic-gate 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
5957c478bd9Sstevel@tonic-gate 				page_ctrs_cands[i][r][mnode].pcc_color_free_len
5967c478bd9Sstevel@tonic-gate 				    = colors_per_szc[r];
5977c478bd9Sstevel@tonic-gate 				page_ctrs_cands[i][r][mnode].pcc_color_free =
5987c478bd9Sstevel@tonic-gate 				    (pgcnt_t *)alloc_base;
5997c478bd9Sstevel@tonic-gate 				alloc_base += colors_per_szc[r] *
6007c478bd9Sstevel@tonic-gate 				    sizeof (pgcnt_t);
6017c478bd9Sstevel@tonic-gate 			}
6027c478bd9Sstevel@tonic-gate 		}
6037c478bd9Sstevel@tonic-gate 	}
6047c478bd9Sstevel@tonic-gate 
6057c478bd9Sstevel@tonic-gate 	/* ctr_mutex */
6067c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
6077c478bd9Sstevel@tonic-gate 		ctr_mutex[i] = (kmutex_t *)alloc_base;
6087c478bd9Sstevel@tonic-gate 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
6097c478bd9Sstevel@tonic-gate 	}
6107c478bd9Sstevel@tonic-gate 
6117c478bd9Sstevel@tonic-gate 	/* initialize page list counts */
6127c478bd9Sstevel@tonic-gate 	PLCNT_INIT(alloc_base);
6137c478bd9Sstevel@tonic-gate 
6147c478bd9Sstevel@tonic-gate 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
6157c478bd9Sstevel@tonic-gate 
6167c478bd9Sstevel@tonic-gate 		pgcnt_t r_pgcnt;
6177c478bd9Sstevel@tonic-gate 		pfn_t	r_base;
6187c478bd9Sstevel@tonic-gate 		pgcnt_t r_align;
6197c478bd9Sstevel@tonic-gate 		int	r_shift;
6207c478bd9Sstevel@tonic-gate 
6217c478bd9Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
6227c478bd9Sstevel@tonic-gate 			continue;
6237c478bd9Sstevel@tonic-gate 
6247c478bd9Sstevel@tonic-gate 		for (r = 1; r < mmu_page_sizes; r++) {
6257c478bd9Sstevel@tonic-gate 			/*
6267c478bd9Sstevel@tonic-gate 			 * the page_counters base has to be aligned to the
6277c478bd9Sstevel@tonic-gate 			 * page count of page size code r otherwise the counts
6287c478bd9Sstevel@tonic-gate 			 * will cross large page boundaries.
6297c478bd9Sstevel@tonic-gate 			 */
6307c478bd9Sstevel@tonic-gate 			r_align = page_get_pagecnt(r);
6317c478bd9Sstevel@tonic-gate 			r_base = mem_node_config[mnode].physbase;
6327c478bd9Sstevel@tonic-gate 			/* base needs to be aligned - lower to aligned value */
6337c478bd9Sstevel@tonic-gate 			r_base &= ~(r_align - 1);
6347c478bd9Sstevel@tonic-gate 			r_pgcnt = howmany(mem_node_config[mnode].physmax -
6356bb54764Skchow 			    r_base + 1, r_align);
6367c478bd9Sstevel@tonic-gate 			r_shift = PAGE_BSZS_SHIFT(r);
6377c478bd9Sstevel@tonic-gate 
6387c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
6397c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
6407c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
6417c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) =
6427c478bd9Sstevel@tonic-gate 			    colors_per_szc[r];
6437c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) =
6447c478bd9Sstevel@tonic-gate 			    (size_t *)alloc_base;
6457c478bd9Sstevel@tonic-gate 			alloc_base += (sizeof (size_t) * colors_per_szc[r]);
6467c478bd9Sstevel@tonic-gate 			for (i = 0; i < colors_per_szc[r]; i++) {
6477c478bd9Sstevel@tonic-gate 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
6487c478bd9Sstevel@tonic-gate 			}
6497c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_COUNTERS(mnode, r) =
6507c478bd9Sstevel@tonic-gate 			    (hpmctr_t *)alloc_base;
6517c478bd9Sstevel@tonic-gate 			/*
6527c478bd9Sstevel@tonic-gate 			 * Round up to make alloc_base always be aligned on
6537c478bd9Sstevel@tonic-gate 			 * a pointer boundary.
6547c478bd9Sstevel@tonic-gate 			 */
6557c478bd9Sstevel@tonic-gate 			alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
6567c478bd9Sstevel@tonic-gate 			    sizeof (hpmctr_t *));
6577c478bd9Sstevel@tonic-gate 
6587c478bd9Sstevel@tonic-gate 			/*
6597c478bd9Sstevel@tonic-gate 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
6607c478bd9Sstevel@tonic-gate 			 * satisfy the identity requirement.
6617c478bd9Sstevel@tonic-gate 			 * We should be able to go from one to the other
6627c478bd9Sstevel@tonic-gate 			 * and get consistent values.
6637c478bd9Sstevel@tonic-gate 			 */
6647c478bd9Sstevel@tonic-gate 			ASSERT(PNUM_TO_IDX(mnode, r,
6657c478bd9Sstevel@tonic-gate 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
6667c478bd9Sstevel@tonic-gate 			ASSERT(IDX_TO_PNUM(mnode, r,
6677c478bd9Sstevel@tonic-gate 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
6687c478bd9Sstevel@tonic-gate 		}
6697c478bd9Sstevel@tonic-gate 		/*
6707c478bd9Sstevel@tonic-gate 		 * Roundup the start address of the page_counters to
6717c478bd9Sstevel@tonic-gate 		 * cache aligned boundary for every memory node.
6727c478bd9Sstevel@tonic-gate 		 * page_ctrs_sz() has added some slop for these roundups.
6737c478bd9Sstevel@tonic-gate 		 */
6747c478bd9Sstevel@tonic-gate 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
6757c478bd9Sstevel@tonic-gate 			L2CACHE_ALIGN);
6767c478bd9Sstevel@tonic-gate 	}
6777c478bd9Sstevel@tonic-gate 
6787c478bd9Sstevel@tonic-gate 	/* Initialize other page counter specific data structures. */
6797c478bd9Sstevel@tonic-gate 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
6807c478bd9Sstevel@tonic-gate 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
6817c478bd9Sstevel@tonic-gate 	}
6827c478bd9Sstevel@tonic-gate 
6837c478bd9Sstevel@tonic-gate 	return (alloc_base);
6847c478bd9Sstevel@tonic-gate }
6857c478bd9Sstevel@tonic-gate 
6867c478bd9Sstevel@tonic-gate /*
6877c478bd9Sstevel@tonic-gate  * Functions to adjust region counters for each size free list.
6887c478bd9Sstevel@tonic-gate  * Caller is responsible to acquire the ctr_mutex lock if necessary and
6897c478bd9Sstevel@tonic-gate  * thus can be called during startup without locks.
6907c478bd9Sstevel@tonic-gate  */
6917c478bd9Sstevel@tonic-gate /* ARGSUSED */
6927c478bd9Sstevel@tonic-gate void
693affbd3ccSkchow page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
6947c478bd9Sstevel@tonic-gate {
6957c478bd9Sstevel@tonic-gate 	ssize_t		r;	/* region size */
6967c478bd9Sstevel@tonic-gate 	ssize_t		idx;
6977c478bd9Sstevel@tonic-gate 	pfn_t		pfnum;
6987c478bd9Sstevel@tonic-gate 	int		lckidx;
6997c478bd9Sstevel@tonic-gate 
700affbd3ccSkchow 	ASSERT(mnode == PP_2_MEM_NODE(pp));
701affbd3ccSkchow 	ASSERT(mtype == PP_2_MTYPE(pp));
702affbd3ccSkchow 
7037c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc < mmu_page_sizes);
7047c478bd9Sstevel@tonic-gate 
705affbd3ccSkchow 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
7067c478bd9Sstevel@tonic-gate 
7077c478bd9Sstevel@tonic-gate 	/* no counter update needed for largest page size */
7087c478bd9Sstevel@tonic-gate 	if (pp->p_szc >= mmu_page_sizes - 1) {
7097c478bd9Sstevel@tonic-gate 		return;
7107c478bd9Sstevel@tonic-gate 	}
7117c478bd9Sstevel@tonic-gate 
7127c478bd9Sstevel@tonic-gate 	r = pp->p_szc + 1;
7137c478bd9Sstevel@tonic-gate 	pfnum = pp->p_pagenum;
7147c478bd9Sstevel@tonic-gate 	lckidx = PP_CTR_LOCK_INDX(pp);
7157c478bd9Sstevel@tonic-gate 
7167c478bd9Sstevel@tonic-gate 	/*
7177c478bd9Sstevel@tonic-gate 	 * Increment the count of free pages for the current
7187c478bd9Sstevel@tonic-gate 	 * region. Continue looping up in region size incrementing
7197c478bd9Sstevel@tonic-gate 	 * count if the preceeding region is full.
7207c478bd9Sstevel@tonic-gate 	 */
7217c478bd9Sstevel@tonic-gate 	while (r < mmu_page_sizes) {
7227c478bd9Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pfnum);
7237c478bd9Sstevel@tonic-gate 
7247c478bd9Sstevel@tonic-gate 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
7257c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
7267c478bd9Sstevel@tonic-gate 
7277c478bd9Sstevel@tonic-gate 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r))
7287c478bd9Sstevel@tonic-gate 			break;
7297c478bd9Sstevel@tonic-gate 
7307c478bd9Sstevel@tonic-gate 		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++;
7317c478bd9Sstevel@tonic-gate 		page_ctrs_cands[lckidx][r][mnode].
7327c478bd9Sstevel@tonic-gate 		    pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
7337c478bd9Sstevel@tonic-gate 		r++;
7347c478bd9Sstevel@tonic-gate 	}
7357c478bd9Sstevel@tonic-gate }
7367c478bd9Sstevel@tonic-gate 
7377c478bd9Sstevel@tonic-gate void
738affbd3ccSkchow page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
7397c478bd9Sstevel@tonic-gate {
7407c478bd9Sstevel@tonic-gate 	int		lckidx = PP_CTR_LOCK_INDX(pp);
7417c478bd9Sstevel@tonic-gate 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
7427c478bd9Sstevel@tonic-gate 
7437c478bd9Sstevel@tonic-gate 	mutex_enter(lock);
744affbd3ccSkchow 	page_ctr_add_internal(mnode, mtype, pp, flags);
7457c478bd9Sstevel@tonic-gate 	mutex_exit(lock);
7467c478bd9Sstevel@tonic-gate }
7477c478bd9Sstevel@tonic-gate 
7487c478bd9Sstevel@tonic-gate void
749affbd3ccSkchow page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
7507c478bd9Sstevel@tonic-gate {
7517c478bd9Sstevel@tonic-gate 	int		lckidx;
7527c478bd9Sstevel@tonic-gate 	kmutex_t	*lock;
7537c478bd9Sstevel@tonic-gate 	ssize_t		r;	/* region size */
7547c478bd9Sstevel@tonic-gate 	ssize_t		idx;
7557c478bd9Sstevel@tonic-gate 	pfn_t		pfnum;
7567c478bd9Sstevel@tonic-gate 
757affbd3ccSkchow 	ASSERT(mnode == PP_2_MEM_NODE(pp));
758affbd3ccSkchow 	ASSERT(mtype == PP_2_MTYPE(pp));
759affbd3ccSkchow 
7607c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc < mmu_page_sizes);
7617c478bd9Sstevel@tonic-gate 
762affbd3ccSkchow 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
7637c478bd9Sstevel@tonic-gate 
7647c478bd9Sstevel@tonic-gate 	/* no counter update needed for largest page size */
7657c478bd9Sstevel@tonic-gate 	if (pp->p_szc >= mmu_page_sizes - 1) {
7667c478bd9Sstevel@tonic-gate 		return;
7677c478bd9Sstevel@tonic-gate 	}
7687c478bd9Sstevel@tonic-gate 
7697c478bd9Sstevel@tonic-gate 	r = pp->p_szc + 1;
7707c478bd9Sstevel@tonic-gate 	pfnum = pp->p_pagenum;
7717c478bd9Sstevel@tonic-gate 	lckidx = PP_CTR_LOCK_INDX(pp);
7727c478bd9Sstevel@tonic-gate 	lock = &ctr_mutex[lckidx][mnode];
7737c478bd9Sstevel@tonic-gate 
7747c478bd9Sstevel@tonic-gate 	/*
7757c478bd9Sstevel@tonic-gate 	 * Decrement the count of free pages for the current
7767c478bd9Sstevel@tonic-gate 	 * region. Continue looping up in region size decrementing
7777c478bd9Sstevel@tonic-gate 	 * count if the preceeding region was full.
7787c478bd9Sstevel@tonic-gate 	 */
7797c478bd9Sstevel@tonic-gate 	mutex_enter(lock);
7807c478bd9Sstevel@tonic-gate 	while (r < mmu_page_sizes) {
7817c478bd9Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pfnum);
7827c478bd9Sstevel@tonic-gate 
7837c478bd9Sstevel@tonic-gate 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
7847c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
7857c478bd9Sstevel@tonic-gate 
7867c478bd9Sstevel@tonic-gate 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
7877c478bd9Sstevel@tonic-gate 			break;
7887c478bd9Sstevel@tonic-gate 		}
7897c478bd9Sstevel@tonic-gate 		ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0);
7907c478bd9Sstevel@tonic-gate 		ASSERT(page_ctrs_cands[lckidx][r][mnode].
7917c478bd9Sstevel@tonic-gate 		    pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
7927c478bd9Sstevel@tonic-gate 
7937c478bd9Sstevel@tonic-gate 		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--;
7947c478bd9Sstevel@tonic-gate 		page_ctrs_cands[lckidx][r][mnode].
7957c478bd9Sstevel@tonic-gate 		    pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
7967c478bd9Sstevel@tonic-gate 		r++;
7977c478bd9Sstevel@tonic-gate 	}
7987c478bd9Sstevel@tonic-gate 	mutex_exit(lock);
7997c478bd9Sstevel@tonic-gate }
8007c478bd9Sstevel@tonic-gate 
8017c478bd9Sstevel@tonic-gate /*
8027c478bd9Sstevel@tonic-gate  * Adjust page counters following a memory attach, since typically the
8037c478bd9Sstevel@tonic-gate  * size of the array needs to change, and the PFN to counter index
8047c478bd9Sstevel@tonic-gate  * mapping needs to change.
8057c478bd9Sstevel@tonic-gate  */
8067c478bd9Sstevel@tonic-gate uint_t
8077c478bd9Sstevel@tonic-gate page_ctrs_adjust(int mnode)
8087c478bd9Sstevel@tonic-gate {
8097c478bd9Sstevel@tonic-gate 	pgcnt_t npgs;
8107c478bd9Sstevel@tonic-gate 	int	r;		/* region size */
8117c478bd9Sstevel@tonic-gate 	int	i;
8127c478bd9Sstevel@tonic-gate 	size_t	pcsz, old_csz;
8137c478bd9Sstevel@tonic-gate 	hpmctr_t *new_ctr, *old_ctr;
8147c478bd9Sstevel@tonic-gate 	pfn_t	oldbase, newbase;
8157c478bd9Sstevel@tonic-gate 	size_t	old_npgs;
8167c478bd9Sstevel@tonic-gate 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
8177c478bd9Sstevel@tonic-gate 	size_t	size_cache[MMU_PAGE_SIZES];
8187c478bd9Sstevel@tonic-gate 	size_t	*color_cache[MMU_PAGE_SIZES];
8197c478bd9Sstevel@tonic-gate 	size_t	*old_color_array;
8207c478bd9Sstevel@tonic-gate 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
8217c478bd9Sstevel@tonic-gate 
8227c478bd9Sstevel@tonic-gate 	newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
8237c478bd9Sstevel@tonic-gate 	npgs = roundup(mem_node_config[mnode].physmax,
8247c478bd9Sstevel@tonic-gate 	    PC_BASE_ALIGN) - newbase;
8257c478bd9Sstevel@tonic-gate 
8267c478bd9Sstevel@tonic-gate 	/*
8277c478bd9Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
8287c478bd9Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
8297c478bd9Sstevel@tonic-gate 	 * arrays.
8307c478bd9Sstevel@tonic-gate 	 */
8317c478bd9Sstevel@tonic-gate 	colors_per_szc[0] = page_colors;
8327c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
8337c478bd9Sstevel@tonic-gate 		colors_per_szc[r] =
8347c478bd9Sstevel@tonic-gate 		    page_convert_color(0, r, page_colors - 1) + 1;
8357c478bd9Sstevel@tonic-gate 	}
8367c478bd9Sstevel@tonic-gate 
8377c478bd9Sstevel@tonic-gate 	/*
8387c478bd9Sstevel@tonic-gate 	 * Preallocate all of the new hpm_counters arrays as we can't
8397c478bd9Sstevel@tonic-gate 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
8407c478bd9Sstevel@tonic-gate 	 * If we can't allocate all of the arrays, undo our work so far
8417c478bd9Sstevel@tonic-gate 	 * and return failure.
8427c478bd9Sstevel@tonic-gate 	 */
8437c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
8447c478bd9Sstevel@tonic-gate 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
8457c478bd9Sstevel@tonic-gate 
8467c478bd9Sstevel@tonic-gate 		ctr_cache[r] = kmem_zalloc(pcsz *
8477c478bd9Sstevel@tonic-gate 		    sizeof (hpmctr_t), KM_NOSLEEP);
8487c478bd9Sstevel@tonic-gate 		if (ctr_cache[r] == NULL) {
8497c478bd9Sstevel@tonic-gate 			while (--r >= 1) {
8507c478bd9Sstevel@tonic-gate 				kmem_free(ctr_cache[r],
8517c478bd9Sstevel@tonic-gate 				    size_cache[r] * sizeof (hpmctr_t));
8527c478bd9Sstevel@tonic-gate 			}
8537c478bd9Sstevel@tonic-gate 			return (ENOMEM);
8547c478bd9Sstevel@tonic-gate 		}
8557c478bd9Sstevel@tonic-gate 		size_cache[r] = pcsz;
8567c478bd9Sstevel@tonic-gate 	}
8577c478bd9Sstevel@tonic-gate 	/*
8587c478bd9Sstevel@tonic-gate 	 * Preallocate all of the new color current arrays as we can't
8597c478bd9Sstevel@tonic-gate 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
8607c478bd9Sstevel@tonic-gate 	 * If we can't allocate all of the arrays, undo our work so far
8617c478bd9Sstevel@tonic-gate 	 * and return failure.
8627c478bd9Sstevel@tonic-gate 	 */
8637c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
8647c478bd9Sstevel@tonic-gate 		color_cache[r] = kmem_zalloc(sizeof (size_t) *
8657c478bd9Sstevel@tonic-gate 		    colors_per_szc[r], KM_NOSLEEP);
8667c478bd9Sstevel@tonic-gate 		if (color_cache[r] == NULL) {
8677c478bd9Sstevel@tonic-gate 			while (--r >= 1) {
8687c478bd9Sstevel@tonic-gate 				kmem_free(color_cache[r],
8697c478bd9Sstevel@tonic-gate 				    colors_per_szc[r] * sizeof (size_t));
8707c478bd9Sstevel@tonic-gate 			}
8717c478bd9Sstevel@tonic-gate 			for (r = 1; r < mmu_page_sizes; r++) {
8727c478bd9Sstevel@tonic-gate 				kmem_free(ctr_cache[r],
8737c478bd9Sstevel@tonic-gate 				    size_cache[r] * sizeof (hpmctr_t));
8747c478bd9Sstevel@tonic-gate 			}
8757c478bd9Sstevel@tonic-gate 			return (ENOMEM);
8767c478bd9Sstevel@tonic-gate 		}
8777c478bd9Sstevel@tonic-gate 	}
8787c478bd9Sstevel@tonic-gate 
8797c478bd9Sstevel@tonic-gate 	/*
8807c478bd9Sstevel@tonic-gate 	 * Grab the write lock to prevent others from walking these arrays
8817c478bd9Sstevel@tonic-gate 	 * while we are modifying them.
8827c478bd9Sstevel@tonic-gate 	 */
8837c478bd9Sstevel@tonic-gate 	rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
8847c478bd9Sstevel@tonic-gate 	page_freelist_lock(mnode);
8857c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
8867c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
8877c478bd9Sstevel@tonic-gate 		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
8887c478bd9Sstevel@tonic-gate 		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
8897c478bd9Sstevel@tonic-gate 		oldbase = PAGE_COUNTERS_BASE(mnode, r);
8907c478bd9Sstevel@tonic-gate 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
8917c478bd9Sstevel@tonic-gate 		old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r);
8927c478bd9Sstevel@tonic-gate 
8937c478bd9Sstevel@tonic-gate 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
8947c478bd9Sstevel@tonic-gate 		new_ctr = ctr_cache[r];
8957c478bd9Sstevel@tonic-gate 		ctr_cache[r] = NULL;
8967c478bd9Sstevel@tonic-gate 		if (old_ctr != NULL &&
8977c478bd9Sstevel@tonic-gate 		    (oldbase + old_npgs > newbase) &&
8987c478bd9Sstevel@tonic-gate 		    (newbase + npgs > oldbase)) {
8997c478bd9Sstevel@tonic-gate 			/*
9007c478bd9Sstevel@tonic-gate 			 * Map the intersection of the old and new
9017c478bd9Sstevel@tonic-gate 			 * counters into the new array.
9027c478bd9Sstevel@tonic-gate 			 */
9037c478bd9Sstevel@tonic-gate 			size_t offset;
9047c478bd9Sstevel@tonic-gate 			if (newbase > oldbase) {
9057c478bd9Sstevel@tonic-gate 				offset = (newbase - oldbase) >>
9067c478bd9Sstevel@tonic-gate 				    PAGE_COUNTERS_SHIFT(mnode, r);
9077c478bd9Sstevel@tonic-gate 				bcopy(old_ctr + offset, new_ctr,
9087c478bd9Sstevel@tonic-gate 				    MIN(pcsz, (old_csz - offset)) *
9097c478bd9Sstevel@tonic-gate 				    sizeof (hpmctr_t));
9107c478bd9Sstevel@tonic-gate 			} else {
9117c478bd9Sstevel@tonic-gate 				offset = (oldbase - newbase) >>
9127c478bd9Sstevel@tonic-gate 				    PAGE_COUNTERS_SHIFT(mnode, r);
9137c478bd9Sstevel@tonic-gate 				bcopy(old_ctr, new_ctr + offset,
9147c478bd9Sstevel@tonic-gate 				    MIN(pcsz - offset, old_csz) *
9157c478bd9Sstevel@tonic-gate 				    sizeof (hpmctr_t));
9167c478bd9Sstevel@tonic-gate 			}
9177c478bd9Sstevel@tonic-gate 		}
9187c478bd9Sstevel@tonic-gate 
9197c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
9207c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
9217c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
9227c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r];
9237c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r];
9247c478bd9Sstevel@tonic-gate 		color_cache[r] = NULL;
9257c478bd9Sstevel@tonic-gate 		/*
9267c478bd9Sstevel@tonic-gate 		 * for now, just reset on these events as it's probably
9277c478bd9Sstevel@tonic-gate 		 * not worthwhile to try and optimize this.
9287c478bd9Sstevel@tonic-gate 		 */
9297c478bd9Sstevel@tonic-gate 		for (i = 0; i < colors_per_szc[r]; i++) {
9307c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
9317c478bd9Sstevel@tonic-gate 		}
9327c478bd9Sstevel@tonic-gate 
9337c478bd9Sstevel@tonic-gate 		/* cache info for freeing out of the critical path */
9347c478bd9Sstevel@tonic-gate 		if ((caddr_t)old_ctr >= kernelheap &&
9357c478bd9Sstevel@tonic-gate 		    (caddr_t)old_ctr < ekernelheap) {
9367c478bd9Sstevel@tonic-gate 			ctr_cache[r] = old_ctr;
9377c478bd9Sstevel@tonic-gate 			size_cache[r] = old_csz;
9387c478bd9Sstevel@tonic-gate 		}
9397c478bd9Sstevel@tonic-gate 		if ((caddr_t)old_color_array >= kernelheap &&
9407c478bd9Sstevel@tonic-gate 		    (caddr_t)old_color_array < ekernelheap) {
9417c478bd9Sstevel@tonic-gate 			color_cache[r] = old_color_array;
9427c478bd9Sstevel@tonic-gate 		}
9437c478bd9Sstevel@tonic-gate 		/*
9447c478bd9Sstevel@tonic-gate 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
9457c478bd9Sstevel@tonic-gate 		 * satisfy the identity requirement.
9467c478bd9Sstevel@tonic-gate 		 * We should be able to go from one to the other
9477c478bd9Sstevel@tonic-gate 		 * and get consistent values.
9487c478bd9Sstevel@tonic-gate 		 */
9497c478bd9Sstevel@tonic-gate 		ASSERT(PNUM_TO_IDX(mnode, r,
9507c478bd9Sstevel@tonic-gate 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
9517c478bd9Sstevel@tonic-gate 		ASSERT(IDX_TO_PNUM(mnode, r,
9527c478bd9Sstevel@tonic-gate 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
9537c478bd9Sstevel@tonic-gate 	}
9547c478bd9Sstevel@tonic-gate 	page_freelist_unlock(mnode);
9557c478bd9Sstevel@tonic-gate 	rw_exit(&page_ctrs_rwlock[mnode]);
9567c478bd9Sstevel@tonic-gate 
9577c478bd9Sstevel@tonic-gate 	/*
9587c478bd9Sstevel@tonic-gate 	 * Now that we have dropped the write lock, it is safe to free all
9597c478bd9Sstevel@tonic-gate 	 * of the memory we have cached above.
9607c478bd9Sstevel@tonic-gate 	 */
9617c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
9627c478bd9Sstevel@tonic-gate 		if (ctr_cache[r] != NULL) {
9637c478bd9Sstevel@tonic-gate 			kmem_free(ctr_cache[r],
9647c478bd9Sstevel@tonic-gate 			    size_cache[r] * sizeof (hpmctr_t));
9657c478bd9Sstevel@tonic-gate 		}
9667c478bd9Sstevel@tonic-gate 		if (color_cache[r] != NULL) {
9677c478bd9Sstevel@tonic-gate 			kmem_free(color_cache[r],
9687c478bd9Sstevel@tonic-gate 			    colors_per_szc[r] * sizeof (size_t));
9697c478bd9Sstevel@tonic-gate 		}
9707c478bd9Sstevel@tonic-gate 	}
9717c478bd9Sstevel@tonic-gate 	return (0);
9727c478bd9Sstevel@tonic-gate }
9737c478bd9Sstevel@tonic-gate 
9747c478bd9Sstevel@tonic-gate /*
9757c478bd9Sstevel@tonic-gate  * color contains a valid color index or bin for cur_szc
9767c478bd9Sstevel@tonic-gate  */
9777c478bd9Sstevel@tonic-gate uint_t
9787c478bd9Sstevel@tonic-gate page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color)
9797c478bd9Sstevel@tonic-gate {
9807c478bd9Sstevel@tonic-gate 	uint_t shift;
9817c478bd9Sstevel@tonic-gate 
9827c478bd9Sstevel@tonic-gate 	if (cur_szc > new_szc) {
9837c478bd9Sstevel@tonic-gate 		shift = page_get_shift(cur_szc) - page_get_shift(new_szc);
9847c478bd9Sstevel@tonic-gate 		return (color << shift);
9857c478bd9Sstevel@tonic-gate 	} else if (cur_szc < new_szc) {
9867c478bd9Sstevel@tonic-gate 		shift = page_get_shift(new_szc) - page_get_shift(cur_szc);
9877c478bd9Sstevel@tonic-gate 		return (color >> shift);
9887c478bd9Sstevel@tonic-gate 	}
9897c478bd9Sstevel@tonic-gate 	return (color);
9907c478bd9Sstevel@tonic-gate }
9917c478bd9Sstevel@tonic-gate 
9927c478bd9Sstevel@tonic-gate #ifdef DEBUG
9937c478bd9Sstevel@tonic-gate 
9947c478bd9Sstevel@tonic-gate /*
9957c478bd9Sstevel@tonic-gate  * confirm pp is a large page corresponding to szc
9967c478bd9Sstevel@tonic-gate  */
9977c478bd9Sstevel@tonic-gate void
9987c478bd9Sstevel@tonic-gate chk_lpg(page_t *pp, uchar_t szc)
9997c478bd9Sstevel@tonic-gate {
10007c478bd9Sstevel@tonic-gate 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
10017c478bd9Sstevel@tonic-gate 	uint_t noreloc;
10027c478bd9Sstevel@tonic-gate 
10037c478bd9Sstevel@tonic-gate 	if (npgs == 1) {
10047c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
10057c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_next == pp);
10067c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_prev == pp);
10077c478bd9Sstevel@tonic-gate 		return;
10087c478bd9Sstevel@tonic-gate 	}
10097c478bd9Sstevel@tonic-gate 
10107c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
10117c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
10127c478bd9Sstevel@tonic-gate 
10137c478bd9Sstevel@tonic-gate 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
10147c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
10157c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
10167c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
10177c478bd9Sstevel@tonic-gate 
10187c478bd9Sstevel@tonic-gate 	/*
10197c478bd9Sstevel@tonic-gate 	 * Check list of pages.
10207c478bd9Sstevel@tonic-gate 	 */
10217c478bd9Sstevel@tonic-gate 	noreloc = PP_ISNORELOC(pp);
10227c478bd9Sstevel@tonic-gate 	while (npgs--) {
10237c478bd9Sstevel@tonic-gate 		if (npgs != 0) {
10247c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
10257c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_next == (pp + 1));
10267c478bd9Sstevel@tonic-gate 		}
10277c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == szc);
10287c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISFREE(pp));
10297c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISAGED(pp));
10307c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
10317c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
10327c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_vnode  == NULL);
10337c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISNORELOC(pp) == noreloc);
10347c478bd9Sstevel@tonic-gate 
10357c478bd9Sstevel@tonic-gate 		pp = pp->p_next;
10367c478bd9Sstevel@tonic-gate 	}
10377c478bd9Sstevel@tonic-gate }
10387c478bd9Sstevel@tonic-gate #endif /* DEBUG */
10397c478bd9Sstevel@tonic-gate 
10407c478bd9Sstevel@tonic-gate void
10417c478bd9Sstevel@tonic-gate page_freelist_lock(int mnode)
10427c478bd9Sstevel@tonic-gate {
10437c478bd9Sstevel@tonic-gate 	int i;
10447c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
10457c478bd9Sstevel@tonic-gate 		mutex_enter(FPC_MUTEX(mnode, i));
10467c478bd9Sstevel@tonic-gate 		mutex_enter(CPC_MUTEX(mnode, i));
10477c478bd9Sstevel@tonic-gate 	}
10487c478bd9Sstevel@tonic-gate }
10497c478bd9Sstevel@tonic-gate 
10507c478bd9Sstevel@tonic-gate void
10517c478bd9Sstevel@tonic-gate page_freelist_unlock(int mnode)
10527c478bd9Sstevel@tonic-gate {
10537c478bd9Sstevel@tonic-gate 	int i;
10547c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
10557c478bd9Sstevel@tonic-gate 		mutex_exit(FPC_MUTEX(mnode, i));
10567c478bd9Sstevel@tonic-gate 		mutex_exit(CPC_MUTEX(mnode, i));
10577c478bd9Sstevel@tonic-gate 	}
10587c478bd9Sstevel@tonic-gate }
10597c478bd9Sstevel@tonic-gate 
10607c478bd9Sstevel@tonic-gate /*
10617c478bd9Sstevel@tonic-gate  * add pp to the specified page list. Defaults to head of the page list
10627c478bd9Sstevel@tonic-gate  * unless PG_LIST_TAIL is specified.
10637c478bd9Sstevel@tonic-gate  */
10647c478bd9Sstevel@tonic-gate void
10657c478bd9Sstevel@tonic-gate page_list_add(page_t *pp, int flags)
10667c478bd9Sstevel@tonic-gate {
10677c478bd9Sstevel@tonic-gate 	page_t		**ppp;
10687c478bd9Sstevel@tonic-gate 	kmutex_t	*pcm;
10697c478bd9Sstevel@tonic-gate 	uint_t		bin, mtype;
10707c478bd9Sstevel@tonic-gate 	int		mnode;
10717c478bd9Sstevel@tonic-gate 
10727c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
10737c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
10747c478bd9Sstevel@tonic-gate 	ASSERT(!hat_page_is_mapped(pp));
10757c478bd9Sstevel@tonic-gate 	ASSERT(hat_page_getshare(pp) == 0);
10767c478bd9Sstevel@tonic-gate 
10777c478bd9Sstevel@tonic-gate 	/*
10787c478bd9Sstevel@tonic-gate 	 * Large pages should be freed via page_list_add_pages().
10797c478bd9Sstevel@tonic-gate 	 */
10807c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
10817c478bd9Sstevel@tonic-gate 
10827c478bd9Sstevel@tonic-gate 	/*
10837c478bd9Sstevel@tonic-gate 	 * Don't need to lock the freelist first here
10847c478bd9Sstevel@tonic-gate 	 * because the page isn't on the freelist yet.
10857c478bd9Sstevel@tonic-gate 	 * This means p_szc can't change on us.
10867c478bd9Sstevel@tonic-gate 	 */
10877c478bd9Sstevel@tonic-gate 
10887c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
10897c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
10907c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
10917c478bd9Sstevel@tonic-gate 
10927c478bd9Sstevel@tonic-gate 	if (flags & PG_LIST_ISINIT) {
10937c478bd9Sstevel@tonic-gate 		/*
10947c478bd9Sstevel@tonic-gate 		 * PG_LIST_ISINIT is set during system startup (ie. single
10957c478bd9Sstevel@tonic-gate 		 * threaded), add a page to the free list and add to the
10967c478bd9Sstevel@tonic-gate 		 * the free region counters w/o any locking
10977c478bd9Sstevel@tonic-gate 		 */
10987c478bd9Sstevel@tonic-gate 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
10997c478bd9Sstevel@tonic-gate 
11007c478bd9Sstevel@tonic-gate 		/* inline version of page_add() */
11017c478bd9Sstevel@tonic-gate 		if (*ppp != NULL) {
11027c478bd9Sstevel@tonic-gate 			pp->p_next = *ppp;
11037c478bd9Sstevel@tonic-gate 			pp->p_prev = (*ppp)->p_prev;
11047c478bd9Sstevel@tonic-gate 			(*ppp)->p_prev = pp;
11057c478bd9Sstevel@tonic-gate 			pp->p_prev->p_next = pp;
11067c478bd9Sstevel@tonic-gate 		} else
11077c478bd9Sstevel@tonic-gate 			*ppp = pp;
11087c478bd9Sstevel@tonic-gate 
1109affbd3ccSkchow 		page_ctr_add_internal(mnode, mtype, pp, flags);
1110affbd3ccSkchow 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
11117c478bd9Sstevel@tonic-gate 	} else {
11127c478bd9Sstevel@tonic-gate 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
11137c478bd9Sstevel@tonic-gate 
11147c478bd9Sstevel@tonic-gate 		if (flags & PG_FREE_LIST) {
1115affbd3ccSkchow 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
11167c478bd9Sstevel@tonic-gate 			ASSERT(PP_ISAGED(pp));
11177c478bd9Sstevel@tonic-gate 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
11187c478bd9Sstevel@tonic-gate 
11197c478bd9Sstevel@tonic-gate 		} else {
1120affbd3ccSkchow 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
11217c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_vnode);
11227c478bd9Sstevel@tonic-gate 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
11237c478bd9Sstevel@tonic-gate 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
11247c478bd9Sstevel@tonic-gate 		}
11257c478bd9Sstevel@tonic-gate 		mutex_enter(pcm);
11267c478bd9Sstevel@tonic-gate 		page_add(ppp, pp);
11277c478bd9Sstevel@tonic-gate 
11287c478bd9Sstevel@tonic-gate 		if (flags & PG_LIST_TAIL)
11297c478bd9Sstevel@tonic-gate 			*ppp = (*ppp)->p_next;
11307c478bd9Sstevel@tonic-gate 		/*
11317c478bd9Sstevel@tonic-gate 		 * Add counters before releasing pcm mutex to avoid a race with
11327c478bd9Sstevel@tonic-gate 		 * page_freelist_coalesce and page_freelist_fill.
11337c478bd9Sstevel@tonic-gate 		 */
1134affbd3ccSkchow 		page_ctr_add(mnode, mtype, pp, flags);
11357c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
11367c478bd9Sstevel@tonic-gate 	}
11377c478bd9Sstevel@tonic-gate 
11387c478bd9Sstevel@tonic-gate 
11397c478bd9Sstevel@tonic-gate #if defined(__sparc)
11407c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(pp)) {
11417c478bd9Sstevel@tonic-gate 		kcage_freemem_add(1);
11427c478bd9Sstevel@tonic-gate 	}
11437c478bd9Sstevel@tonic-gate #endif
11447c478bd9Sstevel@tonic-gate 	/*
11457c478bd9Sstevel@tonic-gate 	 * It is up to the caller to unlock the page!
11467c478bd9Sstevel@tonic-gate 	 */
11477c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
11487c478bd9Sstevel@tonic-gate }
11497c478bd9Sstevel@tonic-gate 
11507c478bd9Sstevel@tonic-gate 
11517c478bd9Sstevel@tonic-gate #ifdef __sparc
11527c478bd9Sstevel@tonic-gate /*
11537c478bd9Sstevel@tonic-gate  * This routine is only used by kcage_init during system startup.
11547c478bd9Sstevel@tonic-gate  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
11557c478bd9Sstevel@tonic-gate  * without the overhead of taking locks and updating counters.
11567c478bd9Sstevel@tonic-gate  */
11577c478bd9Sstevel@tonic-gate void
11587c478bd9Sstevel@tonic-gate page_list_noreloc_startup(page_t *pp)
11597c478bd9Sstevel@tonic-gate {
11607c478bd9Sstevel@tonic-gate 	page_t		**ppp;
11617c478bd9Sstevel@tonic-gate 	uint_t		bin;
11627c478bd9Sstevel@tonic-gate 	int		mnode;
11637c478bd9Sstevel@tonic-gate 	int		mtype;
1164e21bae1bSkchow 	int		flags = 0;
11657c478bd9Sstevel@tonic-gate 
11667c478bd9Sstevel@tonic-gate 	/*
11677c478bd9Sstevel@tonic-gate 	 * If this is a large page on the freelist then
11687c478bd9Sstevel@tonic-gate 	 * break it up into smaller pages.
11697c478bd9Sstevel@tonic-gate 	 */
11707c478bd9Sstevel@tonic-gate 	if (pp->p_szc != 0)
11717c478bd9Sstevel@tonic-gate 		page_boot_demote(pp);
11727c478bd9Sstevel@tonic-gate 
11737c478bd9Sstevel@tonic-gate 	/*
11747c478bd9Sstevel@tonic-gate 	 * Get list page is currently on.
11757c478bd9Sstevel@tonic-gate 	 */
11767c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
11777c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
11787c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
11797c478bd9Sstevel@tonic-gate 	ASSERT(mtype == MTYPE_RELOC);
11807c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
11817c478bd9Sstevel@tonic-gate 
11827c478bd9Sstevel@tonic-gate 	if (PP_ISAGED(pp)) {
11837c478bd9Sstevel@tonic-gate 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
11847c478bd9Sstevel@tonic-gate 		flags |= PG_FREE_LIST;
11857c478bd9Sstevel@tonic-gate 	} else {
11867c478bd9Sstevel@tonic-gate 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
11877c478bd9Sstevel@tonic-gate 		flags |= PG_CACHE_LIST;
11887c478bd9Sstevel@tonic-gate 	}
11897c478bd9Sstevel@tonic-gate 
11907c478bd9Sstevel@tonic-gate 	ASSERT(*ppp != NULL);
11917c478bd9Sstevel@tonic-gate 
11927c478bd9Sstevel@tonic-gate 	/*
11937c478bd9Sstevel@tonic-gate 	 * Delete page from current list.
11947c478bd9Sstevel@tonic-gate 	 */
11957c478bd9Sstevel@tonic-gate 	if (*ppp == pp)
11967c478bd9Sstevel@tonic-gate 		*ppp = pp->p_next;		/* go to next page */
11977c478bd9Sstevel@tonic-gate 	if (*ppp == pp) {
11987c478bd9Sstevel@tonic-gate 		*ppp = NULL;			/* page list is gone */
11997c478bd9Sstevel@tonic-gate 	} else {
12007c478bd9Sstevel@tonic-gate 		pp->p_prev->p_next = pp->p_next;
12017c478bd9Sstevel@tonic-gate 		pp->p_next->p_prev = pp->p_prev;
12027c478bd9Sstevel@tonic-gate 	}
12037c478bd9Sstevel@tonic-gate 
12047c478bd9Sstevel@tonic-gate 	/* LINTED */
1205affbd3ccSkchow 	PLCNT_DECR(pp, mnode, mtype, 0, flags);
12067c478bd9Sstevel@tonic-gate 
12077c478bd9Sstevel@tonic-gate 	/*
12087c478bd9Sstevel@tonic-gate 	 * Set no reloc for cage initted pages.
12097c478bd9Sstevel@tonic-gate 	 */
12107c478bd9Sstevel@tonic-gate 	PP_SETNORELOC(pp);
12117c478bd9Sstevel@tonic-gate 
12127c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
12137c478bd9Sstevel@tonic-gate 	ASSERT(mtype == MTYPE_NORELOC);
12147c478bd9Sstevel@tonic-gate 
12157c478bd9Sstevel@tonic-gate 	/*
12167c478bd9Sstevel@tonic-gate 	 * Get new list for page.
12177c478bd9Sstevel@tonic-gate 	 */
12187c478bd9Sstevel@tonic-gate 	if (PP_ISAGED(pp)) {
12197c478bd9Sstevel@tonic-gate 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
12207c478bd9Sstevel@tonic-gate 	} else {
12217c478bd9Sstevel@tonic-gate 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
12227c478bd9Sstevel@tonic-gate 	}
12237c478bd9Sstevel@tonic-gate 
12247c478bd9Sstevel@tonic-gate 	/*
12257c478bd9Sstevel@tonic-gate 	 * Insert page on new list.
12267c478bd9Sstevel@tonic-gate 	 */
12277c478bd9Sstevel@tonic-gate 	if (*ppp == NULL) {
12287c478bd9Sstevel@tonic-gate 		*ppp = pp;
12297c478bd9Sstevel@tonic-gate 		pp->p_next = pp->p_prev = pp;
12307c478bd9Sstevel@tonic-gate 	} else {
12317c478bd9Sstevel@tonic-gate 		pp->p_next = *ppp;
12327c478bd9Sstevel@tonic-gate 		pp->p_prev = (*ppp)->p_prev;
12337c478bd9Sstevel@tonic-gate 		(*ppp)->p_prev = pp;
12347c478bd9Sstevel@tonic-gate 		pp->p_prev->p_next = pp;
12357c478bd9Sstevel@tonic-gate 	}
12367c478bd9Sstevel@tonic-gate 
12377c478bd9Sstevel@tonic-gate 	/* LINTED */
1238affbd3ccSkchow 	PLCNT_INCR(pp, mnode, mtype, 0, flags);
12397c478bd9Sstevel@tonic-gate 
12407c478bd9Sstevel@tonic-gate 	/*
12417c478bd9Sstevel@tonic-gate 	 * Update cage freemem counter
12427c478bd9Sstevel@tonic-gate 	 */
12437c478bd9Sstevel@tonic-gate 	atomic_add_long(&kcage_freemem, 1);
12447c478bd9Sstevel@tonic-gate }
12457c478bd9Sstevel@tonic-gate #else	/* __sparc */
12467c478bd9Sstevel@tonic-gate 
12477c478bd9Sstevel@tonic-gate /* ARGSUSED */
12487c478bd9Sstevel@tonic-gate void
12497c478bd9Sstevel@tonic-gate page_list_noreloc_startup(page_t *pp)
12507c478bd9Sstevel@tonic-gate {
12517c478bd9Sstevel@tonic-gate 	panic("page_list_noreloc_startup: should be here only for sparc");
12527c478bd9Sstevel@tonic-gate }
12537c478bd9Sstevel@tonic-gate #endif
12547c478bd9Sstevel@tonic-gate 
12557c478bd9Sstevel@tonic-gate void
12567c478bd9Sstevel@tonic-gate page_list_add_pages(page_t *pp, int flags)
12577c478bd9Sstevel@tonic-gate {
12587c478bd9Sstevel@tonic-gate 	kmutex_t *pcm;
12597c478bd9Sstevel@tonic-gate 	pgcnt_t	pgcnt;
12607c478bd9Sstevel@tonic-gate 	uint_t	bin, mtype, i;
12617c478bd9Sstevel@tonic-gate 	int	mnode;
12627c478bd9Sstevel@tonic-gate 
12637c478bd9Sstevel@tonic-gate 	/* default to freelist/head */
12647c478bd9Sstevel@tonic-gate 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
12657c478bd9Sstevel@tonic-gate 
12667c478bd9Sstevel@tonic-gate 	CHK_LPG(pp, pp->p_szc);
1267affbd3ccSkchow 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
12687c478bd9Sstevel@tonic-gate 
12697c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
12707c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
12717c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
12727c478bd9Sstevel@tonic-gate 
12737c478bd9Sstevel@tonic-gate 	if (flags & PG_LIST_ISINIT) {
12747c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
12757c478bd9Sstevel@tonic-gate 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
12767c478bd9Sstevel@tonic-gate 		ASSERT(!PP_ISNORELOC(pp));
1277affbd3ccSkchow 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
12787c478bd9Sstevel@tonic-gate 	} else {
12797c478bd9Sstevel@tonic-gate 
12807c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
12817c478bd9Sstevel@tonic-gate 
12827c478bd9Sstevel@tonic-gate 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
12837c478bd9Sstevel@tonic-gate 
12847c478bd9Sstevel@tonic-gate 		mutex_enter(pcm);
12857c478bd9Sstevel@tonic-gate 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1286affbd3ccSkchow 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
12877c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
12887c478bd9Sstevel@tonic-gate 
12897c478bd9Sstevel@tonic-gate 		pgcnt = page_get_pagecnt(pp->p_szc);
12907c478bd9Sstevel@tonic-gate #if defined(__sparc)
12917c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(pp))
12927c478bd9Sstevel@tonic-gate 			kcage_freemem_add(pgcnt);
12937c478bd9Sstevel@tonic-gate #endif
12947c478bd9Sstevel@tonic-gate 		for (i = 0; i < pgcnt; i++, pp++)
1295db874c57Selowe 			page_unlock_noretire(pp);
12967c478bd9Sstevel@tonic-gate 	}
12977c478bd9Sstevel@tonic-gate }
12987c478bd9Sstevel@tonic-gate 
12997c478bd9Sstevel@tonic-gate /*
13007c478bd9Sstevel@tonic-gate  * During boot, need to demote a large page to base
13017c478bd9Sstevel@tonic-gate  * pagesize pages for seg_kmem for use in boot_alloc()
13027c478bd9Sstevel@tonic-gate  */
13037c478bd9Sstevel@tonic-gate void
13047c478bd9Sstevel@tonic-gate page_boot_demote(page_t *pp)
13057c478bd9Sstevel@tonic-gate {
13067c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc != 0);
13077c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
13087c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
13097c478bd9Sstevel@tonic-gate 
13107c478bd9Sstevel@tonic-gate 	(void) page_demote(PP_2_MEM_NODE(pp),
13117c478bd9Sstevel@tonic-gate 	    PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
13127c478bd9Sstevel@tonic-gate 	    PC_FREE);
13137c478bd9Sstevel@tonic-gate 
13147c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
13157c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
13167c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
13177c478bd9Sstevel@tonic-gate }
13187c478bd9Sstevel@tonic-gate 
13197c478bd9Sstevel@tonic-gate /*
13207c478bd9Sstevel@tonic-gate  * Take a particular page off of whatever freelist the page
13217c478bd9Sstevel@tonic-gate  * is claimed to be on.
13227c478bd9Sstevel@tonic-gate  *
13237c478bd9Sstevel@tonic-gate  * NOTE: Only used for PAGESIZE pages.
13247c478bd9Sstevel@tonic-gate  */
13257c478bd9Sstevel@tonic-gate void
13267c478bd9Sstevel@tonic-gate page_list_sub(page_t *pp, int flags)
13277c478bd9Sstevel@tonic-gate {
13287c478bd9Sstevel@tonic-gate 	int		bin;
13297c478bd9Sstevel@tonic-gate 	uint_t		mtype;
13307c478bd9Sstevel@tonic-gate 	int		mnode;
13317c478bd9Sstevel@tonic-gate 	kmutex_t	*pcm;
13327c478bd9Sstevel@tonic-gate 	page_t		**ppp;
13337c478bd9Sstevel@tonic-gate 
13347c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
13357c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
13367c478bd9Sstevel@tonic-gate 
13377c478bd9Sstevel@tonic-gate 	/*
13387c478bd9Sstevel@tonic-gate 	 * The p_szc field can only be changed by page_promote()
13397c478bd9Sstevel@tonic-gate 	 * and page_demote(). Only free pages can be promoted and
13407c478bd9Sstevel@tonic-gate 	 * demoted and the free list MUST be locked during these
13417c478bd9Sstevel@tonic-gate 	 * operations. So to prevent a race in page_list_sub()
13427c478bd9Sstevel@tonic-gate 	 * between computing which bin of the freelist lock to
13437c478bd9Sstevel@tonic-gate 	 * grab and actually grabing the lock we check again that
13447c478bd9Sstevel@tonic-gate 	 * the bin we locked is still the correct one. Notice that
13457c478bd9Sstevel@tonic-gate 	 * the p_szc field could have actually changed on us but
13467c478bd9Sstevel@tonic-gate 	 * if the bin happens to still be the same we are safe.
13477c478bd9Sstevel@tonic-gate 	 */
13487c478bd9Sstevel@tonic-gate try_again:
13497c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
13507c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
13517c478bd9Sstevel@tonic-gate 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
13527c478bd9Sstevel@tonic-gate 	mutex_enter(pcm);
13537c478bd9Sstevel@tonic-gate 	if (PP_2_BIN(pp) != bin) {
13547c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
13557c478bd9Sstevel@tonic-gate 		goto try_again;
13567c478bd9Sstevel@tonic-gate 	}
13577c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
13587c478bd9Sstevel@tonic-gate 
13597c478bd9Sstevel@tonic-gate 	if (flags & PG_FREE_LIST) {
1360affbd3ccSkchow 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
13617c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISAGED(pp));
13627c478bd9Sstevel@tonic-gate 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
13637c478bd9Sstevel@tonic-gate 	} else {
1364affbd3ccSkchow 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
13657c478bd9Sstevel@tonic-gate 		ASSERT(!PP_ISAGED(pp));
13667c478bd9Sstevel@tonic-gate 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
13677c478bd9Sstevel@tonic-gate 	}
13687c478bd9Sstevel@tonic-gate 
13697c478bd9Sstevel@tonic-gate 	/*
13707c478bd9Sstevel@tonic-gate 	 * Common PAGESIZE case.
13717c478bd9Sstevel@tonic-gate 	 *
13727c478bd9Sstevel@tonic-gate 	 * Note that we locked the freelist. This prevents
13737c478bd9Sstevel@tonic-gate 	 * any page promotion/demotion operations. Therefore
13747c478bd9Sstevel@tonic-gate 	 * the p_szc will not change until we drop pcm mutex.
13757c478bd9Sstevel@tonic-gate 	 */
13767c478bd9Sstevel@tonic-gate 	if (pp->p_szc == 0) {
13777c478bd9Sstevel@tonic-gate 		page_sub(ppp, pp);
13787c478bd9Sstevel@tonic-gate 		/*
13797c478bd9Sstevel@tonic-gate 		 * Subtract counters before releasing pcm mutex
13807c478bd9Sstevel@tonic-gate 		 * to avoid race with page_freelist_coalesce.
13817c478bd9Sstevel@tonic-gate 		 */
1382affbd3ccSkchow 		page_ctr_sub(mnode, mtype, pp, flags);
13837c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
13847c478bd9Sstevel@tonic-gate 
13857c478bd9Sstevel@tonic-gate #if defined(__sparc)
13867c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(pp)) {
13877c478bd9Sstevel@tonic-gate 			kcage_freemem_sub(1);
13887c478bd9Sstevel@tonic-gate 		}
13897c478bd9Sstevel@tonic-gate #endif
13907c478bd9Sstevel@tonic-gate 		return;
13917c478bd9Sstevel@tonic-gate 	}
13927c478bd9Sstevel@tonic-gate 
13937c478bd9Sstevel@tonic-gate 	/*
13947c478bd9Sstevel@tonic-gate 	 * Large pages on the cache list are not supported.
13957c478bd9Sstevel@tonic-gate 	 */
13967c478bd9Sstevel@tonic-gate 	if (flags & PG_CACHE_LIST)
13977c478bd9Sstevel@tonic-gate 		panic("page_list_sub: large page on cachelist");
13987c478bd9Sstevel@tonic-gate 
13997c478bd9Sstevel@tonic-gate 	/*
14007c478bd9Sstevel@tonic-gate 	 * Slow but rare.
14017c478bd9Sstevel@tonic-gate 	 *
14027c478bd9Sstevel@tonic-gate 	 * Somebody wants this particular page which is part
14037c478bd9Sstevel@tonic-gate 	 * of a large page. In this case we just demote the page
14047c478bd9Sstevel@tonic-gate 	 * if it's on the freelist.
14057c478bd9Sstevel@tonic-gate 	 *
14067c478bd9Sstevel@tonic-gate 	 * We have to drop pcm before locking the entire freelist.
14077c478bd9Sstevel@tonic-gate 	 * Once we have re-locked the freelist check to make sure
14087c478bd9Sstevel@tonic-gate 	 * the page hasn't already been demoted or completely
14097c478bd9Sstevel@tonic-gate 	 * freed.
14107c478bd9Sstevel@tonic-gate 	 */
14117c478bd9Sstevel@tonic-gate 	mutex_exit(pcm);
14127c478bd9Sstevel@tonic-gate 	page_freelist_lock(mnode);
14137c478bd9Sstevel@tonic-gate 	if (pp->p_szc != 0) {
14147c478bd9Sstevel@tonic-gate 		/*
14157c478bd9Sstevel@tonic-gate 		 * Large page is on freelist.
14167c478bd9Sstevel@tonic-gate 		 */
14177c478bd9Sstevel@tonic-gate 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
14187c478bd9Sstevel@tonic-gate 		    pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
14197c478bd9Sstevel@tonic-gate 	}
14207c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
14217c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
14227c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
14237c478bd9Sstevel@tonic-gate 
14247c478bd9Sstevel@tonic-gate 	/*
14257c478bd9Sstevel@tonic-gate 	 * Subtract counters before releasing pcm mutex
14267c478bd9Sstevel@tonic-gate 	 * to avoid race with page_freelist_coalesce.
14277c478bd9Sstevel@tonic-gate 	 */
14287c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
14297c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
14307c478bd9Sstevel@tonic-gate 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
14317c478bd9Sstevel@tonic-gate 
14327c478bd9Sstevel@tonic-gate 	page_sub(ppp, pp);
1433affbd3ccSkchow 	page_ctr_sub(mnode, mtype, pp, flags);
14347c478bd9Sstevel@tonic-gate 	page_freelist_unlock(mnode);
14357c478bd9Sstevel@tonic-gate 
14367c478bd9Sstevel@tonic-gate #if defined(__sparc)
14377c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(pp)) {
14387c478bd9Sstevel@tonic-gate 		kcage_freemem_sub(1);
14397c478bd9Sstevel@tonic-gate 	}
14407c478bd9Sstevel@tonic-gate #endif
14417c478bd9Sstevel@tonic-gate }
14427c478bd9Sstevel@tonic-gate 
14437c478bd9Sstevel@tonic-gate void
14447c478bd9Sstevel@tonic-gate page_list_sub_pages(page_t *pp, uint_t szc)
14457c478bd9Sstevel@tonic-gate {
14467c478bd9Sstevel@tonic-gate 	kmutex_t *pcm;
14477c478bd9Sstevel@tonic-gate 	uint_t	bin, mtype;
14487c478bd9Sstevel@tonic-gate 	int	mnode;
14497c478bd9Sstevel@tonic-gate 
14507c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
14517c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
14527c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
14537c478bd9Sstevel@tonic-gate 
14547c478bd9Sstevel@tonic-gate 	/*
14557c478bd9Sstevel@tonic-gate 	 * See comment in page_list_sub().
14567c478bd9Sstevel@tonic-gate 	 */
14577c478bd9Sstevel@tonic-gate try_again:
14587c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
14597c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
14607c478bd9Sstevel@tonic-gate 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
14617c478bd9Sstevel@tonic-gate 	mutex_enter(pcm);
14627c478bd9Sstevel@tonic-gate 	if (PP_2_BIN(pp) != bin) {
14637c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
14647c478bd9Sstevel@tonic-gate 		goto	try_again;
14657c478bd9Sstevel@tonic-gate 	}
14667c478bd9Sstevel@tonic-gate 
14677c478bd9Sstevel@tonic-gate 	/*
14687c478bd9Sstevel@tonic-gate 	 * If we're called with a page larger than szc or it got
14697c478bd9Sstevel@tonic-gate 	 * promoted above szc before we locked the freelist then
14707c478bd9Sstevel@tonic-gate 	 * drop pcm and re-lock entire freelist. If page still larger
14717c478bd9Sstevel@tonic-gate 	 * than szc then demote it.
14727c478bd9Sstevel@tonic-gate 	 */
14737c478bd9Sstevel@tonic-gate 	if (pp->p_szc > szc) {
14747c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
14757c478bd9Sstevel@tonic-gate 		pcm = NULL;
14767c478bd9Sstevel@tonic-gate 		page_freelist_lock(mnode);
14777c478bd9Sstevel@tonic-gate 		if (pp->p_szc > szc) {
1478affbd3ccSkchow 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
14797c478bd9Sstevel@tonic-gate 			(void) page_demote(mnode,
14807c478bd9Sstevel@tonic-gate 			    PFN_BASE(pp->p_pagenum, pp->p_szc),
14817c478bd9Sstevel@tonic-gate 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
14827c478bd9Sstevel@tonic-gate 		}
14837c478bd9Sstevel@tonic-gate 		bin = PP_2_BIN(pp);
14847c478bd9Sstevel@tonic-gate 	}
14857c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
14867c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
14877c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc <= szc);
14887c478bd9Sstevel@tonic-gate 	ASSERT(pp == PP_PAGEROOT(pp));
14897c478bd9Sstevel@tonic-gate 
1490affbd3ccSkchow 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1491affbd3ccSkchow 
14927c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
14937c478bd9Sstevel@tonic-gate 	if (pp->p_szc != 0) {
14947c478bd9Sstevel@tonic-gate 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
14957c478bd9Sstevel@tonic-gate 		CHK_LPG(pp, pp->p_szc);
14967c478bd9Sstevel@tonic-gate 	} else {
1497affbd3ccSkchow 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
14987c478bd9Sstevel@tonic-gate 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
14997c478bd9Sstevel@tonic-gate 	}
1500affbd3ccSkchow 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
15017c478bd9Sstevel@tonic-gate 
15027c478bd9Sstevel@tonic-gate 	if (pcm != NULL) {
15037c478bd9Sstevel@tonic-gate 		mutex_exit(pcm);
15047c478bd9Sstevel@tonic-gate 	} else {
15057c478bd9Sstevel@tonic-gate 		page_freelist_unlock(mnode);
15067c478bd9Sstevel@tonic-gate 	}
15077c478bd9Sstevel@tonic-gate 
15087c478bd9Sstevel@tonic-gate #if defined(__sparc)
15097c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(pp)) {
15107c478bd9Sstevel@tonic-gate 		pgcnt_t	pgcnt;
15117c478bd9Sstevel@tonic-gate 
15127c478bd9Sstevel@tonic-gate 		pgcnt = page_get_pagecnt(pp->p_szc);
15137c478bd9Sstevel@tonic-gate 		kcage_freemem_sub(pgcnt);
15147c478bd9Sstevel@tonic-gate 	}
15157c478bd9Sstevel@tonic-gate #endif
15167c478bd9Sstevel@tonic-gate }
15177c478bd9Sstevel@tonic-gate 
15187c478bd9Sstevel@tonic-gate /*
15197c478bd9Sstevel@tonic-gate  * Add the page to the front of a linked list of pages
15207c478bd9Sstevel@tonic-gate  * using the p_next & p_prev pointers for the list.
15217c478bd9Sstevel@tonic-gate  * The caller is responsible for protecting the list pointers.
15227c478bd9Sstevel@tonic-gate  */
15237c478bd9Sstevel@tonic-gate void
15247c478bd9Sstevel@tonic-gate mach_page_add(page_t **ppp, page_t *pp)
15257c478bd9Sstevel@tonic-gate {
15267c478bd9Sstevel@tonic-gate 	if (*ppp == NULL) {
15277c478bd9Sstevel@tonic-gate 		pp->p_next = pp->p_prev = pp;
15287c478bd9Sstevel@tonic-gate 	} else {
15297c478bd9Sstevel@tonic-gate 		pp->p_next = *ppp;
15307c478bd9Sstevel@tonic-gate 		pp->p_prev = (*ppp)->p_prev;
15317c478bd9Sstevel@tonic-gate 		(*ppp)->p_prev = pp;
15327c478bd9Sstevel@tonic-gate 		pp->p_prev->p_next = pp;
15337c478bd9Sstevel@tonic-gate 	}
15347c478bd9Sstevel@tonic-gate 	*ppp = pp;
15357c478bd9Sstevel@tonic-gate }
15367c478bd9Sstevel@tonic-gate 
15377c478bd9Sstevel@tonic-gate /*
15387c478bd9Sstevel@tonic-gate  * Remove this page from a linked list of pages
15397c478bd9Sstevel@tonic-gate  * using the p_next & p_prev pointers for the list.
15407c478bd9Sstevel@tonic-gate  *
15417c478bd9Sstevel@tonic-gate  * The caller is responsible for protecting the list pointers.
15427c478bd9Sstevel@tonic-gate  */
15437c478bd9Sstevel@tonic-gate void
15447c478bd9Sstevel@tonic-gate mach_page_sub(page_t **ppp, page_t *pp)
15457c478bd9Sstevel@tonic-gate {
15467c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
15477c478bd9Sstevel@tonic-gate 
15487c478bd9Sstevel@tonic-gate 	if (*ppp == NULL || pp == NULL)
15497c478bd9Sstevel@tonic-gate 		panic("mach_page_sub");
15507c478bd9Sstevel@tonic-gate 
15517c478bd9Sstevel@tonic-gate 	if (*ppp == pp)
15527c478bd9Sstevel@tonic-gate 		*ppp = pp->p_next;		/* go to next page */
15537c478bd9Sstevel@tonic-gate 
15547c478bd9Sstevel@tonic-gate 	if (*ppp == pp)
15557c478bd9Sstevel@tonic-gate 		*ppp = NULL;			/* page list is gone */
15567c478bd9Sstevel@tonic-gate 	else {
15577c478bd9Sstevel@tonic-gate 		pp->p_prev->p_next = pp->p_next;
15587c478bd9Sstevel@tonic-gate 		pp->p_next->p_prev = pp->p_prev;
15597c478bd9Sstevel@tonic-gate 	}
15607c478bd9Sstevel@tonic-gate 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
15617c478bd9Sstevel@tonic-gate }
15627c478bd9Sstevel@tonic-gate 
15637c478bd9Sstevel@tonic-gate /*
15647c478bd9Sstevel@tonic-gate  * Routine fsflush uses to gradually coalesce the free list into larger pages.
15657c478bd9Sstevel@tonic-gate  */
15667c478bd9Sstevel@tonic-gate void
15677c478bd9Sstevel@tonic-gate page_promote_size(page_t *pp, uint_t cur_szc)
15687c478bd9Sstevel@tonic-gate {
15697c478bd9Sstevel@tonic-gate 	pfn_t pfn;
15707c478bd9Sstevel@tonic-gate 	int mnode;
15717c478bd9Sstevel@tonic-gate 	int idx;
15727c478bd9Sstevel@tonic-gate 	int new_szc = cur_szc + 1;
15737c478bd9Sstevel@tonic-gate 	int full = FULL_REGION_CNT(new_szc);
15747c478bd9Sstevel@tonic-gate 
15757c478bd9Sstevel@tonic-gate 	pfn = page_pptonum(pp);
15767c478bd9Sstevel@tonic-gate 	mnode = PFN_2_MEM_NODE(pfn);
15777c478bd9Sstevel@tonic-gate 
15787c478bd9Sstevel@tonic-gate 	page_freelist_lock(mnode);
15797c478bd9Sstevel@tonic-gate 
15807c478bd9Sstevel@tonic-gate 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
15817c478bd9Sstevel@tonic-gate 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
15827c478bd9Sstevel@tonic-gate 		(void) page_promote(mnode, pfn, new_szc, PC_FREE);
15837c478bd9Sstevel@tonic-gate 
15847c478bd9Sstevel@tonic-gate 	page_freelist_unlock(mnode);
15857c478bd9Sstevel@tonic-gate }
15867c478bd9Sstevel@tonic-gate 
15877c478bd9Sstevel@tonic-gate static uint_t page_promote_err;
15887c478bd9Sstevel@tonic-gate static uint_t page_promote_noreloc_err;
15897c478bd9Sstevel@tonic-gate 
15907c478bd9Sstevel@tonic-gate /*
15917c478bd9Sstevel@tonic-gate  * Create a single larger page (of szc new_szc) from smaller contiguous pages
15927c478bd9Sstevel@tonic-gate  * for the given mnode starting at pfnum. Pages involved are on the freelist
15937c478bd9Sstevel@tonic-gate  * before the call and may be returned to the caller if requested, otherwise
15947c478bd9Sstevel@tonic-gate  * they will be placed back on the freelist.
15957c478bd9Sstevel@tonic-gate  * If flags is PC_ALLOC, then the large page will be returned to the user in
15967c478bd9Sstevel@tonic-gate  * a state which is consistent with a page being taken off the freelist.  If
15977c478bd9Sstevel@tonic-gate  * we failed to lock the new large page, then we will return NULL to the
15987c478bd9Sstevel@tonic-gate  * caller and put the large page on the freelist instead.
15997c478bd9Sstevel@tonic-gate  * If flags is PC_FREE, then the large page will be placed on the freelist,
16007c478bd9Sstevel@tonic-gate  * and NULL will be returned.
16017c478bd9Sstevel@tonic-gate  * The caller is responsible for locking the freelist as well as any other
16027c478bd9Sstevel@tonic-gate  * accounting which needs to be done for a returned page.
16037c478bd9Sstevel@tonic-gate  *
16047c478bd9Sstevel@tonic-gate  * RFE: For performance pass in pp instead of pfnum so
16057c478bd9Sstevel@tonic-gate  * 	we can avoid excessive calls to page_numtopp_nolock().
16067c478bd9Sstevel@tonic-gate  *	This would depend on an assumption that all contiguous
16077c478bd9Sstevel@tonic-gate  *	pages are in the same memseg so we can just add/dec
16087c478bd9Sstevel@tonic-gate  *	our pp.
16097c478bd9Sstevel@tonic-gate  *
16107c478bd9Sstevel@tonic-gate  * Lock ordering:
16117c478bd9Sstevel@tonic-gate  *
16127c478bd9Sstevel@tonic-gate  *	There is a potential but rare deadlock situation
16137c478bd9Sstevel@tonic-gate  *	for page promotion and demotion operations. The problem
16147c478bd9Sstevel@tonic-gate  *	is there are two paths into the freelist manager and
16157c478bd9Sstevel@tonic-gate  *	they have different lock orders:
16167c478bd9Sstevel@tonic-gate  *
16177c478bd9Sstevel@tonic-gate  *	page_create()
16187c478bd9Sstevel@tonic-gate  *		lock freelist
16197c478bd9Sstevel@tonic-gate  *		page_lock(EXCL)
16207c478bd9Sstevel@tonic-gate  *		unlock freelist
16217c478bd9Sstevel@tonic-gate  *		return
16227c478bd9Sstevel@tonic-gate  *		caller drops page_lock
16237c478bd9Sstevel@tonic-gate  *
16247c478bd9Sstevel@tonic-gate  *	page_free() and page_reclaim()
16257c478bd9Sstevel@tonic-gate  *		caller grabs page_lock(EXCL)
16267c478bd9Sstevel@tonic-gate  *
16277c478bd9Sstevel@tonic-gate  *		lock freelist
16287c478bd9Sstevel@tonic-gate  *		unlock freelist
16297c478bd9Sstevel@tonic-gate  *		drop page_lock
16307c478bd9Sstevel@tonic-gate  *
16317c478bd9Sstevel@tonic-gate  *	What prevents a thread in page_create() from deadlocking
16327c478bd9Sstevel@tonic-gate  *	with a thread freeing or reclaiming the same page is the
16337c478bd9Sstevel@tonic-gate  *	page_trylock() in page_get_freelist(). If the trylock fails
16347c478bd9Sstevel@tonic-gate  *	it skips the page.
16357c478bd9Sstevel@tonic-gate  *
16367c478bd9Sstevel@tonic-gate  *	The lock ordering for promotion and demotion is the same as
16377c478bd9Sstevel@tonic-gate  *	for page_create(). Since the same deadlock could occur during
16387c478bd9Sstevel@tonic-gate  *	page promotion and freeing or reclaiming of a page on the
16397c478bd9Sstevel@tonic-gate  *	cache list we might have to fail the operation and undo what
16407c478bd9Sstevel@tonic-gate  *	have done so far. Again this is rare.
16417c478bd9Sstevel@tonic-gate  */
16427c478bd9Sstevel@tonic-gate page_t *
16437c478bd9Sstevel@tonic-gate page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
16447c478bd9Sstevel@tonic-gate {
16457c478bd9Sstevel@tonic-gate 	page_t		*pp, *pplist, *tpp, *start_pp;
16467c478bd9Sstevel@tonic-gate 	pgcnt_t		new_npgs, npgs;
16477c478bd9Sstevel@tonic-gate 	uint_t		bin;
16487c478bd9Sstevel@tonic-gate 	pgcnt_t		tmpnpgs, pages_left;
16497c478bd9Sstevel@tonic-gate 	uint_t		mtype;
16507c478bd9Sstevel@tonic-gate 	uint_t		noreloc;
16517c478bd9Sstevel@tonic-gate 	uint_t 		i;
16527c478bd9Sstevel@tonic-gate 	int 		which_list;
16537c478bd9Sstevel@tonic-gate 	ulong_t		index;
16547c478bd9Sstevel@tonic-gate 	kmutex_t	*phm;
16557c478bd9Sstevel@tonic-gate 
16567c478bd9Sstevel@tonic-gate 	/*
16577c478bd9Sstevel@tonic-gate 	 * General algorithm:
16587c478bd9Sstevel@tonic-gate 	 * Find the starting page
16597c478bd9Sstevel@tonic-gate 	 * Walk each page struct removing it from the freelist,
16607c478bd9Sstevel@tonic-gate 	 * and linking it to all the other pages removed.
16617c478bd9Sstevel@tonic-gate 	 * Once all pages are off the freelist,
16627c478bd9Sstevel@tonic-gate 	 * walk the list, modifying p_szc to new_szc and what
16637c478bd9Sstevel@tonic-gate 	 * ever other info needs to be done to create a large free page.
16647c478bd9Sstevel@tonic-gate 	 * According to the flags, either return the page or put it
16657c478bd9Sstevel@tonic-gate 	 * on the freelist.
16667c478bd9Sstevel@tonic-gate 	 */
16677c478bd9Sstevel@tonic-gate 
16687c478bd9Sstevel@tonic-gate 	start_pp = page_numtopp_nolock(pfnum);
16697c478bd9Sstevel@tonic-gate 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
16707c478bd9Sstevel@tonic-gate 	new_npgs = page_get_pagecnt(new_szc);
16717c478bd9Sstevel@tonic-gate 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
16727c478bd9Sstevel@tonic-gate 
16737c478bd9Sstevel@tonic-gate 	/*
16747c478bd9Sstevel@tonic-gate 	 * Loop through smaller pages to confirm that all pages
16757c478bd9Sstevel@tonic-gate 	 * give the same result for PP_ISNORELOC().
16767c478bd9Sstevel@tonic-gate 	 * We can check this reliably here as the protocol for setting
16777c478bd9Sstevel@tonic-gate 	 * P_NORELOC requires pages to be taken off the free list first.
16787c478bd9Sstevel@tonic-gate 	 */
16797c478bd9Sstevel@tonic-gate 	for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) {
16807c478bd9Sstevel@tonic-gate 		if (pp == start_pp) {
16817c478bd9Sstevel@tonic-gate 			/* First page, set requirement. */
16827c478bd9Sstevel@tonic-gate 			noreloc = PP_ISNORELOC(pp);
16837c478bd9Sstevel@tonic-gate 		} else if (noreloc != PP_ISNORELOC(pp)) {
16847c478bd9Sstevel@tonic-gate 			page_promote_noreloc_err++;
16857c478bd9Sstevel@tonic-gate 			page_promote_err++;
16867c478bd9Sstevel@tonic-gate 			return (NULL);
16877c478bd9Sstevel@tonic-gate 		}
16887c478bd9Sstevel@tonic-gate 	}
16897c478bd9Sstevel@tonic-gate 
16907c478bd9Sstevel@tonic-gate 	pages_left = new_npgs;
16917c478bd9Sstevel@tonic-gate 	pplist = NULL;
16927c478bd9Sstevel@tonic-gate 	pp = start_pp;
16937c478bd9Sstevel@tonic-gate 
16947c478bd9Sstevel@tonic-gate 	/* Loop around coalescing the smaller pages into a big page. */
16957c478bd9Sstevel@tonic-gate 	while (pages_left) {
16967c478bd9Sstevel@tonic-gate 		/*
16977c478bd9Sstevel@tonic-gate 		 * Remove from the freelist.
16987c478bd9Sstevel@tonic-gate 		 */
16997c478bd9Sstevel@tonic-gate 		ASSERT(PP_ISFREE(pp));
17007c478bd9Sstevel@tonic-gate 		bin = PP_2_BIN(pp);
17017c478bd9Sstevel@tonic-gate 		ASSERT(mnode == PP_2_MEM_NODE(pp));
17027c478bd9Sstevel@tonic-gate 		mtype = PP_2_MTYPE(pp);
17037c478bd9Sstevel@tonic-gate 		if (PP_ISAGED(pp)) {
17047c478bd9Sstevel@tonic-gate 
17057c478bd9Sstevel@tonic-gate 			/*
17067c478bd9Sstevel@tonic-gate 			 * PG_FREE_LIST
17077c478bd9Sstevel@tonic-gate 			 */
17087c478bd9Sstevel@tonic-gate 			if (pp->p_szc) {
17097c478bd9Sstevel@tonic-gate 				page_vpsub(&PAGE_FREELISTS(mnode,
17107c478bd9Sstevel@tonic-gate 				    pp->p_szc, bin, mtype), pp);
17117c478bd9Sstevel@tonic-gate 			} else {
17127c478bd9Sstevel@tonic-gate 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
17137c478bd9Sstevel@tonic-gate 				    bin, mtype), pp);
17147c478bd9Sstevel@tonic-gate 			}
17157c478bd9Sstevel@tonic-gate 			which_list = PG_FREE_LIST;
17167c478bd9Sstevel@tonic-gate 		} else {
17177c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_szc == 0);
17187c478bd9Sstevel@tonic-gate 
17197c478bd9Sstevel@tonic-gate 			/*
17207c478bd9Sstevel@tonic-gate 			 * PG_CACHE_LIST
17217c478bd9Sstevel@tonic-gate 			 *
17227c478bd9Sstevel@tonic-gate 			 * Since this page comes from the
17237c478bd9Sstevel@tonic-gate 			 * cachelist, we must destroy the
17247c478bd9Sstevel@tonic-gate 			 * vnode association.
17257c478bd9Sstevel@tonic-gate 			 */
17267c478bd9Sstevel@tonic-gate 			if (!page_trylock(pp, SE_EXCL)) {
17277c478bd9Sstevel@tonic-gate 				goto fail_promote;
17287c478bd9Sstevel@tonic-gate 			}
17297c478bd9Sstevel@tonic-gate 
17307c478bd9Sstevel@tonic-gate 			/*
17317c478bd9Sstevel@tonic-gate 			 * We need to be careful not to deadlock
17327c478bd9Sstevel@tonic-gate 			 * with another thread in page_lookup().
17337c478bd9Sstevel@tonic-gate 			 * The page_lookup() thread could be holding
17347c478bd9Sstevel@tonic-gate 			 * the same phm that we need if the two
17357c478bd9Sstevel@tonic-gate 			 * pages happen to hash to the same phm lock.
17367c478bd9Sstevel@tonic-gate 			 * At this point we have locked the entire
17377c478bd9Sstevel@tonic-gate 			 * freelist and page_lookup() could be trying
17387c478bd9Sstevel@tonic-gate 			 * to grab a freelist lock.
17397c478bd9Sstevel@tonic-gate 			 */
17407c478bd9Sstevel@tonic-gate 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
17417c478bd9Sstevel@tonic-gate 			phm = PAGE_HASH_MUTEX(index);
17427c478bd9Sstevel@tonic-gate 			if (!mutex_tryenter(phm)) {
1743db874c57Selowe 				page_unlock_noretire(pp);
17447c478bd9Sstevel@tonic-gate 				goto fail_promote;
17457c478bd9Sstevel@tonic-gate 			}
17467c478bd9Sstevel@tonic-gate 
17477c478bd9Sstevel@tonic-gate 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
17487c478bd9Sstevel@tonic-gate 			page_hashout(pp, phm);
17497c478bd9Sstevel@tonic-gate 			mutex_exit(phm);
17507c478bd9Sstevel@tonic-gate 			PP_SETAGED(pp);
1751db874c57Selowe 			page_unlock_noretire(pp);
17527c478bd9Sstevel@tonic-gate 			which_list = PG_CACHE_LIST;
17537c478bd9Sstevel@tonic-gate 		}
1754affbd3ccSkchow 		page_ctr_sub(mnode, mtype, pp, which_list);
17557c478bd9Sstevel@tonic-gate 
17567c478bd9Sstevel@tonic-gate 		/*
17577c478bd9Sstevel@tonic-gate 		 * Concatenate the smaller page(s) onto
17587c478bd9Sstevel@tonic-gate 		 * the large page list.
17597c478bd9Sstevel@tonic-gate 		 */
17607c478bd9Sstevel@tonic-gate 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
17617c478bd9Sstevel@tonic-gate 		pages_left -= npgs;
17627c478bd9Sstevel@tonic-gate 		tpp = pp;
17637c478bd9Sstevel@tonic-gate 		while (npgs--) {
17647c478bd9Sstevel@tonic-gate 			tpp->p_szc = new_szc;
17657c478bd9Sstevel@tonic-gate 			tpp = tpp->p_next;
17667c478bd9Sstevel@tonic-gate 		}
17677c478bd9Sstevel@tonic-gate 		page_list_concat(&pplist, &pp);
17687c478bd9Sstevel@tonic-gate 		pp += tmpnpgs;
17697c478bd9Sstevel@tonic-gate 	}
17707c478bd9Sstevel@tonic-gate 	CHK_LPG(pplist, new_szc);
17717c478bd9Sstevel@tonic-gate 
17727c478bd9Sstevel@tonic-gate 	/*
17737c478bd9Sstevel@tonic-gate 	 * return the page to the user if requested
17747c478bd9Sstevel@tonic-gate 	 * in the properly locked state.
17757c478bd9Sstevel@tonic-gate 	 */
17767c478bd9Sstevel@tonic-gate 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
17777c478bd9Sstevel@tonic-gate 		return (pplist);
17787c478bd9Sstevel@tonic-gate 	}
17797c478bd9Sstevel@tonic-gate 
17807c478bd9Sstevel@tonic-gate 	/*
17817c478bd9Sstevel@tonic-gate 	 * Otherwise place the new large page on the freelist
17827c478bd9Sstevel@tonic-gate 	 */
17837c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pplist);
17847c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pplist);
17857c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pplist);
17867c478bd9Sstevel@tonic-gate 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
17877c478bd9Sstevel@tonic-gate 
1788affbd3ccSkchow 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
17897c478bd9Sstevel@tonic-gate 	return (NULL);
17907c478bd9Sstevel@tonic-gate 
17917c478bd9Sstevel@tonic-gate fail_promote:
17927c478bd9Sstevel@tonic-gate 	/*
17937c478bd9Sstevel@tonic-gate 	 * A thread must have still been freeing or
17947c478bd9Sstevel@tonic-gate 	 * reclaiming the page on the cachelist.
17957c478bd9Sstevel@tonic-gate 	 * To prevent a deadlock undo what we have
17967c478bd9Sstevel@tonic-gate 	 * done sofar and return failure. This
17977c478bd9Sstevel@tonic-gate 	 * situation can only happen while promoting
17987c478bd9Sstevel@tonic-gate 	 * PAGESIZE pages.
17997c478bd9Sstevel@tonic-gate 	 */
18007c478bd9Sstevel@tonic-gate 	page_promote_err++;
18017c478bd9Sstevel@tonic-gate 	while (pplist) {
18027c478bd9Sstevel@tonic-gate 		pp = pplist;
18037c478bd9Sstevel@tonic-gate 		mach_page_sub(&pplist, pp);
18047c478bd9Sstevel@tonic-gate 		pp->p_szc = 0;
18057c478bd9Sstevel@tonic-gate 		bin = PP_2_BIN(pp);
18067c478bd9Sstevel@tonic-gate 		mtype = PP_2_MTYPE(pp);
18077c478bd9Sstevel@tonic-gate 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
1808affbd3ccSkchow 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
18097c478bd9Sstevel@tonic-gate 	}
18107c478bd9Sstevel@tonic-gate 	return (NULL);
18117c478bd9Sstevel@tonic-gate 
18127c478bd9Sstevel@tonic-gate }
18137c478bd9Sstevel@tonic-gate 
18147c478bd9Sstevel@tonic-gate /*
18157c478bd9Sstevel@tonic-gate  * Break up a large page into smaller size pages.
18167c478bd9Sstevel@tonic-gate  * Pages involved are on the freelist before the call and may
18177c478bd9Sstevel@tonic-gate  * be returned to the caller if requested, otherwise they will
18187c478bd9Sstevel@tonic-gate  * be placed back on the freelist.
18197c478bd9Sstevel@tonic-gate  * The caller is responsible for locking the freelist as well as any other
18207c478bd9Sstevel@tonic-gate  * accounting which needs to be done for a returned page.
18217c478bd9Sstevel@tonic-gate  * If flags is not PC_ALLOC, the color argument is ignored, and thus
18227c478bd9Sstevel@tonic-gate  * technically, any value may be passed in but PC_NO_COLOR is the standard
18237c478bd9Sstevel@tonic-gate  * which should be followed for clarity's sake.
18247c478bd9Sstevel@tonic-gate  */
18257c478bd9Sstevel@tonic-gate page_t *
18267c478bd9Sstevel@tonic-gate page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
18277c478bd9Sstevel@tonic-gate     int color, int flags)
18287c478bd9Sstevel@tonic-gate {
18297c478bd9Sstevel@tonic-gate 	page_t	*pp, *pplist, *npplist;
18307c478bd9Sstevel@tonic-gate 	pgcnt_t	npgs, n;
18317c478bd9Sstevel@tonic-gate 	uint_t	bin;
18327c478bd9Sstevel@tonic-gate 	uint_t	mtype;
18337c478bd9Sstevel@tonic-gate 	page_t	*ret_pp = NULL;
18347c478bd9Sstevel@tonic-gate 
18357c478bd9Sstevel@tonic-gate 	ASSERT(cur_szc != 0);
18367c478bd9Sstevel@tonic-gate 	ASSERT(new_szc < cur_szc);
18377c478bd9Sstevel@tonic-gate 
18387c478bd9Sstevel@tonic-gate 	pplist = page_numtopp_nolock(pfnum);
18397c478bd9Sstevel@tonic-gate 	ASSERT(pplist != NULL);
18407c478bd9Sstevel@tonic-gate 
18417c478bd9Sstevel@tonic-gate 	ASSERT(pplist->p_szc == cur_szc);
18427c478bd9Sstevel@tonic-gate 
18437c478bd9Sstevel@tonic-gate 	bin = PP_2_BIN(pplist);
18447c478bd9Sstevel@tonic-gate 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
18457c478bd9Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pplist);
18467c478bd9Sstevel@tonic-gate 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
18477c478bd9Sstevel@tonic-gate 
18487c478bd9Sstevel@tonic-gate 	CHK_LPG(pplist, cur_szc);
1849affbd3ccSkchow 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
18507c478bd9Sstevel@tonic-gate 
18517c478bd9Sstevel@tonic-gate 	/*
18527c478bd9Sstevel@tonic-gate 	 * Number of PAGESIZE pages for smaller new_szc
18537c478bd9Sstevel@tonic-gate 	 * page.
18547c478bd9Sstevel@tonic-gate 	 */
18557c478bd9Sstevel@tonic-gate 	npgs = page_get_pagecnt(new_szc);
18567c478bd9Sstevel@tonic-gate 
18577c478bd9Sstevel@tonic-gate 	while (pplist) {
18587c478bd9Sstevel@tonic-gate 		pp = pplist;
18597c478bd9Sstevel@tonic-gate 
18607c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == cur_szc);
18617c478bd9Sstevel@tonic-gate 
18627c478bd9Sstevel@tonic-gate 		/*
18637c478bd9Sstevel@tonic-gate 		 * We either break it up into PAGESIZE pages or larger.
18647c478bd9Sstevel@tonic-gate 		 */
18657c478bd9Sstevel@tonic-gate 		if (npgs == 1) {	/* PAGESIZE case */
18667c478bd9Sstevel@tonic-gate 			mach_page_sub(&pplist, pp);
18677c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_szc == cur_szc);
18687c478bd9Sstevel@tonic-gate 			ASSERT(new_szc == 0);
18697c478bd9Sstevel@tonic-gate 			ASSERT(mnode == PP_2_MEM_NODE(pp));
18707c478bd9Sstevel@tonic-gate 			pp->p_szc = new_szc;
18717c478bd9Sstevel@tonic-gate 			bin = PP_2_BIN(pp);
18727c478bd9Sstevel@tonic-gate 			if ((bin == color) && (flags == PC_ALLOC) &&
18737c478bd9Sstevel@tonic-gate 			    (ret_pp == NULL) &&
18747c478bd9Sstevel@tonic-gate 			    page_trylock_cons(pp, SE_EXCL)) {
18757c478bd9Sstevel@tonic-gate 				ret_pp = pp;
18767c478bd9Sstevel@tonic-gate 			} else {
18777c478bd9Sstevel@tonic-gate 				mtype = PP_2_MTYPE(pp);
18787c478bd9Sstevel@tonic-gate 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
18797c478bd9Sstevel@tonic-gate 				    mtype), pp);
1880affbd3ccSkchow 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
18817c478bd9Sstevel@tonic-gate 			}
18827c478bd9Sstevel@tonic-gate 		} else {
18837c478bd9Sstevel@tonic-gate 
18847c478bd9Sstevel@tonic-gate 			/*
18857c478bd9Sstevel@tonic-gate 			 * Break down into smaller lists of pages.
18867c478bd9Sstevel@tonic-gate 			 */
18877c478bd9Sstevel@tonic-gate 			page_list_break(&pplist, &npplist, npgs);
18887c478bd9Sstevel@tonic-gate 
18897c478bd9Sstevel@tonic-gate 			pp = pplist;
18907c478bd9Sstevel@tonic-gate 			n = npgs;
18917c478bd9Sstevel@tonic-gate 			while (n--) {
18927c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == cur_szc);
18937c478bd9Sstevel@tonic-gate 				pp->p_szc = new_szc;
18947c478bd9Sstevel@tonic-gate 				pp = pp->p_next;
18957c478bd9Sstevel@tonic-gate 			}
18967c478bd9Sstevel@tonic-gate 
18977c478bd9Sstevel@tonic-gate 			CHK_LPG(pplist, new_szc);
18987c478bd9Sstevel@tonic-gate 
18997c478bd9Sstevel@tonic-gate 			bin = PP_2_BIN(pplist);
19007c478bd9Sstevel@tonic-gate 			ASSERT(mnode == PP_2_MEM_NODE(pp));
19017c478bd9Sstevel@tonic-gate 			if ((bin == color) && (flags == PC_ALLOC) &&
19027c478bd9Sstevel@tonic-gate 			    (ret_pp == NULL) &&
19037c478bd9Sstevel@tonic-gate 			    page_trylock_cons(pp, SE_EXCL)) {
19047c478bd9Sstevel@tonic-gate 				ret_pp = pp;
19057c478bd9Sstevel@tonic-gate 			} else {
19067c478bd9Sstevel@tonic-gate 				mtype = PP_2_MTYPE(pp);
19077c478bd9Sstevel@tonic-gate 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
19087c478bd9Sstevel@tonic-gate 				    bin, mtype), pplist);
19097c478bd9Sstevel@tonic-gate 
1910affbd3ccSkchow 				page_ctr_add(mnode, mtype, pplist,
1911affbd3ccSkchow 				    PG_FREE_LIST);
19127c478bd9Sstevel@tonic-gate 			}
19137c478bd9Sstevel@tonic-gate 			pplist = npplist;
19147c478bd9Sstevel@tonic-gate 		}
19157c478bd9Sstevel@tonic-gate 	}
19167c478bd9Sstevel@tonic-gate 	return (ret_pp);
19177c478bd9Sstevel@tonic-gate }
19187c478bd9Sstevel@tonic-gate 
19197c478bd9Sstevel@tonic-gate int mpss_coalesce_disable = 0;
19207c478bd9Sstevel@tonic-gate 
19217c478bd9Sstevel@tonic-gate /*
19227c478bd9Sstevel@tonic-gate  * Coalesce free pages into a page of the given szc and color if possible.
19237c478bd9Sstevel@tonic-gate  * Return the pointer to the page created, otherwise, return NULL.
19247c478bd9Sstevel@tonic-gate  */
19257c478bd9Sstevel@tonic-gate static page_t *
19267c478bd9Sstevel@tonic-gate page_freelist_coalesce(int mnode, uchar_t szc, int color)
19277c478bd9Sstevel@tonic-gate {
19287c478bd9Sstevel@tonic-gate 	int 	r;		/* region size */
19297c478bd9Sstevel@tonic-gate 	int 	idx, full, i;
19307c478bd9Sstevel@tonic-gate 	pfn_t	pfnum;
19317c478bd9Sstevel@tonic-gate 	size_t	len;
19327c478bd9Sstevel@tonic-gate 	size_t	buckets_to_check;
19337c478bd9Sstevel@tonic-gate 	pgcnt_t	cands;
19347c478bd9Sstevel@tonic-gate 	page_t	*ret_pp;
19357c478bd9Sstevel@tonic-gate 	int	color_stride;
19367c478bd9Sstevel@tonic-gate 
19377c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce);
19387c478bd9Sstevel@tonic-gate 
19397c478bd9Sstevel@tonic-gate 	if (mpss_coalesce_disable) {
19407c478bd9Sstevel@tonic-gate 		return (NULL);
19417c478bd9Sstevel@tonic-gate 	}
19427c478bd9Sstevel@tonic-gate 
19437c478bd9Sstevel@tonic-gate 	r = szc;
19447c478bd9Sstevel@tonic-gate 	PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands);
19457c478bd9Sstevel@tonic-gate 	if (cands == 0) {
19467c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip);
19477c478bd9Sstevel@tonic-gate 		return (NULL);
19487c478bd9Sstevel@tonic-gate 	}
19497c478bd9Sstevel@tonic-gate 	full = FULL_REGION_CNT(r);
19507c478bd9Sstevel@tonic-gate 	color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
19517c478bd9Sstevel@tonic-gate 	    page_colors;
19527c478bd9Sstevel@tonic-gate 
19537c478bd9Sstevel@tonic-gate 	/* Prevent page_counters dynamic memory from being freed */
19547c478bd9Sstevel@tonic-gate 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
19557c478bd9Sstevel@tonic-gate 	len  = PAGE_COUNTERS_ENTRIES(mnode, r);
19567c478bd9Sstevel@tonic-gate 	buckets_to_check = len / color_stride;
19577c478bd9Sstevel@tonic-gate 	idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color);
19587c478bd9Sstevel@tonic-gate 	ASSERT((idx % color_stride) == color);
19597c478bd9Sstevel@tonic-gate 	idx += color_stride;
19607c478bd9Sstevel@tonic-gate 	if (idx >= len)
19617c478bd9Sstevel@tonic-gate 		idx = color;
19627c478bd9Sstevel@tonic-gate 	for (i = 0; i < buckets_to_check; i++) {
19637c478bd9Sstevel@tonic-gate 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
19647c478bd9Sstevel@tonic-gate 			pfnum = IDX_TO_PNUM(mnode, r, idx);
19657c478bd9Sstevel@tonic-gate 			ASSERT(pfnum >= mem_node_config[mnode].physbase &&
19667c478bd9Sstevel@tonic-gate 			    pfnum < mem_node_config[mnode].physmax);
19677c478bd9Sstevel@tonic-gate 			/*
19687c478bd9Sstevel@tonic-gate 			 * RFE: For performance maybe we can do something less
19697c478bd9Sstevel@tonic-gate 			 *	brutal than locking the entire freelist. So far
19707c478bd9Sstevel@tonic-gate 			 * 	this doesn't seem to be a performance problem?
19717c478bd9Sstevel@tonic-gate 			 */
19727c478bd9Sstevel@tonic-gate 			page_freelist_lock(mnode);
19737c478bd9Sstevel@tonic-gate 			if (PAGE_COUNTERS(mnode, r, idx) != full) {
19747c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(vmm_vmstats.page_ctrs_changed);
19757c478bd9Sstevel@tonic-gate 				goto skip_this_one;
19767c478bd9Sstevel@tonic-gate 			}
19777c478bd9Sstevel@tonic-gate 			ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC);
19787c478bd9Sstevel@tonic-gate 			if (ret_pp != NULL) {
19797c478bd9Sstevel@tonic-gate 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
19807c478bd9Sstevel@tonic-gate 				    idx;
19817c478bd9Sstevel@tonic-gate 				page_freelist_unlock(mnode);
19827c478bd9Sstevel@tonic-gate 				rw_exit(&page_ctrs_rwlock[mnode]);
19837c478bd9Sstevel@tonic-gate #if defined(__sparc)
19847c478bd9Sstevel@tonic-gate 				if (PP_ISNORELOC(ret_pp)) {
19857c478bd9Sstevel@tonic-gate 					pgcnt_t npgs;
19867c478bd9Sstevel@tonic-gate 
19877c478bd9Sstevel@tonic-gate 					npgs = page_get_pagecnt(ret_pp->p_szc);
19887c478bd9Sstevel@tonic-gate 					kcage_freemem_sub(npgs);
19897c478bd9Sstevel@tonic-gate 				}
19907c478bd9Sstevel@tonic-gate #endif
19917c478bd9Sstevel@tonic-gate 				return (ret_pp);
19927c478bd9Sstevel@tonic-gate 			}
19937c478bd9Sstevel@tonic-gate skip_this_one:
19947c478bd9Sstevel@tonic-gate 			page_freelist_unlock(mnode);
19957c478bd9Sstevel@tonic-gate 			/*
19967c478bd9Sstevel@tonic-gate 			 * No point looking for another page if we've
19977c478bd9Sstevel@tonic-gate 			 * already tried all of the ones that
19987c478bd9Sstevel@tonic-gate 			 * page_ctr_cands indicated.  Stash off where we left
19997c478bd9Sstevel@tonic-gate 			 * off.
20007c478bd9Sstevel@tonic-gate 			 * Note: this is not exact since we don't hold the
20017c478bd9Sstevel@tonic-gate 			 * page_freelist_locks before we initially get the
20027c478bd9Sstevel@tonic-gate 			 * value of cands for performance reasons, but should
20037c478bd9Sstevel@tonic-gate 			 * be a decent approximation.
20047c478bd9Sstevel@tonic-gate 			 */
20057c478bd9Sstevel@tonic-gate 			if (--cands == 0) {
20067c478bd9Sstevel@tonic-gate 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
20077c478bd9Sstevel@tonic-gate 				    idx;
20087c478bd9Sstevel@tonic-gate 				break;
20097c478bd9Sstevel@tonic-gate 			}
20107c478bd9Sstevel@tonic-gate 		}
20117c478bd9Sstevel@tonic-gate 		idx += color_stride;
20127c478bd9Sstevel@tonic-gate 		if (idx >= len)
20137c478bd9Sstevel@tonic-gate 			idx = color;
20147c478bd9Sstevel@tonic-gate 	}
20157c478bd9Sstevel@tonic-gate 	rw_exit(&page_ctrs_rwlock[mnode]);
20167c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed);
20177c478bd9Sstevel@tonic-gate 	return (NULL);
20187c478bd9Sstevel@tonic-gate }
20197c478bd9Sstevel@tonic-gate 
20207c478bd9Sstevel@tonic-gate /*
20217c478bd9Sstevel@tonic-gate  * For the given mnode, promote as many small pages to large pages as possible.
20227c478bd9Sstevel@tonic-gate  */
20237c478bd9Sstevel@tonic-gate void
20247c478bd9Sstevel@tonic-gate page_freelist_coalesce_all(int mnode)
20257c478bd9Sstevel@tonic-gate {
20267c478bd9Sstevel@tonic-gate 	int 	r;		/* region size */
20277c478bd9Sstevel@tonic-gate 	int 	idx, full;
20287c478bd9Sstevel@tonic-gate 	pfn_t	pfnum;
20297c478bd9Sstevel@tonic-gate 	size_t	len;
20307c478bd9Sstevel@tonic-gate 
20317c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
20327c478bd9Sstevel@tonic-gate 
20337c478bd9Sstevel@tonic-gate 	if (mpss_coalesce_disable) {
20347c478bd9Sstevel@tonic-gate 		return;
20357c478bd9Sstevel@tonic-gate 	}
20367c478bd9Sstevel@tonic-gate 
20377c478bd9Sstevel@tonic-gate 	/*
20387c478bd9Sstevel@tonic-gate 	 * Lock the entire freelist and coalesce what we can.
20397c478bd9Sstevel@tonic-gate 	 *
20407c478bd9Sstevel@tonic-gate 	 * Always promote to the largest page possible
20417c478bd9Sstevel@tonic-gate 	 * first to reduce the number of page promotions.
20427c478bd9Sstevel@tonic-gate 	 */
20437c478bd9Sstevel@tonic-gate 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
20447c478bd9Sstevel@tonic-gate 	page_freelist_lock(mnode);
20457c478bd9Sstevel@tonic-gate 	for (r = mmu_page_sizes - 1; r > 0; r--) {
20467c478bd9Sstevel@tonic-gate 		pgcnt_t cands;
20477c478bd9Sstevel@tonic-gate 
20487c478bd9Sstevel@tonic-gate 		PGCTRS_CANDS_GETVALUE(mnode, r, cands);
20497c478bd9Sstevel@tonic-gate 		if (cands == 0) {
20507c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
20517c478bd9Sstevel@tonic-gate 			continue;
20527c478bd9Sstevel@tonic-gate 		}
20537c478bd9Sstevel@tonic-gate 
20547c478bd9Sstevel@tonic-gate 		full = FULL_REGION_CNT(r);
20557c478bd9Sstevel@tonic-gate 		len  = PAGE_COUNTERS_ENTRIES(mnode, r);
20567c478bd9Sstevel@tonic-gate 
20577c478bd9Sstevel@tonic-gate 		for (idx = 0; idx < len; idx++) {
20587c478bd9Sstevel@tonic-gate 			if (PAGE_COUNTERS(mnode, r, idx) == full) {
20597c478bd9Sstevel@tonic-gate 				pfnum = IDX_TO_PNUM(mnode, r, idx);
20607c478bd9Sstevel@tonic-gate 				ASSERT(pfnum >=
20617c478bd9Sstevel@tonic-gate 				    mem_node_config[mnode].physbase &&
20627c478bd9Sstevel@tonic-gate 				    pfnum <
20637c478bd9Sstevel@tonic-gate 				    mem_node_config[mnode].physmax);
20647c478bd9Sstevel@tonic-gate 				(void) page_promote(mnode, pfnum, r, PC_FREE);
20657c478bd9Sstevel@tonic-gate 			}
20667c478bd9Sstevel@tonic-gate 		}
20677c478bd9Sstevel@tonic-gate 	}
20687c478bd9Sstevel@tonic-gate 	page_freelist_unlock(mnode);
20697c478bd9Sstevel@tonic-gate 	rw_exit(&page_ctrs_rwlock[mnode]);
20707c478bd9Sstevel@tonic-gate }
20717c478bd9Sstevel@tonic-gate 
20727c478bd9Sstevel@tonic-gate /*
20737c478bd9Sstevel@tonic-gate  * This is where all polices for moving pages around
20747c478bd9Sstevel@tonic-gate  * to different page size free lists is implemented.
20757c478bd9Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
20767c478bd9Sstevel@tonic-gate  *
20777c478bd9Sstevel@tonic-gate  * So far these are the priorities for this algorithm in descending
20787c478bd9Sstevel@tonic-gate  * order:
20797c478bd9Sstevel@tonic-gate  *
20807c478bd9Sstevel@tonic-gate  *	1) When servicing a request try to do so with a free page
20817c478bd9Sstevel@tonic-gate  *	   from next size up. Helps defer fragmentation as long
20827c478bd9Sstevel@tonic-gate  *	   as possible.
20837c478bd9Sstevel@tonic-gate  *
20847c478bd9Sstevel@tonic-gate  *	2) Page coalesce on demand. Only when a freelist
20857c478bd9Sstevel@tonic-gate  *	   larger than PAGESIZE is empty and step 1
20867c478bd9Sstevel@tonic-gate  *	   will not work since all larger size lists are
20877c478bd9Sstevel@tonic-gate  *	   also empty.
20887c478bd9Sstevel@tonic-gate  *
20897c478bd9Sstevel@tonic-gate  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
20907c478bd9Sstevel@tonic-gate  */
20917c478bd9Sstevel@tonic-gate page_t *
20927c478bd9Sstevel@tonic-gate page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi)
20937c478bd9Sstevel@tonic-gate {
20947c478bd9Sstevel@tonic-gate 	uchar_t nszc = szc + 1;
20957c478bd9Sstevel@tonic-gate 	int 	bin;
20967c478bd9Sstevel@tonic-gate 	page_t	*pp, *firstpp;
20977c478bd9Sstevel@tonic-gate 	page_t	*ret_pp = NULL;
20987c478bd9Sstevel@tonic-gate 
20997c478bd9Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
21007c478bd9Sstevel@tonic-gate 
2101affbd3ccSkchow 	VM_STAT_ADD(vmm_vmstats.pff_req[szc]);
21027c478bd9Sstevel@tonic-gate 	/*
21037c478bd9Sstevel@tonic-gate 	 * First try to break up a larger page to fill
21047c478bd9Sstevel@tonic-gate 	 * current size freelist.
21057c478bd9Sstevel@tonic-gate 	 */
21067c478bd9Sstevel@tonic-gate 	while (nszc < mmu_page_sizes) {
21077c478bd9Sstevel@tonic-gate 		/*
21087c478bd9Sstevel@tonic-gate 		 * If page found then demote it.
21097c478bd9Sstevel@tonic-gate 		 */
21107c478bd9Sstevel@tonic-gate 		bin = page_convert_color(szc, nszc, color);
21117c478bd9Sstevel@tonic-gate 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
21127c478bd9Sstevel@tonic-gate 			page_freelist_lock(mnode);
21137c478bd9Sstevel@tonic-gate 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
21147c478bd9Sstevel@tonic-gate 
21157c478bd9Sstevel@tonic-gate 			/*
21167c478bd9Sstevel@tonic-gate 			 * If pfnhi is not PFNNULL, look for large page below
21177c478bd9Sstevel@tonic-gate 			 * pfnhi. PFNNULL signifies no pfn requirement.
21187c478bd9Sstevel@tonic-gate 			 */
21197c478bd9Sstevel@tonic-gate 			if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
21207c478bd9Sstevel@tonic-gate 				do {
21217c478bd9Sstevel@tonic-gate 					pp = pp->p_vpnext;
21227c478bd9Sstevel@tonic-gate 					if (pp == firstpp) {
21237c478bd9Sstevel@tonic-gate 						pp = NULL;
21247c478bd9Sstevel@tonic-gate 						break;
21257c478bd9Sstevel@tonic-gate 					}
21267c478bd9Sstevel@tonic-gate 				} while (pp->p_pagenum >= pfnhi);
21277c478bd9Sstevel@tonic-gate 			}
21287c478bd9Sstevel@tonic-gate 			if (pp) {
21297c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == nszc);
2130affbd3ccSkchow 				VM_STAT_ADD(vmm_vmstats.pff_demote[nszc]);
21317c478bd9Sstevel@tonic-gate 				ret_pp = page_demote(mnode, pp->p_pagenum,
21327c478bd9Sstevel@tonic-gate 				    pp->p_szc, szc, color, PC_ALLOC);
21337c478bd9Sstevel@tonic-gate 				if (ret_pp) {
21347c478bd9Sstevel@tonic-gate 					page_freelist_unlock(mnode);
21357c478bd9Sstevel@tonic-gate #if defined(__sparc)
21367c478bd9Sstevel@tonic-gate 					if (PP_ISNORELOC(ret_pp)) {
21377c478bd9Sstevel@tonic-gate 						pgcnt_t npgs;
21387c478bd9Sstevel@tonic-gate 
21397c478bd9Sstevel@tonic-gate 						npgs = page_get_pagecnt(
21407c478bd9Sstevel@tonic-gate 						    ret_pp->p_szc);
21417c478bd9Sstevel@tonic-gate 						kcage_freemem_sub(npgs);
21427c478bd9Sstevel@tonic-gate 					}
21437c478bd9Sstevel@tonic-gate #endif
21447c478bd9Sstevel@tonic-gate 					return (ret_pp);
21457c478bd9Sstevel@tonic-gate 				}
21467c478bd9Sstevel@tonic-gate 			}
21477c478bd9Sstevel@tonic-gate 			page_freelist_unlock(mnode);
21487c478bd9Sstevel@tonic-gate 		}
21497c478bd9Sstevel@tonic-gate 		nszc++;
21507c478bd9Sstevel@tonic-gate 	}
21517c478bd9Sstevel@tonic-gate 
21527c478bd9Sstevel@tonic-gate 	/*
21537c478bd9Sstevel@tonic-gate 	 * Ok that didn't work. Time to coalesce.
21547c478bd9Sstevel@tonic-gate 	 */
21557c478bd9Sstevel@tonic-gate 	if (szc != 0) {
21567c478bd9Sstevel@tonic-gate 		ret_pp = page_freelist_coalesce(mnode, szc, color);
2157affbd3ccSkchow 		VM_STAT_COND_ADD(ret_pp, vmm_vmstats.pff_coalok[szc]);
21587c478bd9Sstevel@tonic-gate 	}
21597c478bd9Sstevel@tonic-gate 
21607c478bd9Sstevel@tonic-gate 	return (ret_pp);
21617c478bd9Sstevel@tonic-gate }
21627c478bd9Sstevel@tonic-gate 
21637c478bd9Sstevel@tonic-gate /*
21647c478bd9Sstevel@tonic-gate  * Helper routine used only by the freelist code to lock
21657c478bd9Sstevel@tonic-gate  * a page. If the page is a large page then it succeeds in
21667c478bd9Sstevel@tonic-gate  * locking all the constituent pages or none at all.
21677c478bd9Sstevel@tonic-gate  * Returns 1 on sucess, 0 on failure.
21687c478bd9Sstevel@tonic-gate  */
21697c478bd9Sstevel@tonic-gate static int
21707c478bd9Sstevel@tonic-gate page_trylock_cons(page_t *pp, se_t se)
21717c478bd9Sstevel@tonic-gate {
21727c478bd9Sstevel@tonic-gate 	page_t	*tpp, *first_pp = pp;
21737c478bd9Sstevel@tonic-gate 
21747c478bd9Sstevel@tonic-gate 	/*
21757c478bd9Sstevel@tonic-gate 	 * Fail if can't lock first or only page.
21767c478bd9Sstevel@tonic-gate 	 */
21777c478bd9Sstevel@tonic-gate 	if (!page_trylock(pp, se)) {
21787c478bd9Sstevel@tonic-gate 		return (0);
21797c478bd9Sstevel@tonic-gate 	}
21807c478bd9Sstevel@tonic-gate 
21817c478bd9Sstevel@tonic-gate 	/*
21827c478bd9Sstevel@tonic-gate 	 * PAGESIZE: common case.
21837c478bd9Sstevel@tonic-gate 	 */
21847c478bd9Sstevel@tonic-gate 	if (pp->p_szc == 0) {
21857c478bd9Sstevel@tonic-gate 		return (1);
21867c478bd9Sstevel@tonic-gate 	}
21877c478bd9Sstevel@tonic-gate 
21887c478bd9Sstevel@tonic-gate 	/*
21897c478bd9Sstevel@tonic-gate 	 * Large page case.
21907c478bd9Sstevel@tonic-gate 	 */
21917c478bd9Sstevel@tonic-gate 	tpp = pp->p_next;
21927c478bd9Sstevel@tonic-gate 	while (tpp != pp) {
21937c478bd9Sstevel@tonic-gate 		if (!page_trylock(tpp, se)) {
21947c478bd9Sstevel@tonic-gate 			/*
21957c478bd9Sstevel@tonic-gate 			 * On failure unlock what we
21967c478bd9Sstevel@tonic-gate 			 * have locked so far.
21977c478bd9Sstevel@tonic-gate 			 */
21987c478bd9Sstevel@tonic-gate 			while (first_pp != tpp) {
2199db874c57Selowe 				page_unlock_noretire(first_pp);
22007c478bd9Sstevel@tonic-gate 				first_pp = first_pp->p_next;
22017c478bd9Sstevel@tonic-gate 			}
22027c478bd9Sstevel@tonic-gate 			return (0);
22037c478bd9Sstevel@tonic-gate 		}
22047c478bd9Sstevel@tonic-gate 		tpp = tpp->p_next;
22057c478bd9Sstevel@tonic-gate 	}
22067c478bd9Sstevel@tonic-gate 	return (1);
22077c478bd9Sstevel@tonic-gate }
22087c478bd9Sstevel@tonic-gate 
22097c478bd9Sstevel@tonic-gate page_t *
22107c478bd9Sstevel@tonic-gate page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
22117c478bd9Sstevel@tonic-gate     uint_t flags)
22127c478bd9Sstevel@tonic-gate {
22137c478bd9Sstevel@tonic-gate 	kmutex_t	*pcm;
22147c478bd9Sstevel@tonic-gate 	int		i, fill_tried, fill_marker;
22157c478bd9Sstevel@tonic-gate 	page_t		*pp, *first_pp;
22167c478bd9Sstevel@tonic-gate 	uint_t		bin_marker;
22177c478bd9Sstevel@tonic-gate 	int		colors, cpucolors;
22187c478bd9Sstevel@tonic-gate 	uchar_t		nszc;
22197c478bd9Sstevel@tonic-gate 	uint_t		nszc_color_shift;
22207c478bd9Sstevel@tonic-gate 	int		nwaybins = 0, nwaycnt;
22217c478bd9Sstevel@tonic-gate 
22227c478bd9Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
22237c478bd9Sstevel@tonic-gate 
22247c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
22257c478bd9Sstevel@tonic-gate 
22267c478bd9Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
22277c478bd9Sstevel@tonic-gate 	if (mtype < 0) {	/* mnode foes not have memory in mtype range */
22287c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
22297c478bd9Sstevel@tonic-gate 		return (NULL);
22307c478bd9Sstevel@tonic-gate 	}
22317c478bd9Sstevel@tonic-gate 
22327c478bd9Sstevel@tonic-gate 	/*
22337c478bd9Sstevel@tonic-gate 	 * Set how many physical colors for this page size.
22347c478bd9Sstevel@tonic-gate 	 */
22357c478bd9Sstevel@tonic-gate 	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
22367c478bd9Sstevel@tonic-gate 	    page_colors;
22377c478bd9Sstevel@tonic-gate 
22387c478bd9Sstevel@tonic-gate 	nszc = MIN(szc + 1, mmu_page_sizes - 1);
22397c478bd9Sstevel@tonic-gate 	nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc);
22407c478bd9Sstevel@tonic-gate 
22417c478bd9Sstevel@tonic-gate 	/* cpu_page_colors is non-zero if a page color may be in > 1 bin */
22427c478bd9Sstevel@tonic-gate 	cpucolors = cpu_page_colors;
22437c478bd9Sstevel@tonic-gate 
22447c478bd9Sstevel@tonic-gate 	/*
22457c478bd9Sstevel@tonic-gate 	 * adjust cpucolors to possibly check additional 'equivalent' bins
22467c478bd9Sstevel@tonic-gate 	 * to try to minimize fragmentation of large pages by delaying calls
22477c478bd9Sstevel@tonic-gate 	 * to page_freelist_fill.
22487c478bd9Sstevel@tonic-gate 	 */
22497c478bd9Sstevel@tonic-gate 	if (colorequiv > 1) {
22507c478bd9Sstevel@tonic-gate 		int equivcolors = colors / colorequiv;
22517c478bd9Sstevel@tonic-gate 
22527c478bd9Sstevel@tonic-gate 		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
22537c478bd9Sstevel@tonic-gate 			cpucolors = equivcolors;
22547c478bd9Sstevel@tonic-gate 	}
22557c478bd9Sstevel@tonic-gate 
22567c478bd9Sstevel@tonic-gate 	ASSERT(colors <= page_colors);
22577c478bd9Sstevel@tonic-gate 	ASSERT(colors);
22587c478bd9Sstevel@tonic-gate 	ASSERT((colors & (colors - 1)) == 0);
22597c478bd9Sstevel@tonic-gate 
22607c478bd9Sstevel@tonic-gate 	ASSERT(bin < colors);
22617c478bd9Sstevel@tonic-gate 
22627c478bd9Sstevel@tonic-gate 	/*
22637c478bd9Sstevel@tonic-gate 	 * Only hold one freelist lock at a time, that way we
22647c478bd9Sstevel@tonic-gate 	 * can start anywhere and not have to worry about lock
22657c478bd9Sstevel@tonic-gate 	 * ordering.
22667c478bd9Sstevel@tonic-gate 	 */
22677c478bd9Sstevel@tonic-gate big_try_again:
22687c478bd9Sstevel@tonic-gate 	fill_tried = 0;
22697c478bd9Sstevel@tonic-gate 	nwaycnt = 0;
22707c478bd9Sstevel@tonic-gate 	for (i = 0; i <= colors; i++) {
22717c478bd9Sstevel@tonic-gate try_again:
22727c478bd9Sstevel@tonic-gate 		ASSERT(bin < colors);
22737c478bd9Sstevel@tonic-gate 		if (PAGE_FREELISTS(mnode, szc, bin, mtype)) {
22747c478bd9Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
22757c478bd9Sstevel@tonic-gate 			mutex_enter(pcm);
22767c478bd9Sstevel@tonic-gate 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
22777c478bd9Sstevel@tonic-gate 			if (pp != NULL) {
22787c478bd9Sstevel@tonic-gate 				/*
22797c478bd9Sstevel@tonic-gate 				 * These were set before the page
22807c478bd9Sstevel@tonic-gate 				 * was put on the free list,
22817c478bd9Sstevel@tonic-gate 				 * they must still be set.
22827c478bd9Sstevel@tonic-gate 				 */
22837c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISFREE(pp));
22847c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp));
22857c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_vnode == NULL);
22867c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_hash == NULL);
22877c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_offset == (u_offset_t)-1);
22887c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == szc);
22897c478bd9Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
22907c478bd9Sstevel@tonic-gate 
22917c478bd9Sstevel@tonic-gate 				/*
22927c478bd9Sstevel@tonic-gate 				 * Walk down the hash chain.
22937c478bd9Sstevel@tonic-gate 				 * 8k pages are linked on p_next
22947c478bd9Sstevel@tonic-gate 				 * and p_prev fields. Large pages
22957c478bd9Sstevel@tonic-gate 				 * are a contiguous group of
22967c478bd9Sstevel@tonic-gate 				 * constituent pages linked together
22977c478bd9Sstevel@tonic-gate 				 * on their p_next and p_prev fields.
22987c478bd9Sstevel@tonic-gate 				 * The large pages are linked together
22997c478bd9Sstevel@tonic-gate 				 * on the hash chain using p_vpnext
23007c478bd9Sstevel@tonic-gate 				 * p_vpprev of the base constituent
23017c478bd9Sstevel@tonic-gate 				 * page of each large page.
23027c478bd9Sstevel@tonic-gate 				 */
23037c478bd9Sstevel@tonic-gate 				first_pp = pp;
23047c478bd9Sstevel@tonic-gate 				while (!page_trylock_cons(pp, SE_EXCL)) {
23057c478bd9Sstevel@tonic-gate 					if (szc == 0) {
23067c478bd9Sstevel@tonic-gate 						pp = pp->p_next;
23077c478bd9Sstevel@tonic-gate 					} else {
23087c478bd9Sstevel@tonic-gate 						pp = pp->p_vpnext;
23097c478bd9Sstevel@tonic-gate 					}
23107c478bd9Sstevel@tonic-gate 
23117c478bd9Sstevel@tonic-gate 					ASSERT(PP_ISFREE(pp));
23127c478bd9Sstevel@tonic-gate 					ASSERT(PP_ISAGED(pp));
23137c478bd9Sstevel@tonic-gate 					ASSERT(pp->p_vnode == NULL);
23147c478bd9Sstevel@tonic-gate 					ASSERT(pp->p_hash == NULL);
23157c478bd9Sstevel@tonic-gate 					ASSERT(pp->p_offset == (u_offset_t)-1);
23167c478bd9Sstevel@tonic-gate 					ASSERT(pp->p_szc == szc);
23177c478bd9Sstevel@tonic-gate 					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
23187c478bd9Sstevel@tonic-gate 							mnode);
23197c478bd9Sstevel@tonic-gate 
23207c478bd9Sstevel@tonic-gate 					if (pp == first_pp) {
23217c478bd9Sstevel@tonic-gate 						pp = NULL;
23227c478bd9Sstevel@tonic-gate 						break;
23237c478bd9Sstevel@tonic-gate 					}
23247c478bd9Sstevel@tonic-gate 				}
23257c478bd9Sstevel@tonic-gate 
23267c478bd9Sstevel@tonic-gate 				if (pp) {
23277c478bd9Sstevel@tonic-gate 					ASSERT(mtype == PP_2_MTYPE(pp));
23287c478bd9Sstevel@tonic-gate 					ASSERT(pp->p_szc == szc);
23297c478bd9Sstevel@tonic-gate 					if (szc == 0) {
23307c478bd9Sstevel@tonic-gate 						page_sub(&PAGE_FREELISTS(mnode,
23317c478bd9Sstevel@tonic-gate 						    szc, bin, mtype), pp);
23327c478bd9Sstevel@tonic-gate 					} else {
23337c478bd9Sstevel@tonic-gate 						page_vpsub(&PAGE_FREELISTS(
23347c478bd9Sstevel@tonic-gate 						    mnode, szc, bin, mtype),
23357c478bd9Sstevel@tonic-gate 						    pp);
23367c478bd9Sstevel@tonic-gate 						CHK_LPG(pp, szc);
23377c478bd9Sstevel@tonic-gate 					}
2338affbd3ccSkchow 					page_ctr_sub(mnode, mtype, pp,
2339affbd3ccSkchow 					    PG_FREE_LIST);
23407c478bd9Sstevel@tonic-gate 
23417c478bd9Sstevel@tonic-gate 					if ((PP_ISFREE(pp) == 0) ||
23427c478bd9Sstevel@tonic-gate 					    (PP_ISAGED(pp) == 0))
23437c478bd9Sstevel@tonic-gate 						panic("free page is not. pp %p",
23447c478bd9Sstevel@tonic-gate 						    (void *)pp);
23457c478bd9Sstevel@tonic-gate 					mutex_exit(pcm);
23467c478bd9Sstevel@tonic-gate 
23477c478bd9Sstevel@tonic-gate #if defined(__sparc)
23487c478bd9Sstevel@tonic-gate 					ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
23497c478bd9Sstevel@tonic-gate 					    (flags & PG_NORELOC) == 0);
23507c478bd9Sstevel@tonic-gate 
23517c478bd9Sstevel@tonic-gate 					if (PP_ISNORELOC(pp)) {
23527c478bd9Sstevel@tonic-gate 						pgcnt_t	npgs;
23537c478bd9Sstevel@tonic-gate 
23547c478bd9Sstevel@tonic-gate 						npgs = page_get_pagecnt(szc);
23557c478bd9Sstevel@tonic-gate 						kcage_freemem_sub(npgs);
23567c478bd9Sstevel@tonic-gate 					}
23577c478bd9Sstevel@tonic-gate #endif
23587c478bd9Sstevel@tonic-gate 					VM_STAT_ADD(vmm_vmstats.
23597c478bd9Sstevel@tonic-gate 					    pgmf_allocok[szc]);
23607c478bd9Sstevel@tonic-gate 					return (pp);
23617c478bd9Sstevel@tonic-gate 				}
23627c478bd9Sstevel@tonic-gate 			}
23637c478bd9Sstevel@tonic-gate 			mutex_exit(pcm);
23647c478bd9Sstevel@tonic-gate 		}
23657c478bd9Sstevel@tonic-gate 
23667c478bd9Sstevel@tonic-gate 		/*
23677c478bd9Sstevel@tonic-gate 		 * Wow! The initial bin is empty.
23687c478bd9Sstevel@tonic-gate 		 * If specific color is needed, check if page color may be
23697c478bd9Sstevel@tonic-gate 		 * in other bins. cpucolors is:
23707c478bd9Sstevel@tonic-gate 		 *   0	if the colors for this cpu is equal to page_colors.
23717c478bd9Sstevel@tonic-gate 		 *	This means that pages with a particular color are in a
23727c478bd9Sstevel@tonic-gate 		 *	single bin.
23737c478bd9Sstevel@tonic-gate 		 *  -1	if colors of cpus (cheetah+) are heterogenous. Need to
23747c478bd9Sstevel@tonic-gate 		 *	first determine the colors for the current cpu.
23757c478bd9Sstevel@tonic-gate 		 *  >0	colors of all cpus are homogenous and < page_colors
23767c478bd9Sstevel@tonic-gate 		 */
23777c478bd9Sstevel@tonic-gate 
23787c478bd9Sstevel@tonic-gate 		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
23797c478bd9Sstevel@tonic-gate 			if (!nwaybins) {
23807c478bd9Sstevel@tonic-gate 				/*
23817c478bd9Sstevel@tonic-gate 				 * cpucolors is negative if ecache setsizes
23827c478bd9Sstevel@tonic-gate 				 * are heterogenous. determine colors for this
23837c478bd9Sstevel@tonic-gate 				 * particular cpu.
23847c478bd9Sstevel@tonic-gate 				 */
23857c478bd9Sstevel@tonic-gate 				if (cpucolors < 0) {
23867c478bd9Sstevel@tonic-gate 					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
23877c478bd9Sstevel@tonic-gate 					ASSERT(cpucolors > 0);
23887c478bd9Sstevel@tonic-gate 					nwaybins = colors / cpucolors;
23897c478bd9Sstevel@tonic-gate 				} else {
23907c478bd9Sstevel@tonic-gate 					nwaybins = colors / cpucolors;
23917c478bd9Sstevel@tonic-gate 					ASSERT(szc > 0 || nwaybins > 1);
23927c478bd9Sstevel@tonic-gate 				}
23937c478bd9Sstevel@tonic-gate 				if (nwaybins < 2)
23947c478bd9Sstevel@tonic-gate 					cpucolors = 0;
23957c478bd9Sstevel@tonic-gate 			}
23967c478bd9Sstevel@tonic-gate 
23977c478bd9Sstevel@tonic-gate 			if (cpucolors && (nwaycnt + 1 <= nwaybins)) {
23987c478bd9Sstevel@tonic-gate 				nwaycnt++;
23997c478bd9Sstevel@tonic-gate 				bin = (bin + (colors / nwaybins)) &
24007c478bd9Sstevel@tonic-gate 				    (colors - 1);
24017c478bd9Sstevel@tonic-gate 				if (nwaycnt < nwaybins) {
24027c478bd9Sstevel@tonic-gate 					goto try_again;
24037c478bd9Sstevel@tonic-gate 				}
24047c478bd9Sstevel@tonic-gate 			}
24057c478bd9Sstevel@tonic-gate 			/* back to initial color if fall-thru */
24067c478bd9Sstevel@tonic-gate 		}
24077c478bd9Sstevel@tonic-gate 
24087c478bd9Sstevel@tonic-gate 		/*
24097c478bd9Sstevel@tonic-gate 		 * color bins are all empty if color match. Try and satisfy
24107c478bd9Sstevel@tonic-gate 		 * the request by breaking up or coalescing pages from
24117c478bd9Sstevel@tonic-gate 		 * a different size freelist of the correct color that
24127c478bd9Sstevel@tonic-gate 		 * satisfies the ORIGINAL color requested. If that
24137c478bd9Sstevel@tonic-gate 		 * fails then try pages of the same size but different
24147c478bd9Sstevel@tonic-gate 		 * colors assuming we are not called with
24157c478bd9Sstevel@tonic-gate 		 * PG_MATCH_COLOR.
24167c478bd9Sstevel@tonic-gate 		 */
24177c478bd9Sstevel@tonic-gate 		if (!fill_tried) {
24187c478bd9Sstevel@tonic-gate 			fill_tried = 1;
24197c478bd9Sstevel@tonic-gate 			fill_marker = bin >> nszc_color_shift;
24207c478bd9Sstevel@tonic-gate 			pp = page_freelist_fill(szc, bin, mnode, mtype,
24217c478bd9Sstevel@tonic-gate 			    PFNNULL);
24227c478bd9Sstevel@tonic-gate 			if (pp != NULL) {
24237c478bd9Sstevel@tonic-gate 				return (pp);
24247c478bd9Sstevel@tonic-gate 			}
24257c478bd9Sstevel@tonic-gate 		}
24267c478bd9Sstevel@tonic-gate 
24277c478bd9Sstevel@tonic-gate 		if (flags & PG_MATCH_COLOR)
24287c478bd9Sstevel@tonic-gate 			break;
24297c478bd9Sstevel@tonic-gate 
24307c478bd9Sstevel@tonic-gate 		/*
24317c478bd9Sstevel@tonic-gate 		 * Select next color bin to try.
24327c478bd9Sstevel@tonic-gate 		 */
24337c478bd9Sstevel@tonic-gate 		if (szc == 0) {
24347c478bd9Sstevel@tonic-gate 			/*
24357c478bd9Sstevel@tonic-gate 			 * PAGESIZE page case.
24367c478bd9Sstevel@tonic-gate 			 */
24377c478bd9Sstevel@tonic-gate 			if (i == 0) {
24387c478bd9Sstevel@tonic-gate 				bin = (bin + BIN_STEP) & page_colors_mask;
24397c478bd9Sstevel@tonic-gate 				bin_marker = bin;
24407c478bd9Sstevel@tonic-gate 			} else {
24417c478bd9Sstevel@tonic-gate 				bin = (bin + vac_colors) & page_colors_mask;
24427c478bd9Sstevel@tonic-gate 				if (bin == bin_marker) {
24437c478bd9Sstevel@tonic-gate 					bin = (bin + 1) & page_colors_mask;
24447c478bd9Sstevel@tonic-gate 					bin_marker = bin;
24457c478bd9Sstevel@tonic-gate 				}
24467c478bd9Sstevel@tonic-gate 			}
24477c478bd9Sstevel@tonic-gate 		} else {
24487c478bd9Sstevel@tonic-gate 			/*
24497c478bd9Sstevel@tonic-gate 			 * Large page case.
24507c478bd9Sstevel@tonic-gate 			 */
24517c478bd9Sstevel@tonic-gate 			bin = (bin + 1) & (colors - 1);
24527c478bd9Sstevel@tonic-gate 		}
24537c478bd9Sstevel@tonic-gate 		/*
24547c478bd9Sstevel@tonic-gate 		 * If bin advanced to the next color bin of the
24557c478bd9Sstevel@tonic-gate 		 * next larger pagesize, there is a chance the fill
24567c478bd9Sstevel@tonic-gate 		 * could succeed.
24577c478bd9Sstevel@tonic-gate 		 */
24587c478bd9Sstevel@tonic-gate 		if (fill_marker != (bin >> nszc_color_shift))
24597c478bd9Sstevel@tonic-gate 			fill_tried = 0;
24607c478bd9Sstevel@tonic-gate 	}
24617c478bd9Sstevel@tonic-gate 
2462affbd3ccSkchow 	/* if allowed, cycle through additional mtypes */
2463affbd3ccSkchow 	MTYPE_NEXT(mnode, mtype, flags);
2464affbd3ccSkchow 	if (mtype >= 0)
24657c478bd9Sstevel@tonic-gate 		goto big_try_again;
2466affbd3ccSkchow 
24677c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
24687c478bd9Sstevel@tonic-gate 
24697c478bd9Sstevel@tonic-gate 	return (NULL);
24707c478bd9Sstevel@tonic-gate }
24717c478bd9Sstevel@tonic-gate 
24727c478bd9Sstevel@tonic-gate 
24737c478bd9Sstevel@tonic-gate /*
24747c478bd9Sstevel@tonic-gate  * Returns the count of free pages for 'pp' with size code 'szc'.
24757c478bd9Sstevel@tonic-gate  * Note: This function does not return an exact value as the page freelist
24767c478bd9Sstevel@tonic-gate  * locks are not held and thus the values in the page_counters may be
24777c478bd9Sstevel@tonic-gate  * changing as we walk through the data.
24787c478bd9Sstevel@tonic-gate  */
24797c478bd9Sstevel@tonic-gate static int
24807c478bd9Sstevel@tonic-gate page_freecnt(int mnode, page_t *pp, uchar_t szc)
24817c478bd9Sstevel@tonic-gate {
24827c478bd9Sstevel@tonic-gate 	pgcnt_t	pgfree;
24837c478bd9Sstevel@tonic-gate 	pgcnt_t cnt;
24847c478bd9Sstevel@tonic-gate 	ssize_t	r = szc;	/* region size */
24857c478bd9Sstevel@tonic-gate 	ssize_t	idx;
24867c478bd9Sstevel@tonic-gate 	int	i;
24877c478bd9Sstevel@tonic-gate 	int	full, range;
24887c478bd9Sstevel@tonic-gate 
24897c478bd9Sstevel@tonic-gate 	/* Make sure pagenum passed in is aligned properly */
24907c478bd9Sstevel@tonic-gate 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
24917c478bd9Sstevel@tonic-gate 	ASSERT(szc > 0);
24927c478bd9Sstevel@tonic-gate 
24937c478bd9Sstevel@tonic-gate 	/* Prevent page_counters dynamic memory from being freed */
24947c478bd9Sstevel@tonic-gate 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
24957c478bd9Sstevel@tonic-gate 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
24967c478bd9Sstevel@tonic-gate 	cnt = PAGE_COUNTERS(mnode, r, idx);
24977c478bd9Sstevel@tonic-gate 	pgfree = cnt << PNUM_SHIFT(r - 1);
24987c478bd9Sstevel@tonic-gate 	range = FULL_REGION_CNT(szc);
24997c478bd9Sstevel@tonic-gate 
25007c478bd9Sstevel@tonic-gate 	/* Check for completely full region */
25017c478bd9Sstevel@tonic-gate 	if (cnt == range) {
25027c478bd9Sstevel@tonic-gate 		rw_exit(&page_ctrs_rwlock[mnode]);
25037c478bd9Sstevel@tonic-gate 		return (pgfree);
25047c478bd9Sstevel@tonic-gate 	}
25057c478bd9Sstevel@tonic-gate 
25067c478bd9Sstevel@tonic-gate 	while (--r > 0) {
25077c478bd9Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
25087c478bd9Sstevel@tonic-gate 		full = FULL_REGION_CNT(r);
25097c478bd9Sstevel@tonic-gate 		for (i = 0; i < range; i++, idx++) {
25107c478bd9Sstevel@tonic-gate 			cnt = PAGE_COUNTERS(mnode, r, idx);
25117c478bd9Sstevel@tonic-gate 			/*
25127c478bd9Sstevel@tonic-gate 			 * If cnt here is full, that means we have already
25137c478bd9Sstevel@tonic-gate 			 * accounted for these pages earlier.
25147c478bd9Sstevel@tonic-gate 			 */
25157c478bd9Sstevel@tonic-gate 			if (cnt != full) {
25167c478bd9Sstevel@tonic-gate 				pgfree += (cnt << PNUM_SHIFT(r - 1));
25177c478bd9Sstevel@tonic-gate 			}
25187c478bd9Sstevel@tonic-gate 		}
25197c478bd9Sstevel@tonic-gate 		range *= full;
25207c478bd9Sstevel@tonic-gate 	}
25217c478bd9Sstevel@tonic-gate 	rw_exit(&page_ctrs_rwlock[mnode]);
25227c478bd9Sstevel@tonic-gate 	return (pgfree);
25237c478bd9Sstevel@tonic-gate }
25247c478bd9Sstevel@tonic-gate 
25257c478bd9Sstevel@tonic-gate /*
25267c478bd9Sstevel@tonic-gate  * Called from page_geti_contig_pages to exclusively lock constituent pages
25277c478bd9Sstevel@tonic-gate  * starting from 'spp' for page size code 'szc'.
25287c478bd9Sstevel@tonic-gate  *
25297c478bd9Sstevel@tonic-gate  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
25307c478bd9Sstevel@tonic-gate  * region needs to be greater than or equal to the threshold.
25317c478bd9Sstevel@tonic-gate  */
25327c478bd9Sstevel@tonic-gate static int
25337c478bd9Sstevel@tonic-gate page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
25347c478bd9Sstevel@tonic-gate {
25357c478bd9Sstevel@tonic-gate 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
25367c478bd9Sstevel@tonic-gate 	pgcnt_t pgfree, i;
25377c478bd9Sstevel@tonic-gate 	page_t *pp;
25387c478bd9Sstevel@tonic-gate 
25397c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
25407c478bd9Sstevel@tonic-gate 
25417c478bd9Sstevel@tonic-gate 
25427c478bd9Sstevel@tonic-gate 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
25437c478bd9Sstevel@tonic-gate 		goto skipptcpcheck;
25447c478bd9Sstevel@tonic-gate 	/*
25457c478bd9Sstevel@tonic-gate 	 * check if there are sufficient free pages available before attempting
25467c478bd9Sstevel@tonic-gate 	 * to trylock. Count is approximate as page counters can change.
25477c478bd9Sstevel@tonic-gate 	 */
25487c478bd9Sstevel@tonic-gate 	pgfree = page_freecnt(mnode, spp, szc);
25497c478bd9Sstevel@tonic-gate 
25507c478bd9Sstevel@tonic-gate 	/* attempt to trylock if there are sufficient already free pages */
25517c478bd9Sstevel@tonic-gate 	if (pgfree < pgcnt/ptcpthreshold) {
25527c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
25537c478bd9Sstevel@tonic-gate 		return (0);
25547c478bd9Sstevel@tonic-gate 	}
25557c478bd9Sstevel@tonic-gate 
25567c478bd9Sstevel@tonic-gate skipptcpcheck:
25577c478bd9Sstevel@tonic-gate 
25587c478bd9Sstevel@tonic-gate 	for (i = 0; i < pgcnt; i++) {
25597c478bd9Sstevel@tonic-gate 		pp = &spp[i];
25607c478bd9Sstevel@tonic-gate 		if (!page_trylock(pp, SE_EXCL)) {
25617c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
25627c478bd9Sstevel@tonic-gate 			while (--i != (pgcnt_t)-1) {
25637c478bd9Sstevel@tonic-gate 				pp = &spp[i];
25647c478bd9Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
2565db874c57Selowe 				page_unlock_noretire(pp);
25667c478bd9Sstevel@tonic-gate 			}
25677c478bd9Sstevel@tonic-gate 			return (0);
25687c478bd9Sstevel@tonic-gate 		}
25697c478bd9Sstevel@tonic-gate 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
25707c478bd9Sstevel@tonic-gate 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
25717c478bd9Sstevel@tonic-gate 		    !PP_ISFREE(pp)) {
25727c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
25737c478bd9Sstevel@tonic-gate 			ASSERT(i == 0);
2574db874c57Selowe 			page_unlock_noretire(pp);
25757c478bd9Sstevel@tonic-gate 			return (0);
25767c478bd9Sstevel@tonic-gate 		}
25777c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(pp)) {
25787c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
25797c478bd9Sstevel@tonic-gate 			while (i != (pgcnt_t)-1) {
25807c478bd9Sstevel@tonic-gate 				pp = &spp[i];
25817c478bd9Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
2582db874c57Selowe 				page_unlock_noretire(pp);
25837c478bd9Sstevel@tonic-gate 				i--;
25847c478bd9Sstevel@tonic-gate 			}
25857c478bd9Sstevel@tonic-gate 			return (0);
25867c478bd9Sstevel@tonic-gate 		}
25877c478bd9Sstevel@tonic-gate 	}
25887c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
25897c478bd9Sstevel@tonic-gate 	return (1);
25907c478bd9Sstevel@tonic-gate }
25917c478bd9Sstevel@tonic-gate 
25927c478bd9Sstevel@tonic-gate /*
25937c478bd9Sstevel@tonic-gate  * Claim large page pointed to by 'pp'. 'pp' is the starting set
25947c478bd9Sstevel@tonic-gate  * of 'szc' constituent pages that had been locked exclusively previously.
25957c478bd9Sstevel@tonic-gate  * Will attempt to relocate constituent pages in use.
25967c478bd9Sstevel@tonic-gate  */
25977c478bd9Sstevel@tonic-gate static page_t *
25987c478bd9Sstevel@tonic-gate page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
25997c478bd9Sstevel@tonic-gate {
26007c478bd9Sstevel@tonic-gate 	spgcnt_t pgcnt, npgs, i;
26017c478bd9Sstevel@tonic-gate 	page_t *targpp, *rpp, *hpp;
26027c478bd9Sstevel@tonic-gate 	page_t *replpp = NULL;
26037c478bd9Sstevel@tonic-gate 	page_t *pplist = NULL;
26047c478bd9Sstevel@tonic-gate 
26057c478bd9Sstevel@tonic-gate 	ASSERT(pp != NULL);
26067c478bd9Sstevel@tonic-gate 
26077c478bd9Sstevel@tonic-gate 	pgcnt = page_get_pagecnt(szc);
26087c478bd9Sstevel@tonic-gate 	while (pgcnt) {
26097c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(pp));
26107c478bd9Sstevel@tonic-gate 		ASSERT(!PP_ISNORELOC(pp));
26117c478bd9Sstevel@tonic-gate 		if (PP_ISFREE(pp)) {
26127c478bd9Sstevel@tonic-gate 			/*
26137c478bd9Sstevel@tonic-gate 			 * If this is a PG_FREE_LIST page then its
26147c478bd9Sstevel@tonic-gate 			 * size code can change underneath us due to
26157c478bd9Sstevel@tonic-gate 			 * page promotion or demotion. As an optimzation
26167c478bd9Sstevel@tonic-gate 			 * use page_list_sub_pages() instead of
26177c478bd9Sstevel@tonic-gate 			 * page_list_sub().
26187c478bd9Sstevel@tonic-gate 			 */
26197c478bd9Sstevel@tonic-gate 			if (PP_ISAGED(pp)) {
26207c478bd9Sstevel@tonic-gate 				page_list_sub_pages(pp, szc);
26217c478bd9Sstevel@tonic-gate 				if (pp->p_szc == szc) {
26227c478bd9Sstevel@tonic-gate 					return (pp);
26237c478bd9Sstevel@tonic-gate 				}
26247c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc < szc);
26257c478bd9Sstevel@tonic-gate 				npgs = page_get_pagecnt(pp->p_szc);
26267c478bd9Sstevel@tonic-gate 				hpp = pp;
26277c478bd9Sstevel@tonic-gate 				for (i = 0; i < npgs; i++, pp++) {
26287c478bd9Sstevel@tonic-gate 					pp->p_szc = szc;
26297c478bd9Sstevel@tonic-gate 				}
26307c478bd9Sstevel@tonic-gate 				page_list_concat(&pplist, &hpp);
26317c478bd9Sstevel@tonic-gate 				pgcnt -= npgs;
26327c478bd9Sstevel@tonic-gate 				continue;
26337c478bd9Sstevel@tonic-gate 			}
26347c478bd9Sstevel@tonic-gate 			ASSERT(!PP_ISAGED(pp));
26357c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_szc == 0);
26367c478bd9Sstevel@tonic-gate 			page_list_sub(pp, PG_CACHE_LIST);
26377c478bd9Sstevel@tonic-gate 			page_hashout(pp, NULL);
26387c478bd9Sstevel@tonic-gate 			PP_SETAGED(pp);
26397c478bd9Sstevel@tonic-gate 			pp->p_szc = szc;
26407c478bd9Sstevel@tonic-gate 			page_list_concat(&pplist, &pp);
26417c478bd9Sstevel@tonic-gate 			pp++;
26427c478bd9Sstevel@tonic-gate 			pgcnt--;
26437c478bd9Sstevel@tonic-gate 			continue;
26447c478bd9Sstevel@tonic-gate 		}
26457c478bd9Sstevel@tonic-gate 		npgs = page_get_pagecnt(pp->p_szc);
26467c478bd9Sstevel@tonic-gate 
26477c478bd9Sstevel@tonic-gate 		/*
26487c478bd9Sstevel@tonic-gate 		 * page_create_wait freemem accounting done by caller of
26497c478bd9Sstevel@tonic-gate 		 * page_get_freelist and not necessary to call it prior to
26507c478bd9Sstevel@tonic-gate 		 * calling page_get_replacement_page.
26517c478bd9Sstevel@tonic-gate 		 *
26527c478bd9Sstevel@tonic-gate 		 * page_get_replacement_page can call page_get_contig_pages
26537c478bd9Sstevel@tonic-gate 		 * to acquire a large page (szc > 0); the replacement must be
26547c478bd9Sstevel@tonic-gate 		 * smaller than the contig page size to avoid looping or
26557c478bd9Sstevel@tonic-gate 		 * szc == 0 and PGI_PGCPSZC0 is set.
26567c478bd9Sstevel@tonic-gate 		 */
26577c478bd9Sstevel@tonic-gate 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
26587c478bd9Sstevel@tonic-gate 			replpp = page_get_replacement_page(pp, NULL, 0);
26597c478bd9Sstevel@tonic-gate 			if (replpp) {
26607c478bd9Sstevel@tonic-gate 				npgs = page_get_pagecnt(pp->p_szc);
26617c478bd9Sstevel@tonic-gate 				ASSERT(npgs <= pgcnt);
26627c478bd9Sstevel@tonic-gate 				targpp = pp;
26637c478bd9Sstevel@tonic-gate 			}
26647c478bd9Sstevel@tonic-gate 		}
26657c478bd9Sstevel@tonic-gate 
26667c478bd9Sstevel@tonic-gate 		/*
26677c478bd9Sstevel@tonic-gate 		 * If replacement is NULL or do_page_relocate fails, fail
26687c478bd9Sstevel@tonic-gate 		 * coalescing of pages.
26697c478bd9Sstevel@tonic-gate 		 */
26707c478bd9Sstevel@tonic-gate 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
26717c478bd9Sstevel@tonic-gate 		    &npgs, NULL) != 0)) {
26727c478bd9Sstevel@tonic-gate 			/*
26737c478bd9Sstevel@tonic-gate 			 * Unlock un-processed target list
26747c478bd9Sstevel@tonic-gate 			 */
26757c478bd9Sstevel@tonic-gate 			while (pgcnt--) {
26767c478bd9Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
2677db874c57Selowe 				page_unlock_noretire(pp);
26787c478bd9Sstevel@tonic-gate 				pp++;
26797c478bd9Sstevel@tonic-gate 			}
26807c478bd9Sstevel@tonic-gate 			/*
26817c478bd9Sstevel@tonic-gate 			 * Free the processed target list.
26827c478bd9Sstevel@tonic-gate 			 */
26837c478bd9Sstevel@tonic-gate 			while (pplist) {
26847c478bd9Sstevel@tonic-gate 				pp = pplist;
26857c478bd9Sstevel@tonic-gate 				page_sub(&pplist, pp);
26867c478bd9Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
26877c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == szc);
26887c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISFREE(pp));
26897c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp));
26907c478bd9Sstevel@tonic-gate 				pp->p_szc = 0;
26917c478bd9Sstevel@tonic-gate 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2692db874c57Selowe 				page_unlock_noretire(pp);
26937c478bd9Sstevel@tonic-gate 			}
26947c478bd9Sstevel@tonic-gate 
26957c478bd9Sstevel@tonic-gate 			if (replpp != NULL)
26967c478bd9Sstevel@tonic-gate 				page_free_replacement_page(replpp);
26977c478bd9Sstevel@tonic-gate 
26987c478bd9Sstevel@tonic-gate 			return (NULL);
26997c478bd9Sstevel@tonic-gate 		}
27007c478bd9Sstevel@tonic-gate 		ASSERT(pp == targpp);
27017c478bd9Sstevel@tonic-gate 
27027c478bd9Sstevel@tonic-gate 		/* LINTED */
27037c478bd9Sstevel@tonic-gate 		ASSERT(hpp = pp); /* That's right, it's an assignment */
27047c478bd9Sstevel@tonic-gate 
27057c478bd9Sstevel@tonic-gate 		pp += npgs;
27067c478bd9Sstevel@tonic-gate 		pgcnt -= npgs;
27077c478bd9Sstevel@tonic-gate 
27087c478bd9Sstevel@tonic-gate 		while (npgs--) {
27097c478bd9Sstevel@tonic-gate 			ASSERT(PAGE_EXCL(targpp));
27107c478bd9Sstevel@tonic-gate 			ASSERT(!PP_ISFREE(targpp));
27117c478bd9Sstevel@tonic-gate 			ASSERT(!PP_ISNORELOC(targpp));
27127c478bd9Sstevel@tonic-gate 			PP_SETFREE(targpp);
27137c478bd9Sstevel@tonic-gate 			ASSERT(PP_ISAGED(targpp));
27147c478bd9Sstevel@tonic-gate 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
27157c478bd9Sstevel@tonic-gate 			    (flags & PGI_PGCPSZC0)));
27167c478bd9Sstevel@tonic-gate 			targpp->p_szc = szc;
27177c478bd9Sstevel@tonic-gate 			targpp = targpp->p_next;
27187c478bd9Sstevel@tonic-gate 
27197c478bd9Sstevel@tonic-gate 			rpp = replpp;
27207c478bd9Sstevel@tonic-gate 			ASSERT(rpp != NULL);
27217c478bd9Sstevel@tonic-gate 			page_sub(&replpp, rpp);
27227c478bd9Sstevel@tonic-gate 			ASSERT(PAGE_EXCL(rpp));
27237c478bd9Sstevel@tonic-gate 			ASSERT(!PP_ISFREE(rpp));
2724db874c57Selowe 			page_unlock_noretire(rpp);
27257c478bd9Sstevel@tonic-gate 		}
27267c478bd9Sstevel@tonic-gate 		ASSERT(targpp == hpp);
27277c478bd9Sstevel@tonic-gate 		ASSERT(replpp == NULL);
27287c478bd9Sstevel@tonic-gate 		page_list_concat(&pplist, &targpp);
27297c478bd9Sstevel@tonic-gate 	}
27307c478bd9Sstevel@tonic-gate 	CHK_LPG(pplist, szc);
27317c478bd9Sstevel@tonic-gate 	return (pplist);
27327c478bd9Sstevel@tonic-gate }
27337c478bd9Sstevel@tonic-gate 
27347c478bd9Sstevel@tonic-gate /*
27357c478bd9Sstevel@tonic-gate  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
27367c478bd9Sstevel@tonic-gate  * of 0 means nothing left after trim.
27377c478bd9Sstevel@tonic-gate  */
27387c478bd9Sstevel@tonic-gate 
27397c478bd9Sstevel@tonic-gate int
27407c478bd9Sstevel@tonic-gate trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
27417c478bd9Sstevel@tonic-gate {
27427c478bd9Sstevel@tonic-gate 	pfn_t	kcagepfn;
27437c478bd9Sstevel@tonic-gate 	int	decr;
27447c478bd9Sstevel@tonic-gate 	int	rc = 0;
27457c478bd9Sstevel@tonic-gate 
27467c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(mseg->pages)) {
27477c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
27487c478bd9Sstevel@tonic-gate 
27497c478bd9Sstevel@tonic-gate 			/* lower part of this mseg inside kernel cage */
27507c478bd9Sstevel@tonic-gate 			decr = kcage_current_pfn(&kcagepfn);
27517c478bd9Sstevel@tonic-gate 
27527c478bd9Sstevel@tonic-gate 			/* kernel cage may have transitioned past mseg */
27537c478bd9Sstevel@tonic-gate 			if (kcagepfn >= mseg->pages_base &&
27547c478bd9Sstevel@tonic-gate 			    kcagepfn < mseg->pages_end) {
27557c478bd9Sstevel@tonic-gate 				ASSERT(decr == 0);
27567c478bd9Sstevel@tonic-gate 				*lo = kcagepfn;
27577c478bd9Sstevel@tonic-gate 				*hi = MIN(pfnhi,
27587c478bd9Sstevel@tonic-gate 				    (mseg->pages_end - 1));
27597c478bd9Sstevel@tonic-gate 				rc = 1;
27607c478bd9Sstevel@tonic-gate 			}
27617c478bd9Sstevel@tonic-gate 		}
27627c478bd9Sstevel@tonic-gate 		/* else entire mseg in the cage */
27637c478bd9Sstevel@tonic-gate 	} else {
27647c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(mseg->epages - 1)) {
27657c478bd9Sstevel@tonic-gate 
27667c478bd9Sstevel@tonic-gate 			/* upper part of this mseg inside kernel cage */
27677c478bd9Sstevel@tonic-gate 			decr = kcage_current_pfn(&kcagepfn);
27687c478bd9Sstevel@tonic-gate 
27697c478bd9Sstevel@tonic-gate 			/* kernel cage may have transitioned past mseg */
27707c478bd9Sstevel@tonic-gate 			if (kcagepfn >= mseg->pages_base &&
27717c478bd9Sstevel@tonic-gate 			    kcagepfn < mseg->pages_end) {
27727c478bd9Sstevel@tonic-gate 				ASSERT(decr);
27737c478bd9Sstevel@tonic-gate 				*hi = kcagepfn;
27747c478bd9Sstevel@tonic-gate 				*lo = MAX(pfnlo, mseg->pages_base);
27757c478bd9Sstevel@tonic-gate 				rc = 1;
27767c478bd9Sstevel@tonic-gate 			}
27777c478bd9Sstevel@tonic-gate 		} else {
27787c478bd9Sstevel@tonic-gate 			/* entire mseg outside of kernel cage */
27797c478bd9Sstevel@tonic-gate 			*lo = MAX(pfnlo, mseg->pages_base);
27807c478bd9Sstevel@tonic-gate 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
27817c478bd9Sstevel@tonic-gate 			rc = 1;
27827c478bd9Sstevel@tonic-gate 		}
27837c478bd9Sstevel@tonic-gate 	}
27847c478bd9Sstevel@tonic-gate 	return (rc);
27857c478bd9Sstevel@tonic-gate }
27867c478bd9Sstevel@tonic-gate 
27877c478bd9Sstevel@tonic-gate /*
27887c478bd9Sstevel@tonic-gate  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a
27897c478bd9Sstevel@tonic-gate  * page with size code 'szc'. Claiming such a page requires acquiring
27907c478bd9Sstevel@tonic-gate  * exclusive locks on all constituent pages (page_trylock_contig_pages),
27917c478bd9Sstevel@tonic-gate  * relocating pages in use and concatenating these constituent pages into a
27927c478bd9Sstevel@tonic-gate  * large page.
27937c478bd9Sstevel@tonic-gate  *
27947c478bd9Sstevel@tonic-gate  * The page lists do not have such a large page and page_freelist_fill has
27957c478bd9Sstevel@tonic-gate  * already failed to demote larger pages and/or coalesce smaller free pages.
27967c478bd9Sstevel@tonic-gate  *
27977c478bd9Sstevel@tonic-gate  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
27987c478bd9Sstevel@tonic-gate  * pages with the same color as 'bin'.
27997c478bd9Sstevel@tonic-gate  *
28007c478bd9Sstevel@tonic-gate  * 'pfnflag' specifies the subset of the pfn range to search.
28017c478bd9Sstevel@tonic-gate  */
28027c478bd9Sstevel@tonic-gate 
28037c478bd9Sstevel@tonic-gate 
28047c478bd9Sstevel@tonic-gate static page_t *
28057c478bd9Sstevel@tonic-gate page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
280683f9b804Skchow     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
28077c478bd9Sstevel@tonic-gate {
28087c478bd9Sstevel@tonic-gate 	struct memseg *mseg;
28097c478bd9Sstevel@tonic-gate 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
28107c478bd9Sstevel@tonic-gate 	pgcnt_t szcpgmask = szcpgcnt - 1;
28117c478bd9Sstevel@tonic-gate 	pfn_t	randpfn;
28127c478bd9Sstevel@tonic-gate 	page_t *pp, *randpp, *endpp;
28137c478bd9Sstevel@tonic-gate 	uint_t colors;
28147c478bd9Sstevel@tonic-gate 	pfn_t hi, lo;
28157c478bd9Sstevel@tonic-gate 	uint_t skip;
28167c478bd9Sstevel@tonic-gate 
28177c478bd9Sstevel@tonic-gate 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
28187c478bd9Sstevel@tonic-gate 
28197c478bd9Sstevel@tonic-gate 	if ((pfnhi - pfnlo) + 1 < szcpgcnt)
28207c478bd9Sstevel@tonic-gate 		return (NULL);
28217c478bd9Sstevel@tonic-gate 
28227c478bd9Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
28237c478bd9Sstevel@tonic-gate 
28247c478bd9Sstevel@tonic-gate 	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
28257c478bd9Sstevel@tonic-gate 	    page_colors;
28267c478bd9Sstevel@tonic-gate 
28277c478bd9Sstevel@tonic-gate 	ASSERT(bin < colors);
28287c478bd9Sstevel@tonic-gate 
28297c478bd9Sstevel@tonic-gate 	/*
28307c478bd9Sstevel@tonic-gate 	 * trim the pfn range to search based on pfnflag. pfnflag is set
28317c478bd9Sstevel@tonic-gate 	 * when there have been previous page_get_contig_page failures to
28327c478bd9Sstevel@tonic-gate 	 * limit the search.
28337c478bd9Sstevel@tonic-gate 	 *
28347c478bd9Sstevel@tonic-gate 	 * The high bit in pfnflag specifies the number of 'slots' in the
28357c478bd9Sstevel@tonic-gate 	 * pfn range and the remainder of pfnflag specifies which slot.
28367c478bd9Sstevel@tonic-gate 	 * For example, a value of 1010b would mean the second slot of
28377c478bd9Sstevel@tonic-gate 	 * the pfn range that has been divided into 8 slots.
28387c478bd9Sstevel@tonic-gate 	 */
28397c478bd9Sstevel@tonic-gate 	if (pfnflag > 1) {
28407c478bd9Sstevel@tonic-gate 		int	slots = 1 << (highbit(pfnflag) - 1);
28417c478bd9Sstevel@tonic-gate 		int	slotid = pfnflag & (slots - 1);
28427c478bd9Sstevel@tonic-gate 		pgcnt_t	szcpages;
28437c478bd9Sstevel@tonic-gate 		int	slotlen;
28447c478bd9Sstevel@tonic-gate 
28457c478bd9Sstevel@tonic-gate 		pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
28467c478bd9Sstevel@tonic-gate 		pfnhi = pfnhi & ~(szcpgcnt - 1);
28477c478bd9Sstevel@tonic-gate 
28487c478bd9Sstevel@tonic-gate 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
28497c478bd9Sstevel@tonic-gate 		slotlen = howmany(szcpages, slots);
28507c478bd9Sstevel@tonic-gate 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
28517c478bd9Sstevel@tonic-gate 		ASSERT(pfnlo < pfnhi);
28527c478bd9Sstevel@tonic-gate 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
28537c478bd9Sstevel@tonic-gate 			pfnhi = pfnlo + (slotlen * szcpgcnt);
28547c478bd9Sstevel@tonic-gate 	}
28557c478bd9Sstevel@tonic-gate 
28567c478bd9Sstevel@tonic-gate 	memsegs_lock(0);
28577c478bd9Sstevel@tonic-gate 
28587c478bd9Sstevel@tonic-gate 	/*
28597c478bd9Sstevel@tonic-gate 	 * loop through memsegs to look for contig page candidates
28607c478bd9Sstevel@tonic-gate 	 */
28617c478bd9Sstevel@tonic-gate 
28627c478bd9Sstevel@tonic-gate 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
28637c478bd9Sstevel@tonic-gate 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
28647c478bd9Sstevel@tonic-gate 			/* no overlap */
28657c478bd9Sstevel@tonic-gate 			continue;
28667c478bd9Sstevel@tonic-gate 		}
28677c478bd9Sstevel@tonic-gate 
28687c478bd9Sstevel@tonic-gate 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
28697c478bd9Sstevel@tonic-gate 			/* mseg too small */
28707c478bd9Sstevel@tonic-gate 			continue;
28717c478bd9Sstevel@tonic-gate 
28727c478bd9Sstevel@tonic-gate 		/* trim off kernel cage pages from pfn range */
28737c478bd9Sstevel@tonic-gate 		if (kcage_on) {
28747c478bd9Sstevel@tonic-gate 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
28757c478bd9Sstevel@tonic-gate 				continue;
28767c478bd9Sstevel@tonic-gate 		} else {
28777c478bd9Sstevel@tonic-gate 			lo = MAX(pfnlo, mseg->pages_base);
28787c478bd9Sstevel@tonic-gate 			hi = MIN(pfnhi, (mseg->pages_end - 1));
28797c478bd9Sstevel@tonic-gate 		}
28807c478bd9Sstevel@tonic-gate 
28817c478bd9Sstevel@tonic-gate 		/* round to szcpgcnt boundaries */
28827c478bd9Sstevel@tonic-gate 		lo = P2ROUNDUP(lo, szcpgcnt);
28837c478bd9Sstevel@tonic-gate 		hi = hi & ~(szcpgcnt - 1);
28847c478bd9Sstevel@tonic-gate 
28857c478bd9Sstevel@tonic-gate 		if (hi <= lo)
28867c478bd9Sstevel@tonic-gate 			continue;
28877c478bd9Sstevel@tonic-gate 
28887c478bd9Sstevel@tonic-gate 		/*
28897c478bd9Sstevel@tonic-gate 		 * set lo to point to the pfn for the desired bin. Large
28907c478bd9Sstevel@tonic-gate 		 * page sizes may only have a single page color
28917c478bd9Sstevel@tonic-gate 		 */
28927c478bd9Sstevel@tonic-gate 		if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
28937c478bd9Sstevel@tonic-gate 			uint_t	lobin;
28947c478bd9Sstevel@tonic-gate 
28957c478bd9Sstevel@tonic-gate 			/*
28967c478bd9Sstevel@tonic-gate 			 * factor in colorequiv to check additional
28977c478bd9Sstevel@tonic-gate 			 * 'equivalent' bins.
28987c478bd9Sstevel@tonic-gate 			 */
28997c478bd9Sstevel@tonic-gate 			if (colorequiv > 1 && colors > colorequiv)
29007c478bd9Sstevel@tonic-gate 				colors = colors / colorequiv;
29017c478bd9Sstevel@tonic-gate 
29027c478bd9Sstevel@tonic-gate 			/* determine bin that lo currently points to */
29037c478bd9Sstevel@tonic-gate 			lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt;
29047c478bd9Sstevel@tonic-gate 
29057c478bd9Sstevel@tonic-gate 			/*
29067c478bd9Sstevel@tonic-gate 			 * set lo to point at appropriate color and set skip
29077c478bd9Sstevel@tonic-gate 			 * to arrive at the next szc page of the same color.
29087c478bd9Sstevel@tonic-gate 			 */
29097c478bd9Sstevel@tonic-gate 			lo += ((bin - lobin) & (colors - 1)) * szcpgcnt;
29107c478bd9Sstevel@tonic-gate 
29117c478bd9Sstevel@tonic-gate 			skip = colors * szcpgcnt;
29127c478bd9Sstevel@tonic-gate 		} else {
29137c478bd9Sstevel@tonic-gate 			/* check all pages starting from lo */
29147c478bd9Sstevel@tonic-gate 			skip = szcpgcnt;
29157c478bd9Sstevel@tonic-gate 		}
29167c478bd9Sstevel@tonic-gate 		if (hi <= lo)
29177c478bd9Sstevel@tonic-gate 			/* mseg cannot satisfy color request */
29187c478bd9Sstevel@tonic-gate 			continue;
29197c478bd9Sstevel@tonic-gate 
29207c478bd9Sstevel@tonic-gate 		/* randomly choose a point between lo and hi to begin search */
29217c478bd9Sstevel@tonic-gate 
29227c478bd9Sstevel@tonic-gate 		randpfn = (pfn_t)GETTICK();
29237c478bd9Sstevel@tonic-gate 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
29247c478bd9Sstevel@tonic-gate 		randpp = mseg->pages + (randpfn - mseg->pages_base);
29257c478bd9Sstevel@tonic-gate 
29267c478bd9Sstevel@tonic-gate 		ASSERT(randpp->p_pagenum == randpfn);
29277c478bd9Sstevel@tonic-gate 
29287c478bd9Sstevel@tonic-gate 		pp = randpp;
29297c478bd9Sstevel@tonic-gate 		endpp =  mseg->pages + (hi - mseg->pages_base);
29307c478bd9Sstevel@tonic-gate 
29317c478bd9Sstevel@tonic-gate 		ASSERT(randpp + szcpgcnt <= endpp);
29327c478bd9Sstevel@tonic-gate 
29337c478bd9Sstevel@tonic-gate 		do {
29347c478bd9Sstevel@tonic-gate 			ASSERT(!(pp->p_pagenum & szcpgmask));
29357c478bd9Sstevel@tonic-gate 			ASSERT((flags & PG_MATCH_COLOR) == 0 ||
29367c478bd9Sstevel@tonic-gate 			    colorequiv > 1 ||
29377c478bd9Sstevel@tonic-gate 			    PP_2_BIN(pp) == bin);
29387c478bd9Sstevel@tonic-gate 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
29397c478bd9Sstevel@tonic-gate 				/* pages unlocked by page_claim on failure */
29407c478bd9Sstevel@tonic-gate 				if (page_claim_contig_pages(pp, szc, flags)) {
29417c478bd9Sstevel@tonic-gate 					memsegs_unlock(0);
29427c478bd9Sstevel@tonic-gate 					return (pp);
29437c478bd9Sstevel@tonic-gate 				}
29447c478bd9Sstevel@tonic-gate 			}
29457c478bd9Sstevel@tonic-gate 
29467c478bd9Sstevel@tonic-gate 			pp += skip;
29477c478bd9Sstevel@tonic-gate 			if (pp >= endpp) {
29487c478bd9Sstevel@tonic-gate 				/* start from the beginning */
29497c478bd9Sstevel@tonic-gate 				pp = mseg->pages + (lo - mseg->pages_base);
29507c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_pagenum == lo);
29517c478bd9Sstevel@tonic-gate 				ASSERT(pp + szcpgcnt <= endpp);
29527c478bd9Sstevel@tonic-gate 			}
29537c478bd9Sstevel@tonic-gate 		} while (pp != randpp);
29547c478bd9Sstevel@tonic-gate 	}
29557c478bd9Sstevel@tonic-gate 	memsegs_unlock(0);
29567c478bd9Sstevel@tonic-gate 	return (NULL);
29577c478bd9Sstevel@tonic-gate }
29587c478bd9Sstevel@tonic-gate 
29597c478bd9Sstevel@tonic-gate 
29607c478bd9Sstevel@tonic-gate /*
29617c478bd9Sstevel@tonic-gate  * controlling routine that searches through physical memory in an attempt to
29627c478bd9Sstevel@tonic-gate  * claim a large page based on the input parameters.
29637c478bd9Sstevel@tonic-gate  * on the page free lists.
29647c478bd9Sstevel@tonic-gate  *
29657c478bd9Sstevel@tonic-gate  * calls page_geti_contig_pages with an initial pfn range from the mnode
29667c478bd9Sstevel@tonic-gate  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
29677c478bd9Sstevel@tonic-gate  * that overlaps with the kernel cage or does not match the requested page
29687c478bd9Sstevel@tonic-gate  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
29697c478bd9Sstevel@tonic-gate  * page_geti_contig_pages may further limit the search range based on
29707c478bd9Sstevel@tonic-gate  * previous failure counts (pgcpfailcnt[]).
29717c478bd9Sstevel@tonic-gate  *
29727c478bd9Sstevel@tonic-gate  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
29737c478bd9Sstevel@tonic-gate  * pagesize page that satisfies mtype.
29747c478bd9Sstevel@tonic-gate  */
29757c478bd9Sstevel@tonic-gate page_t *
29767c478bd9Sstevel@tonic-gate page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
29777c478bd9Sstevel@tonic-gate     uint_t flags)
29787c478bd9Sstevel@tonic-gate {
29797c478bd9Sstevel@tonic-gate 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
29807c478bd9Sstevel@tonic-gate 	page_t		*pp;
298183f9b804Skchow 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
29827c478bd9Sstevel@tonic-gate 
29837c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
29847c478bd9Sstevel@tonic-gate 
2985*0b5aa17bSmec 	/* no allocations from cage */
2986*0b5aa17bSmec 	flags |= PGI_NOCAGE;
2987*0b5aa17bSmec 
29887c478bd9Sstevel@tonic-gate 	/* LINTED */
29897c478bd9Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
29907c478bd9Sstevel@tonic-gate 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
29917c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
29927c478bd9Sstevel@tonic-gate 		return (NULL);
29937c478bd9Sstevel@tonic-gate 	}
29947c478bd9Sstevel@tonic-gate 
29957c478bd9Sstevel@tonic-gate 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
29967c478bd9Sstevel@tonic-gate 
29977c478bd9Sstevel@tonic-gate 	/* do not limit search and ignore color if hi pri */
29987c478bd9Sstevel@tonic-gate 
29997c478bd9Sstevel@tonic-gate 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
30007c478bd9Sstevel@tonic-gate 		pfnflag = pgcpfailcnt[szc];
30017c478bd9Sstevel@tonic-gate 
30027c478bd9Sstevel@tonic-gate 	/* remove color match to improve chances */
30037c478bd9Sstevel@tonic-gate 
30047c478bd9Sstevel@tonic-gate 	if (flags & PGI_PGCPHIPRI || pfnflag)
30057c478bd9Sstevel@tonic-gate 		flags &= ~PG_MATCH_COLOR;
30067c478bd9Sstevel@tonic-gate 
30077c478bd9Sstevel@tonic-gate 	do {
30087c478bd9Sstevel@tonic-gate 		/* get pfn range based on mnode and mtype */
30097c478bd9Sstevel@tonic-gate 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
30107c478bd9Sstevel@tonic-gate 
30117c478bd9Sstevel@tonic-gate 		ASSERT(pfnhi >= pfnlo);
30127c478bd9Sstevel@tonic-gate 
30137c478bd9Sstevel@tonic-gate 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
30147c478bd9Sstevel@tonic-gate 		    pfnlo, pfnhi, pfnflag);
30157c478bd9Sstevel@tonic-gate 
30167c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
30177c478bd9Sstevel@tonic-gate 			pfnflag = pgcpfailcnt[szc];
30187c478bd9Sstevel@tonic-gate 			if (pfnflag) {
30197c478bd9Sstevel@tonic-gate 				/* double the search size */
30207c478bd9Sstevel@tonic-gate 				pgcpfailcnt[szc] = pfnflag >> 1;
30217c478bd9Sstevel@tonic-gate 			}
30227c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
30237c478bd9Sstevel@tonic-gate 			return (pp);
30247c478bd9Sstevel@tonic-gate 		}
3025affbd3ccSkchow 		MTYPE_NEXT(mnode, mtype, flags);
3026affbd3ccSkchow 	} while (mtype >= 0);
30277c478bd9Sstevel@tonic-gate 
30287c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
30297c478bd9Sstevel@tonic-gate 	return (NULL);
30307c478bd9Sstevel@tonic-gate }
30317c478bd9Sstevel@tonic-gate 
30327c478bd9Sstevel@tonic-gate 
30337c478bd9Sstevel@tonic-gate /*
30347c478bd9Sstevel@tonic-gate  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
30357c478bd9Sstevel@tonic-gate  *
30367c478bd9Sstevel@tonic-gate  * Does its own locking and accounting.
30377c478bd9Sstevel@tonic-gate  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
30387c478bd9Sstevel@tonic-gate  * pages of the proper color even if there are pages of a different color.
30397c478bd9Sstevel@tonic-gate  *
30407c478bd9Sstevel@tonic-gate  * Finds a page, removes it, THEN locks it.
30417c478bd9Sstevel@tonic-gate  */
30427c478bd9Sstevel@tonic-gate 
30437c478bd9Sstevel@tonic-gate /*ARGSUSED*/
30447c478bd9Sstevel@tonic-gate page_t *
30457c478bd9Sstevel@tonic-gate page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
30467c478bd9Sstevel@tonic-gate 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
30477c478bd9Sstevel@tonic-gate {
30487c478bd9Sstevel@tonic-gate 	struct as	*as = seg->s_as;
30497c478bd9Sstevel@tonic-gate 	page_t		*pp = NULL;
30507c478bd9Sstevel@tonic-gate 	ulong_t		bin;
30517c478bd9Sstevel@tonic-gate 	uchar_t		szc;
30527c478bd9Sstevel@tonic-gate 	int		mnode;
30537c478bd9Sstevel@tonic-gate 	int		mtype;
30547c478bd9Sstevel@tonic-gate 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
30557c478bd9Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
30567c478bd9Sstevel@tonic-gate 
30577c478bd9Sstevel@tonic-gate 	page_get_func = page_get_mnode_freelist;
30587c478bd9Sstevel@tonic-gate 
30597c478bd9Sstevel@tonic-gate 	/*
30607c478bd9Sstevel@tonic-gate 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
30617c478bd9Sstevel@tonic-gate 	 * assume we wish to allocate near to the current thread's home.
30627c478bd9Sstevel@tonic-gate 	 */
30637c478bd9Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
30647c478bd9Sstevel@tonic-gate 		lgrp = lgrp_home_lgrp();
30657c478bd9Sstevel@tonic-gate 
30667c478bd9Sstevel@tonic-gate 	if (kcage_on) {
30677c478bd9Sstevel@tonic-gate 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
30687c478bd9Sstevel@tonic-gate 		    kcage_freemem < kcage_throttlefree + btop(size) &&
30697c478bd9Sstevel@tonic-gate 		    curthread != kcage_cageout_thread) {
30707c478bd9Sstevel@tonic-gate 			/*
30717c478bd9Sstevel@tonic-gate 			 * Set a "reserve" of kcage_throttlefree pages for
30727c478bd9Sstevel@tonic-gate 			 * PG_PANIC and cageout thread allocations.
30737c478bd9Sstevel@tonic-gate 			 *
30747c478bd9Sstevel@tonic-gate 			 * Everybody else has to serialize in
30757c478bd9Sstevel@tonic-gate 			 * page_create_get_something() to get a cage page, so
30767c478bd9Sstevel@tonic-gate 			 * that we don't deadlock cageout!
30777c478bd9Sstevel@tonic-gate 			 */
30787c478bd9Sstevel@tonic-gate 			return (NULL);
30797c478bd9Sstevel@tonic-gate 		}
30807c478bd9Sstevel@tonic-gate 	} else {
30817c478bd9Sstevel@tonic-gate 		flags &= ~PG_NORELOC;
30827c478bd9Sstevel@tonic-gate 		flags |= PGI_NOCAGE;
30837c478bd9Sstevel@tonic-gate 	}
30847c478bd9Sstevel@tonic-gate 
30857c478bd9Sstevel@tonic-gate 	/* LINTED */
308607ad560dSkchow 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
30877c478bd9Sstevel@tonic-gate 
30887c478bd9Sstevel@tonic-gate 	/*
30897c478bd9Sstevel@tonic-gate 	 * Convert size to page size code.
30907c478bd9Sstevel@tonic-gate 	 */
30917c478bd9Sstevel@tonic-gate 	if ((szc = page_szc(size)) == (uchar_t)-1)
30927c478bd9Sstevel@tonic-gate 		panic("page_get_freelist: illegal page size request");
30937c478bd9Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
30947c478bd9Sstevel@tonic-gate 
30957c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
30967c478bd9Sstevel@tonic-gate 
30977c478bd9Sstevel@tonic-gate 	/* LINTED */
30987c478bd9Sstevel@tonic-gate 	AS_2_BIN(as, seg, vp, vaddr, bin);
30997c478bd9Sstevel@tonic-gate 
31007c478bd9Sstevel@tonic-gate 	/* bin is for base pagesize color - convert if larger pagesize. */
31017c478bd9Sstevel@tonic-gate 	if (szc)
31027c478bd9Sstevel@tonic-gate 		bin = page_convert_color(0, szc, bin);
31037c478bd9Sstevel@tonic-gate 
31047c478bd9Sstevel@tonic-gate 	/*
31057c478bd9Sstevel@tonic-gate 	 * Try to get a local page first, but try remote if we can't
31067c478bd9Sstevel@tonic-gate 	 * get a page of the right color.
31077c478bd9Sstevel@tonic-gate 	 */
31087c478bd9Sstevel@tonic-gate pgretry:
31097c478bd9Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
31107c478bd9Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
31117c478bd9Sstevel@tonic-gate 		pp = page_get_func(mnode, bin, mtype, szc, flags);
31127c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
31137c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
31147c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
31157c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
31167c478bd9Sstevel@tonic-gate 			    int, mnode,
31177c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
31187c478bd9Sstevel@tonic-gate 			    uint_t, flags);
31197c478bd9Sstevel@tonic-gate 			return (pp);
31207c478bd9Sstevel@tonic-gate 		}
31217c478bd9Sstevel@tonic-gate 	}
31227c478bd9Sstevel@tonic-gate 	ASSERT(pp == NULL);
31237c478bd9Sstevel@tonic-gate 
31247c478bd9Sstevel@tonic-gate 	/*
31257c478bd9Sstevel@tonic-gate 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
31267c478bd9Sstevel@tonic-gate 	 * remote free lists.  Caller expected to call page_get_cachelist which
31277c478bd9Sstevel@tonic-gate 	 * will check local cache lists and remote free lists.
31287c478bd9Sstevel@tonic-gate 	 */
31297c478bd9Sstevel@tonic-gate 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
31307c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
31317c478bd9Sstevel@tonic-gate 		return (NULL);
31327c478bd9Sstevel@tonic-gate 	}
31337c478bd9Sstevel@tonic-gate 
31347c478bd9Sstevel@tonic-gate 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
31357c478bd9Sstevel@tonic-gate 
31367c478bd9Sstevel@tonic-gate 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
31377c478bd9Sstevel@tonic-gate 
31387c478bd9Sstevel@tonic-gate 	/*
31397c478bd9Sstevel@tonic-gate 	 * Try to get a non-local freelist page.
31407c478bd9Sstevel@tonic-gate 	 */
31417c478bd9Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
31427c478bd9Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
31437c478bd9Sstevel@tonic-gate 		pp = page_get_func(mnode, bin, mtype, szc, flags);
31447c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
31457c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
31467c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
31477c478bd9Sstevel@tonic-gate 			    int, mnode,
31487c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
31497c478bd9Sstevel@tonic-gate 			    uint_t, flags);
31507c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
31517c478bd9Sstevel@tonic-gate 			return (pp);
31527c478bd9Sstevel@tonic-gate 		}
31537c478bd9Sstevel@tonic-gate 	}
31547c478bd9Sstevel@tonic-gate 
31557c478bd9Sstevel@tonic-gate 	ASSERT(pp == NULL);
31567c478bd9Sstevel@tonic-gate 
31577c478bd9Sstevel@tonic-gate 	/*
31587c478bd9Sstevel@tonic-gate 	 * when the cage is off chances are page_get_contig_pages() will fail
31597c478bd9Sstevel@tonic-gate 	 * to lock a large page chunk therefore when the cage is off it's not
31607c478bd9Sstevel@tonic-gate 	 * called by default.  this can be changed via /etc/system.
31617c478bd9Sstevel@tonic-gate 	 *
31627c478bd9Sstevel@tonic-gate 	 * page_get_contig_pages() also called to acquire a base pagesize page
31637c478bd9Sstevel@tonic-gate 	 * for page_create_get_something().
31647c478bd9Sstevel@tonic-gate 	 */
31657c478bd9Sstevel@tonic-gate 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
31667c478bd9Sstevel@tonic-gate 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
31677c478bd9Sstevel@tonic-gate 	    (page_get_func != page_get_contig_pages)) {
31687c478bd9Sstevel@tonic-gate 
31697c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
31707c478bd9Sstevel@tonic-gate 		page_get_func = page_get_contig_pages;
31717c478bd9Sstevel@tonic-gate 		goto pgretry;
31727c478bd9Sstevel@tonic-gate 	}
31737c478bd9Sstevel@tonic-gate 
31747c478bd9Sstevel@tonic-gate 	if (pgcplimitsearch && page_get_func == page_get_contig_pages)
317583f9b804Skchow 		SETPGCPFAILCNT(szc);
31767c478bd9Sstevel@tonic-gate 
31777c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
31787c478bd9Sstevel@tonic-gate 	return (NULL);
31797c478bd9Sstevel@tonic-gate }
31807c478bd9Sstevel@tonic-gate 
31817c478bd9Sstevel@tonic-gate /*
31827c478bd9Sstevel@tonic-gate  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
31837c478bd9Sstevel@tonic-gate  *
31847c478bd9Sstevel@tonic-gate  * Does its own locking.
31857c478bd9Sstevel@tonic-gate  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
31867c478bd9Sstevel@tonic-gate  * pages of the proper color even if there are pages of a different color.
31877c478bd9Sstevel@tonic-gate  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
31887c478bd9Sstevel@tonic-gate  * try to lock one of them.  If no page can be locked, try the
31897c478bd9Sstevel@tonic-gate  * next bin.  Return NULL if a page can not be found and locked.
31907c478bd9Sstevel@tonic-gate  *
31917c478bd9Sstevel@tonic-gate  * Finds a pages, trys to lock it, then removes it.
31927c478bd9Sstevel@tonic-gate  */
31937c478bd9Sstevel@tonic-gate 
31947c478bd9Sstevel@tonic-gate /*ARGSUSED*/
31957c478bd9Sstevel@tonic-gate page_t *
31967c478bd9Sstevel@tonic-gate page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
31977c478bd9Sstevel@tonic-gate     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
31987c478bd9Sstevel@tonic-gate {
31997c478bd9Sstevel@tonic-gate 	page_t		*pp;
32007c478bd9Sstevel@tonic-gate 	struct as	*as = seg->s_as;
32017c478bd9Sstevel@tonic-gate 	ulong_t		bin;
32027c478bd9Sstevel@tonic-gate 	/*LINTED*/
32037c478bd9Sstevel@tonic-gate 	int		mnode;
32047c478bd9Sstevel@tonic-gate 	int		mtype;
32057c478bd9Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
32067c478bd9Sstevel@tonic-gate 
32077c478bd9Sstevel@tonic-gate 	/*
32087c478bd9Sstevel@tonic-gate 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
32097c478bd9Sstevel@tonic-gate 	 * assume we wish to allocate near to the current thread's home.
32107c478bd9Sstevel@tonic-gate 	 */
32117c478bd9Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
32127c478bd9Sstevel@tonic-gate 		lgrp = lgrp_home_lgrp();
32137c478bd9Sstevel@tonic-gate 
32147c478bd9Sstevel@tonic-gate 	if (!kcage_on) {
32157c478bd9Sstevel@tonic-gate 		flags &= ~PG_NORELOC;
32167c478bd9Sstevel@tonic-gate 		flags |= PGI_NOCAGE;
32177c478bd9Sstevel@tonic-gate 	}
32187c478bd9Sstevel@tonic-gate 
32197c478bd9Sstevel@tonic-gate 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
32207c478bd9Sstevel@tonic-gate 	    kcage_freemem <= kcage_throttlefree) {
32217c478bd9Sstevel@tonic-gate 		/*
32227c478bd9Sstevel@tonic-gate 		 * Reserve kcage_throttlefree pages for critical kernel
32237c478bd9Sstevel@tonic-gate 		 * threads.
32247c478bd9Sstevel@tonic-gate 		 *
32257c478bd9Sstevel@tonic-gate 		 * Everybody else has to go to page_create_get_something()
32267c478bd9Sstevel@tonic-gate 		 * to get a cage page, so we don't deadlock cageout.
32277c478bd9Sstevel@tonic-gate 		 */
32287c478bd9Sstevel@tonic-gate 		return (NULL);
32297c478bd9Sstevel@tonic-gate 	}
32307c478bd9Sstevel@tonic-gate 
32317c478bd9Sstevel@tonic-gate 	/* LINTED */
32327c478bd9Sstevel@tonic-gate 	AS_2_BIN(as, seg, vp, vaddr, bin);
32337c478bd9Sstevel@tonic-gate 
32347c478bd9Sstevel@tonic-gate 	ASSERT(bin <= page_colors_mask);
32357c478bd9Sstevel@tonic-gate 
32367c478bd9Sstevel@tonic-gate 	/* LINTED */
323707ad560dSkchow 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
32387c478bd9Sstevel@tonic-gate 
32397c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
32407c478bd9Sstevel@tonic-gate 
32417c478bd9Sstevel@tonic-gate 	/*
32427c478bd9Sstevel@tonic-gate 	 * Try local cachelists first
32437c478bd9Sstevel@tonic-gate 	 */
32447c478bd9Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
32457c478bd9Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
32467c478bd9Sstevel@tonic-gate 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
32477c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
32487c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
32497c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
32507c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
32517c478bd9Sstevel@tonic-gate 			    int, mnode,
32527c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
32537c478bd9Sstevel@tonic-gate 			    uint_t, flags);
32547c478bd9Sstevel@tonic-gate 			return (pp);
32557c478bd9Sstevel@tonic-gate 		}
32567c478bd9Sstevel@tonic-gate 	}
32577c478bd9Sstevel@tonic-gate 
32587c478bd9Sstevel@tonic-gate 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
32597c478bd9Sstevel@tonic-gate 
32607c478bd9Sstevel@tonic-gate 	/*
32617c478bd9Sstevel@tonic-gate 	 * Try freelists/cachelists that are farther away
32627c478bd9Sstevel@tonic-gate 	 * This is our only chance to allocate remote pages for PAGESIZE
32637c478bd9Sstevel@tonic-gate 	 * requests.
32647c478bd9Sstevel@tonic-gate 	 */
32657c478bd9Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
32667c478bd9Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
32677c478bd9Sstevel@tonic-gate 		pp = page_get_mnode_freelist(mnode, bin, mtype,
32687c478bd9Sstevel@tonic-gate 		    0, flags);
32697c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
32707c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
32717c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
32727c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
32737c478bd9Sstevel@tonic-gate 			    int, mnode,
32747c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
32757c478bd9Sstevel@tonic-gate 			    uint_t, flags);
32767c478bd9Sstevel@tonic-gate 			return (pp);
32777c478bd9Sstevel@tonic-gate 		}
32787c478bd9Sstevel@tonic-gate 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
32797c478bd9Sstevel@tonic-gate 		if (pp != NULL) {
32807c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
32817c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
32827c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
32837c478bd9Sstevel@tonic-gate 			    int, mnode,
32847c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
32857c478bd9Sstevel@tonic-gate 			    uint_t, flags);
32867c478bd9Sstevel@tonic-gate 			return (pp);
32877c478bd9Sstevel@tonic-gate 		}
32887c478bd9Sstevel@tonic-gate 	}
32897c478bd9Sstevel@tonic-gate 
32907c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
32917c478bd9Sstevel@tonic-gate 	return (NULL);
32927c478bd9Sstevel@tonic-gate }
32937c478bd9Sstevel@tonic-gate 
32947c478bd9Sstevel@tonic-gate page_t *
32957c478bd9Sstevel@tonic-gate page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
32967c478bd9Sstevel@tonic-gate {
32977c478bd9Sstevel@tonic-gate 	kmutex_t	*pcm;
32987c478bd9Sstevel@tonic-gate 	int		i;
32997c478bd9Sstevel@tonic-gate 	page_t		*pp;
33007c478bd9Sstevel@tonic-gate 	page_t		*first_pp;
33017c478bd9Sstevel@tonic-gate 	uint_t		bin_marker;
33027c478bd9Sstevel@tonic-gate 	int		nwaybins, nwaycnt;
33037c478bd9Sstevel@tonic-gate 	int		cpucolors;
33047c478bd9Sstevel@tonic-gate 
33057c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
33067c478bd9Sstevel@tonic-gate 
33077c478bd9Sstevel@tonic-gate 	/* LINTED */
33087c478bd9Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
33097c478bd9Sstevel@tonic-gate 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
33107c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
33117c478bd9Sstevel@tonic-gate 		return (NULL);
33127c478bd9Sstevel@tonic-gate 	}
33137c478bd9Sstevel@tonic-gate 
33147c478bd9Sstevel@tonic-gate 	nwaybins = 0;
33157c478bd9Sstevel@tonic-gate 	cpucolors = cpu_page_colors;
33167c478bd9Sstevel@tonic-gate 	/*
33177c478bd9Sstevel@tonic-gate 	 * adjust cpucolors to possibly check additional 'equivalent' bins
33187c478bd9Sstevel@tonic-gate 	 * to try to minimize fragmentation of large pages by delaying calls
33197c478bd9Sstevel@tonic-gate 	 * to page_freelist_fill.
33207c478bd9Sstevel@tonic-gate 	 */
33217c478bd9Sstevel@tonic-gate 	if (colorequiv > 1) {
33227c478bd9Sstevel@tonic-gate 		int equivcolors = page_colors / colorequiv;
33237c478bd9Sstevel@tonic-gate 
33247c478bd9Sstevel@tonic-gate 		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
33257c478bd9Sstevel@tonic-gate 			cpucolors = equivcolors;
33267c478bd9Sstevel@tonic-gate 	}
33277c478bd9Sstevel@tonic-gate 
33287c478bd9Sstevel@tonic-gate 	/*
33297c478bd9Sstevel@tonic-gate 	 * Only hold one cachelist lock at a time, that way we
33307c478bd9Sstevel@tonic-gate 	 * can start anywhere and not have to worry about lock
33317c478bd9Sstevel@tonic-gate 	 * ordering.
33327c478bd9Sstevel@tonic-gate 	 */
33337c478bd9Sstevel@tonic-gate 
33347c478bd9Sstevel@tonic-gate big_try_again:
33357c478bd9Sstevel@tonic-gate 	nwaycnt = 0;
33367c478bd9Sstevel@tonic-gate 	for (i = 0; i <= page_colors; i++) {
33377c478bd9Sstevel@tonic-gate 		if (PAGE_CACHELISTS(mnode, bin, mtype)) {
33387c478bd9Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
33397c478bd9Sstevel@tonic-gate 			mutex_enter(pcm);
33407c478bd9Sstevel@tonic-gate 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
33417c478bd9Sstevel@tonic-gate 			if (pp != NULL) {
33427c478bd9Sstevel@tonic-gate 				first_pp = pp;
33437c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_vnode);
33447c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp) == 0);
33457c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
33467c478bd9Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
33477c478bd9Sstevel@tonic-gate 				while (!page_trylock(pp, SE_EXCL)) {
33487c478bd9Sstevel@tonic-gate 					pp = pp->p_next;
33497c478bd9Sstevel@tonic-gate 					ASSERT(pp->p_szc == 0);
33507c478bd9Sstevel@tonic-gate 					if (pp == first_pp) {
33517c478bd9Sstevel@tonic-gate 						/*
33527c478bd9Sstevel@tonic-gate 						 * We have searched the
33537c478bd9Sstevel@tonic-gate 						 * complete list!
33547c478bd9Sstevel@tonic-gate 						 * And all of them (might
33557c478bd9Sstevel@tonic-gate 						 * only be one) are locked.
33567c478bd9Sstevel@tonic-gate 						 * This can happen since
33577c478bd9Sstevel@tonic-gate 						 * these pages can also be
33587c478bd9Sstevel@tonic-gate 						 * found via the hash list.
33597c478bd9Sstevel@tonic-gate 						 * When found via the hash
33607c478bd9Sstevel@tonic-gate 						 * list, they are locked
33617c478bd9Sstevel@tonic-gate 						 * first, then removed.
33627c478bd9Sstevel@tonic-gate 						 * We give up to let the
33637c478bd9Sstevel@tonic-gate 						 * other thread run.
33647c478bd9Sstevel@tonic-gate 						 */
33657c478bd9Sstevel@tonic-gate 						pp = NULL;
33667c478bd9Sstevel@tonic-gate 						break;
33677c478bd9Sstevel@tonic-gate 					}
33687c478bd9Sstevel@tonic-gate 					ASSERT(pp->p_vnode);
33697c478bd9Sstevel@tonic-gate 					ASSERT(PP_ISFREE(pp));
33707c478bd9Sstevel@tonic-gate 					ASSERT(PP_ISAGED(pp) == 0);
33717c478bd9Sstevel@tonic-gate 					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
33727c478bd9Sstevel@tonic-gate 							mnode);
33737c478bd9Sstevel@tonic-gate 				}
33747c478bd9Sstevel@tonic-gate 
33757c478bd9Sstevel@tonic-gate 				if (pp) {
33767c478bd9Sstevel@tonic-gate 					page_t	**ppp;
33777c478bd9Sstevel@tonic-gate 					/*
33787c478bd9Sstevel@tonic-gate 					 * Found and locked a page.
33797c478bd9Sstevel@tonic-gate 					 * Pull it off the list.
33807c478bd9Sstevel@tonic-gate 					 */
33817c478bd9Sstevel@tonic-gate 					ASSERT(mtype == PP_2_MTYPE(pp));
33827c478bd9Sstevel@tonic-gate 					ppp = &PAGE_CACHELISTS(mnode, bin,
33837c478bd9Sstevel@tonic-gate 					    mtype);
33847c478bd9Sstevel@tonic-gate 					page_sub(ppp, pp);
33857c478bd9Sstevel@tonic-gate 					/*
33867c478bd9Sstevel@tonic-gate 					 * Subtract counters before releasing
33877c478bd9Sstevel@tonic-gate 					 * pcm mutex to avoid a race with
33887c478bd9Sstevel@tonic-gate 					 * page_freelist_coalesce and
33897c478bd9Sstevel@tonic-gate 					 * page_freelist_fill.
33907c478bd9Sstevel@tonic-gate 					 */
3391affbd3ccSkchow 					page_ctr_sub(mnode, mtype, pp,
3392affbd3ccSkchow 					    PG_CACHE_LIST);
33937c478bd9Sstevel@tonic-gate 					mutex_exit(pcm);
33947c478bd9Sstevel@tonic-gate 					ASSERT(pp->p_vnode);
33957c478bd9Sstevel@tonic-gate 					ASSERT(PP_ISAGED(pp) == 0);
33967c478bd9Sstevel@tonic-gate #if defined(__sparc)
33977c478bd9Sstevel@tonic-gate 					ASSERT(!kcage_on ||
33987c478bd9Sstevel@tonic-gate 					    (flags & PG_NORELOC) == 0 ||
33997c478bd9Sstevel@tonic-gate 					    PP_ISNORELOC(pp));
34007c478bd9Sstevel@tonic-gate 					if (PP_ISNORELOC(pp)) {
34017c478bd9Sstevel@tonic-gate 						kcage_freemem_sub(1);
34027c478bd9Sstevel@tonic-gate 					}
34037c478bd9Sstevel@tonic-gate #endif
34047c478bd9Sstevel@tonic-gate 					VM_STAT_ADD(vmm_vmstats.
34057c478bd9Sstevel@tonic-gate 					    pgmc_allocok);
34067c478bd9Sstevel@tonic-gate 					return (pp);
34077c478bd9Sstevel@tonic-gate 				}
34087c478bd9Sstevel@tonic-gate 			}
34097c478bd9Sstevel@tonic-gate 			mutex_exit(pcm);
34107c478bd9Sstevel@tonic-gate 		}
34117c478bd9Sstevel@tonic-gate 
34127c478bd9Sstevel@tonic-gate 		/*
34137c478bd9Sstevel@tonic-gate 		 * Wow! The initial bin is empty or no page in the bin could
34147c478bd9Sstevel@tonic-gate 		 * be locked.
34157c478bd9Sstevel@tonic-gate 		 *
34167c478bd9Sstevel@tonic-gate 		 * If specific color is needed, check if page color may be in
34177c478bd9Sstevel@tonic-gate 		 * other bins.
34187c478bd9Sstevel@tonic-gate 		 */
34197c478bd9Sstevel@tonic-gate 		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
34207c478bd9Sstevel@tonic-gate 			if (!nwaybins) {
34217c478bd9Sstevel@tonic-gate 				if (cpucolors < 0) {
34227c478bd9Sstevel@tonic-gate 					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
34237c478bd9Sstevel@tonic-gate 					ASSERT(cpucolors > 0);
34247c478bd9Sstevel@tonic-gate 					nwaybins = page_colors / cpucolors;
34257c478bd9Sstevel@tonic-gate 					if (nwaybins < 2)
34267c478bd9Sstevel@tonic-gate 						cpucolors = 0;
34277c478bd9Sstevel@tonic-gate 				} else {
34287c478bd9Sstevel@tonic-gate 					nwaybins = page_colors / cpucolors;
34297c478bd9Sstevel@tonic-gate 					ASSERT(nwaybins > 1);
34307c478bd9Sstevel@tonic-gate 				}
34317c478bd9Sstevel@tonic-gate 			}
34327c478bd9Sstevel@tonic-gate 
34337c478bd9Sstevel@tonic-gate 			if (++nwaycnt >= nwaybins) {
34347c478bd9Sstevel@tonic-gate 				break;
34357c478bd9Sstevel@tonic-gate 			}
34367c478bd9Sstevel@tonic-gate 			bin = (bin + (page_colors / nwaybins)) &
34377c478bd9Sstevel@tonic-gate 			    page_colors_mask;
34387c478bd9Sstevel@tonic-gate 			continue;
34397c478bd9Sstevel@tonic-gate 		}
34407c478bd9Sstevel@tonic-gate 
34417c478bd9Sstevel@tonic-gate 		if (i == 0) {
34427c478bd9Sstevel@tonic-gate 			bin = (bin + BIN_STEP) & page_colors_mask;
34437c478bd9Sstevel@tonic-gate 			bin_marker = bin;
34447c478bd9Sstevel@tonic-gate 		} else {
34457c478bd9Sstevel@tonic-gate 			bin = (bin + vac_colors) & page_colors_mask;
34467c478bd9Sstevel@tonic-gate 			if (bin == bin_marker) {
34477c478bd9Sstevel@tonic-gate 				bin = (bin + 1) & page_colors_mask;
34487c478bd9Sstevel@tonic-gate 				bin_marker = bin;
34497c478bd9Sstevel@tonic-gate 			}
34507c478bd9Sstevel@tonic-gate 		}
34517c478bd9Sstevel@tonic-gate 	}
34527c478bd9Sstevel@tonic-gate 
3453affbd3ccSkchow 	MTYPE_NEXT(mnode, mtype, flags);
3454affbd3ccSkchow 	if (mtype >= 0)
34557c478bd9Sstevel@tonic-gate 		goto big_try_again;
3456affbd3ccSkchow 
34577c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
34587c478bd9Sstevel@tonic-gate 	return (NULL);
34597c478bd9Sstevel@tonic-gate }
34607c478bd9Sstevel@tonic-gate 
34617c478bd9Sstevel@tonic-gate #ifdef DEBUG
34627c478bd9Sstevel@tonic-gate #define	REPL_PAGE_STATS
34637c478bd9Sstevel@tonic-gate #endif /* DEBUG */
34647c478bd9Sstevel@tonic-gate 
34657c478bd9Sstevel@tonic-gate #ifdef REPL_PAGE_STATS
34667c478bd9Sstevel@tonic-gate struct repl_page_stats {
34677c478bd9Sstevel@tonic-gate 	uint_t	ngets;
34687c478bd9Sstevel@tonic-gate 	uint_t	ngets_noreloc;
34697c478bd9Sstevel@tonic-gate 	uint_t	npgr_noreloc;
34707c478bd9Sstevel@tonic-gate 	uint_t	nnopage_first;
34717c478bd9Sstevel@tonic-gate 	uint_t	nnopage;
34727c478bd9Sstevel@tonic-gate 	uint_t	nhashout;
34737c478bd9Sstevel@tonic-gate 	uint_t	nnofree;
34747c478bd9Sstevel@tonic-gate 	uint_t	nnext_pp;
34757c478bd9Sstevel@tonic-gate } repl_page_stats;
34767c478bd9Sstevel@tonic-gate #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
34777c478bd9Sstevel@tonic-gate #else /* REPL_PAGE_STATS */
34787c478bd9Sstevel@tonic-gate #define	REPL_STAT_INCR(v)
34797c478bd9Sstevel@tonic-gate #endif /* REPL_PAGE_STATS */
34807c478bd9Sstevel@tonic-gate 
34817c478bd9Sstevel@tonic-gate int	pgrppgcp;
34827c478bd9Sstevel@tonic-gate 
34837c478bd9Sstevel@tonic-gate /*
34847c478bd9Sstevel@tonic-gate  * The freemem accounting must be done by the caller.
34857c478bd9Sstevel@tonic-gate  * First we try to get a replacement page of the same size as like_pp,
34867c478bd9Sstevel@tonic-gate  * if that is not possible, then we just get a set of discontiguous
34877c478bd9Sstevel@tonic-gate  * PAGESIZE pages.
34887c478bd9Sstevel@tonic-gate  */
34897c478bd9Sstevel@tonic-gate page_t *
34902dae3fb5Sjjc page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
34917c478bd9Sstevel@tonic-gate     uint_t pgrflags)
34927c478bd9Sstevel@tonic-gate {
34937c478bd9Sstevel@tonic-gate 	page_t		*like_pp;
34947c478bd9Sstevel@tonic-gate 	page_t		*pp, *pplist;
34957c478bd9Sstevel@tonic-gate 	page_t		*pl = NULL;
34967c478bd9Sstevel@tonic-gate 	ulong_t		bin;
34977c478bd9Sstevel@tonic-gate 	int		mnode, page_mnode;
34987c478bd9Sstevel@tonic-gate 	int		szc;
34997c478bd9Sstevel@tonic-gate 	spgcnt_t	npgs, pg_cnt;
35007c478bd9Sstevel@tonic-gate 	pfn_t		pfnum;
35017c478bd9Sstevel@tonic-gate 	int		mtype;
35027c478bd9Sstevel@tonic-gate 	int		flags = 0;
35037c478bd9Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
35042dae3fb5Sjjc 	lgrp_t		*lgrp;
35057c478bd9Sstevel@tonic-gate 
35067c478bd9Sstevel@tonic-gate 	REPL_STAT_INCR(ngets);
35077c478bd9Sstevel@tonic-gate 	like_pp = orig_like_pp;
35087c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(like_pp));
35097c478bd9Sstevel@tonic-gate 
35107c478bd9Sstevel@tonic-gate 	szc = like_pp->p_szc;
35117c478bd9Sstevel@tonic-gate 	npgs = page_get_pagecnt(szc);
35127c478bd9Sstevel@tonic-gate 	/*
35137c478bd9Sstevel@tonic-gate 	 * Now we reset like_pp to the base page_t.
35147c478bd9Sstevel@tonic-gate 	 * That way, we won't walk past the end of this 'szc' page.
35157c478bd9Sstevel@tonic-gate 	 */
35167c478bd9Sstevel@tonic-gate 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
35177c478bd9Sstevel@tonic-gate 	like_pp = page_numtopp_nolock(pfnum);
35187c478bd9Sstevel@tonic-gate 	ASSERT(like_pp->p_szc == szc);
35197c478bd9Sstevel@tonic-gate 
35207c478bd9Sstevel@tonic-gate 	if (PP_ISNORELOC(like_pp)) {
35217c478bd9Sstevel@tonic-gate 		ASSERT(kcage_on);
35227c478bd9Sstevel@tonic-gate 		REPL_STAT_INCR(ngets_noreloc);
35237c478bd9Sstevel@tonic-gate 		flags = PGI_RELOCONLY;
35247c478bd9Sstevel@tonic-gate 	} else if (pgrflags & PGR_NORELOC) {
35257c478bd9Sstevel@tonic-gate 		ASSERT(kcage_on);
35267c478bd9Sstevel@tonic-gate 		REPL_STAT_INCR(npgr_noreloc);
35277c478bd9Sstevel@tonic-gate 		flags = PG_NORELOC;
35287c478bd9Sstevel@tonic-gate 	}
35297c478bd9Sstevel@tonic-gate 
35307c478bd9Sstevel@tonic-gate 	/*
35317c478bd9Sstevel@tonic-gate 	 * Kernel pages must always be replaced with the same size
35327c478bd9Sstevel@tonic-gate 	 * pages, since we cannot properly handle demotion of kernel
35337c478bd9Sstevel@tonic-gate 	 * pages.
35347c478bd9Sstevel@tonic-gate 	 */
35357c478bd9Sstevel@tonic-gate 	if (like_pp->p_vnode == &kvp)
35367c478bd9Sstevel@tonic-gate 		pgrflags |= PGR_SAMESZC;
35377c478bd9Sstevel@tonic-gate 
35387c478bd9Sstevel@tonic-gate 	/* LINTED */
353907ad560dSkchow 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
35407c478bd9Sstevel@tonic-gate 
35417c478bd9Sstevel@tonic-gate 	while (npgs) {
35427c478bd9Sstevel@tonic-gate 		pplist = NULL;
35437c478bd9Sstevel@tonic-gate 		for (;;) {
35447c478bd9Sstevel@tonic-gate 			pg_cnt = page_get_pagecnt(szc);
35457c478bd9Sstevel@tonic-gate 			bin = PP_2_BIN(like_pp);
35467c478bd9Sstevel@tonic-gate 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
35477c478bd9Sstevel@tonic-gate 			ASSERT(pg_cnt <= npgs);
35487c478bd9Sstevel@tonic-gate 
35497c478bd9Sstevel@tonic-gate 			/*
35507c478bd9Sstevel@tonic-gate 			 * If an lgroup was specified, try to get the
35517c478bd9Sstevel@tonic-gate 			 * page from that lgroup.
35522dae3fb5Sjjc 			 * NOTE: Must be careful with code below because
35532dae3fb5Sjjc 			 *	 lgroup may disappear and reappear since there
35542dae3fb5Sjjc 			 *	 is no locking for lgroup here.
35557c478bd9Sstevel@tonic-gate 			 */
35562dae3fb5Sjjc 			if (LGRP_EXISTS(lgrp_target)) {
35572dae3fb5Sjjc 				/*
35582dae3fb5Sjjc 				 * Keep local variable for lgroup separate
35592dae3fb5Sjjc 				 * from lgroup argument since this code should
35602dae3fb5Sjjc 				 * only be exercised when lgroup argument
35612dae3fb5Sjjc 				 * exists....
35622dae3fb5Sjjc 				 */
35632dae3fb5Sjjc 				lgrp = lgrp_target;
35642dae3fb5Sjjc 
35657c478bd9Sstevel@tonic-gate 				/* Try the lgroup's freelists first */
35667c478bd9Sstevel@tonic-gate 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
35677c478bd9Sstevel@tonic-gate 				    LGRP_SRCH_LOCAL);
35687c478bd9Sstevel@tonic-gate 				while ((pplist == NULL) &&
35697c478bd9Sstevel@tonic-gate 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
35707c478bd9Sstevel@tonic-gate 				    != -1) {
35717c478bd9Sstevel@tonic-gate 					pplist = page_get_mnode_freelist(
35727c478bd9Sstevel@tonic-gate 						mnode, bin, mtype, szc,
35737c478bd9Sstevel@tonic-gate 						    flags);
35747c478bd9Sstevel@tonic-gate 				}
35757c478bd9Sstevel@tonic-gate 
35767c478bd9Sstevel@tonic-gate 				/*
35777c478bd9Sstevel@tonic-gate 				 * Now try it's cachelists if this is a
35787c478bd9Sstevel@tonic-gate 				 * small page. Don't need to do it for
35797c478bd9Sstevel@tonic-gate 				 * larger ones since page_freelist_coalesce()
35807c478bd9Sstevel@tonic-gate 				 * already failed.
35817c478bd9Sstevel@tonic-gate 				 */
35827c478bd9Sstevel@tonic-gate 				if (pplist != NULL || szc != 0)
35837c478bd9Sstevel@tonic-gate 					break;
35847c478bd9Sstevel@tonic-gate 
35857c478bd9Sstevel@tonic-gate 				/* Now try it's cachelists */
35867c478bd9Sstevel@tonic-gate 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
35877c478bd9Sstevel@tonic-gate 				    LGRP_SRCH_LOCAL);
35887c478bd9Sstevel@tonic-gate 
35897c478bd9Sstevel@tonic-gate 				while ((pplist == NULL) &&
35907c478bd9Sstevel@tonic-gate 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
35917c478bd9Sstevel@tonic-gate 				    != -1) {
35927c478bd9Sstevel@tonic-gate 					pplist = page_get_mnode_cachelist(
35937c478bd9Sstevel@tonic-gate 						bin, flags, mnode, mtype);
35947c478bd9Sstevel@tonic-gate 				}
35957c478bd9Sstevel@tonic-gate 				if (pplist != NULL) {
35967c478bd9Sstevel@tonic-gate 					page_hashout(pplist, NULL);
35977c478bd9Sstevel@tonic-gate 					PP_SETAGED(pplist);
35987c478bd9Sstevel@tonic-gate 					REPL_STAT_INCR(nhashout);
35997c478bd9Sstevel@tonic-gate 					break;
36007c478bd9Sstevel@tonic-gate 				}
36017c478bd9Sstevel@tonic-gate 				/* Done looking in this lgroup. Bail out. */
36027c478bd9Sstevel@tonic-gate 				break;
36037c478bd9Sstevel@tonic-gate 			}
36047c478bd9Sstevel@tonic-gate 
36057c478bd9Sstevel@tonic-gate 			/*
36062dae3fb5Sjjc 			 * No lgroup was specified (or lgroup was removed by
36072dae3fb5Sjjc 			 * DR, so just try to get the page as close to
36082dae3fb5Sjjc 			 * like_pp's mnode as possible.
36097c478bd9Sstevel@tonic-gate 			 * First try the local freelist...
36107c478bd9Sstevel@tonic-gate 			 */
36117c478bd9Sstevel@tonic-gate 			mnode = PP_2_MEM_NODE(like_pp);
36127c478bd9Sstevel@tonic-gate 			pplist = page_get_mnode_freelist(mnode, bin,
36137c478bd9Sstevel@tonic-gate 			    mtype, szc, flags);
36147c478bd9Sstevel@tonic-gate 			if (pplist != NULL)
36157c478bd9Sstevel@tonic-gate 				break;
36167c478bd9Sstevel@tonic-gate 
36177c478bd9Sstevel@tonic-gate 			REPL_STAT_INCR(nnofree);
36187c478bd9Sstevel@tonic-gate 
36197c478bd9Sstevel@tonic-gate 			/*
36207c478bd9Sstevel@tonic-gate 			 * ...then the local cachelist. Don't need to do it for
36217c478bd9Sstevel@tonic-gate 			 * larger pages cause page_freelist_coalesce() already
36227c478bd9Sstevel@tonic-gate 			 * failed there anyway.
36237c478bd9Sstevel@tonic-gate 			 */
36247c478bd9Sstevel@tonic-gate 			if (szc == 0) {
36257c478bd9Sstevel@tonic-gate 				pplist = page_get_mnode_cachelist(bin, flags,
36267c478bd9Sstevel@tonic-gate 				    mnode, mtype);
36277c478bd9Sstevel@tonic-gate 				if (pplist != NULL) {
36287c478bd9Sstevel@tonic-gate 					page_hashout(pplist, NULL);
36297c478bd9Sstevel@tonic-gate 					PP_SETAGED(pplist);
36307c478bd9Sstevel@tonic-gate 					REPL_STAT_INCR(nhashout);
36317c478bd9Sstevel@tonic-gate 					break;
36327c478bd9Sstevel@tonic-gate 				}
36337c478bd9Sstevel@tonic-gate 			}
36347c478bd9Sstevel@tonic-gate 
36357c478bd9Sstevel@tonic-gate 			/* Now try remote freelists */
36367c478bd9Sstevel@tonic-gate 			page_mnode = mnode;
36377c478bd9Sstevel@tonic-gate 			lgrp =
36387c478bd9Sstevel@tonic-gate 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
36397c478bd9Sstevel@tonic-gate 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
36407c478bd9Sstevel@tonic-gate 			    LGRP_SRCH_HIER);
36417c478bd9Sstevel@tonic-gate 			while (pplist == NULL &&
36427c478bd9Sstevel@tonic-gate 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
36437c478bd9Sstevel@tonic-gate 			    != -1) {
36447c478bd9Sstevel@tonic-gate 				/*
36457c478bd9Sstevel@tonic-gate 				 * Skip local mnode.
36467c478bd9Sstevel@tonic-gate 				 */
36477c478bd9Sstevel@tonic-gate 				if ((mnode == page_mnode) ||
36487c478bd9Sstevel@tonic-gate 				    (mem_node_config[mnode].exists == 0))
36497c478bd9Sstevel@tonic-gate 					continue;
36507c478bd9Sstevel@tonic-gate 
36517c478bd9Sstevel@tonic-gate 				pplist = page_get_mnode_freelist(mnode,
36527c478bd9Sstevel@tonic-gate 				    bin, mtype, szc, flags);
36537c478bd9Sstevel@tonic-gate 			}
36547c478bd9Sstevel@tonic-gate 
36557c478bd9Sstevel@tonic-gate 			if (pplist != NULL)
36567c478bd9Sstevel@tonic-gate 				break;
36577c478bd9Sstevel@tonic-gate 
36587c478bd9Sstevel@tonic-gate 
36597c478bd9Sstevel@tonic-gate 			/* Now try remote cachelists */
36607c478bd9Sstevel@tonic-gate 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
36617c478bd9Sstevel@tonic-gate 			    LGRP_SRCH_HIER);
36627c478bd9Sstevel@tonic-gate 			while (pplist == NULL && szc == 0) {
36637c478bd9Sstevel@tonic-gate 				mnode = lgrp_memnode_choose(&lgrp_cookie);
36647c478bd9Sstevel@tonic-gate 				if (mnode == -1)
36657c478bd9Sstevel@tonic-gate 					break;
36667c478bd9Sstevel@tonic-gate 				/*
36677c478bd9Sstevel@tonic-gate 				 * Skip local mnode.
36687c478bd9Sstevel@tonic-gate 				 */
36697c478bd9Sstevel@tonic-gate 				if ((mnode == page_mnode) ||
36707c478bd9Sstevel@tonic-gate 				    (mem_node_config[mnode].exists == 0))
36717c478bd9Sstevel@tonic-gate 					continue;
36727c478bd9Sstevel@tonic-gate 
36737c478bd9Sstevel@tonic-gate 				pplist = page_get_mnode_cachelist(bin,
36747c478bd9Sstevel@tonic-gate 				    flags, mnode, mtype);
36757c478bd9Sstevel@tonic-gate 
36767c478bd9Sstevel@tonic-gate 				if (pplist != NULL) {
36777c478bd9Sstevel@tonic-gate 					page_hashout(pplist, NULL);
36787c478bd9Sstevel@tonic-gate 					PP_SETAGED(pplist);
36797c478bd9Sstevel@tonic-gate 					REPL_STAT_INCR(nhashout);
36807c478bd9Sstevel@tonic-gate 					break;
36817c478bd9Sstevel@tonic-gate 				}
36827c478bd9Sstevel@tonic-gate 			}
36837c478bd9Sstevel@tonic-gate 
36847c478bd9Sstevel@tonic-gate 			/*
36857c478bd9Sstevel@tonic-gate 			 * Break out of while loop under the following cases:
36867c478bd9Sstevel@tonic-gate 			 * - If we successfully got a page.
36877c478bd9Sstevel@tonic-gate 			 * - If pgrflags specified only returning a specific
36887c478bd9Sstevel@tonic-gate 			 *   page size and we could not find that page size.
36897c478bd9Sstevel@tonic-gate 			 * - If we could not satisfy the request with PAGESIZE
36907c478bd9Sstevel@tonic-gate 			 *   or larger pages.
36917c478bd9Sstevel@tonic-gate 			 */
36927c478bd9Sstevel@tonic-gate 			if (pplist != NULL || szc == 0)
36937c478bd9Sstevel@tonic-gate 				break;
36947c478bd9Sstevel@tonic-gate 
36957c478bd9Sstevel@tonic-gate 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
36967c478bd9Sstevel@tonic-gate 				/* try to find contig page */
36977c478bd9Sstevel@tonic-gate 
36987c478bd9Sstevel@tonic-gate 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
36997c478bd9Sstevel@tonic-gate 				    LGRP_SRCH_HIER);
37007c478bd9Sstevel@tonic-gate 
37017c478bd9Sstevel@tonic-gate 				while ((pplist == NULL) &&
37027c478bd9Sstevel@tonic-gate 				    (mnode =
37037c478bd9Sstevel@tonic-gate 					lgrp_memnode_choose(&lgrp_cookie))
37047c478bd9Sstevel@tonic-gate 				    != -1) {
37057c478bd9Sstevel@tonic-gate 					pplist = page_get_contig_pages(
37067c478bd9Sstevel@tonic-gate 						mnode, bin, mtype, szc,
37077c478bd9Sstevel@tonic-gate 						    flags | PGI_PGCPHIPRI);
37087c478bd9Sstevel@tonic-gate 				}
37097c478bd9Sstevel@tonic-gate 				break;
37107c478bd9Sstevel@tonic-gate 			}
37117c478bd9Sstevel@tonic-gate 
37127c478bd9Sstevel@tonic-gate 			/*
37137c478bd9Sstevel@tonic-gate 			 * The correct thing to do here is try the next
37147c478bd9Sstevel@tonic-gate 			 * page size down using szc--. Due to a bug
37157c478bd9Sstevel@tonic-gate 			 * with the processing of HAT_RELOAD_SHARE
37167c478bd9Sstevel@tonic-gate 			 * where the sfmmu_ttecnt arrays of all
37177c478bd9Sstevel@tonic-gate 			 * hats sharing an ISM segment don't get updated,
37187c478bd9Sstevel@tonic-gate 			 * using intermediate size pages for relocation
37197c478bd9Sstevel@tonic-gate 			 * can lead to continuous page faults.
37207c478bd9Sstevel@tonic-gate 			 */
37217c478bd9Sstevel@tonic-gate 			szc = 0;
37227c478bd9Sstevel@tonic-gate 		}
37237c478bd9Sstevel@tonic-gate 
37247c478bd9Sstevel@tonic-gate 		if (pplist != NULL) {
37257c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
37267c478bd9Sstevel@tonic-gate 			    lgrp_t *, lgrp,
37277c478bd9Sstevel@tonic-gate 			    int, mnode,
37287c478bd9Sstevel@tonic-gate 			    ulong_t, bin,
37297c478bd9Sstevel@tonic-gate 			    uint_t, flags);
37307c478bd9Sstevel@tonic-gate 
37317c478bd9Sstevel@tonic-gate 			while (pplist != NULL && pg_cnt--) {
37327c478bd9Sstevel@tonic-gate 				ASSERT(pplist != NULL);
37337c478bd9Sstevel@tonic-gate 				pp = pplist;
37347c478bd9Sstevel@tonic-gate 				page_sub(&pplist, pp);
37357c478bd9Sstevel@tonic-gate 				PP_CLRFREE(pp);
37367c478bd9Sstevel@tonic-gate 				PP_CLRAGED(pp);
37377c478bd9Sstevel@tonic-gate 				page_list_concat(&pl, &pp);
37387c478bd9Sstevel@tonic-gate 				npgs--;
37397c478bd9Sstevel@tonic-gate 				like_pp = like_pp + 1;
37407c478bd9Sstevel@tonic-gate 				REPL_STAT_INCR(nnext_pp);
37417c478bd9Sstevel@tonic-gate 			}
37427c478bd9Sstevel@tonic-gate 			ASSERT(pg_cnt == 0);
37437c478bd9Sstevel@tonic-gate 		} else {
37447c478bd9Sstevel@tonic-gate 			break;
37457c478bd9Sstevel@tonic-gate 		}
37467c478bd9Sstevel@tonic-gate 	}
37477c478bd9Sstevel@tonic-gate 
37487c478bd9Sstevel@tonic-gate 	if (npgs) {
37497c478bd9Sstevel@tonic-gate 		/*
37507c478bd9Sstevel@tonic-gate 		 * We were unable to allocate the necessary number
37517c478bd9Sstevel@tonic-gate 		 * of pages.
37527c478bd9Sstevel@tonic-gate 		 * We need to free up any pl.
37537c478bd9Sstevel@tonic-gate 		 */
37547c478bd9Sstevel@tonic-gate 		REPL_STAT_INCR(nnopage);
37557c478bd9Sstevel@tonic-gate 		page_free_replacement_page(pl);
37567c478bd9Sstevel@tonic-gate 		return (NULL);
37577c478bd9Sstevel@tonic-gate 	} else {
37587c478bd9Sstevel@tonic-gate 		return (pl);
37597c478bd9Sstevel@tonic-gate 	}
37607c478bd9Sstevel@tonic-gate }
37617c478bd9Sstevel@tonic-gate 
37627c478bd9Sstevel@tonic-gate /*
37637c478bd9Sstevel@tonic-gate  * demote a free large page to it's constituent pages
37647c478bd9Sstevel@tonic-gate  */
37657c478bd9Sstevel@tonic-gate void
37667c478bd9Sstevel@tonic-gate page_demote_free_pages(page_t *pp)
37677c478bd9Sstevel@tonic-gate {
37687c478bd9Sstevel@tonic-gate 
37697c478bd9Sstevel@tonic-gate 	int mnode;
37707c478bd9Sstevel@tonic-gate 
37717c478bd9Sstevel@tonic-gate 	ASSERT(pp != NULL);
37727c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
37737c478bd9Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
37747c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
37757c478bd9Sstevel@tonic-gate 
37767c478bd9Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
37777c478bd9Sstevel@tonic-gate 	page_freelist_lock(mnode);
37787c478bd9Sstevel@tonic-gate 	if (pp->p_szc != 0) {
37797c478bd9Sstevel@tonic-gate 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
37807c478bd9Sstevel@tonic-gate 		    pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
37817c478bd9Sstevel@tonic-gate 	}
37827c478bd9Sstevel@tonic-gate 	page_freelist_unlock(mnode);
37837c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
37847c478bd9Sstevel@tonic-gate }
3785