xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision 584b574a)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
50b5aa17bSmec  * Common Development and Distribution License (the "License").
60b5aa17bSmec  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22cb15d5d9SPeter Rival  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
237c478bd9Sstevel@tonic-gate  */
25d0bf2cb9SBryan Cantrill /*
26d0bf2cb9SBryan Cantrill  * Copyright 2012 Joyent, Inc.  All rights reserved.
27d0bf2cb9SBryan Cantrill  */
28d0bf2cb9SBryan Cantrill 
297c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
307c478bd9Sstevel@tonic-gate /*	All Rights Reserved   */
327c478bd9Sstevel@tonic-gate /*
337c478bd9Sstevel@tonic-gate  * Portions of this source code were derived from Berkeley 4.3 BSD
347c478bd9Sstevel@tonic-gate  * under license from the Regents of the University of California.
357c478bd9Sstevel@tonic-gate  */
387c478bd9Sstevel@tonic-gate /*
397c478bd9Sstevel@tonic-gate  * This file contains common functions to access and manage the page lists.
407c478bd9Sstevel@tonic-gate  * Many of these routines originated from platform dependent modules
417c478bd9Sstevel@tonic-gate  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
427c478bd9Sstevel@tonic-gate  * a platform independent manner.
437c478bd9Sstevel@tonic-gate  *
447c478bd9Sstevel@tonic-gate  * vm/vm_dep.h provides for platform specific support.
457c478bd9Sstevel@tonic-gate  */
477c478bd9Sstevel@tonic-gate #include <sys/types.h>
487c478bd9Sstevel@tonic-gate #include <sys/debug.h>
497c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
507c478bd9Sstevel@tonic-gate #include <sys/systm.h>
517c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
527c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
537c478bd9Sstevel@tonic-gate #include <vm/as.h>
547c478bd9Sstevel@tonic-gate #include <vm/page.h>
557c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
567c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h>
5778b03d3aSkchow #include <sys/vmsystm.h>
587c478bd9Sstevel@tonic-gate #include <sys/memnode.h>
597c478bd9Sstevel@tonic-gate #include <vm/vm_dep.h>
607c478bd9Sstevel@tonic-gate #include <sys/lgrp.h>
617c478bd9Sstevel@tonic-gate #include <sys/mem_config.h>
627c478bd9Sstevel@tonic-gate #include <sys/callb.h>
637c478bd9Sstevel@tonic-gate #include <sys/mem_cage.h>
647c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
65ca3e8d88SDave Plauger #include <sys/dumphdr.h>
66cb15d5d9SPeter Rival #include <sys/swap.h>
687c478bd9Sstevel@tonic-gate extern uint_t	vac_colors;
706061ce8aSkchow #define	MAX_PRAGMA_ALIGN	128
726061ce8aSkchow /* vm_cpu_data0 for the boot cpu before kmem is initialized */
746061ce8aSkchow #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
75affbd3ccSkchow #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
766061ce8aSkchow #else
776061ce8aSkchow #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
786061ce8aSkchow #endif
79affbd3ccSkchow char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
817c478bd9Sstevel@tonic-gate /*
827c478bd9Sstevel@tonic-gate  * number of page colors equivalent to reqested color in page_get routines.
837c478bd9Sstevel@tonic-gate  * If set, keeps large pages intact longer and keeps MPO allocation
847c478bd9Sstevel@tonic-gate  * from the local mnode in favor of acquiring the 'correct' page color from
857c478bd9Sstevel@tonic-gate  * a demoted large page or from a remote mnode.
867c478bd9Sstevel@tonic-gate  */
875d07b933Sdp uint_t	colorequiv;
895d07b933Sdp /*
905d07b933Sdp  * color equivalency mask for each page size.
915d07b933Sdp  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
925d07b933Sdp  * High 4 bits determine the number of high order bits of the color to ignore.
935d07b933Sdp  * Low 4 bits determines number of low order bits of color to ignore (it's only
945d07b933Sdp  * relevant for hashed index based page coloring).
955d07b933Sdp  */
965d07b933Sdp uchar_t colorequivszc[MMU_PAGE_SIZES];
987c478bd9Sstevel@tonic-gate /*
997c478bd9Sstevel@tonic-gate  * if set, specifies the percentage of large pages that are free from within
1007c478bd9Sstevel@tonic-gate  * a large page region before attempting to lock those pages for
1017c478bd9Sstevel@tonic-gate  * page_get_contig_pages processing.
1027c478bd9Sstevel@tonic-gate  *
1037c478bd9Sstevel@tonic-gate  * Should be turned on when kpr is available when page_trylock_contig_pages
1047c478bd9Sstevel@tonic-gate  * can be more selective.
1057c478bd9Sstevel@tonic-gate  */
1077c478bd9Sstevel@tonic-gate int	ptcpthreshold;
1097c478bd9Sstevel@tonic-gate /*
1107c478bd9Sstevel@tonic-gate  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
11183f9b804Skchow  * Enabled by default via pgcplimitsearch.
11283f9b804Skchow  *
11383f9b804Skchow  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
11483f9b804Skchow  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
11583f9b804Skchow  * bound. This upper bound range guarantees:
11683f9b804Skchow  *    - all large page 'slots' will be searched over time
11783f9b804Skchow  *    - the minimum (1) large page candidates considered on each pgcp call
11883f9b804Skchow  *    - count doesn't wrap around to 0
1197c478bd9Sstevel@tonic-gate  */
12083f9b804Skchow pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
1217c478bd9Sstevel@tonic-gate int	pgcplimitsearch = 1;
12383f9b804Skchow #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
12483f9b804Skchow #define	SETPGCPFAILCNT(szc)						\
12583f9b804Skchow 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
12683f9b804Skchow 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
1287c478bd9Sstevel@tonic-gate #ifdef VM_STATS
1297c478bd9Sstevel@tonic-gate struct vmm_vmstats_str  vmm_vmstats;
1317c478bd9Sstevel@tonic-gate #endif /* VM_STATS */
1337c478bd9Sstevel@tonic-gate #if defined(__sparc)
1347c478bd9Sstevel@tonic-gate #define	LPGCREATE	0
1357c478bd9Sstevel@tonic-gate #else
1367c478bd9Sstevel@tonic-gate /* enable page_get_contig_pages */
1377c478bd9Sstevel@tonic-gate #define	LPGCREATE	1
1387c478bd9Sstevel@tonic-gate #endif
1407c478bd9Sstevel@tonic-gate int pg_contig_disable;
1417c478bd9Sstevel@tonic-gate int pg_lpgcreate_nocage = LPGCREATE;
1437c478bd9Sstevel@tonic-gate /*
14419397407SSherry Moore  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
1457c478bd9Sstevel@tonic-gate  */
1467c478bd9Sstevel@tonic-gate #define	PFNNULL		0
1487c478bd9Sstevel@tonic-gate /* Flags involved in promotion and demotion routines */
1497c478bd9Sstevel@tonic-gate #define	PC_FREE		0x1	/* put page on freelist */
1507c478bd9Sstevel@tonic-gate #define	PC_ALLOC	0x2	/* return page for allocation */
1527c478bd9Sstevel@tonic-gate /*
1537c478bd9Sstevel@tonic-gate  * Flag for page_demote to be used with PC_FREE to denote that we don't care
1547c478bd9Sstevel@tonic-gate  * what the color is as the color parameter to the function is ignored.
1557c478bd9Sstevel@tonic-gate  */
1567c478bd9Sstevel@tonic-gate #define	PC_NO_COLOR	(-1)
1585d07b933Sdp /* mtype value for page_promote to use when mtype does not matter */
1595d07b933Sdp #define	PC_MTYPE_ANY	(-1)
1617c478bd9Sstevel@tonic-gate /*
1627c478bd9Sstevel@tonic-gate  * page counters candidates info
1637c478bd9Sstevel@tonic-gate  * See page_ctrs_cands comment below for more details.
1647c478bd9Sstevel@tonic-gate  * fields are as follows:
1657c478bd9Sstevel@tonic-gate  *	pcc_pages_free:		# pages which freelist coalesce can create
1667c478bd9Sstevel@tonic-gate  *	pcc_color_free:		pointer to page free counts per color
1677c478bd9Sstevel@tonic-gate  */
1687c478bd9Sstevel@tonic-gate typedef struct pcc_info {
1697c478bd9Sstevel@tonic-gate 	pgcnt_t	pcc_pages_free;
1707c478bd9Sstevel@tonic-gate 	pgcnt_t	*pcc_color_free;
17106fb6a36Sdv 	uint_t	pad[12];
1727c478bd9Sstevel@tonic-gate } pcc_info_t;
1747c478bd9Sstevel@tonic-gate /*
1757c478bd9Sstevel@tonic-gate  * On big machines it can take a long time to check page_counters
1767c478bd9Sstevel@tonic-gate  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
1777c478bd9Sstevel@tonic-gate  * updated sum of all elements of the corresponding page_counters arrays.
1787c478bd9Sstevel@tonic-gate  * page_freelist_coalesce() searches page_counters only if an appropriate
1797c478bd9Sstevel@tonic-gate  * element of page_ctrs_cands array is greater than 0.
1807c478bd9Sstevel@tonic-gate  *
1815d07b933Sdp  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
1827c478bd9Sstevel@tonic-gate  */
1835d07b933Sdp pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
1857c478bd9Sstevel@tonic-gate /*
1867c478bd9Sstevel@tonic-gate  * Return in val the total number of free pages which can be created
1875d07b933Sdp  * for the given mnode (m), mrange (g), and region size (r)
1887c478bd9Sstevel@tonic-gate  */
1895d07b933Sdp #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
1907c478bd9Sstevel@tonic-gate 	int i;								\
1917c478bd9Sstevel@tonic-gate 	val = 0;							\
1927c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {				\
1935d07b933Sdp 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
1947c478bd9Sstevel@tonic-gate 	}								\
1957c478bd9Sstevel@tonic-gate }
1977c478bd9Sstevel@tonic-gate /*
1987c478bd9Sstevel@tonic-gate  * Return in val the total number of free pages which can be created
1995d07b933Sdp  * for the given mnode (m), mrange (g), region size (r), and color (c)
2007c478bd9Sstevel@tonic-gate  */
2015d07b933Sdp #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
2027c478bd9Sstevel@tonic-gate 	int i;								\
2037c478bd9Sstevel@tonic-gate 	val = 0;							\
2045d07b933Sdp 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
2057c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {				\
2065d07b933Sdp 	    val +=							\
2075d07b933Sdp 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
2087c478bd9Sstevel@tonic-gate 	}								\
2097c478bd9Sstevel@tonic-gate }
2117c478bd9Sstevel@tonic-gate /*
2127c478bd9Sstevel@tonic-gate  * We can only allow a single thread to update a counter within the physical
2137c478bd9Sstevel@tonic-gate  * range of the largest supported page size. That is the finest granularity
2147c478bd9Sstevel@tonic-gate  * possible since the counter values are dependent on each other
2157c478bd9Sstevel@tonic-gate  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
2167c478bd9Sstevel@tonic-gate  * ctr_mutex lock index for a particular physical range.
2177c478bd9Sstevel@tonic-gate  */
2187c478bd9Sstevel@tonic-gate static kmutex_t	*ctr_mutex[NPC_MUTEX];
2207c478bd9Sstevel@tonic-gate #define	PP_CTR_LOCK_INDX(pp)						\
2215d07b933Sdp 	(((pp)->p_pagenum >>						\
2227c478bd9Sstevel@tonic-gate 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
2245d07b933Sdp #define	INVALID_COLOR 0xffffffff
2255d07b933Sdp #define	INVALID_MASK  0xffffffff
2277c478bd9Sstevel@tonic-gate /*
2287c478bd9Sstevel@tonic-gate  * Local functions prototypes.
2297c478bd9Sstevel@tonic-gate  */
231affbd3ccSkchow void page_ctr_add(int, int, page_t *, int);
232affbd3ccSkchow void page_ctr_add_internal(int, int, page_t *, int);
233affbd3ccSkchow void page_ctr_sub(int, int, page_t *, int);
2345d07b933Sdp void page_ctr_sub_internal(int, int, page_t *, int);
2357c478bd9Sstevel@tonic-gate void page_freelist_lock(int);
2367c478bd9Sstevel@tonic-gate void page_freelist_unlock(int);
2375d07b933Sdp page_t *page_promote(int, pfn_t, uchar_t, int, int);
23819397407SSherry Moore page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
2395d07b933Sdp page_t *page_freelist_split(uchar_t,
24019397407SSherry Moore     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
2417c478bd9Sstevel@tonic-gate page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
2427c478bd9Sstevel@tonic-gate static int page_trylock_cons(page_t *pp, se_t se);
2447c478bd9Sstevel@tonic-gate /*
2457c478bd9Sstevel@tonic-gate  * The page_counters array below is used to keep track of free contiguous
2467c478bd9Sstevel@tonic-gate  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
2477c478bd9Sstevel@tonic-gate  * This contains an array of counters, the size of the array, a shift value
2487c478bd9Sstevel@tonic-gate  * used to convert a pagenum into a counter array index or vice versa, as
2497c478bd9Sstevel@tonic-gate  * well as a cache of the last successful index to be promoted to a larger
2507c478bd9Sstevel@tonic-gate  * page size.  As an optimization, we keep track of the last successful index
2517c478bd9Sstevel@tonic-gate  * to be promoted per page color for the given size region, and this is
2527c478bd9Sstevel@tonic-gate  * allocated dynamically based upon the number of colors for a given
2537c478bd9Sstevel@tonic-gate  * region size.
2547c478bd9Sstevel@tonic-gate  *
2557c478bd9Sstevel@tonic-gate  * Conceptually, the page counters are represented as:
2567c478bd9Sstevel@tonic-gate  *
2577c478bd9Sstevel@tonic-gate  *	page_counters[region_size][mnode]
2587c478bd9Sstevel@tonic-gate  *
2597c478bd9Sstevel@tonic-gate  *	region_size:	size code of a candidate larger page made up
2607c478bd9Sstevel@tonic-gate  *			of contiguous free smaller pages.
2617c478bd9Sstevel@tonic-gate  *
2627c478bd9Sstevel@tonic-gate  *	page_counters[region_size][mnode].hpm_counters[index]:
2637c478bd9Sstevel@tonic-gate  *		represents how many (region_size - 1) pages either
2647c478bd9Sstevel@tonic-gate  *		exist or can be created within the given index range.
2657c478bd9Sstevel@tonic-gate  *
2667c478bd9Sstevel@tonic-gate  * Let's look at a sparc example:
2677c478bd9Sstevel@tonic-gate  *	If we want to create a free 512k page, we look at region_size 2
2687c478bd9Sstevel@tonic-gate  *	for the mnode we want.  We calculate the index and look at a specific
2697c478bd9Sstevel@tonic-gate  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
2707c478bd9Sstevel@tonic-gate  *	this location, it means that 8 64k pages either exist or can be created
2717c478bd9Sstevel@tonic-gate  *	from 8K pages in order to make a single free 512k page at the given
2727c478bd9Sstevel@tonic-gate  *	index.  Note that when a region is full, it will contribute to the
2737c478bd9Sstevel@tonic-gate  *	counts in the region above it.  Thus we will not know what page
2747c478bd9Sstevel@tonic-gate  *	size the free pages will be which can be promoted to this new free
2757c478bd9Sstevel@tonic-gate  *	page unless we look at all regions below the current region.
2767c478bd9Sstevel@tonic-gate  */
2787c478bd9Sstevel@tonic-gate /*
2797c478bd9Sstevel@tonic-gate  * Note: hpmctr_t is defined in platform vm_dep.h
2807c478bd9Sstevel@tonic-gate  * hw_page_map_t contains all the information needed for the page_counters
2817c478bd9Sstevel@tonic-gate  * logic. The fields are as follows:
2827c478bd9Sstevel@tonic-gate  *
2837c478bd9Sstevel@tonic-gate  *	hpm_counters:	dynamically allocated array to hold counter data
2847c478bd9Sstevel@tonic-gate  *	hpm_entries:	entries in hpm_counters
2857c478bd9Sstevel@tonic-gate  *	hpm_shift:	shift for pnum/array index conv
2867c478bd9Sstevel@tonic-gate  *	hpm_base:	PFN mapped to counter index 0
2877c478bd9Sstevel@tonic-gate  *	hpm_color_current:	last index in counter array for this color at
2887c478bd9Sstevel@tonic-gate  *				which we successfully created a large page
2897c478bd9Sstevel@tonic-gate  */
2907c478bd9Sstevel@tonic-gate typedef struct hw_page_map {
2917c478bd9Sstevel@tonic-gate 	hpmctr_t	*hpm_counters;
2927c478bd9Sstevel@tonic-gate 	size_t		hpm_entries;
2937c478bd9Sstevel@tonic-gate 	int		hpm_shift;
2947c478bd9Sstevel@tonic-gate 	pfn_t		hpm_base;
2955d07b933Sdp 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
29606fb6a36Sdv #if defined(__sparc)
29706fb6a36Sdv 	uint_t		pad[4];
29806fb6a36Sdv #endif
2997c478bd9Sstevel@tonic-gate } hw_page_map_t;
3017c478bd9Sstevel@tonic-gate /*
3027c478bd9Sstevel@tonic-gate  * Element zero is not used, but is allocated for convenience.
3037c478bd9Sstevel@tonic-gate  */
3047c478bd9Sstevel@tonic-gate static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
3065d07b933Sdp /*
3075d07b933Sdp  * Cached value of MNODE_RANGE_CNT(mnode).
3085d07b933Sdp  * This is a function call in x86.
3095d07b933Sdp  */
3105d07b933Sdp static int mnode_nranges[MAX_MEM_NODES];
3115d07b933Sdp static int mnode_maxmrange[MAX_MEM_NODES];
3137c478bd9Sstevel@tonic-gate /*
3147c478bd9Sstevel@tonic-gate  * The following macros are convenient ways to get access to the individual
3157c478bd9Sstevel@tonic-gate  * elements of the page_counters arrays.  They can be used on both
3167c478bd9Sstevel@tonic-gate  * the left side and right side of equations.
3177c478bd9Sstevel@tonic-gate  */
3187c478bd9Sstevel@tonic-gate #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
3197c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
321*584b574aSToomas Soome #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc)			\
3227c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
324*584b574aSToomas Soome #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc)			\
3257c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
327*584b574aSToomas Soome #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc)			\
3287c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
330*584b574aSToomas Soome #define	PAGE_COUNTERS_BASE(mnode, rg_szc)			\
3317c478bd9Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_base)
3335d07b933Sdp #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
3345d07b933Sdp 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
3365d07b933Sdp #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
3375d07b933Sdp 	(page_counters[(rg_szc)][(mnode)].				\
3385d07b933Sdp 	hpm_color_current[(mrange)][(color)])
3407c478bd9Sstevel@tonic-gate #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
3417c478bd9Sstevel@tonic-gate 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
3427c478bd9Sstevel@tonic-gate 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
344*584b574aSToomas Soome #define	IDX_TO_PNUM(mnode, rg_szc, index)			\
3457c478bd9Sstevel@tonic-gate 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
3467c478bd9Sstevel@tonic-gate 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
3487c478bd9Sstevel@tonic-gate /*
3497c478bd9Sstevel@tonic-gate  * Protects the hpm_counters and hpm_color_current memory from changing while
3507c478bd9Sstevel@tonic-gate  * looking at page counters information.
3517c478bd9Sstevel@tonic-gate  * Grab the write lock to modify what these fields point at.
3527c478bd9Sstevel@tonic-gate  * Grab the read lock to prevent any pointers from changing.
3537c478bd9Sstevel@tonic-gate  * The write lock can not be held during memory allocation due to a possible
3547c478bd9Sstevel@tonic-gate  * recursion deadlock with trying to grab the read lock while the
3557c478bd9Sstevel@tonic-gate  * write lock is already held.
3567c478bd9Sstevel@tonic-gate  */
3577c478bd9Sstevel@tonic-gate krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
360affbd3ccSkchow /*
361affbd3ccSkchow  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
362affbd3ccSkchow  */
363affbd3ccSkchow void
cpu_vm_data_init(struct cpu * cp)364affbd3ccSkchow cpu_vm_data_init(struct cpu *cp)
365affbd3ccSkchow {
366affbd3ccSkchow 	if (cp == CPU0) {
367affbd3ccSkchow 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
368affbd3ccSkchow 	} else {
369affbd3ccSkchow 		void	*kmptr;
3706061ce8aSkchow 		int	align;
3716061ce8aSkchow 		size_t	sz;
3736061ce8aSkchow 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
3746061ce8aSkchow 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
3756061ce8aSkchow 		kmptr = kmem_zalloc(sz, KM_SLEEP);
376affbd3ccSkchow 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
377affbd3ccSkchow 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
3786061ce8aSkchow 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
379affbd3ccSkchow 	}
380affbd3ccSkchow }
382affbd3ccSkchow /*
383affbd3ccSkchow  * free cpu_vm_data
384affbd3ccSkchow  */
385affbd3ccSkchow void
cpu_vm_data_destroy(struct cpu * cp)386affbd3ccSkchow cpu_vm_data_destroy(struct cpu *cp)
387affbd3ccSkchow {
388affbd3ccSkchow 	if (cp->cpu_seqid && cp->cpu_vm_data) {
389affbd3ccSkchow 		ASSERT(cp != CPU0);
390affbd3ccSkchow 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
3916061ce8aSkchow 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
392affbd3ccSkchow 	}
393affbd3ccSkchow 	cp->cpu_vm_data = NULL;
394affbd3ccSkchow }
3977c478bd9Sstevel@tonic-gate /*
3987c478bd9Sstevel@tonic-gate  * page size to page size code
3997c478bd9Sstevel@tonic-gate  */
4007c478bd9Sstevel@tonic-gate int
page_szc(size_t pagesize)4017c478bd9Sstevel@tonic-gate page_szc(size_t pagesize)
4027c478bd9Sstevel@tonic-gate {
4037c478bd9Sstevel@tonic-gate 	int	i = 0;
4057c478bd9Sstevel@tonic-gate 	while (hw_page_array[i].hp_size) {
4067c478bd9Sstevel@tonic-gate 		if (pagesize == hw_page_array[i].hp_size)
4077c478bd9Sstevel@tonic-gate 			return (i);
4087c478bd9Sstevel@tonic-gate 		i++;
4097c478bd9Sstevel@tonic-gate 	}
4107c478bd9Sstevel@tonic-gate 	return (-1);
4117c478bd9Sstevel@tonic-gate }
4137c478bd9Sstevel@tonic-gate /*
4144abce959Smec  * page size to page size code with the restriction that it be a supported
4154abce959Smec  * user page size.  If it's not a supported user page size, -1 will be returned.
4167c478bd9Sstevel@tonic-gate  */
4177c478bd9Sstevel@tonic-gate int
page_szc_user_filtered(size_t pagesize)4184abce959Smec page_szc_user_filtered(size_t pagesize)
4197c478bd9Sstevel@tonic-gate {
4207c478bd9Sstevel@tonic-gate 	int szc = page_szc(pagesize);
4214abce959Smec 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
4224abce959Smec 		return (szc);
4234abce959Smec 	}
4247c478bd9Sstevel@tonic-gate 	return (-1);
4257c478bd9Sstevel@tonic-gate }
4277c478bd9Sstevel@tonic-gate /*
4287c478bd9Sstevel@tonic-gate  * Return how many page sizes are available for the user to use.  This is
4297c478bd9Sstevel@tonic-gate  * what the hardware supports and not based upon how the OS implements the
4307c478bd9Sstevel@tonic-gate  * support of different page sizes.
43102bc52beSkchow  *
43202bc52beSkchow  * If legacy is non-zero, return the number of pagesizes available to legacy
43302bc52beSkchow  * applications. The number of legacy page sizes might be less than the
43402bc52beSkchow  * exported user page sizes. This is to prevent legacy applications that
43502bc52beSkchow  * use the largest page size returned from getpagesizes(3c) from inadvertantly
43602bc52beSkchow  * using the 'new' large pagesizes.
4377c478bd9Sstevel@tonic-gate  */
4387c478bd9Sstevel@tonic-gate uint_t
page_num_user_pagesizes(int legacy)43902bc52beSkchow page_num_user_pagesizes(int legacy)
4407c478bd9Sstevel@tonic-gate {
44102bc52beSkchow 	if (legacy)
44202bc52beSkchow 		return (mmu_legacy_page_sizes);
4437c478bd9Sstevel@tonic-gate 	return (mmu_exported_page_sizes);
4447c478bd9Sstevel@tonic-gate }
4467c478bd9Sstevel@tonic-gate uint_t
page_num_pagesizes(void)4477c478bd9Sstevel@tonic-gate page_num_pagesizes(void)
4487c478bd9Sstevel@tonic-gate {
4497c478bd9Sstevel@tonic-gate 	return (mmu_page_sizes);
4507c478bd9Sstevel@tonic-gate }
4527c478bd9Sstevel@tonic-gate /*
4537c478bd9Sstevel@tonic-gate  * returns the count of the number of base pagesize pages associated with szc
4547c478bd9Sstevel@tonic-gate  */
4557c478bd9Sstevel@tonic-gate pgcnt_t
page_get_pagecnt(uint_t szc)4567c478bd9Sstevel@tonic-gate page_get_pagecnt(uint_t szc)
4577c478bd9Sstevel@tonic-gate {
4587c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4597c478bd9Sstevel@tonic-gate 		panic("page_get_pagecnt: out of range %d", szc);
4607c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_pgcnt);
4617c478bd9Sstevel@tonic-gate }
4637c478bd9Sstevel@tonic-gate size_t
page_get_pagesize(uint_t szc)4647c478bd9Sstevel@tonic-gate page_get_pagesize(uint_t szc)
4657c478bd9Sstevel@tonic-gate {
4667c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4677c478bd9Sstevel@tonic-gate 		panic("page_get_pagesize: out of range %d", szc);
4687c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_size);
4697c478bd9Sstevel@tonic-gate }
4717c478bd9Sstevel@tonic-gate /*
4727c478bd9Sstevel@tonic-gate  * Return the size of a page based upon the index passed in.  An index of
4737c478bd9Sstevel@tonic-gate  * zero refers to the smallest page size in the system, and as index increases
4747c478bd9Sstevel@tonic-gate  * it refers to the next larger supported page size in the system.
4757c478bd9Sstevel@tonic-gate  * Note that szc and userszc may not be the same due to unsupported szc's on
4767c478bd9Sstevel@tonic-gate  * some systems.
4777c478bd9Sstevel@tonic-gate  */
4787c478bd9Sstevel@tonic-gate size_t
page_get_user_pagesize(uint_t userszc)4797c478bd9Sstevel@tonic-gate page_get_user_pagesize(uint_t userszc)
4807c478bd9Sstevel@tonic-gate {
4817c478bd9Sstevel@tonic-gate 	uint_t szc = USERSZC_2_SZC(userszc);
4837c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4847c478bd9Sstevel@tonic-gate 		panic("page_get_user_pagesize: out of range %d", szc);
4857c478bd9Sstevel@tonic-gate 	return (hw_page_array[szc].hp_size);
4867c478bd9Sstevel@tonic-gate }
4887c478bd9Sstevel@tonic-gate uint_t
page_get_shift(uint_t szc)4897c478bd9Sstevel@tonic-gate page_get_shift(uint_t szc)
4907c478bd9Sstevel@tonic-gate {
4917c478bd9Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4927c478bd9Sstevel@tonic-gate 		panic("page_get_shift: out of range %d", szc);
4935d07b933Sdp 	return (PAGE_GET_SHIFT(szc));
4947c478bd9Sstevel@tonic-gate }
4967c478bd9Sstevel@tonic-gate uint_t
page_get_pagecolors(uint_t szc)4977c478bd9Sstevel@tonic-gate page_get_pagecolors(uint_t szc)
4987c478bd9Sstevel@tonic-gate {
4995d07b933Sdp 	if (szc >= mmu_page_sizes)
5005d07b933Sdp 		panic("page_get_pagecolors: out of range %d", szc);
5015d07b933Sdp 	return (PAGE_GET_PAGECOLORS(szc));
5025d07b933Sdp }
5045d07b933Sdp /*
5055d07b933Sdp  * this assigns the desired equivalent color after a split
5065d07b933Sdp  */
5075d07b933Sdp uint_t
page_correct_color(uchar_t szc,uchar_t nszc,uint_t color,uint_t ncolor,uint_t ceq_mask)5085d07b933Sdp page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
5095d07b933Sdp     uint_t ncolor, uint_t ceq_mask)
5105d07b933Sdp {
5115d07b933Sdp 	ASSERT(nszc > szc);
5125d07b933Sdp 	ASSERT(szc < mmu_page_sizes);
5135d07b933Sdp 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
5145d07b933Sdp 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
5165d07b933Sdp 	color &= ceq_mask;
517ce8eb11aSdp 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
5185d07b933Sdp 	return (color | (ncolor & ~ceq_mask));
5197c478bd9Sstevel@tonic-gate }
521ce8eb11aSdp /*
522ce8eb11aSdp  * The interleaved_mnodes flag is set when mnodes overlap in
523ce8eb11aSdp  * the physbase..physmax range, but have disjoint slices.
524ce8eb11aSdp  * In this case hpm_counters is shared by all mnodes.
525ce8eb11aSdp  * This flag is set dynamically by the platform.
526ce8eb11aSdp  */
527ce8eb11aSdp int interleaved_mnodes = 0;
5297c478bd9Sstevel@tonic-gate /*
5307c478bd9Sstevel@tonic-gate  * Called by startup().
5317c478bd9Sstevel@tonic-gate  * Size up the per page size free list counters based on physmax
5327c478bd9Sstevel@tonic-gate  * of each node and max_mem_nodes.
533ce8eb11aSdp  *
534ce8eb11aSdp  * If interleaved_mnodes is set we need to find the first mnode that
535ce8eb11aSdp  * exists. hpm_counters for the first mnode will then be shared by
536ce8eb11aSdp  * all other mnodes. If interleaved_mnodes is not set, just set
537ce8eb11aSdp  * first=mnode each time. That means there will be no sharing.
5387c478bd9Sstevel@tonic-gate  */
5397c478bd9Sstevel@tonic-gate size_t
page_ctrs_sz(void)5407c478bd9Sstevel@tonic-gate page_ctrs_sz(void)
5417c478bd9Sstevel@tonic-gate {
5427c478bd9Sstevel@tonic-gate 	int	r;		/* region size */
5437c478bd9Sstevel@tonic-gate 	int	mnode;
544ce8eb11aSdp 	int	firstmn;	/* first mnode that exists */
5455d07b933Sdp 	int	nranges;
546ce8eb11aSdp 	pfn_t	physbase;
547ce8eb11aSdp 	pfn_t	physmax;
5487c478bd9Sstevel@tonic-gate 	uint_t	ctrs_sz = 0;
549*584b574aSToomas Soome 	int	i;
5507c478bd9Sstevel@tonic-gate 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
5527c478bd9Sstevel@tonic-gate 	/*
5537c478bd9Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
5547c478bd9Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
5557c478bd9Sstevel@tonic-gate 	 * arrays.
5567c478bd9Sstevel@tonic-gate 	 */
5575d07b933Sdp 	for (i = 0; i < mmu_page_sizes; i++) {
5585d07b933Sdp 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
5597c478bd9Sstevel@tonic-gate 	}
561ce8eb11aSdp 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
5637c478bd9Sstevel@tonic-gate 		pgcnt_t r_pgcnt;
5647c478bd9Sstevel@tonic-gate 		pfn_t   r_base;
5657c478bd9Sstevel@tonic-gate 		pgcnt_t r_align;
5677c478bd9Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
5687c478bd9Sstevel@tonic-gate 			continue;
570ce8eb11aSdp 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
5715d07b933Sdp 		nranges = MNODE_RANGE_CNT(mnode);
5725d07b933Sdp 		mnode_nranges[mnode] = nranges;
5735d07b933Sdp 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
5757c478bd9Sstevel@tonic-gate 		/*
5767c478bd9Sstevel@tonic-gate 		 * determine size needed for page counter arrays with
5777c478bd9Sstevel@tonic-gate 		 * base aligned to large page size.
5787c478bd9Sstevel@tonic-gate 		 */
5797c478bd9Sstevel@tonic-gate 		for (r = 1; r < mmu_page_sizes; r++) {
580ce8eb11aSdp 			/* add in space for hpm_color_current */
581ce8eb11aSdp 			ctrs_sz += sizeof (size_t) *
582ce8eb11aSdp 			    colors_per_szc[r] * nranges;
584ce8eb11aSdp 			if (firstmn != mnode)
585ce8eb11aSdp 				continue;
5877c478bd9Sstevel@tonic-gate 			/* add in space for hpm_counters */
5887c478bd9Sstevel@tonic-gate 			r_align = page_get_pagecnt(r);
589ce8eb11aSdp 			r_base = physbase;
5907c478bd9Sstevel@tonic-gate 			r_base &= ~(r_align - 1);
591ce8eb11aSdp 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
5937c478bd9Sstevel@tonic-gate 			/*
5947c478bd9Sstevel@tonic-gate 			 * Round up to always allocate on pointer sized
5957c478bd9Sstevel@tonic-gate 			 * boundaries.
5967c478bd9Sstevel@tonic-gate 			 */
5977c478bd9Sstevel@tonic-gate 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
5987c478bd9Sstevel@tonic-gate 			    sizeof (hpmctr_t *));
5997c478bd9Sstevel@tonic-gate 		}
6007c478bd9Sstevel@tonic-gate 	}
6027c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
6037c478bd9Sstevel@tonic-gate 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
6045d07b933Sdp 	}
6065d07b933Sdp 	/* add in space for page_ctrs_cands and pcc_color_free */
6075d07b933Sdp 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
6085d07b933Sdp 	    mmu_page_sizes * NPC_MUTEX;
6105d07b933Sdp 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
6125d07b933Sdp 		if (mem_node_config[mnode].exists == 0)
6135d07b933Sdp 			continue;
6155d07b933Sdp 		nranges = mnode_nranges[mnode];
6165d07b933Sdp 		ctrs_sz += sizeof (pcc_info_t) * nranges *
6175d07b933Sdp 		    mmu_page_sizes * NPC_MUTEX;
6185d07b933Sdp 		for (r = 1; r < mmu_page_sizes; r++) {
6195d07b933Sdp 			ctrs_sz += sizeof (pgcnt_t) * nranges *
6205d07b933Sdp 			    colors_per_szc[r] * NPC_MUTEX;
6215d07b933Sdp 		}
6227c478bd9Sstevel@tonic-gate 	}
6247c478bd9Sstevel@tonic-gate 	/* ctr_mutex */
6257c478bd9Sstevel@tonic-gate 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
6277c478bd9Sstevel@tonic-gate 	/* size for page list counts */
6287c478bd9Sstevel@tonic-gate 	PLCNT_SZ(ctrs_sz);
6307c478bd9Sstevel@tonic-gate 	/*
6317c478bd9Sstevel@tonic-gate 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
6327c478bd9Sstevel@tonic-gate 	 * address of the counters to ecache_alignsize boundary for every
6337c478bd9Sstevel@tonic-gate 	 * memory node.
6347c478bd9Sstevel@tonic-gate 	 */
6357c478bd9Sstevel@tonic-gate 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
6367c478bd9Sstevel@tonic-gate }
6387c478bd9Sstevel@tonic-gate caddr_t
page_ctrs_alloc(caddr_t alloc_base)6397c478bd9Sstevel@tonic-gate page_ctrs_alloc(caddr_t alloc_base)
6407c478bd9Sstevel@tonic-gate {
6417c478bd9Sstevel@tonic-gate 	int	mnode;
6425d07b933Sdp 	int	mrange, nranges;
6437c478bd9Sstevel@tonic-gate 	int	r;		/* region size */
6447c478bd9Sstevel@tonic-gate 	int	i;
645ce8eb11aSdp 	int	firstmn;	/* first mnode that exists */
646ce8eb11aSdp 	pfn_t	physbase;
647ce8eb11aSdp 	pfn_t	physmax;
6487c478bd9Sstevel@tonic-gate 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
6507c478bd9Sstevel@tonic-gate 	/*
6517c478bd9Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
6527c478bd9Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
6537c478bd9Sstevel@tonic-gate 	 * arrays.
6547c478bd9Sstevel@tonic-gate 	 */
6555d07b933Sdp 	for (i = 0; i < mmu_page_sizes; i++) {
6565d07b933Sdp 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
6577c478bd9Sstevel@tonic-gate 	}
6597c478bd9Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
6607c478bd9Sstevel@tonic-gate 		page_counters[r] = (hw_page_map_t *)alloc_base;
6617c478bd9Sstevel@tonic-gate 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
6627c478bd9Sstevel@tonic-gate 	}
6645d07b933Sdp 	/* page_ctrs_cands and pcc_color_free array */
6655d07b933Sdp 	for (i = 0; i < NPC_MUTEX; i++) {
6665d07b933Sdp 		for (r = 1; r < mmu_page_sizes; r++) {
6685d07b933Sdp 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
6695d07b933Sdp 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
6717c478bd9Sstevel@tonic-gate 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
6725d07b933Sdp 				pcc_info_t *pi;
6745d07b933Sdp 				if (mem_node_config[mnode].exists == 0)
6755d07b933Sdp 					continue;
6775d07b933Sdp 				nranges = mnode_nranges[mnode];
6795d07b933Sdp 				pi = (pcc_info_t *)alloc_base;
6805d07b933Sdp 				alloc_base += sizeof (pcc_info_t) * nranges;
6815d07b933Sdp 				page_ctrs_cands[i][r][mnode] = pi;
6835d07b933Sdp 				for (mrange = 0; mrange < nranges; mrange++) {
6845d07b933Sdp 					pi->pcc_color_free =
6855d07b933Sdp 					    (pgcnt_t *)alloc_base;
6865d07b933Sdp 					alloc_base += sizeof (pgcnt_t) *
6875d07b933Sdp 					    colors_per_szc[r];
6885d07b933Sdp 					pi++;
6895d07b933Sdp 				}
6907c478bd9Sstevel@tonic-gate 			}
6917c478bd9Sstevel@tonic-gate 		}
6927c478bd9Sstevel@tonic-gate 	}
6947c478bd9Sstevel@tonic-gate 	/* ctr_mutex */
6957c478bd9Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
6967c478bd9Sstevel@tonic-gate 		ctr_mutex[i] = (kmutex_t *)alloc_base;
6977c478bd9Sstevel@tonic-gate 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
6987c478bd9Sstevel@tonic-gate 	}
7007c478bd9Sstevel@tonic-gate 	/* initialize page list counts */
7017c478bd9Sstevel@tonic-gate 	PLCNT_INIT(alloc_base);
703ce8eb11aSdp 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
7057c478bd9Sstevel@tonic-gate 		pgcnt_t r_pgcnt;
7067c478bd9Sstevel@tonic-gate 		pfn_t	r_base;
7077c478bd9Sstevel@tonic-gate 		pgcnt_t r_align;
7087c478bd9Sstevel@tonic-gate 		int	r_shift;
7095d07b933Sdp 		int	nranges = mnode_nranges[mnode];
7117c478bd9Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
7127c478bd9Sstevel@tonic-gate 			continue;
714ce8eb11aSdp 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
7167c478bd9Sstevel@tonic-gate 		for (r = 1; r < mmu_page_sizes; r++) {
7177c478bd9Sstevel@tonic-gate 			/*
7187c478bd9Sstevel@tonic-gate 			 * the page_counters base has to be aligned to the
7197c478bd9Sstevel@tonic-gate 			 * page count of page size code r otherwise the counts
7207c478bd9Sstevel@tonic-gate 			 * will cross large page boundaries.
7217c478bd9Sstevel@tonic-gate 			 */
7227c478bd9Sstevel@tonic-gate 			r_align = page_get_pagecnt(r);
723ce8eb11aSdp 			r_base = physbase;
7247c478bd9Sstevel@tonic-gate 			/* base needs to be aligned - lower to aligned value */
7257c478bd9Sstevel@tonic-gate 			r_base &= ~(r_align - 1);
726ce8eb11aSdp 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
7277c478bd9Sstevel@tonic-gate 			r_shift = PAGE_BSZS_SHIFT(r);
7297c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
7307c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
7317c478bd9Sstevel@tonic-gate 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
7325d07b933Sdp 			for (mrange = 0; mrange < nranges; mrange++) {
7345d07b933Sdp 				    r, mrange) = (size_t *)alloc_base;
7355d07b933Sdp 				alloc_base += sizeof (size_t) *
7365d07b933Sdp 				    colors_per_szc[r];
7375d07b933Sdp 			}
7387c478bd9Sstevel@tonic-gate 			for (i = 0; i < colors_per_szc[r]; i++) {
7395d07b933Sdp 				uint_t color_mask = colors_per_szc[r] - 1;
7405d07b933Sdp 				pfn_t  pfnum = r_base;
7415d07b933Sdp 				size_t idx;
7425d07b933Sdp 				int mrange;
743ce8eb11aSdp 				MEM_NODE_ITERATOR_DECL(it);
745b779d3e0Sdp 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
746b779d3e0Sdp 				if (pfnum == (pfn_t)-1) {
747b779d3e0Sdp 					idx = 0;
748b779d3e0Sdp 				} else {
749b779d3e0Sdp 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
750b779d3e0Sdp 					    color_mask, color_mask, &it);
751b779d3e0Sdp 					idx = PNUM_TO_IDX(mnode, r, pfnum);
752b779d3e0Sdp 					idx = (idx >= r_pgcnt) ? 0 : idx;
753b779d3e0Sdp 				}
7545d07b933Sdp 				for (mrange = 0; mrange < nranges; mrange++) {
7555d07b933Sdp 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
7565d07b933Sdp 					    r, i, mrange) = idx;
7575d07b933Sdp 				}
7587c478bd9Sstevel@tonic-gate 			}
760ce8eb11aSdp 			/* hpm_counters may be shared by all mnodes */
761ce8eb11aSdp 			if (firstmn == mnode) {
762ce8eb11aSdp 				PAGE_COUNTERS_COUNTERS(mnode, r) =
763ce8eb11aSdp 				    (hpmctr_t *)alloc_base;
764ce8eb11aSdp 				alloc_base +=
765ce8eb11aSdp 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
766ce8eb11aSdp 				    sizeof (hpmctr_t *));
767ce8eb11aSdp 			} else {
768ce8eb11aSdp 				PAGE_COUNTERS_COUNTERS(mnode, r) =
769ce8eb11aSdp 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
770ce8eb11aSdp 			}
7727c478bd9Sstevel@tonic-gate 			/*
7737c478bd9Sstevel@tonic-gate 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
7747c478bd9Sstevel@tonic-gate 			 * satisfy the identity requirement.
7757c478bd9Sstevel@tonic-gate 			 * We should be able to go from one to the other
7767c478bd9Sstevel@tonic-gate 			 * and get consistent values.
7777c478bd9Sstevel@tonic-gate 			 */
7787c478bd9Sstevel@tonic-gate 			ASSERT(PNUM_TO_IDX(mnode, r,
7797c478bd9Sstevel@tonic-gate 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
7807c478bd9Sstevel@tonic-gate 			ASSERT(IDX_TO_PNUM(mnode, r,
7817c478bd9Sstevel@tonic-gate 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
7827c478bd9Sstevel@tonic-gate 		}
7837c478bd9Sstevel@tonic-gate 		/*
7847c478bd9Sstevel@tonic-gate 		 * Roundup the start address of the page_counters to
7857c478bd9Sstevel@tonic-gate 		 * cache aligned boundary for every memory node.
7867c478bd9Sstevel@tonic-gate 		 * page_ctrs_sz() has added some slop for these roundups.
7877c478bd9Sstevel@tonic-gate 		 */
7887c478bd9Sstevel@tonic-gate 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
789ce8eb11aSdp 		    L2CACHE_ALIGN);
7907c478bd9Sstevel@tonic-gate 	}
7927c478bd9Sstevel@tonic-gate 	/* Initialize other page counter specific data structures. */
7937c478bd9Sstevel@tonic-gate 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
7947c478bd9Sstevel@tonic-gate 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
7957c478bd9Sstevel@tonic-gate 	}
7977c478bd9Sstevel@tonic-gate 	return (alloc_base);
7987c478bd9Sstevel@tonic-gate }
8007c478bd9Sstevel@tonic-gate /*
8017c478bd9Sstevel@tonic-gate  * Functions to adjust region counters for each size free list.
8027c478bd9Sstevel@tonic-gate  * Caller is responsible to acquire the ctr_mutex lock if necessary and
8037c478bd9Sstevel@tonic-gate  * thus can be called during startup without locks.
8047c478bd9Sstevel@tonic-gate  */
8057c478bd9Sstevel@tonic-gate /* ARGSUSED */
8067c478bd9Sstevel@tonic-gate void
page_ctr_add_internal(int mnode,int mtype,page_t * pp,int flags)807affbd3ccSkchow page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
8087c478bd9Sstevel@tonic-gate {
8097c478bd9Sstevel@tonic-gate 	ssize_t		r;	/* region size */
8107c478bd9Sstevel@tonic-gate 	ssize_t		idx;
8117c478bd9Sstevel@tonic-gate 	pfn_t		pfnum;
8127c478bd9Sstevel@tonic-gate 	int		lckidx;
814affbd3ccSkchow 	ASSERT(mnode == PP_2_MEM_NODE(pp));
815affbd3ccSkchow 	ASSERT(mtype == PP_2_MTYPE(pp));
8177c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc < mmu_page_sizes);
819affbd3ccSkchow 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
8217c478bd9Sstevel@tonic-gate 	/* no counter update needed for largest page size */
8227c478bd9Sstevel@tonic-gate 	if (pp->p_szc >= mmu_page_sizes - 1) {
8237c478bd9Sstevel@tonic-gate 		return;
8247c478bd9Sstevel@tonic-gate 	}
8267c478bd9Sstevel@tonic-gate 	r = pp->p_szc + 1;
8277c478bd9Sstevel@tonic-gate 	pfnum = pp->p_pagenum;
8287c478bd9Sstevel@tonic-gate 	lckidx = PP_CTR_LOCK_INDX(pp);
8307c478bd9Sstevel@tonic-gate 	/*
8317c478bd9Sstevel@tonic-gate 	 * Increment the count of free pages for the current
8327c478bd9Sstevel@tonic-gate 	 * region. Continue looping up in region size incrementing
8337c478bd9Sstevel@tonic-gate 	 * count if the preceeding region is full.
8347c478bd9Sstevel@tonic-gate 	 */
8357c478bd9Sstevel@tonic-gate 	while (r < mmu_page_sizes) {
8367c478bd9Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pfnum);
8387c478bd9Sstevel@tonic-gate 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
8397c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
8415d07b933Sdp 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
8427c478bd9Sstevel@tonic-gate 			break;
8435d07b933Sdp 		} else {
8445d07b933Sdp 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
8455d07b933Sdp 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
8465d07b933Sdp 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
8485d07b933Sdp 			cand->pcc_pages_free++;
8495d07b933Sdp 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
8505d07b933Sdp 		}
8517c478bd9Sstevel@tonic-gate 		r++;
8527c478bd9Sstevel@tonic-gate 	}
8537c478bd9Sstevel@tonic-gate }
8557c478bd9Sstevel@tonic-gate void
page_ctr_add(int mnode,int mtype,page_t * pp,int flags)856affbd3ccSkchow page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
8577c478bd9Sstevel@tonic-gate {
8587c478bd9Sstevel@tonic-gate 	int		lckidx = PP_CTR_LOCK_INDX(pp);
8597c478bd9Sstevel@tonic-gate 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
8617c478bd9Sstevel@tonic-gate 	mutex_enter(lock);
862affbd3ccSkchow 	page_ctr_add_internal(mnode, mtype, pp, flags);
8637c478bd9Sstevel@tonic-gate 	mutex_exit(lock);
8647c478bd9Sstevel@tonic-gate }
8667c478bd9Sstevel@tonic-gate void
page_ctr_sub_internal(int mnode,int mtype,page_t * pp,int flags)8675d07b933Sdp page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
8687c478bd9Sstevel@tonic-gate {
8697c478bd9Sstevel@tonic-gate 	int		lckidx;
8707c478bd9Sstevel@tonic-gate 	ssize_t		r;	/* region size */
8717c478bd9Sstevel@tonic-gate 	ssize_t		idx;
8727c478bd9Sstevel@tonic-gate 	pfn_t		pfnum;
874affbd3ccSkchow 	ASSERT(mnode == PP_2_MEM_NODE(pp));
875affbd3ccSkchow 	ASSERT(mtype == PP_2_MTYPE(pp));
8777c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_szc < mmu_page_sizes);
879affbd3ccSkchow 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
8817c478bd9Sstevel@tonic-gate 	/* no counter update needed for largest page size */
8827c478bd9Sstevel@tonic-gate 	if (pp->p_szc >= mmu_page_sizes - 1) {
8837c478bd9Sstevel@tonic-gate 		return;
8847c478bd9Sstevel@tonic-gate 	}
8867c478bd9Sstevel@tonic-gate 	r = pp->p_szc + 1;
8877c478bd9Sstevel@tonic-gate 	pfnum = pp->p_pagenum;
8887c478bd9Sstevel@tonic-gate 	lckidx = PP_CTR_LOCK_INDX(pp);
8907c478bd9Sstevel@tonic-gate 	/*
8917c478bd9Sstevel@tonic-gate 	 * Decrement the count of free pages for the current
8927c478bd9Sstevel@tonic-gate 	 * region. Continue looping up in region size decrementing
8937c478bd9Sstevel@tonic-gate 	 * count if the preceeding region was full.
8947c478bd9Sstevel@tonic-gate 	 */
8957c478bd9Sstevel@tonic-gate 	while (r < mmu_page_sizes) {
8967c478bd9Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pfnum);
8987c478bd9Sstevel@tonic-gate 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
8997c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
9017c478bd9Sstevel@tonic-gate 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
9027c478bd9Sstevel@tonic-gate 			break;
9035d07b933Sdp 		} else {
9045d07b933Sdp 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
9055d07b933Sdp 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
9065d07b933Sdp 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
9085d07b933Sdp 			ASSERT(cand->pcc_pages_free != 0);
9095d07b933Sdp 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
9115d07b933Sdp 			cand->pcc_pages_free--;
9125d07b933Sdp 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
9135d07b933Sdp 		}
9147c478bd9Sstevel@tonic-gate 		r++;
9157c478bd9Sstevel@tonic-gate 	}
9165d07b933Sdp }
9185d07b933Sdp void
page_ctr_sub(int mnode,int mtype,page_t * pp,int flags)9195d07b933Sdp page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
9205d07b933Sdp {
9215d07b933Sdp 	int		lckidx = PP_CTR_LOCK_INDX(pp);
9225d07b933Sdp 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
9245d07b933Sdp 	mutex_enter(lock);
9255d07b933Sdp 	page_ctr_sub_internal(mnode, mtype, pp, flags);
9267c478bd9Sstevel@tonic-gate 	mutex_exit(lock);
9277c478bd9Sstevel@tonic-gate }
9297c478bd9Sstevel@tonic-gate /*
9307c478bd9Sstevel@tonic-gate  * Adjust page counters following a memory attach, since typically the
9317c478bd9Sstevel@tonic-gate  * size of the array needs to change, and the PFN to counter index
9327c478bd9Sstevel@tonic-gate  * mapping needs to change.
9335d07b933Sdp  *
9345d07b933Sdp  * It is possible this mnode did not exist at startup. In that case
9355d07b933Sdp  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
9365d07b933Sdp  * to change (a theoretical possibility on x86), which means pcc_color_free
9375d07b933Sdp  * arrays must be extended.
9387c478bd9Sstevel@tonic-gate  */
9397c478bd9Sstevel@tonic-gate uint_t
page_ctrs_adjust(int mnode)9407c478bd9Sstevel@tonic-gate page_ctrs_adjust(int mnode)
9417c478bd9Sstevel@tonic-gate {
9427c478bd9Sstevel@tonic-gate 	pgcnt_t npgs;
9437c478bd9Sstevel@tonic-gate 	int	r;		/* region size */
9447c478bd9Sstevel@tonic-gate 	int	i;
9457c478bd9Sstevel@tonic-gate 	size_t	pcsz, old_csz;
9467c478bd9Sstevel@tonic-gate 	hpmctr_t *new_ctr, *old_ctr;
9477c478bd9Sstevel@tonic-gate 	pfn_t	oldbase, newbase;
948ce8eb11aSdp 	pfn_t	physbase, physmax;
9497c478bd9Sstevel@tonic-gate 	size_t	old_npgs;
9507c478bd9Sstevel@tonic-gate 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
9517c478bd9Sstevel@tonic-gate 	size_t	size_cache[MMU_PAGE_SIZES];
9525d07b933Sdp 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
9535d07b933Sdp 	size_t	*old_color_array[MAX_MNODE_MRANGES];
9547c478bd9Sstevel@tonic-gate 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
9555d07b933Sdp 	pcc_info_t **cands_cache;
9565d07b933Sdp 	pcc_info_t *old_pi, *pi;
9575d07b933Sdp 	pgcnt_t *pgcntp;
9585d07b933Sdp 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
9595d07b933Sdp 	int cands_cache_nranges;
9605d07b933Sdp 	int old_maxmrange, new_maxmrange;
9615d07b933Sdp 	int rc = 0;
9629853d9e8SJason Beloro 	int oldmnode;
9645d07b933Sdp 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
9655d07b933Sdp 	    MMU_PAGE_SIZES, KM_NOSLEEP);
9665d07b933Sdp 	if (cands_cache == NULL)
9675d07b933Sdp 		return (ENOMEM);
969ce8eb11aSdp 	i = -1;
970ce8eb11aSdp 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
972ce8eb11aSdp 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
973ce8eb11aSdp 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
9755d07b933Sdp 	/* prepare to free non-null pointers on the way out */
9765d07b933Sdp 	cands_cache_nranges = nranges;