17c478bdstevel@tonic-gate/*
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
5aa042c4kchow * Common Development and Distribution License (the "License").
6aa042c4kchow * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
217c478bdstevel@tonic-gate/*
22cb15d5dPeter Rival * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
237c478bdstevel@tonic-gate */
24a311483Gerry Liu/*
25a311483Gerry Liu * Copyright (c) 2010, Intel Corporation.
26a311483Gerry Liu * All rights reserved.
27dddac43John Levon * Copyright 2019, Joyent, Inc.
28a311483Gerry Liu */
297c478bdstevel@tonic-gate
307c478bdstevel@tonic-gate/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
317c478bdstevel@tonic-gate/*	All Rights Reserved   */
327c478bdstevel@tonic-gate
337c478bdstevel@tonic-gate/*
347c478bdstevel@tonic-gate * Portions of this source code were derived from Berkeley 4.3 BSD
357c478bdstevel@tonic-gate * under license from the Regents of the University of California.
367c478bdstevel@tonic-gate */
377c478bdstevel@tonic-gate
387c478bdstevel@tonic-gate/*
397c478bdstevel@tonic-gate * UNIX machine dependent virtual memory support.
407c478bdstevel@tonic-gate */
417c478bdstevel@tonic-gate
427c478bdstevel@tonic-gate#include <sys/types.h>
437c478bdstevel@tonic-gate#include <sys/param.h>
447c478bdstevel@tonic-gate#include <sys/systm.h>
457c478bdstevel@tonic-gate#include <sys/user.h>
467c478bdstevel@tonic-gate#include <sys/proc.h>
477c478bdstevel@tonic-gate#include <sys/kmem.h>
487c478bdstevel@tonic-gate#include <sys/vmem.h>
497c478bdstevel@tonic-gate#include <sys/buf.h>
507c478bdstevel@tonic-gate#include <sys/cpuvar.h>
517c478bdstevel@tonic-gate#include <sys/lgrp.h>
527c478bdstevel@tonic-gate#include <sys/disp.h>
537c478bdstevel@tonic-gate#include <sys/vm.h>
547c478bdstevel@tonic-gate#include <sys/mman.h>
557c478bdstevel@tonic-gate#include <sys/vnode.h>
567c478bdstevel@tonic-gate#include <sys/cred.h>
577c478bdstevel@tonic-gate#include <sys/exec.h>
587c478bdstevel@tonic-gate#include <sys/exechdr.h>
597c478bdstevel@tonic-gate#include <sys/debug.h>
60ec25b48susans#include <sys/vmsystm.h>
61cb15d5dPeter Rival#include <sys/swap.h>
621f84c0dDave Plauger#include <sys/dumphdr.h>
63d2a7078Richard Lowe#include <sys/random.h>
647c478bdstevel@tonic-gate
657c478bdstevel@tonic-gate#include <vm/hat.h>
667c478bdstevel@tonic-gate#include <vm/as.h>
677c478bdstevel@tonic-gate#include <vm/seg.h>
687c478bdstevel@tonic-gate#include <vm/seg_kp.h>
697c478bdstevel@tonic-gate#include <vm/seg_vn.h>
707c478bdstevel@tonic-gate#include <vm/page.h>
717c478bdstevel@tonic-gate#include <vm/seg_kmem.h>
727c478bdstevel@tonic-gate#include <vm/seg_kpm.h>
737c478bdstevel@tonic-gate#include <vm/vm_dep.h>
747c478bdstevel@tonic-gate
757c478bdstevel@tonic-gate#include <sys/cpu.h>
767c478bdstevel@tonic-gate#include <sys/vm_machparam.h>
777c478bdstevel@tonic-gate#include <sys/memlist.h>
787c478bdstevel@tonic-gate#include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
797c478bdstevel@tonic-gate#include <vm/hat_i86.h>
807c478bdstevel@tonic-gate#include <sys/x86_archext.h>
817c478bdstevel@tonic-gate#include <sys/elf_386.h>
827c478bdstevel@tonic-gate#include <sys/cmn_err.h>
837c478bdstevel@tonic-gate#include <sys/archsystm.h>
847c478bdstevel@tonic-gate#include <sys/machsystm.h>
85d2a7078Richard Lowe#include <sys/secflags.h>
867c478bdstevel@tonic-gate
877c478bdstevel@tonic-gate#include <sys/vtrace.h>
887c478bdstevel@tonic-gate#include <sys/ddidmareq.h>
897c478bdstevel@tonic-gate#include <sys/promif.h>
907c478bdstevel@tonic-gate#include <sys/memnode.h>
917c478bdstevel@tonic-gate#include <sys/stack.h>
92843e198johnlev#include <util/qsort.h>
93843e198johnlev#include <sys/taskq.h>
94843e198johnlev
95843e198johnlev#ifdef __xpv
96843e198johnlev
97843e198johnlev#include <sys/hypervisor.h>
98843e198johnlev#include <sys/xen_mmu.h>
99843e198johnlev#include <sys/balloon_impl.h>
100843e198johnlev
101843e198johnlev/*
102843e198johnlev * domain 0 pages usable for DMA are kept pre-allocated and kept in
103843e198johnlev * distinct lists, ordered by increasing mfn.
104843e198johnlev */
105843e198johnlevstatic kmutex_t io_pool_lock;
106b9bc7f7smaybestatic kmutex_t contig_list_lock;
107843e198johnlevstatic page_t *io_pool_4g;	/* pool for 32 bit dma limited devices */
108843e198johnlevstatic page_t *io_pool_16m;	/* pool for 24 bit dma limited legacy devices */
109843e198johnlevstatic long io_pool_cnt;
110843e198johnlevstatic long io_pool_cnt_max = 0;
111843e198johnlev#define	DEFAULT_IO_POOL_MIN	128
112843e198johnlevstatic long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
113843e198johnlevstatic long io_pool_cnt_lowater = 0;
114843e198johnlevstatic long io_pool_shrink_attempts; /* how many times did we try to shrink */
115843e198johnlevstatic long io_pool_shrinks;	/* how many times did we really shrink */
116843e198johnlevstatic long io_pool_grows;	/* how many times did we grow */
117843e198johnlevstatic mfn_t start_mfn = 1;
118843e198johnlevstatic caddr_t io_pool_kva;	/* use to alloc pages when needed */
119843e198johnlev
120843e198johnlevstatic int create_contig_pfnlist(uint_t);
121843e198johnlev
122843e198johnlev/*
123843e198johnlev * percentage of phys mem to hold in the i/o pool
124843e198johnlev */
125843e198johnlev#define	DEFAULT_IO_POOL_PCT	2
126843e198johnlevstatic long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
127843e198johnlevstatic void page_io_pool_sub(page_t **, page_t *, page_t *);
128b9bc7f7smaybeint ioalloc_dbg = 0;
129843e198johnlev
130843e198johnlev#endif /* __xpv */
1317c478bdstevel@tonic-gate
1325d07b93dpuint_t vac_colors = 1;
1337c478bdstevel@tonic-gate
1347c478bdstevel@tonic-gateint largepagesupport = 0;
1357c478bdstevel@tonic-gateextern uint_t page_create_new;
1367c478bdstevel@tonic-gateextern uint_t page_create_exists;
1377c478bdstevel@tonic-gateextern uint_t page_create_putbacks;
138ae115bcmrj/*
139ae115bcmrj * Allow users to disable the kernel's use of SSE.
140ae115bcmrj */
141ae115bcmrjextern int use_sse_pagecopy, use_sse_pagezero;
1427c478bdstevel@tonic-gate
143843e198johnlev/*
144d94ffb2jmcp * combined memory ranges from mnode and memranges[] to manage single
145d94ffb2jmcp * mnode/mtype dimension in the page lists.
146d94ffb2jmcp */
147d94ffb2jmcptypedef struct {
148d94ffb2jmcp	pfn_t	mnr_pfnlo;
149d94ffb2jmcp	pfn_t	mnr_pfnhi;
150d94ffb2jmcp	int	mnr_mnode;
151d94ffb2jmcp	int	mnr_memrange;		/* index into memranges[] */
152d94ffb2jmcp	int	mnr_next;		/* next lower PA mnoderange */
153d94ffb2jmcp	int	mnr_exists;
154d94ffb2jmcp	/* maintain page list stats */
155d94ffb2jmcp	pgcnt_t	mnr_mt_clpgcnt;		/* cache list cnt */
156d94ffb2jmcp	pgcnt_t	mnr_mt_flpgcnt[MMU_PAGE_SIZES];	/* free list cnt per szc */
157d94ffb2jmcp	pgcnt_t	mnr_mt_totcnt;		/* sum of cache and free lists */
158d94ffb2jmcp#ifdef DEBUG
159d94ffb2jmcp	struct mnr_mts {		/* mnode/mtype szc stats */
160d94ffb2jmcp		pgcnt_t	mnr_mts_pgcnt;
161d94ffb2jmcp		int	mnr_mts_colors;
162d94ffb2jmcp		pgcnt_t *mnr_mtsc_pgcnt;
163dddac43John Levon	}	*mnr_mts;
164d94ffb2jmcp#endif
165d94ffb2jmcp} mnoderange_t;
166d94ffb2jmcp
167d94ffb2jmcp#define	MEMRANGEHI(mtype)						\
168d94ffb2jmcp	((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
169d94ffb2jmcp#define	MEMRANGELO(mtype)	(memranges[mtype])
170d94ffb2jmcp
171d94ffb2jmcp#define	MTYPE_FREEMEM(mt)	(mnoderanges[mt].mnr_mt_totcnt)
172d94ffb2jmcp
173d94ffb2jmcp/*
174843e198johnlev * As the PC architecture evolved memory up was clumped into several
175843e198johnlev * ranges for various historical I/O devices to do DMA.
176843e198johnlev * < 16Meg - ISA bus
177843e198johnlev * < 2Gig - ???
178843e198johnlev * < 4Gig - PCI bus or drivers that don't understand PAE mode
179843e198johnlev *
180843e198johnlev * These are listed in reverse order, so that we can skip over unused
181843e198johnlev * ranges on machines with small memories.
182843e198johnlev *
183843e198johnlev * For now under the Hypervisor, we'll only ever have one memrange.
184843e198johnlev */
185843e198johnlev#define	PFN_4GIG	0x100000
186843e198johnlev#define	PFN_16MEG	0x1000
187a311483Gerry Liu/* Indices into the memory range (arch_memranges) array. */
188a311483Gerry Liu#define	MRI_4G		0
189a311483Gerry Liu#define	MRI_2G		1
190a311483Gerry Liu#define	MRI_16M		2
191a311483Gerry Liu#define	MRI_0		3
192843e198johnlevstatic pfn_t arch_memranges[NUM_MEM_RANGES] = {
193843e198johnlev    PFN_4GIG,	/* pfn range for 4G and above */
194843e198johnlev    0x80000,	/* pfn range for 2G-4G */
195843e198johnlev    PFN_16MEG,	/* pfn range for 16M-2G */
196843e198johnlev    0x00000,	/* pfn range for 0-16M */
197843e198johnlev};
198843e198johnlevpfn_t *memranges = &arch_memranges[0];
199843e198johnlevint nranges = NUM_MEM_RANGES;
200843e198johnlev
201843e198johnlev/*
202843e198johnlev * This combines mem_node_config and memranges into one data
203843e198johnlev * structure to be used for page list management.
204843e198johnlev */
205dddac43John Levonstatic mnoderange_t *mnoderanges;
206dddac43John Levonstatic int mnoderangecnt;
207dddac43John Levonstatic int mtype4g;
208dddac43John Levonstatic int mtype16m;
209dddac43John Levonstatic int mtypetop;
210843e198johnlev
211843e198johnlev/*
212843e198johnlev * 4g memory management variables for systems with more than 4g of memory:
213843e198johnlev *
214843e198johnlev * physical memory below 4g is required for 32bit dma devices and, currently,
215843e198johnlev * for kmem memory. On systems with more than 4g of memory, the pool of memory
216843e198johnlev * below 4g can be depleted without any paging activity given that there is
217843e198johnlev * likely to be sufficient memory above 4g.
218843e198johnlev *
219843e198johnlev * physmax4g is set true if the largest pfn is over 4g. The rest of the
220843e198johnlev * 4g memory management code is enabled only when physmax4g is true.
221843e198johnlev *
222843e198johnlev * maxmem4g is the count of the maximum number of pages on the page lists
223843e198johnlev * with physical addresses below 4g. It can be a lot less then 4g given that
224843e198johnlev * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
225843e198johnlev * agp aperture etc.
226843e198johnlev *
227843e198johnlev * freemem4g maintains the count of the number of available pages on the
228843e198johnlev * page lists with physical addresses below 4g.
229843e198johnlev *
230843e198johnlev * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
231843e198johnlev * 6% (desfree4gshift = 4) of maxmem4g.
232843e198johnlev *
233843e198johnlev * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
234843e198johnlev * and the amount of physical memory above 4g is greater than freemem4g.
235843e198johnlev * In this case, page_get_* routines will restrict below 4g allocations
236843e198johnlev * for requests that don't specifically require it.
237843e198johnlev */
238843e198johnlev
239843e198johnlev#define	DESFREE4G	(maxmem4g >> desfree4gshift)
240843e198johnlev
241843e198johnlev#define	RESTRICT4G_ALLOC					\
242843e198johnlev	(physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
2437c478bdstevel@tonic-gate
244843e198johnlevstatic pgcnt_t	maxmem4g;
245843e198johnlevstatic pgcnt_t	freemem4g;
246843e198johnlevstatic int	physmax4g;
247843e198johnlevstatic int	desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
248843e198johnlev
249843e198johnlev/*
250843e198johnlev * 16m memory management:
251843e198johnlev *
252843e198johnlev * reserve some amount of physical memory below 16m for legacy devices.
253843e198johnlev *
254843e198johnlev * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
255843e198johnlev * 16m or if the 16m pool drops below DESFREE16M.
256843e198johnlev *
257843e198johnlev * In this case, general page allocations via page_get_{free,cache}list
258843e198johnlev * routines will be restricted from allocating from the 16m pool. Allocations
259843e198johnlev * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
260843e198johnlev * are not restricted.
261843e198johnlev */
262843e198johnlev
263a311483Gerry Liu#define	FREEMEM16M	MTYPE_FREEMEM(mtype16m)
264843e198johnlev#define	DESFREE16M	desfree16m
265dddac43John Levon#define	RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
266dddac43John Levon	(mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \
267dddac43John Levon	    ((freemem >= (FREEMEM16M)) || \
268843e198johnlev	    (FREEMEM16M  < (DESFREE16M + pgcnt))))
269843e198johnlev
270843e198johnlevstatic pgcnt_t	desfree16m = 0x380;
271843e198johnlev
272843e198johnlev/*
273843e198johnlev * This can be patched via /etc/system to allow old non-PAE aware device
274843e198johnlev * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
275843e198johnlev */
276843e198johnlevint restricted_kmemalloc = 0;
27707ad560kchow
2787c478bdstevel@tonic-gate#ifdef VM_STATS
2797c478bdstevel@tonic-gatestruct {
2807c478bdstevel@tonic-gate	ulong_t	pga_alloc;
2817c478bdstevel@tonic-gate	ulong_t	pga_notfullrange;
2827c478bdstevel@tonic-gate	ulong_t	pga_nulldmaattr;
2837c478bdstevel@tonic-gate	ulong_t	pga_allocok;
2847c478bdstevel@tonic-gate	ulong_t	pga_allocfailed;
2857c478bdstevel@tonic-gate	ulong_t	pgma_alloc;
2867c478bdstevel@tonic-gate	ulong_t	pgma_allocok;
2877c478bdstevel@tonic-gate	ulong_t	pgma_allocfailed;
2887c478bdstevel@tonic-gate	ulong_t	pgma_allocempty;
2897c478bdstevel@tonic-gate} pga_vmstats;
2907c478bdstevel@tonic-gate#endif
2917c478bdstevel@tonic-gate
2927c478bdstevel@tonic-gateuint_t mmu_page_sizes;
2937c478bdstevel@tonic-gate
2947c478bdstevel@tonic-gate/* How many page sizes the users can see */
2957c478bdstevel@tonic-gateuint_t mmu_exported_page_sizes;
2967c478bdstevel@tonic-gate
29702bc52bkchow/* page sizes that legacy applications can see */
29802bc52bkchowuint_t mmu_legacy_page_sizes;
29902bc52bkchow
300beb1bdadavemq/*
301beb1bdadavemq * Number of pages in 1 GB.  Don't enable automatic large pages if we have
302beb1bdadavemq * fewer than this many pages.
303beb1bdadavemq */
304ec25b48susanspgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
305ec25b48susanspgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
306ec25b48susans
307ec25b48susans/*
308ec25b48susans * Maximum and default segment size tunables for user private
309ec25b48susans * and shared anon memory, and user text and initialized data.
310ec25b48susans * These can be patched via /etc/system to allow large pages
311ec25b48susans * to be used for mapping application private and shared anon memory.
312ec25b48susans */
313ec25b48susanssize_t mcntl0_lpsize = MMU_PAGESIZE;
314ec25b48susanssize_t max_uheap_lpsize = MMU_PAGESIZE;
315ec25b48susanssize_t default_uheap_lpsize = MMU_PAGESIZE;
316ec25b48susanssize_t max_ustack_lpsize = MMU_PAGESIZE;
317ec25b48susanssize_t default_ustack_lpsize = MMU_PAGESIZE;
318ec25b48susanssize_t max_privmap_lpsize = MMU_PAGESIZE;
319ec25b48susanssize_t max_uidata_lpsize = MMU_PAGESIZE;
320ec25b48susanssize_t max_utext_lpsize = MMU_PAGESIZE;
321ec25b48susanssize_t max_shm_lpsize = MMU_PAGESIZE;
3227c478bdstevel@tonic-gate
323843e198johnlev
324843e198johnlev/*
325843e198johnlev * initialized by page_coloring_init().
326843e198johnlev */
327843e198johnlevuint_t	page_colors;
328843e198johnlevuint_t	page_colors_mask;
329843e198johnlevuint_t	page_coloring_shift;
330843e198johnlevint	cpu_page_colors;
331843e198johnlevstatic uint_t	l2_colors;
332843e198johnlev
333843e198johnlev/*
334843e198johnlev * Page freelists and cachelists are dynamically allocated once mnoderangecnt
335843e198johnlev * and page_colors are calculated from the l2 cache n-way set size.  Within a
336843e198johnlev * mnode range, the page freelist and cachelist are hashed into bins based on
337843e198johnlev * color. This makes it easier to search for a page within a specific memory
338843e198johnlev * range.
339843e198johnlev */
340843e198johnlev#define	PAGE_COLORS_MIN	16
341843e198johnlev
342843e198johnlevpage_t ****page_freelists;
343843e198johnlevpage_t ***page_cachelists;
344843e198johnlev
345843e198johnlev
346843e198johnlev/*
347843e198johnlev * Used by page layer to know about page sizes
348843e198johnlev */
349843e198johnlevhw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
350843e198johnlev
351d94ffb2jmcpkmutex_t	*fpc_mutex[NPC_MUTEX];
352d94ffb2jmcpkmutex_t	*cpc_mutex[NPC_MUTEX];
353843e198johnlev
354a311483Gerry Liu/* Lock to protect mnoderanges array for memory DR operations. */
355a311483Gerry Liustatic kmutex_t mnoderange_lock;
356a311483Gerry Liu
357843e198johnlev/*
358843e198johnlev * Only let one thread at a time try to coalesce large pages, to
359843e198johnlev * prevent them from working against each other.
360843e198johnlev */
361843e198johnlevstatic kmutex_t	contig_lock;
362843e198johnlev#define	CONTIG_LOCK()	mutex_enter(&contig_lock);
363843e198johnlev#define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
364843e198johnlev
365843e198johnlev#define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
366843e198johnlev
36774ecdb5John Levoncaddr_t
36874ecdb5John Levoni86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
36974ecdb5John Levon{
37074ecdb5John Levon	caddr_t addr;
37174ecdb5John Levon	caddr_t addr1;
37274ecdb5John Levon	page_t *pp;
37374ecdb5John Levon
37474ecdb5John Levon	addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
37574ecdb5John Levon
37674ecdb5John Levon	for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
37774ecdb5John Levon		pp = page_numtopp_nolock(pf);
37874ecdb5John Levon		if (pp == NULL) {
37974ecdb5John Levon			hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
38074ecdb5John Levon			    prot | HAT_NOSYNC, HAT_LOAD_LOCK);
38174ecdb5John Levon		} else {
38274ecdb5John Levon			hat_memload(kas.a_hat, addr, pp,
38374ecdb5John Levon			    prot | HAT_NOSYNC, HAT_LOAD_LOCK);
38474ecdb5John Levon		}
38574ecdb5John Levon	}
38674ecdb5John Levon
38774ecdb5John Levon	return (addr1);
38874ecdb5John Levon}
38974ecdb5John Levon
39074ecdb5John Levon/*
39174ecdb5John Levon * This routine is like page_numtopp, but accepts only free pages, which
39274ecdb5John Levon * it allocates (unfrees) and returns with the exclusive lock held.
39374ecdb5John Levon * It is used by machdep.c/dma_init() to find contiguous free pages.
39474ecdb5John Levon */
39574ecdb5John Levonpage_t *
39674ecdb5John Levonpage_numtopp_alloc(pfn_t pfnum)
39774ecdb5John Levon{
39874ecdb5John Levon	page_t *pp;
39974ecdb5John Levon
40074ecdb5John Levonretry:
40174ecdb5John Levon	pp = page_numtopp_nolock(pfnum);
40274ecdb5John Levon	if (pp == NULL) {
40374ecdb5John Levon		return (NULL);
40474ecdb5John Levon	}
40574ecdb5John Levon
40674ecdb5John Levon	if (!page_trylock(pp, SE_EXCL)) {
40774ecdb5John Levon		return (NULL);
40874ecdb5John Levon	}
40974ecdb5John Levon
41074ecdb5John Levon	if (page_pptonum(pp) != pfnum) {
41174ecdb5John Levon		page_unlock(pp);
41274ecdb5John Levon		goto retry;
41374ecdb5John Levon	}
41474ecdb5John Levon
41574ecdb5John Levon	if (!PP_ISFREE(pp)) {
41674ecdb5John Levon		page_unlock(pp);
41774ecdb5John Levon		return (NULL);
41874ecdb5John Levon	}
41974ecdb5John Levon	if (pp->p_szc) {
42074ecdb5John Levon		page_demote_free_pages(pp);
42174ecdb5John Levon		page_unlock(pp);
42274ecdb5John Levon		goto retry;
42374ecdb5John Levon	}
42474ecdb5John Levon
42574ecdb5John Levon	/* If associated with a vnode, destroy mappings */
42674ecdb5John Levon
42774ecdb5John Levon	if (pp->p_vnode) {
42874ecdb5John Levon
42974ecdb5John Levon		page_destroy_free(pp);
43074ecdb5John Levon
43174ecdb5John Levon		if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
43274ecdb5John Levon			return (NULL);
43374ecdb5John Levon		}
43474ecdb5John Levon
43574ecdb5John Levon		if (page_pptonum(pp) != pfnum) {
43674ecdb5John Levon			page_unlock(pp);
43774ecdb5John Levon			goto retry;
43874ecdb5John Levon		}
43974ecdb5John Levon	}
44074ecdb5John Levon
44174ecdb5John Levon	if (!PP_ISFREE(pp)) {
44274ecdb5John Levon		page_unlock(pp);
44374ecdb5John Levon		return (NULL);
44474ecdb5John Levon	}
44574ecdb5John Levon
44674ecdb5John Levon	if (!page_reclaim(pp, (kmutex_t *)NULL))
44774ecdb5John Levon		return (NULL);
44874ecdb5John Levon
44974ecdb5John Levon	return (pp);
45074ecdb5John Levon}
45174ecdb5John Levon
4527c478bdstevel@tonic-gate/*
4537c478bdstevel@tonic-gate * Return the optimum page size for a given mapping
4547c478bdstevel@tonic-gate */
4557c478bdstevel@tonic-gate/*ARGSUSED*/
4567c478bdstevel@tonic-gatesize_t
457ec25b48susansmap_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
4587c478bdstevel@tonic-gate{
459ec25b48susans	level_t l = 0;
460ec25b48susans	size_t pgsz = MMU_PAGESIZE;
461ec25b48susans	size_t max_lpsize;
462ec25b48susans	uint_t mszc;
4637c478bdstevel@tonic-gate
464ec25b48susans	ASSERT(maptype != MAPPGSZ_VA);
4657c478bdstevel@tonic-gate
466ec25b48susans	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
467ec25b48susans		return (MMU_PAGESIZE);
468ec25b48susans	}
4697c478bdstevel@tonic-gate
470ec25b48susans	switch (maptype) {
4717c478bdstevel@tonic-gate	case MAPPGSZ_HEAP:
472ec25b48susans	case MAPPGSZ_STK:
473ec25b48susans		max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
474ec25b48susans		    MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
475ec25b48susans		if (max_lpsize == MMU_PAGESIZE) {
476ec25b48susans			return (MMU_PAGESIZE);
477ec25b48susans		}
478ec25b48susans		if (len == 0) {
479ec25b48susans			len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
480ec25b48susans			    p->p_brksize - p->p_bssbase : p->p_stksize;
481ec25b48susans		}
482ec25b48susans		len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
483ec25b48susans		    default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
484ec25b48susans
4857c478bdstevel@tonic-gate		/*
4867c478bdstevel@tonic-gate		 * use the pages size that best fits len
4877c478bdstevel@tonic-gate		 */
48802bc52bkchow		for (l = mmu.umax_page_level; l > 0; --l) {
489