xref: /illumos-gate/usr/src/uts/i86pc/vm/vm_machdep.c (revision ae115bc77f6fcde83175c75b4206dc2e50747966)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5aa042c4bSkchow  * Common Development and Distribution License (the "License").
6aa042c4bSkchow  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22*ae115bc7Smrj  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
277c478bd9Sstevel@tonic-gate /*	All Rights Reserved   */
287c478bd9Sstevel@tonic-gate 
297c478bd9Sstevel@tonic-gate /*
307c478bd9Sstevel@tonic-gate  * Portions of this source code were derived from Berkeley 4.3 BSD
317c478bd9Sstevel@tonic-gate  * under license from the Regents of the University of California.
327c478bd9Sstevel@tonic-gate  */
337c478bd9Sstevel@tonic-gate 
347c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
357c478bd9Sstevel@tonic-gate 
367c478bd9Sstevel@tonic-gate /*
377c478bd9Sstevel@tonic-gate  * UNIX machine dependent virtual memory support.
387c478bd9Sstevel@tonic-gate  */
397c478bd9Sstevel@tonic-gate 
407c478bd9Sstevel@tonic-gate #include <sys/types.h>
417c478bd9Sstevel@tonic-gate #include <sys/param.h>
427c478bd9Sstevel@tonic-gate #include <sys/systm.h>
437c478bd9Sstevel@tonic-gate #include <sys/user.h>
447c478bd9Sstevel@tonic-gate #include <sys/proc.h>
457c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
467c478bd9Sstevel@tonic-gate #include <sys/vmem.h>
477c478bd9Sstevel@tonic-gate #include <sys/buf.h>
487c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
497c478bd9Sstevel@tonic-gate #include <sys/lgrp.h>
507c478bd9Sstevel@tonic-gate #include <sys/disp.h>
517c478bd9Sstevel@tonic-gate #include <sys/vm.h>
527c478bd9Sstevel@tonic-gate #include <sys/mman.h>
537c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
547c478bd9Sstevel@tonic-gate #include <sys/cred.h>
557c478bd9Sstevel@tonic-gate #include <sys/exec.h>
567c478bd9Sstevel@tonic-gate #include <sys/exechdr.h>
577c478bd9Sstevel@tonic-gate #include <sys/debug.h>
58ec25b48fSsusans #include <sys/vmsystm.h>
597c478bd9Sstevel@tonic-gate 
607c478bd9Sstevel@tonic-gate #include <vm/hat.h>
617c478bd9Sstevel@tonic-gate #include <vm/as.h>
627c478bd9Sstevel@tonic-gate #include <vm/seg.h>
637c478bd9Sstevel@tonic-gate #include <vm/seg_kp.h>
647c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h>
657c478bd9Sstevel@tonic-gate #include <vm/page.h>
667c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
677c478bd9Sstevel@tonic-gate #include <vm/seg_kpm.h>
687c478bd9Sstevel@tonic-gate #include <vm/vm_dep.h>
697c478bd9Sstevel@tonic-gate 
707c478bd9Sstevel@tonic-gate #include <sys/cpu.h>
717c478bd9Sstevel@tonic-gate #include <sys/vm_machparam.h>
727c478bd9Sstevel@tonic-gate #include <sys/memlist.h>
737c478bd9Sstevel@tonic-gate #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
747c478bd9Sstevel@tonic-gate #include <vm/hat_i86.h>
757c478bd9Sstevel@tonic-gate #include <sys/x86_archext.h>
767c478bd9Sstevel@tonic-gate #include <sys/elf_386.h>
777c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
787c478bd9Sstevel@tonic-gate #include <sys/archsystm.h>
797c478bd9Sstevel@tonic-gate #include <sys/machsystm.h>
807c478bd9Sstevel@tonic-gate 
817c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
827c478bd9Sstevel@tonic-gate #include <sys/ddidmareq.h>
837c478bd9Sstevel@tonic-gate #include <sys/promif.h>
847c478bd9Sstevel@tonic-gate #include <sys/memnode.h>
857c478bd9Sstevel@tonic-gate #include <sys/stack.h>
867c478bd9Sstevel@tonic-gate 
875d07b933Sdp uint_t vac_colors = 1;
887c478bd9Sstevel@tonic-gate 
897c478bd9Sstevel@tonic-gate int largepagesupport = 0;
907c478bd9Sstevel@tonic-gate extern uint_t page_create_new;
917c478bd9Sstevel@tonic-gate extern uint_t page_create_exists;
927c478bd9Sstevel@tonic-gate extern uint_t page_create_putbacks;
937c478bd9Sstevel@tonic-gate extern uint_t page_create_putbacks;
94*ae115bc7Smrj /*
95*ae115bc7Smrj  * Allow users to disable the kernel's use of SSE.
96*ae115bc7Smrj  */
97*ae115bc7Smrj extern int use_sse_pagecopy, use_sse_pagezero;
987c478bd9Sstevel@tonic-gate 
997c478bd9Sstevel@tonic-gate /* 4g memory management */
1007c478bd9Sstevel@tonic-gate pgcnt_t		maxmem4g;
1017c478bd9Sstevel@tonic-gate pgcnt_t		freemem4g;
1027c478bd9Sstevel@tonic-gate int		physmax4g;
1037c478bd9Sstevel@tonic-gate int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
1047c478bd9Sstevel@tonic-gate int		lotsfree4gshift = 3;
1057c478bd9Sstevel@tonic-gate 
10607ad560dSkchow /* 16m memory management: desired number of free pages below 16m. */
10707ad560dSkchow pgcnt_t		desfree16m = 0x380;
10807ad560dSkchow 
1097c478bd9Sstevel@tonic-gate #ifdef VM_STATS
1107c478bd9Sstevel@tonic-gate struct {
1117c478bd9Sstevel@tonic-gate 	ulong_t	pga_alloc;
1127c478bd9Sstevel@tonic-gate 	ulong_t	pga_notfullrange;
1137c478bd9Sstevel@tonic-gate 	ulong_t	pga_nulldmaattr;
1147c478bd9Sstevel@tonic-gate 	ulong_t	pga_allocok;
1157c478bd9Sstevel@tonic-gate 	ulong_t	pga_allocfailed;
1167c478bd9Sstevel@tonic-gate 	ulong_t	pgma_alloc;
1177c478bd9Sstevel@tonic-gate 	ulong_t	pgma_allocok;
1187c478bd9Sstevel@tonic-gate 	ulong_t	pgma_allocfailed;
1197c478bd9Sstevel@tonic-gate 	ulong_t	pgma_allocempty;
1207c478bd9Sstevel@tonic-gate } pga_vmstats;
1217c478bd9Sstevel@tonic-gate #endif
1227c478bd9Sstevel@tonic-gate 
1237c478bd9Sstevel@tonic-gate uint_t mmu_page_sizes;
1247c478bd9Sstevel@tonic-gate 
1257c478bd9Sstevel@tonic-gate /* How many page sizes the users can see */
1267c478bd9Sstevel@tonic-gate uint_t mmu_exported_page_sizes;
1277c478bd9Sstevel@tonic-gate 
128beb1bda0Sdavemq /*
129beb1bda0Sdavemq  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
130beb1bda0Sdavemq  * fewer than this many pages.
131beb1bda0Sdavemq  */
132ec25b48fSsusans pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
133ec25b48fSsusans pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
134ec25b48fSsusans 
135ec25b48fSsusans /*
136ec25b48fSsusans  * Maximum and default segment size tunables for user private
137ec25b48fSsusans  * and shared anon memory, and user text and initialized data.
138ec25b48fSsusans  * These can be patched via /etc/system to allow large pages
139ec25b48fSsusans  * to be used for mapping application private and shared anon memory.
140ec25b48fSsusans  */
141ec25b48fSsusans size_t mcntl0_lpsize = MMU_PAGESIZE;
142ec25b48fSsusans size_t max_uheap_lpsize = MMU_PAGESIZE;
143ec25b48fSsusans size_t default_uheap_lpsize = MMU_PAGESIZE;
144ec25b48fSsusans size_t max_ustack_lpsize = MMU_PAGESIZE;
145ec25b48fSsusans size_t default_ustack_lpsize = MMU_PAGESIZE;
146ec25b48fSsusans size_t max_privmap_lpsize = MMU_PAGESIZE;
147ec25b48fSsusans size_t max_uidata_lpsize = MMU_PAGESIZE;
148ec25b48fSsusans size_t max_utext_lpsize = MMU_PAGESIZE;
149ec25b48fSsusans size_t max_shm_lpsize = MMU_PAGESIZE;
1507c478bd9Sstevel@tonic-gate 
1517c478bd9Sstevel@tonic-gate /*
1527c478bd9Sstevel@tonic-gate  * Return the optimum page size for a given mapping
1537c478bd9Sstevel@tonic-gate  */
1547c478bd9Sstevel@tonic-gate /*ARGSUSED*/
1557c478bd9Sstevel@tonic-gate size_t
156ec25b48fSsusans map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
1577c478bd9Sstevel@tonic-gate {
158ec25b48fSsusans 	level_t l = 0;
159ec25b48fSsusans 	size_t pgsz = MMU_PAGESIZE;
160ec25b48fSsusans 	size_t max_lpsize;
161ec25b48fSsusans 	uint_t mszc;
1627c478bd9Sstevel@tonic-gate 
163ec25b48fSsusans 	ASSERT(maptype != MAPPGSZ_VA);
1647c478bd9Sstevel@tonic-gate 
165ec25b48fSsusans 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
166ec25b48fSsusans 		return (MMU_PAGESIZE);
167ec25b48fSsusans 	}
1687c478bd9Sstevel@tonic-gate 
169ec25b48fSsusans 	switch (maptype) {
1707c478bd9Sstevel@tonic-gate 	case MAPPGSZ_HEAP:
171ec25b48fSsusans 	case MAPPGSZ_STK:
172ec25b48fSsusans 		max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
173ec25b48fSsusans 		    MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
174ec25b48fSsusans 		if (max_lpsize == MMU_PAGESIZE) {
175ec25b48fSsusans 			return (MMU_PAGESIZE);
176ec25b48fSsusans 		}
177ec25b48fSsusans 		if (len == 0) {
178ec25b48fSsusans 			len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
179ec25b48fSsusans 			    p->p_brksize - p->p_bssbase : p->p_stksize;
180ec25b48fSsusans 		}
181ec25b48fSsusans 		len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
182ec25b48fSsusans 		    default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
183ec25b48fSsusans 
1847c478bd9Sstevel@tonic-gate 		/*
1857c478bd9Sstevel@tonic-gate 		 * use the pages size that best fits len
1867c478bd9Sstevel@tonic-gate 		 */
1877c478bd9Sstevel@tonic-gate 		for (l = mmu.max_page_level; l > 0; --l) {
188ec25b48fSsusans 			if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
1897c478bd9Sstevel@tonic-gate 				continue;
190ec25b48fSsusans 			} else {
191ec25b48fSsusans 				pgsz = LEVEL_SIZE(l);
192ec25b48fSsusans 			}
1937c478bd9Sstevel@tonic-gate 			break;
1947c478bd9Sstevel@tonic-gate 		}
195ec25b48fSsusans 
196ec25b48fSsusans 		mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
197ec25b48fSsusans 		    p->p_stkpageszc);
198ec25b48fSsusans 		if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
199ec25b48fSsusans 			pgsz = hw_page_array[mszc].hp_size;
200ec25b48fSsusans 		}
201ec25b48fSsusans 		return (pgsz);
2027c478bd9Sstevel@tonic-gate 
2037c478bd9Sstevel@tonic-gate 	/*
2047c478bd9Sstevel@tonic-gate 	 * for ISM use the 1st large page size.
2057c478bd9Sstevel@tonic-gate 	 */
2067c478bd9Sstevel@tonic-gate 	case MAPPGSZ_ISM:
2077c478bd9Sstevel@tonic-gate 		if (mmu.max_page_level == 0)
2087c478bd9Sstevel@tonic-gate 			return (MMU_PAGESIZE);
2097c478bd9Sstevel@tonic-gate 		return (LEVEL_SIZE(1));
2107c478bd9Sstevel@tonic-gate 	}
211ec25b48fSsusans 	return (pgsz);
2127c478bd9Sstevel@tonic-gate }
2137c478bd9Sstevel@tonic-gate 
214ec25b48fSsusans static uint_t
215ec25b48fSsusans map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
216ec25b48fSsusans     size_t min_physmem)
2177c478bd9Sstevel@tonic-gate {
218ec25b48fSsusans 	caddr_t eaddr = addr + size;
219ec25b48fSsusans 	uint_t szcvec = 0;
220ec25b48fSsusans 	caddr_t raddr;
221ec25b48fSsusans 	caddr_t readdr;
2227c478bd9Sstevel@tonic-gate 	size_t	pgsz;
223ec25b48fSsusans 	int i;
2247c478bd9Sstevel@tonic-gate 
225ec25b48fSsusans 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
2267c478bd9Sstevel@tonic-gate 		return (0);
2277c478bd9Sstevel@tonic-gate 	}
228ec25b48fSsusans 
229ec25b48fSsusans 	for (i = mmu_page_sizes - 1; i > 0; i--) {
230ec25b48fSsusans 		pgsz = page_get_pagesize(i);
231ec25b48fSsusans 		if (pgsz > max_lpsize) {
232ec25b48fSsusans 			continue;
233ec25b48fSsusans 		}
234ec25b48fSsusans 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
235ec25b48fSsusans 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
236ec25b48fSsusans 		if (raddr < addr || raddr >= readdr) {
237ec25b48fSsusans 			continue;
238ec25b48fSsusans 		}
239ec25b48fSsusans 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
240ec25b48fSsusans 			continue;
241ec25b48fSsusans 		}
242ec25b48fSsusans 		/*
243ec25b48fSsusans 		 * Set szcvec to the remaining page sizes.
244ec25b48fSsusans 		 */
245ec25b48fSsusans 		szcvec = ((1 << (i + 1)) - 1) & ~1;
246ec25b48fSsusans 		break;
2477c478bd9Sstevel@tonic-gate 	}
248ec25b48fSsusans 	return (szcvec);
2497c478bd9Sstevel@tonic-gate }
2507c478bd9Sstevel@tonic-gate 
251ec25b48fSsusans /*
252ec25b48fSsusans  * Return a bit vector of large page size codes that
253ec25b48fSsusans  * can be used to map [addr, addr + len) region.
254ec25b48fSsusans  */
255ec25b48fSsusans /*ARGSUSED*/
25607b65a64Saguzovsk uint_t
257ec25b48fSsusans map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
258ec25b48fSsusans     int memcntl)
25907b65a64Saguzovsk {
260ec25b48fSsusans 	size_t max_lpsize = mcntl0_lpsize;
26107b65a64Saguzovsk 
262ec25b48fSsusans 	if (mmu.max_page_level == 0)
26307b65a64Saguzovsk 		return (0);
26407b65a64Saguzovsk 
265ec25b48fSsusans 	if (flags & MAP_TEXT) {
266ec25b48fSsusans 	    if (!memcntl)
267ec25b48fSsusans 		max_lpsize = max_utext_lpsize;
268ec25b48fSsusans 	    return (map_szcvec(addr, size, off, max_lpsize,
269ec25b48fSsusans 		    shm_lpg_min_physmem));
270ec25b48fSsusans 
271ec25b48fSsusans 	} else if (flags & MAP_INITDATA) {
272ec25b48fSsusans 	    if (!memcntl)
273ec25b48fSsusans 		max_lpsize = max_uidata_lpsize;
274ec25b48fSsusans 	    return (map_szcvec(addr, size, off, max_lpsize,
275ec25b48fSsusans 		    privm_lpg_min_physmem));
276ec25b48fSsusans 
277ec25b48fSsusans 	} else if (type == MAPPGSZC_SHM) {
278ec25b48fSsusans 	    if (!memcntl)
279ec25b48fSsusans 		max_lpsize = max_shm_lpsize;
280ec25b48fSsusans 	    return (map_szcvec(addr, size, off, max_lpsize,
281ec25b48fSsusans 		    shm_lpg_min_physmem));
282ec25b48fSsusans 
283ec25b48fSsusans 	} else if (type == MAPPGSZC_HEAP) {
284ec25b48fSsusans 	    if (!memcntl)
285ec25b48fSsusans 		max_lpsize = max_uheap_lpsize;
286ec25b48fSsusans 	    return (map_szcvec(addr, size, off, max_lpsize,
287ec25b48fSsusans 		    privm_lpg_min_physmem));
288ec25b48fSsusans 
289ec25b48fSsusans 	} else if (type == MAPPGSZC_STACK) {
290ec25b48fSsusans 	    if (!memcntl)
291ec25b48fSsusans 		max_lpsize = max_ustack_lpsize;
292ec25b48fSsusans 	    return (map_szcvec(addr, size, off, max_lpsize,
293ec25b48fSsusans 		    privm_lpg_min_physmem));
294ec25b48fSsusans 
295ec25b48fSsusans 	} else {
296ec25b48fSsusans 	    if (!memcntl)
297ec25b48fSsusans 		max_lpsize = max_privmap_lpsize;
298ec25b48fSsusans 	    return (map_szcvec(addr, size, off, max_lpsize,
299ec25b48fSsusans 		    privm_lpg_min_physmem));
30007b65a64Saguzovsk 	}
30107b65a64Saguzovsk }
30207b65a64Saguzovsk 
3037c478bd9Sstevel@tonic-gate /*
3047c478bd9Sstevel@tonic-gate  * Handle a pagefault.
3057c478bd9Sstevel@tonic-gate  */
3067c478bd9Sstevel@tonic-gate faultcode_t
3077c478bd9Sstevel@tonic-gate pagefault(
3087c478bd9Sstevel@tonic-gate 	caddr_t addr,
3097c478bd9Sstevel@tonic-gate 	enum fault_type type,
3107c478bd9Sstevel@tonic-gate 	enum seg_rw rw,
3117c478bd9Sstevel@tonic-gate 	int iskernel)
3127c478bd9Sstevel@tonic-gate {
3137c478bd9Sstevel@tonic-gate 	struct as *as;
3147c478bd9Sstevel@tonic-gate 	struct hat *hat;
3157c478bd9Sstevel@tonic-gate 	struct proc *p;
3167c478bd9Sstevel@tonic-gate 	kthread_t *t;
3177c478bd9Sstevel@tonic-gate 	faultcode_t res;
3187c478bd9Sstevel@tonic-gate 	caddr_t base;
3197c478bd9Sstevel@tonic-gate 	size_t len;
3207c478bd9Sstevel@tonic-gate 	int err;
3217c478bd9Sstevel@tonic-gate 	int mapped_red;
3227c478bd9Sstevel@tonic-gate 	uintptr_t ea;
3237c478bd9Sstevel@tonic-gate 
3247c478bd9Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
3257c478bd9Sstevel@tonic-gate 
3267c478bd9Sstevel@tonic-gate 	if (INVALID_VADDR(addr))
3277c478bd9Sstevel@tonic-gate 		return (FC_NOMAP);
3287c478bd9Sstevel@tonic-gate 
3297c478bd9Sstevel@tonic-gate 	mapped_red = segkp_map_red();
3307c478bd9Sstevel@tonic-gate 
3317c478bd9Sstevel@tonic-gate 	if (iskernel) {
3327c478bd9Sstevel@tonic-gate 		as = &kas;
3337c478bd9Sstevel@tonic-gate 		hat = as->a_hat;
3347c478bd9Sstevel@tonic-gate 	} else {
3357c478bd9Sstevel@tonic-gate 		t = curthread;
3367c478bd9Sstevel@tonic-gate 		p = ttoproc(t);
3377c478bd9Sstevel@tonic-gate 		as = p->p_as;
3387c478bd9Sstevel@tonic-gate 		hat = as->a_hat;
3397c478bd9Sstevel@tonic-gate 	}
3407c478bd9Sstevel@tonic-gate 
3417c478bd9Sstevel@tonic-gate 	/*
3427c478bd9Sstevel@tonic-gate 	 * Dispatch pagefault.
3437c478bd9Sstevel@tonic-gate 	 */
3447c478bd9Sstevel@tonic-gate 	res = as_fault(hat, as, addr, 1, type, rw);
3457c478bd9Sstevel@tonic-gate 
3467c478bd9Sstevel@tonic-gate 	/*
3477c478bd9Sstevel@tonic-gate 	 * If this isn't a potential unmapped hole in the user's
3487c478bd9Sstevel@tonic-gate 	 * UNIX data or stack segments, just return status info.
3497c478bd9Sstevel@tonic-gate 	 */
3507c478bd9Sstevel@tonic-gate 	if (res != FC_NOMAP || iskernel)
3517c478bd9Sstevel@tonic-gate 		goto out;
3527c478bd9Sstevel@tonic-gate 
3537c478bd9Sstevel@tonic-gate 	/*
3547c478bd9Sstevel@tonic-gate 	 * Check to see if we happened to faulted on a currently unmapped
3557c478bd9Sstevel@tonic-gate 	 * part of the UNIX data or stack segments.  If so, create a zfod
3567c478bd9Sstevel@tonic-gate 	 * mapping there and then try calling the fault routine again.
3577c478bd9Sstevel@tonic-gate 	 */
3587c478bd9Sstevel@tonic-gate 	base = p->p_brkbase;
3597c478bd9Sstevel@tonic-gate 	len = p->p_brksize;
3607c478bd9Sstevel@tonic-gate 
3617c478bd9Sstevel@tonic-gate 	if (addr < base || addr >= base + len) {		/* data seg? */
3627c478bd9Sstevel@tonic-gate 		base = (caddr_t)p->p_usrstack - p->p_stksize;
3637c478bd9Sstevel@tonic-gate 		len = p->p_stksize;
3647c478bd9Sstevel@tonic-gate 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
3657c478bd9Sstevel@tonic-gate 			/* not in either UNIX data or stack segments */
3667c478bd9Sstevel@tonic-gate 			res = FC_NOMAP;
3677c478bd9Sstevel@tonic-gate 			goto out;
3687c478bd9Sstevel@tonic-gate 		}
3697c478bd9Sstevel@tonic-gate 	}
3707c478bd9Sstevel@tonic-gate 
3717c478bd9Sstevel@tonic-gate 	/*
3727c478bd9Sstevel@tonic-gate 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
3737c478bd9Sstevel@tonic-gate 	 * This code is probably not needed anymore
3747c478bd9Sstevel@tonic-gate 	 */
3757c478bd9Sstevel@tonic-gate 	if (p->p_model == DATAMODEL_ILP32) {
3767c478bd9Sstevel@tonic-gate 
3777c478bd9Sstevel@tonic-gate 		/* expand the gap to the page boundaries on each side */
3787c478bd9Sstevel@tonic-gate 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
3797c478bd9Sstevel@tonic-gate 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
3807c478bd9Sstevel@tonic-gate 		len = ea - (uintptr_t)base;
3817c478bd9Sstevel@tonic-gate 
3827c478bd9Sstevel@tonic-gate 		as_rangelock(as);
3837c478bd9Sstevel@tonic-gate 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
3847c478bd9Sstevel@tonic-gate 		    0) {
3857c478bd9Sstevel@tonic-gate 			err = as_map(as, base, len, segvn_create, zfod_argsp);
3867c478bd9Sstevel@tonic-gate 			as_rangeunlock(as);
3877c478bd9Sstevel@tonic-gate 			if (err) {
3887c478bd9Sstevel@tonic-gate 				res = FC_MAKE_ERR(err);
3897c478bd9Sstevel@tonic-gate 				goto out;
3907c478bd9Sstevel@tonic-gate 			}
3917c478bd9Sstevel@tonic-gate 		} else {
3927c478bd9Sstevel@tonic-gate 			/*
3937c478bd9Sstevel@tonic-gate 			 * This page is already mapped by another thread after
3947c478bd9Sstevel@tonic-gate 			 * we returned from as_fault() above.  We just fall
3957c478bd9Sstevel@tonic-gate 			 * through as_fault() below.
3967c478bd9Sstevel@tonic-gate 			 */
3977c478bd9Sstevel@tonic-gate 			as_rangeunlock(as);
3987c478bd9Sstevel@tonic-gate 		}
3997c478bd9Sstevel@tonic-gate 
4007c478bd9Sstevel@tonic-gate 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
4017c478bd9Sstevel@tonic-gate 	}
4027c478bd9Sstevel@tonic-gate 
4037c478bd9Sstevel@tonic-gate out:
4047c478bd9Sstevel@tonic-gate 	if (mapped_red)
4057c478bd9Sstevel@tonic-gate 		segkp_unmap_red();
4067c478bd9Sstevel@tonic-gate 
4077c478bd9Sstevel@tonic-gate 	return (res);
4087c478bd9Sstevel@tonic-gate }
4097c478bd9Sstevel@tonic-gate 
4107c478bd9Sstevel@tonic-gate void
4117c478bd9Sstevel@tonic-gate map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
4127c478bd9Sstevel@tonic-gate {
4137c478bd9Sstevel@tonic-gate 	struct proc *p = curproc;
4147c478bd9Sstevel@tonic-gate 	caddr_t userlimit = (flags & _MAP_LOW32) ?
4157c478bd9Sstevel@tonic-gate 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
4167c478bd9Sstevel@tonic-gate 
4177c478bd9Sstevel@tonic-gate 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
4187c478bd9Sstevel@tonic-gate }
4197c478bd9Sstevel@tonic-gate 
4207c478bd9Sstevel@tonic-gate /*ARGSUSED*/
4217c478bd9Sstevel@tonic-gate int
4227c478bd9Sstevel@tonic-gate map_addr_vacalign_check(caddr_t addr, u_offset_t off)
4237c478bd9Sstevel@tonic-gate {
4247c478bd9Sstevel@tonic-gate 	return (0);
4257c478bd9Sstevel@tonic-gate }
4267c478bd9Sstevel@tonic-gate 
4277c478bd9Sstevel@tonic-gate /*
4287c478bd9Sstevel@tonic-gate  * map_addr_proc() is the routine called when the system is to
4297c478bd9Sstevel@tonic-gate  * choose an address for the user.  We will pick an address
430*ae115bc7Smrj  * range which is the highest available below userlimit.
4317c478bd9Sstevel@tonic-gate  *
4327c478bd9Sstevel@tonic-gate  * addrp is a value/result parameter.
4337c478bd9Sstevel@tonic-gate  *	On input it is a hint from the user to be used in a completely
4347c478bd9Sstevel@tonic-gate  *	machine dependent fashion.  We decide to completely ignore this hint.
4357c478bd9Sstevel@tonic-gate  *
4367c478bd9Sstevel@tonic-gate  *	On output it is NULL if no address can be found in the current
4377c478bd9Sstevel@tonic-gate  *	processes address space or else an address that is currently
4387c478bd9Sstevel@tonic-gate  *	not mapped for len bytes with a page of red zone on either side.
4397c478bd9Sstevel@tonic-gate  *
4407c478bd9Sstevel@tonic-gate  *	align is not needed on x86 (it's for viturally addressed caches)
4417c478bd9Sstevel@tonic-gate  */
4427c478bd9Sstevel@tonic-gate /*ARGSUSED*/
4437c478bd9Sstevel@tonic-gate void
4447c478bd9Sstevel@tonic-gate map_addr_proc(
4457c478bd9Sstevel@tonic-gate 	caddr_t *addrp,
4467c478bd9Sstevel@tonic-gate 	size_t len,
4477c478bd9Sstevel@tonic-gate 	offset_t off,
4487c478bd9Sstevel@tonic-gate 	int vacalign,
4497c478bd9Sstevel@tonic-gate 	caddr_t userlimit,
4507c478bd9Sstevel@tonic-gate 	struct proc *p,
4517c478bd9Sstevel@tonic-gate 	uint_t flags)
4527c478bd9Sstevel@tonic-gate {
4537c478bd9Sstevel@tonic-gate 	struct as *as = p->p_as;
4547c478bd9Sstevel@tonic-gate 	caddr_t addr;
4557c478bd9Sstevel@tonic-gate 	caddr_t base;
4567c478bd9Sstevel@tonic-gate 	size_t slen;
4577c478bd9Sstevel@tonic-gate 	size_t align_amount;
4587c478bd9Sstevel@tonic-gate 
4597c478bd9Sstevel@tonic-gate 	ASSERT32(userlimit == as->a_userlimit);
4607c478bd9Sstevel@tonic-gate 
4617c478bd9Sstevel@tonic-gate 	base = p->p_brkbase;
4627c478bd9Sstevel@tonic-gate #if defined(__amd64)
4637c478bd9Sstevel@tonic-gate 	/*
4647c478bd9Sstevel@tonic-gate 	 * XX64 Yes, this needs more work.
4657c478bd9Sstevel@tonic-gate 	 */
4667c478bd9Sstevel@tonic-gate 	if (p->p_model == DATAMODEL_NATIVE) {
4677c478bd9Sstevel@tonic-gate 		if (userlimit < as->a_userlimit) {
4687c478bd9Sstevel@tonic-gate 			/*
4697c478bd9Sstevel@tonic-gate 			 * This happens when a program wants to map
4707c478bd9Sstevel@tonic-gate 			 * something in a range that's accessible to a
4717c478bd9Sstevel@tonic-gate 			 * program in a smaller address space.  For example,
4727c478bd9Sstevel@tonic-gate 			 * a 64-bit program calling mmap32(2) to guarantee
4737c478bd9Sstevel@tonic-gate 			 * that the returned address is below 4Gbytes.
4747c478bd9Sstevel@tonic-gate 			 */
4757c478bd9Sstevel@tonic-gate 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
4767c478bd9Sstevel@tonic-gate 
4777c478bd9Sstevel@tonic-gate 			if (userlimit > base)
4787c478bd9Sstevel@tonic-gate 				slen = userlimit - base;
4797c478bd9Sstevel@tonic-gate 			else {
4807c478bd9Sstevel@tonic-gate 				*addrp = NULL;
4817c478bd9Sstevel@tonic-gate 				return;
4827c478bd9Sstevel@tonic-gate 			}
4837c478bd9Sstevel@tonic-gate 		} else {
4847c478bd9Sstevel@tonic-gate 			/*
4857c478bd9Sstevel@tonic-gate 			 * XX64 This layout is probably wrong .. but in
4867c478bd9Sstevel@tonic-gate 			 * the event we make the amd64 address space look
4877c478bd9Sstevel@tonic-gate 			 * like sparcv9 i.e. with the stack -above- the
4887c478bd9Sstevel@tonic-gate 			 * heap, this bit of code might even be correct.
4897c478bd9Sstevel@tonic-gate 			 */
4907c478bd9Sstevel@tonic-gate 			slen = p->p_usrstack - base -
4917c478bd9Sstevel@tonic-gate 			    (((size_t)rctl_enforced_value(
4927c478bd9Sstevel@tonic-gate 			    rctlproc_legacy[RLIMIT_STACK],
4937c478bd9Sstevel@tonic-gate 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
4947c478bd9Sstevel@tonic-gate 		}
4957c478bd9Sstevel@tonic-gate 	} else
4967c478bd9Sstevel@tonic-gate #endif
4977c478bd9Sstevel@tonic-gate 		slen = userlimit - base;
4987c478bd9Sstevel@tonic-gate 
4997c478bd9Sstevel@tonic-gate 	len = (len + PAGEOFFSET) & PAGEMASK;
5007c478bd9Sstevel@tonic-gate 
5017c478bd9Sstevel@tonic-gate 	/*
5027c478bd9Sstevel@tonic-gate 	 * Redzone for each side of the request. This is done to leave
5037c478bd9Sstevel@tonic-gate 	 * one page unmapped between segments. This is not required, but
5047c478bd9Sstevel@tonic-gate 	 * it's useful for the user because if their program strays across
5057c478bd9Sstevel@tonic-gate 	 * a segment boundary, it will catch a fault immediately making
5067c478bd9Sstevel@tonic-gate 	 * debugging a little easier.
5077c478bd9Sstevel@tonic-gate 	 */
5087c478bd9Sstevel@tonic-gate 	len += 2 * MMU_PAGESIZE;
5097c478bd9Sstevel@tonic-gate 
5107c478bd9Sstevel@tonic-gate 	/*
5117c478bd9Sstevel@tonic-gate 	 * figure out what the alignment should be
5127c478bd9Sstevel@tonic-gate 	 *
5137c478bd9Sstevel@tonic-gate 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
5147c478bd9Sstevel@tonic-gate 	 */
5157c478bd9Sstevel@tonic-gate 	if (len <= ELF_386_MAXPGSZ) {
5167c478bd9Sstevel@tonic-gate 		/*
5177c478bd9Sstevel@tonic-gate 		 * Align virtual addresses to ensure that ELF shared libraries
5187c478bd9Sstevel@tonic-gate 		 * are mapped with the appropriate alignment constraints by
5197c478bd9Sstevel@tonic-gate 		 * the run-time linker.
5207c478bd9Sstevel@tonic-gate 		 */
5217c478bd9Sstevel@tonic-gate 		align_amount = ELF_386_MAXPGSZ;
5227c478bd9Sstevel@tonic-gate 	} else {
5237c478bd9Sstevel@tonic-gate 		int l = mmu.max_page_level;
5247c478bd9Sstevel@tonic-gate 
5257c478bd9Sstevel@tonic-gate 		while (l && len < LEVEL_SIZE(l))
5267c478bd9Sstevel@tonic-gate 			--l;
5277c478bd9Sstevel@tonic-gate 
5287c478bd9Sstevel@tonic-gate 		align_amount = LEVEL_SIZE(l);
5297c478bd9Sstevel@tonic-gate 	}
5307c478bd9Sstevel@tonic-gate 
5317c478bd9Sstevel@tonic-gate 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
5327c478bd9Sstevel@tonic-gate 		align_amount = (uintptr_t)*addrp;
5337c478bd9Sstevel@tonic-gate 
5347c478bd9Sstevel@tonic-gate 	len += align_amount;
5357c478bd9Sstevel@tonic-gate 
5367c478bd9Sstevel@tonic-gate 	/*
5377c478bd9Sstevel@tonic-gate 	 * Look for a large enough hole starting below userlimit.
5387c478bd9Sstevel@tonic-gate 	 * After finding it, use the upper part.  Addition of PAGESIZE
5397c478bd9Sstevel@tonic-gate 	 * is for the redzone as described above.
5407c478bd9Sstevel@tonic-gate 	 */
5417c478bd9Sstevel@tonic-gate 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
5427c478bd9Sstevel@tonic-gate 		caddr_t as_addr;
5437c478bd9Sstevel@tonic-gate 
5447c478bd9Sstevel@tonic-gate 		addr = base + slen - len + MMU_PAGESIZE;
5457c478bd9Sstevel@tonic-gate 		as_addr = addr;
5467c478bd9Sstevel@tonic-gate 		/*
5477c478bd9Sstevel@tonic-gate 		 * Round address DOWN to the alignment amount,
5487c478bd9Sstevel@tonic-gate 		 * add the offset, and if this address is less
5497c478bd9Sstevel@tonic-gate 		 * than the original address, add alignment amount.
5507c478bd9Sstevel@tonic-gate 		 */
5517c478bd9Sstevel@tonic-gate 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
5527c478bd9Sstevel@tonic-gate 		addr += (uintptr_t)(off & (align_amount - 1));
5537c478bd9Sstevel@tonic-gate 		if (addr < as_addr)
5547c478bd9Sstevel@tonic-gate 			addr += align_amount;
5557c478bd9Sstevel@tonic-gate 
5567c478bd9Sstevel@tonic-gate 		ASSERT(addr <= (as_addr + align_amount));
5577c478bd9Sstevel@tonic-gate 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
5587c478bd9Sstevel@tonic-gate 		    ((uintptr_t)(off & (align_amount - 1))));
5597c478bd9Sstevel@tonic-gate 		*addrp = addr;
5607c478bd9Sstevel@tonic-gate 	} else {
5617c478bd9Sstevel@tonic-gate 		*addrp = NULL;	/* no more virtual space */
5627c478bd9Sstevel@tonic-gate 	}
5637c478bd9Sstevel@tonic-gate }
5647c478bd9Sstevel@tonic-gate 
5657c478bd9Sstevel@tonic-gate /*
5667c478bd9Sstevel@tonic-gate  * Determine whether [base, base+len] contains a valid range of
5677c478bd9Sstevel@tonic-gate  * addresses at least minlen long. base and len are adjusted if
5687c478bd9Sstevel@tonic-gate  * required to provide a valid range.
5697c478bd9Sstevel@tonic-gate  */
5707c478bd9Sstevel@tonic-gate /*ARGSUSED3*/
5717c478bd9Sstevel@tonic-gate int
5727c478bd9Sstevel@tonic-gate valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
5737c478bd9Sstevel@tonic-gate {
5747c478bd9Sstevel@tonic-gate 	uintptr_t hi, lo;
5757c478bd9Sstevel@tonic-gate 
5767c478bd9Sstevel@tonic-gate 	lo = (uintptr_t)*basep;
5777c478bd9Sstevel@tonic-gate 	hi = lo + *lenp;
5787c478bd9Sstevel@tonic-gate 
5797c478bd9Sstevel@tonic-gate 	/*
5807c478bd9Sstevel@tonic-gate 	 * If hi rolled over the top, try cutting back.
5817c478bd9Sstevel@tonic-gate 	 */
5827c478bd9Sstevel@tonic-gate 	if (hi < lo) {
5837c478bd9Sstevel@tonic-gate 		if (0 - lo + hi < minlen)
5847c478bd9Sstevel@tonic-gate 			return (0);
5857c478bd9Sstevel@tonic-gate 		if (0 - lo < minlen)
5867c478bd9Sstevel@tonic-gate 			return (0);
5877c478bd9Sstevel@tonic-gate 		*lenp = 0 - lo;
5887c478bd9Sstevel@tonic-gate 	} else if (hi - lo < minlen) {
5897c478bd9Sstevel@tonic-gate 		return (0);
5907c478bd9Sstevel@tonic-gate 	}
5917c478bd9Sstevel@tonic-gate #if defined(__amd64)
5927c478bd9Sstevel@tonic-gate 	/*
5937c478bd9Sstevel@tonic-gate 	 * Deal with a possible hole in the address range between
5947c478bd9Sstevel@tonic-gate 	 * hole_start and hole_end that should never be mapped.
5957c478bd9Sstevel@tonic-gate 	 */
5967c478bd9Sstevel@tonic-gate 	if (lo < hole_start) {
5977c478bd9Sstevel@tonic-gate 		if (hi > hole_start) {
5987c478bd9Sstevel@tonic-gate 			if (hi < hole_end) {
5997c478bd9Sstevel@tonic-gate 				hi = hole_start;
6007c478bd9Sstevel@tonic-gate 			} else {
6017c478bd9Sstevel@tonic-gate 				/* lo < hole_start && hi >= hole_end */
6027c478bd9Sstevel@tonic-gate 				if (dir == AH_LO) {
6037c478bd9Sstevel@tonic-gate 					/*
6047c478bd9Sstevel@tonic-gate 					 * prefer lowest range
6057c478bd9Sstevel@tonic-gate 					 */
6067c478bd9Sstevel@tonic-gate 					if (hole_start - lo >= minlen)
6077c478bd9Sstevel@tonic-gate 						hi = hole_start;
6087c478bd9Sstevel@tonic-gate 					else if (hi - hole_end >= minlen)
6097c478bd9Sstevel@tonic-gate 						lo = hole_end;
6107c478bd9Sstevel@tonic-gate 					else
6117c478bd9Sstevel@tonic-gate 						return (0);
6127c478bd9Sstevel@tonic-gate 				} else {
6137c478bd9Sstevel@tonic-gate 					/*
6147c478bd9Sstevel@tonic-gate 					 * prefer highest range
6157c478bd9Sstevel@tonic-gate 					 */
6167c478bd9Sstevel@tonic-gate 					if (hi - hole_end >= minlen)
6177c478bd9Sstevel@tonic-gate 						lo = hole_end;
6187c478bd9Sstevel@tonic-gate 					else if (hole_start - lo >= minlen)
6197c478bd9Sstevel@tonic-gate 						hi = hole_start;
6207c478bd9Sstevel@tonic-gate 					else
6217c478bd9Sstevel@tonic-gate 						return (0);
6227c478bd9Sstevel@tonic-gate 				}
6237c478bd9Sstevel@tonic-gate 			}
6247c478bd9Sstevel@tonic-gate 		}
6257c478bd9Sstevel@tonic-gate 	} else {
6267c478bd9Sstevel@tonic-gate 		/* lo >= hole_start */
6277c478bd9Sstevel@tonic-gate 		if (hi < hole_end)
6287c478bd9Sstevel@tonic-gate 			return (0);
6297c478bd9Sstevel@tonic-gate 		if (lo < hole_end)
6307c478bd9Sstevel@tonic-gate 			lo = hole_end;
6317c478bd9Sstevel@tonic-gate 	}
6327c478bd9Sstevel@tonic-gate 
6337c478bd9Sstevel@tonic-gate 	if (hi - lo < minlen)
6347c478bd9Sstevel@tonic-gate 		return (0);
6357c478bd9Sstevel@tonic-gate 
6367c478bd9Sstevel@tonic-gate 	*basep = (caddr_t)lo;
6377c478bd9Sstevel@tonic-gate 	*lenp = hi - lo;
6387c478bd9Sstevel@tonic-gate #endif
6397c478bd9Sstevel@tonic-gate 	return (1);
6407c478bd9Sstevel@tonic-gate }
6417c478bd9Sstevel@tonic-gate 
6427c478bd9Sstevel@tonic-gate /*
6437c478bd9Sstevel@tonic-gate  * Determine whether [addr, addr+len] are valid user addresses.
6447c478bd9Sstevel@tonic-gate  */
6457c478bd9Sstevel@tonic-gate /*ARGSUSED*/
6467c478bd9Sstevel@tonic-gate int
6477c478bd9Sstevel@tonic-gate valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
6487c478bd9Sstevel@tonic-gate     caddr_t userlimit)
6497c478bd9Sstevel@tonic-gate {
6507c478bd9Sstevel@tonic-gate 	caddr_t eaddr = addr + len;
6517c478bd9Sstevel@tonic-gate 
6527c478bd9Sstevel@tonic-gate 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
6537c478bd9Sstevel@tonic-gate 		return (RANGE_BADADDR);
6547c478bd9Sstevel@tonic-gate 
6557c478bd9Sstevel@tonic-gate #if defined(__amd64)
6567c478bd9Sstevel@tonic-gate 	/*
6577c478bd9Sstevel@tonic-gate 	 * Check for the VA hole
6587c478bd9Sstevel@tonic-gate 	 */
6597c478bd9Sstevel@tonic-gate 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
6607c478bd9Sstevel@tonic-gate 		return (RANGE_BADADDR);
6617c478bd9Sstevel@tonic-gate #endif
6627c478bd9Sstevel@tonic-gate 
6637c478bd9Sstevel@tonic-gate 	return (RANGE_OKAY);
6647c478bd9Sstevel@tonic-gate }
6657c478bd9Sstevel@tonic-gate 
6667c478bd9Sstevel@tonic-gate /*
6677c478bd9Sstevel@tonic-gate  * Return 1 if the page frame is onboard memory, else 0.
6687c478bd9Sstevel@tonic-gate  */
6697c478bd9Sstevel@tonic-gate int
6707c478bd9Sstevel@tonic-gate pf_is_memory(pfn_t pf)
6717c478bd9Sstevel@tonic-gate {
672*ae115bc7Smrj 	if (pfn_is_foreign(pf))
673*ae115bc7Smrj 		return (0);
674*ae115bc7Smrj 	return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
6757c478bd9Sstevel@tonic-gate }
6767c478bd9Sstevel@tonic-gate 
6777c478bd9Sstevel@tonic-gate 
6787c478bd9Sstevel@tonic-gate /*
6797c478bd9Sstevel@tonic-gate  * initialized by page_coloring_init().
6807c478bd9Sstevel@tonic-gate  */
6817c478bd9Sstevel@tonic-gate uint_t	page_colors;
6827c478bd9Sstevel@tonic-gate uint_t	page_colors_mask;
6837c478bd9Sstevel@tonic-gate uint_t	page_coloring_shift;
6847c478bd9Sstevel@tonic-gate int	cpu_page_colors;
6857c478bd9Sstevel@tonic-gate static uint_t	l2_colors;
6867c478bd9Sstevel@tonic-gate 
6877c478bd9Sstevel@tonic-gate /*
6887c478bd9Sstevel@tonic-gate  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
6897c478bd9Sstevel@tonic-gate  * and page_colors are calculated from the l2 cache n-way set size.  Within a
6907c478bd9Sstevel@tonic-gate  * mnode range, the page freelist and cachelist are hashed into bins based on
6917c478bd9Sstevel@tonic-gate  * color. This makes it easier to search for a page within a specific memory
6927c478bd9Sstevel@tonic-gate  * range.
6937c478bd9Sstevel@tonic-gate  */
6947c478bd9Sstevel@tonic-gate #define	PAGE_COLORS_MIN	16
6957c478bd9Sstevel@tonic-gate 
6967c478bd9Sstevel@tonic-gate page_t ****page_freelists;
6977c478bd9Sstevel@tonic-gate page_t ***page_cachelists;
6987c478bd9Sstevel@tonic-gate 
6997c478bd9Sstevel@tonic-gate /*
7007c478bd9Sstevel@tonic-gate  * As the PC architecture evolved memory up was clumped into several
7017c478bd9Sstevel@tonic-gate  * ranges for various historical I/O devices to do DMA.
7027c478bd9Sstevel@tonic-gate  * < 16Meg - ISA bus
7037c478bd9Sstevel@tonic-gate  * < 2Gig - ???
7047c478bd9Sstevel@tonic-gate  * < 4Gig - PCI bus or drivers that don't understand PAE mode
7057c478bd9Sstevel@tonic-gate  */
7067c478bd9Sstevel@tonic-gate static pfn_t arch_memranges[NUM_MEM_RANGES] = {
7077c478bd9Sstevel@tonic-gate     0x100000,	/* pfn range for 4G and above */
7087c478bd9Sstevel@tonic-gate     0x80000,	/* pfn range for 2G-4G */
7097c478bd9Sstevel@tonic-gate     0x01000,	/* pfn range for 16M-2G */
7107c478bd9Sstevel@tonic-gate     0x00000,	/* pfn range for 0-16M */
7117c478bd9Sstevel@tonic-gate };
7127c478bd9Sstevel@tonic-gate 
7137c478bd9Sstevel@tonic-gate /*
7147c478bd9Sstevel@tonic-gate  * These are changed during startup if the machine has limited memory.
7157c478bd9Sstevel@tonic-gate  */
7167c478bd9Sstevel@tonic-gate pfn_t *memranges = &arch_memranges[0];
7177c478bd9Sstevel@tonic-gate int nranges = NUM_MEM_RANGES;
7187c478bd9Sstevel@tonic-gate 
7197c478bd9Sstevel@tonic-gate /*
7207c478bd9Sstevel@tonic-gate  * Used by page layer to know about page sizes
7217c478bd9Sstevel@tonic-gate  */
7227c478bd9Sstevel@tonic-gate hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
7237c478bd9Sstevel@tonic-gate 
7247c478bd9Sstevel@tonic-gate /*
7257c478bd9Sstevel@tonic-gate  * This can be patched via /etc/system to allow old non-PAE aware device
7267c478bd9Sstevel@tonic-gate  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
7277c478bd9Sstevel@tonic-gate  */
7287c478bd9Sstevel@tonic-gate #if defined(__i386)
729aa042c4bSkchow int restricted_kmemalloc = 0;
7307c478bd9Sstevel@tonic-gate #elif defined(__amd64)
7317c478bd9Sstevel@tonic-gate int restricted_kmemalloc = 0;
7327c478bd9Sstevel@tonic-gate #endif
7337c478bd9Sstevel@tonic-gate 
7347c478bd9Sstevel@tonic-gate kmutex_t	*fpc_mutex[NPC_MUTEX];
7357c478bd9Sstevel@tonic-gate kmutex_t	*cpc_mutex[NPC_MUTEX];
7367c478bd9Sstevel@tonic-gate 
7377c478bd9Sstevel@tonic-gate 
7387c478bd9Sstevel@tonic-gate /*
7397c478bd9Sstevel@tonic-gate  * return the memrange containing pfn
7407c478bd9Sstevel@tonic-gate  */
7417c478bd9Sstevel@tonic-gate int
7427c478bd9Sstevel@tonic-gate memrange_num(pfn_t pfn)
7437c478bd9Sstevel@tonic-gate {
7447c478bd9Sstevel@tonic-gate 	int n;
7457c478bd9Sstevel@tonic-gate 
7467c478bd9Sstevel@tonic-gate 	for (n = 0; n < nranges - 1; ++n) {
7477c478bd9Sstevel@tonic-gate 		if (pfn >= memranges[n])
7487c478bd9Sstevel@tonic-gate 			break;
7497c478bd9Sstevel@tonic-gate 	}
7507c478bd9Sstevel@tonic-gate 	return (n);
7517c478bd9Sstevel@tonic-gate }
7527c478bd9Sstevel@tonic-gate 
7537c478bd9Sstevel@tonic-gate /*
7547c478bd9Sstevel@tonic-gate  * return the mnoderange containing pfn
7557c478bd9Sstevel@tonic-gate  */
7567c478bd9Sstevel@tonic-gate int
7577c478bd9Sstevel@tonic-gate pfn_2_mtype(pfn_t pfn)
7587c478bd9Sstevel@tonic-gate {
7597c478bd9Sstevel@tonic-gate 	int	n;
7607c478bd9Sstevel@tonic-gate 
7617c478bd9Sstevel@tonic-gate 	for (n = mnoderangecnt - 1; n >= 0; n--) {
7627c478bd9Sstevel@tonic-gate 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
7637c478bd9Sstevel@tonic-gate 			break;
7647c478bd9Sstevel@tonic-gate 		}
7657c478bd9Sstevel@tonic-gate 	}
7667c478bd9Sstevel@tonic-gate 	return (n);
7677c478bd9Sstevel@tonic-gate }
7687c478bd9Sstevel@tonic-gate 
7697c478bd9Sstevel@tonic-gate /*
7707c478bd9Sstevel@tonic-gate  * is_contigpage_free:
7717c478bd9Sstevel@tonic-gate  *	returns a page list of contiguous pages. It minimally has to return
7727c478bd9Sstevel@tonic-gate  *	minctg pages. Caller determines minctg based on the scatter-gather
7737c478bd9Sstevel@tonic-gate  *	list length.
7747c478bd9Sstevel@tonic-gate  *
7757c478bd9Sstevel@tonic-gate  *	pfnp is set to the next page frame to search on return.
7767c478bd9Sstevel@tonic-gate  */
7777c478bd9Sstevel@tonic-gate static page_t *
7787c478bd9Sstevel@tonic-gate is_contigpage_free(
7797c478bd9Sstevel@tonic-gate 	pfn_t *pfnp,
7807c478bd9Sstevel@tonic-gate 	pgcnt_t *pgcnt,
7817c478bd9Sstevel@tonic-gate 	pgcnt_t minctg,
7827c478bd9Sstevel@tonic-gate 	uint64_t pfnseg,
7837c478bd9Sstevel@tonic-gate 	int iolock)
7847c478bd9Sstevel@tonic-gate {
7857c478bd9Sstevel@tonic-gate 	int	i = 0;
7867c478bd9Sstevel@tonic-gate 	pfn_t	pfn = *pfnp;
7877c478bd9Sstevel@tonic-gate 	page_t	*pp;
7887c478bd9Sstevel@tonic-gate 	page_t	*plist = NULL;
7897c478bd9Sstevel@tonic-gate 
7907c478bd9Sstevel@tonic-gate 	/*
7917c478bd9Sstevel@tonic-gate 	 * fail if pfn + minctg crosses a segment boundary.
7927c478bd9Sstevel@tonic-gate 	 * Adjust for next starting pfn to begin at segment boundary.
7937c478bd9Sstevel@tonic-gate 	 */
7947c478bd9Sstevel@tonic-gate 
7957c478bd9Sstevel@tonic-gate 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
7967c478bd9Sstevel@tonic-gate 		*pfnp = roundup(*pfnp, pfnseg + 1);
7977c478bd9Sstevel@tonic-gate 		return (NULL);
7987c478bd9Sstevel@tonic-gate 	}
7997c478bd9Sstevel@tonic-gate 
8007c478bd9Sstevel@tonic-gate 	do {
8017c478bd9Sstevel@tonic-gate retry:
8027c478bd9Sstevel@tonic-gate 		pp = page_numtopp_nolock(pfn + i);
8037c478bd9Sstevel@tonic-gate 		if ((pp == NULL) ||
8047c478bd9Sstevel@tonic-gate 		    (page_trylock(pp, SE_EXCL) == 0)) {
8057c478bd9Sstevel@tonic-gate 			(*pfnp)++;
8067c478bd9Sstevel@tonic-gate 			break;
8077c478bd9Sstevel@tonic-gate 		}
8087c478bd9Sstevel@tonic-gate 		if (page_pptonum(pp) != pfn + i) {
8097c478bd9Sstevel@tonic-gate 			page_unlock(pp);
8107c478bd9Sstevel@tonic-gate 			goto retry;
8117c478bd9Sstevel@tonic-gate 		}
8127c478bd9Sstevel@tonic-gate 
8137c478bd9Sstevel@tonic-gate 		if (!(PP_ISFREE(pp))) {
8147c478bd9Sstevel@tonic-gate 			page_unlock(pp);
8157c478bd9Sstevel@tonic-gate 			(*pfnp)++;
8167c478bd9Sstevel@tonic-gate 			break;
8177c478bd9Sstevel@tonic-gate 		}
8187c478bd9Sstevel@tonic-gate 
8197c478bd9Sstevel@tonic-gate 		if (!PP_ISAGED(pp)) {
8207c478bd9Sstevel@tonic-gate 			page_list_sub(pp, PG_CACHE_LIST);
8217c478bd9Sstevel@tonic-gate 			page_hashout(pp, (kmutex_t *)NULL);
8227c478bd9Sstevel@tonic-gate 		} else {
8237c478bd9Sstevel@tonic-gate 			page_list_sub(pp, PG_FREE_LIST);
8247c478bd9Sstevel@tonic-gate 		}
8257c478bd9Sstevel@tonic-gate 
8267c478bd9Sstevel@tonic-gate 		if (iolock)
8277c478bd9Sstevel@tonic-gate 			page_io_lock(pp);
8287c478bd9Sstevel@tonic-gate 		page_list_concat(&plist, &pp);
8297c478bd9Sstevel@tonic-gate 
8307c478bd9Sstevel@tonic-gate 		/*
8317c478bd9Sstevel@tonic-gate 		 * exit loop when pgcnt satisfied or segment boundary reached.
8327c478bd9Sstevel@tonic-gate 		 */
8337c478bd9Sstevel@tonic-gate 
8347c478bd9Sstevel@tonic-gate 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
8357c478bd9Sstevel@tonic-gate 
8367c478bd9Sstevel@tonic-gate 	*pfnp += i;		/* set to next pfn to search */
8377c478bd9Sstevel@tonic-gate 
8387c478bd9Sstevel@tonic-gate 	if (i >= minctg) {
8397c478bd9Sstevel@tonic-gate 		*pgcnt -= i;
8407c478bd9Sstevel@tonic-gate 		return (plist);
8417c478bd9Sstevel@tonic-gate 	}
8427c478bd9Sstevel@tonic-gate 
8437c478bd9Sstevel@tonic-gate 	/*
8447c478bd9Sstevel@tonic-gate 	 * failure: minctg not satisfied.
8457c478bd9Sstevel@tonic-gate 	 *
8467c478bd9Sstevel@tonic-gate 	 * if next request crosses segment boundary, set next pfn
8477c478bd9Sstevel@tonic-gate 	 * to search from the segment boundary.
8487c478bd9Sstevel@tonic-gate 	 */
8497c478bd9Sstevel@tonic-gate 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
8507c478bd9Sstevel@tonic-gate 		*pfnp = roundup(*pfnp, pfnseg + 1);
8517c478bd9Sstevel@tonic-gate 
8527c478bd9Sstevel@tonic-gate 	/* clean up any pages already allocated */
8537c478bd9Sstevel@tonic-gate 
8547c478bd9Sstevel@tonic-gate 	while (plist) {
8557c478bd9Sstevel@tonic-gate 		pp = plist;
8567c478bd9Sstevel@tonic-gate 		page_sub(&plist, pp);
8577c478bd9Sstevel@tonic-gate 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
8587c478bd9Sstevel@tonic-gate 		if (iolock)
8597c478bd9Sstevel@tonic-gate 			page_io_unlock(pp);
8607c478bd9Sstevel@tonic-gate 		page_unlock(pp);
8617c478bd9Sstevel@tonic-gate 	}
8627c478bd9Sstevel@tonic-gate 
8637c478bd9Sstevel@tonic-gate 	return (NULL);
8647c478bd9Sstevel@tonic-gate }
8657c478bd9Sstevel@tonic-gate 
8667c478bd9Sstevel@tonic-gate /*
8677c478bd9Sstevel@tonic-gate  * verify that pages being returned from allocator have correct DMA attribute
8687c478bd9Sstevel@tonic-gate  */
8697c478bd9Sstevel@tonic-gate #ifndef DEBUG
8707c478bd9Sstevel@tonic-gate #define	check_dma(a, b, c) (0)
8717c478bd9Sstevel@tonic-gate #else
8727c478bd9Sstevel@tonic-gate static void
8737c478bd9Sstevel@tonic-gate check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
8747c478bd9Sstevel@tonic-gate {
8757c478bd9Sstevel@tonic-gate 	if (dma_attr == NULL)
8767c478bd9Sstevel@tonic-gate 		return;
8777c478bd9Sstevel@tonic-gate 
8787c478bd9Sstevel@tonic-gate 	while (cnt-- > 0) {
879*ae115bc7Smrj 		if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
8807c478bd9Sstevel@tonic-gate 		    dma_attr->dma_attr_addr_lo)
8817c478bd9Sstevel@tonic-gate 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
882*ae115bc7Smrj 		if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
8837c478bd9Sstevel@tonic-gate 		    dma_attr->dma_attr_addr_hi)
8847c478bd9Sstevel@tonic-gate 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
8857c478bd9Sstevel@tonic-gate 		pp = pp->p_next;
8867c478bd9Sstevel@tonic-gate 	}
8877c478bd9Sstevel@tonic-gate }
8887c478bd9Sstevel@tonic-gate #endif
8897c478bd9Sstevel@tonic-gate 
8907c478bd9Sstevel@tonic-gate static kmutex_t	contig_lock;
8917c478bd9Sstevel@tonic-gate 
8927c478bd9Sstevel@tonic-gate #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
8937c478bd9Sstevel@tonic-gate #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
8947c478bd9Sstevel@tonic-gate 
8957c478bd9Sstevel@tonic-gate #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
8967c478bd9Sstevel@tonic-gate 
8977c478bd9Sstevel@tonic-gate static page_t *
8987c478bd9Sstevel@tonic-gate page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
8997c478bd9Sstevel@tonic-gate {
9007c478bd9Sstevel@tonic-gate 	pfn_t		pfn;
9017c478bd9Sstevel@tonic-gate 	int		sgllen;
9027c478bd9Sstevel@tonic-gate 	uint64_t	pfnseg;
9037c478bd9Sstevel@tonic-gate 	pgcnt_t		minctg;
9047c478bd9Sstevel@tonic-gate 	page_t		*pplist = NULL, *plist;
9057c478bd9Sstevel@tonic-gate 	uint64_t	lo, hi;
9067c478bd9Sstevel@tonic-gate 	pgcnt_t		pfnalign = 0;
9077c478bd9Sstevel@tonic-gate 	static pfn_t	startpfn;
9087c478bd9Sstevel@tonic-gate 	static pgcnt_t	lastctgcnt;
9097c478bd9Sstevel@tonic-gate 	uintptr_t	align;
9107c478bd9Sstevel@tonic-gate 
9117c478bd9Sstevel@tonic-gate 	CONTIG_LOCK();
9127c478bd9Sstevel@tonic-gate 
9137c478bd9Sstevel@tonic-gate 	if (mattr) {
9147c478bd9Sstevel@tonic-gate 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
9157c478bd9Sstevel@tonic-gate 		hi = mmu_btop(mattr->dma_attr_addr_hi);
9167c478bd9Sstevel@tonic-gate 		if (hi >= physmax)
9177c478bd9Sstevel@tonic-gate 			hi = physmax - 1;
9187c478bd9Sstevel@tonic-gate 		sgllen = mattr->dma_attr_sgllen;
9197c478bd9Sstevel@tonic-gate 		pfnseg = mmu_btop(mattr->dma_attr_seg);
9207c478bd9Sstevel@tonic-gate 
9217c478bd9Sstevel@tonic-gate 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
9227c478bd9Sstevel@tonic-gate 		if (align > MMU_PAGESIZE)
9237c478bd9Sstevel@tonic-gate 			pfnalign = mmu_btop(align);
9247c478bd9Sstevel@tonic-gate 
9257c478bd9Sstevel@tonic-gate 		/*
9267c478bd9Sstevel@tonic-gate 		 * in order to satisfy the request, must minimally
9277c478bd9Sstevel@tonic-gate 		 * acquire minctg contiguous pages
9287c478bd9Sstevel@tonic-gate 		 */
9297c478bd9Sstevel@tonic-gate 		minctg = howmany(*pgcnt, sgllen);
9307c478bd9Sstevel@tonic-gate 
9317c478bd9Sstevel@tonic-gate 		ASSERT(hi >= lo);
9327c478bd9Sstevel@tonic-gate 
9337c478bd9Sstevel@tonic-gate 		/*
9347c478bd9Sstevel@tonic-gate 		 * start from where last searched if the minctg >= lastctgcnt
9357c478bd9Sstevel@tonic-gate 		 */
9367c478bd9Sstevel@tonic-gate 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
9377c478bd9Sstevel@tonic-gate 			startpfn = lo;
9387c478bd9Sstevel@tonic-gate 	} else {
9397c478bd9Sstevel@tonic-gate 		hi = physmax - 1;
9407c478bd9Sstevel@tonic-gate 		lo = 0;
9417c478bd9Sstevel@tonic-gate 		sgllen = 1;
9427c478bd9Sstevel@tonic-gate 		pfnseg = mmu.highest_pfn;
9437c478bd9Sstevel@tonic-gate 		minctg = *pgcnt;
9447c478bd9Sstevel@tonic-gate 
9457c478bd9Sstevel@tonic-gate 		if (minctg < lastctgcnt)
9467c478bd9Sstevel@tonic-gate 			startpfn = lo;
9477c478bd9Sstevel@tonic-gate 	}
9487c478bd9Sstevel@tonic-gate 	lastctgcnt = minctg;
9497c478bd9Sstevel@tonic-gate 
9507c478bd9Sstevel@tonic-gate 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
9517c478bd9Sstevel@tonic-gate 
9527c478bd9Sstevel@tonic-gate 	/* conserve 16m memory - start search above 16m when possible */
9537c478bd9Sstevel@tonic-gate 	if (hi > PFN_16M && startpfn < PFN_16M)
9547c478bd9Sstevel@tonic-gate 		startpfn = PFN_16M;
9557c478bd9Sstevel@tonic-gate 
9567c478bd9Sstevel@tonic-gate 	pfn = startpfn;
9577c478bd9Sstevel@tonic-gate 	if (pfnalign)
9587c478bd9Sstevel@tonic-gate 		pfn = P2ROUNDUP(pfn, pfnalign);
9597c478bd9Sstevel@tonic-gate 
9607c478bd9Sstevel@tonic-gate 	while (pfn + minctg - 1 <= hi) {
9617c478bd9Sstevel@tonic-gate 
9627c478bd9Sstevel@tonic-gate 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
9637c478bd9Sstevel@tonic-gate 		if (plist) {
9647c478bd9Sstevel@tonic-gate 			page_list_concat(&pplist, &plist);
9657c478bd9Sstevel@tonic-gate 			sgllen--;
9667c478bd9Sstevel@tonic-gate 			/*
9677c478bd9Sstevel@tonic-gate 			 * return when contig pages no longer needed
9687c478bd9Sstevel@tonic-gate 			 */
9697c478bd9Sstevel@tonic-gate 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
9707c478bd9Sstevel@tonic-gate 				startpfn = pfn;
9717c478bd9Sstevel@tonic-gate 				CONTIG_UNLOCK();
9727c478bd9Sstevel@tonic-gate 				check_dma(mattr, pplist, *pgcnt);
9737c478bd9Sstevel@tonic-gate 				return (pplist);
9747c478bd9Sstevel@tonic-gate 			}
9757c478bd9Sstevel@tonic-gate 			minctg = howmany(*pgcnt, sgllen);
9767c478bd9Sstevel@tonic-gate 		}
9777c478bd9Sstevel@tonic-gate 		if (pfnalign)
9787c478bd9Sstevel@tonic-gate 			pfn = P2ROUNDUP(pfn, pfnalign);
9797c478bd9Sstevel@tonic-gate 	}
9807c478bd9Sstevel@tonic-gate 
9817c478bd9Sstevel@tonic-gate 	/* cannot find contig pages in specified range */
9827c478bd9Sstevel@tonic-gate 	if (startpfn == lo) {
9837c478bd9Sstevel@tonic-gate 		CONTIG_UNLOCK();
9847c478bd9Sstevel@tonic-gate 		return (NULL);
9857c478bd9Sstevel@tonic-gate 	}
9867c478bd9Sstevel@tonic-gate 
9877c478bd9Sstevel@tonic-gate 	/* did not start with lo previously */
9887c478bd9Sstevel@tonic-gate 	pfn = lo;
9897c478bd9Sstevel@tonic-gate 	if (pfnalign)
9907c478bd9Sstevel@tonic-gate 		pfn = P2ROUNDUP(pfn, pfnalign);
9917c478bd9Sstevel@tonic-gate 
9927c478bd9Sstevel@tonic-gate 	/* allow search to go above startpfn */
9937c478bd9Sstevel@tonic-gate 	while (pfn < startpfn) {
9947c478bd9Sstevel@tonic-gate 
9957c478bd9Sstevel@tonic-gate 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
9967c478bd9Sstevel@tonic-gate 		if (plist != NULL) {
9977c478bd9Sstevel@tonic-gate 
9987c478bd9Sstevel@tonic-gate 			page_list_concat(&pplist, &plist);
9997c478bd9Sstevel@tonic-gate 			sgllen--;
10007c478bd9Sstevel@tonic-gate 
10017c478bd9Sstevel@tonic-gate 			/*
10027c478bd9Sstevel@tonic-gate 			 * return when contig pages no longer needed
10037c478bd9Sstevel@tonic-gate 			 */
10047c478bd9Sstevel@tonic-gate 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
10057c478bd9Sstevel@tonic-gate 				startpfn = pfn;
10067c478bd9Sstevel@tonic-gate 				CONTIG_UNLOCK();
10077c478bd9Sstevel@tonic-gate 				check_dma(mattr, pplist, *pgcnt);
10087c478bd9Sstevel@tonic-gate 				return (pplist);
10097c478bd9Sstevel@tonic-gate 			}
10107c478bd9Sstevel@tonic-gate 			minctg = howmany(*pgcnt, sgllen);
10117c478bd9Sstevel@tonic-gate 		}
10127c478bd9Sstevel@tonic-gate 		if (pfnalign)
10137c478bd9Sstevel@tonic-gate 			pfn = P2ROUNDUP(pfn, pfnalign);
10147c478bd9Sstevel@tonic-gate 	}
10157c478bd9Sstevel@tonic-gate 	CONTIG_UNLOCK();
10167c478bd9Sstevel@tonic-gate 	return (NULL);
10177c478bd9Sstevel@tonic-gate }
10187c478bd9Sstevel@tonic-gate 
10197c478bd9Sstevel@tonic-gate /*
10207c478bd9Sstevel@tonic-gate  * combine mem_node_config and memrange memory ranges into one data
10217c478bd9Sstevel@tonic-gate  * structure to be used for page list management.
10227c478bd9Sstevel@tonic-gate  *
10237c478bd9Sstevel@tonic-gate  * mnode_range_cnt() calculates the number of memory ranges for mnode and
10247c478bd9Sstevel@tonic-gate  * memranges[]. Used to determine the size of page lists and mnoderanges.
10257c478bd9Sstevel@tonic-gate  *
10267c478bd9Sstevel@tonic-gate  * mnode_range_setup() initializes mnoderanges.
10277c478bd9Sstevel@tonic-gate  */
10287c478bd9Sstevel@tonic-gate mnoderange_t	*mnoderanges;
10297c478bd9Sstevel@tonic-gate int		mnoderangecnt;
10307c478bd9Sstevel@tonic-gate int		mtype4g;
10317c478bd9Sstevel@tonic-gate 
10327c478bd9Sstevel@tonic-gate int
10335d07b933Sdp mnode_range_cnt(int mnode)
10347c478bd9Sstevel@tonic-gate {
10357c478bd9Sstevel@tonic-gate 	int	mri;
10367c478bd9Sstevel@tonic-gate 	int	mnrcnt = 0;
10377c478bd9Sstevel@tonic-gate 
10385d07b933Sdp 	if (mem_node_config[mnode].exists != 0) {
10397c478bd9Sstevel@tonic-gate 		mri = nranges - 1;
10407c478bd9Sstevel@tonic-gate 
10417c478bd9Sstevel@tonic-gate 		/* find the memranges index below contained in mnode range */
10427c478bd9Sstevel@tonic-gate 
10437c478bd9Sstevel@tonic-gate 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
10447c478bd9Sstevel@tonic-gate 			mri--;
10457c478bd9Sstevel@tonic-gate 
10467c478bd9Sstevel@tonic-gate 		/*
10477c478bd9Sstevel@tonic-gate 		 * increment mnode range counter when memranges or mnode
10487c478bd9Sstevel@tonic-gate 		 * boundary is reached.
10497c478bd9Sstevel@tonic-gate 		 */
10507c478bd9Sstevel@tonic-gate 		while (mri >= 0 &&
10517c478bd9Sstevel@tonic-gate 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
10527c478bd9Sstevel@tonic-gate 			mnrcnt++;
10537c478bd9Sstevel@tonic-gate 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
10547c478bd9Sstevel@tonic-gate 				mri--;
10557c478bd9Sstevel@tonic-gate 			else
10567c478bd9Sstevel@tonic-gate 				break;
10577c478bd9Sstevel@tonic-gate 		}
10587c478bd9Sstevel@tonic-gate 	}
10595d07b933Sdp 	ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
10607c478bd9Sstevel@tonic-gate 	return (mnrcnt);
10617c478bd9Sstevel@tonic-gate }
10627c478bd9Sstevel@tonic-gate 
10637c478bd9Sstevel@tonic-gate void
10647c478bd9Sstevel@tonic-gate mnode_range_setup(mnoderange_t *mnoderanges)
10657c478bd9Sstevel@tonic-gate {
10667c478bd9Sstevel@tonic-gate 	int	mnode, mri;
10677c478bd9Sstevel@tonic-gate 
10687c478bd9Sstevel@tonic-gate 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
10697c478bd9Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
10707c478bd9Sstevel@tonic-gate 			continue;
10717c478bd9Sstevel@tonic-gate 
10727c478bd9Sstevel@tonic-gate 		mri = nranges - 1;
10737c478bd9Sstevel@tonic-gate 
10747c478bd9Sstevel@tonic-gate 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
10757c478bd9Sstevel@tonic-gate 			mri--;
10767c478bd9Sstevel@tonic-gate 
10777c478bd9Sstevel@tonic-gate 		while (mri >= 0 && mem_node_config[mnode].physmax >=
10787c478bd9Sstevel@tonic-gate 		    MEMRANGELO(mri)) {
10797c478bd9Sstevel@tonic-gate 			mnoderanges->mnr_pfnlo =
10807c478bd9Sstevel@tonic-gate 			    MAX(MEMRANGELO(mri),
10817c478bd9Sstevel@tonic-gate 				mem_node_config[mnode].physbase);
10827c478bd9Sstevel@tonic-gate 			mnoderanges->mnr_pfnhi =
10837c478bd9Sstevel@tonic-gate 			    MIN(MEMRANGEHI(mri),
10847c478bd9Sstevel@tonic-gate 				mem_node_config[mnode].physmax);
10857c478bd9Sstevel@tonic-gate 			mnoderanges->mnr_mnode = mnode;
10867c478bd9Sstevel@tonic-gate 			mnoderanges->mnr_memrange = mri;
10877c478bd9Sstevel@tonic-gate 			mnoderanges++;
10887c478bd9Sstevel@tonic-gate 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
10897c478bd9Sstevel@tonic-gate 				mri--;
10907c478bd9Sstevel@tonic-gate 			else
10917c478bd9Sstevel@tonic-gate 				break;
10927c478bd9Sstevel@tonic-gate 		}
10937c478bd9Sstevel@tonic-gate 	}
10947c478bd9Sstevel@tonic-gate }
10957c478bd9Sstevel@tonic-gate 
10967c478bd9Sstevel@tonic-gate /*
10977c478bd9Sstevel@tonic-gate  * Determine if the mnode range specified in mtype contains memory belonging
10987c478bd9Sstevel@tonic-gate  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
109907ad560dSkchow  * the range of indices from high pfn to 0, 16m or 4g.
11007c478bd9Sstevel@tonic-gate  *
11017c478bd9Sstevel@tonic-gate  * Return first mnode range type index found otherwise return -1 if none found.
11027c478bd9Sstevel@tonic-gate  */
11037c478bd9Sstevel@tonic-gate int
11047c478bd9Sstevel@tonic-gate mtype_func(int mnode, int mtype, uint_t flags)
11057c478bd9Sstevel@tonic-gate {
11067c478bd9Sstevel@tonic-gate 	if (flags & PGI_MT_RANGE) {
110707ad560dSkchow 		int	mtlim;
11087c478bd9Sstevel@tonic-gate 
11097c478bd9Sstevel@tonic-gate 		if (flags & PGI_MT_NEXT)
11107c478bd9Sstevel@tonic-gate 			mtype--;
111107ad560dSkchow 		if (flags & PGI_MT_RANGE0)
111207ad560dSkchow 			mtlim = 0;
111307ad560dSkchow 		else if (flags & PGI_MT_RANGE4G)
111407ad560dSkchow 			mtlim = mtype4g + 1;	/* exclude 0-4g range */
111507ad560dSkchow 		else if (flags & PGI_MT_RANGE16M)
111607ad560dSkchow 			mtlim = 1;		/* exclude 0-16m range */
11177c478bd9Sstevel@tonic-gate 		while (mtype >= mtlim) {
11187c478bd9Sstevel@tonic-gate 			if (mnoderanges[mtype].mnr_mnode == mnode)
11197c478bd9Sstevel@tonic-gate 				return (mtype);
11207c478bd9Sstevel@tonic-gate 			mtype--;
11217c478bd9Sstevel@tonic-gate 		}
11227c478bd9Sstevel@tonic-gate 	} else {
11237c478bd9Sstevel@tonic-gate 		if (mnoderanges[mtype].mnr_mnode == mnode)
11247c478bd9Sstevel@tonic-gate 			return (mtype);
11257c478bd9Sstevel@tonic-gate 	}
11267c478bd9Sstevel@tonic-gate 	return (-1);
11277c478bd9Sstevel@tonic-gate }
11287c478bd9Sstevel@tonic-gate 
1129e21bae1bSkchow /*
1130e21bae1bSkchow  * Update the page list max counts with the pfn range specified by the
1131e21bae1bSkchow  * input parameters.  Called from add_physmem() when physical memory with
1132e21bae1bSkchow  * page_t's are initially added to the page lists.
1133e21bae1bSkchow  */
1134e21bae1bSkchow void
1135e21bae1bSkchow mtype_modify_max(pfn_t startpfn, long cnt)
1136e21bae1bSkchow {
1137e21bae1bSkchow 	int	mtype = 0;
1138e21bae1bSkchow 	pfn_t	endpfn = startpfn + cnt, pfn;
1139e21bae1bSkchow 	pgcnt_t	inc;
1140e21bae1bSkchow 
1141e21bae1bSkchow 	ASSERT(cnt > 0);
1142e21bae1bSkchow 
1143e21bae1bSkchow 	for (pfn = startpfn; pfn < endpfn; ) {
1144e21bae1bSkchow 		if (pfn <= mnoderanges[mtype].mnr_pfnhi) {
1145e21bae1bSkchow 			if (endpfn < mnoderanges[mtype].mnr_pfnhi) {
1146e21bae1bSkchow 				inc = endpfn - pfn;
1147e21bae1bSkchow 			} else {
1148e21bae1bSkchow 				inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1;
1149e21bae1bSkchow 			}
1150e21bae1bSkchow 			mnoderanges[mtype].mnr_mt_pgmax += inc;
1151e21bae1bSkchow 			if (physmax4g && mtype <= mtype4g)
1152e21bae1bSkchow 				maxmem4g += inc;
1153e21bae1bSkchow 			pfn += inc;
1154e21bae1bSkchow 		}
1155e21bae1bSkchow 		mtype++;
1156e21bae1bSkchow 		ASSERT(mtype < mnoderangecnt || pfn >= endpfn);
1157e21bae1bSkchow 	}
1158e21bae1bSkchow }
1159e21bae1bSkchow 
1160affbd3ccSkchow /*
1161affbd3ccSkchow  * Returns the free page count for mnode
1162affbd3ccSkchow  */
1163affbd3ccSkchow int
1164affbd3ccSkchow mnode_pgcnt(int mnode)
1165affbd3ccSkchow {
1166affbd3ccSkchow 	int	mtype = mnoderangecnt - 1;
1167affbd3ccSkchow 	int	flags = PGI_MT_RANGE0;
1168affbd3ccSkchow 	pgcnt_t	pgcnt = 0;
1169affbd3ccSkchow 
1170affbd3ccSkchow 	mtype = mtype_func(mnode, mtype, flags);
1171affbd3ccSkchow 
1172affbd3ccSkchow 	while (mtype != -1) {
117307ad560dSkchow 		pgcnt += MTYPE_FREEMEM(mtype);
1174affbd3ccSkchow 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1175affbd3ccSkchow 	}
1176affbd3ccSkchow 	return (pgcnt);
1177affbd3ccSkchow }
1178affbd3ccSkchow 
11797c478bd9Sstevel@tonic-gate /*
11807c478bd9Sstevel@tonic-gate  * Initialize page coloring variables based on the l2 cache parameters.
11817c478bd9Sstevel@tonic-gate  * Calculate and return memory needed for page coloring data structures.
11827c478bd9Sstevel@tonic-gate  */
11837c478bd9Sstevel@tonic-gate size_t
11847c478bd9Sstevel@tonic-gate page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
11857c478bd9Sstevel@tonic-gate {
11867c478bd9Sstevel@tonic-gate 	size_t	colorsz = 0;
11877c478bd9Sstevel@tonic-gate 	int	i;
11887c478bd9Sstevel@tonic-gate 	int	colors;
11897c478bd9Sstevel@tonic-gate 
11907c478bd9Sstevel@tonic-gate 	/*
11917c478bd9Sstevel@tonic-gate 	 * Reduce the memory ranges lists if we don't have large amounts
11927c478bd9Sstevel@tonic-gate 	 * of memory. This avoids searching known empty free lists.
11937c478bd9Sstevel@tonic-gate 	 */
11947c478bd9Sstevel@tonic-gate 	i = memrange_num(physmax);
11957c478bd9Sstevel@tonic-gate 	memranges += i;
11967c478bd9Sstevel@tonic-gate 	nranges -= i;
11977c478bd9Sstevel@tonic-gate #if defined(__i386)
11987c478bd9Sstevel@tonic-gate 	if (i > 0)
11997c478bd9Sstevel@tonic-gate 		restricted_kmemalloc = 0;
12007c478bd9Sstevel@tonic-gate #endif
12017c478bd9Sstevel@tonic-gate 	/* physmax greater than 4g */
12027c478bd9Sstevel@tonic-gate 	if (i == 0)
12037c478bd9Sstevel@tonic-gate 		physmax4g = 1;
12047c478bd9Sstevel@tonic-gate 
12057c478bd9Sstevel@tonic-gate 	ASSERT(ISP2(l2_sz));
12067c478bd9Sstevel@tonic-gate 	ASSERT(ISP2(l2_linesz));
12077c478bd9Sstevel@tonic-gate 	ASSERT(l2_sz > MMU_PAGESIZE);
12087c478bd9Sstevel@tonic-gate 
12097c478bd9Sstevel@tonic-gate 	/* l2_assoc is 0 for fully associative l2 cache */
12107c478bd9Sstevel@tonic-gate 	if (l2_assoc)
12117c478bd9Sstevel@tonic-gate 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
12127c478bd9Sstevel@tonic-gate 	else
12137c478bd9Sstevel@tonic-gate 		l2_colors = 1;
12147c478bd9Sstevel@tonic-gate 
12157c478bd9Sstevel@tonic-gate 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
12167c478bd9Sstevel@tonic-gate 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
12177c478bd9Sstevel@tonic-gate 
12187c478bd9Sstevel@tonic-gate 	/*
12197c478bd9Sstevel@tonic-gate 	 * cpu_page_colors is non-zero when a page color may be spread across
12207c478bd9Sstevel@tonic-gate 	 * multiple bins.
12217c478bd9Sstevel@tonic-gate 	 */
12227c478bd9Sstevel@tonic-gate 	if (l2_colors < page_colors)
12237c478bd9Sstevel@tonic-gate 		cpu_page_colors = l2_colors;
12247c478bd9Sstevel@tonic-gate 
12257c478bd9Sstevel@tonic-gate 	ASSERT(ISP2(page_colors));
12267c478bd9Sstevel@tonic-gate 
12277c478bd9Sstevel@tonic-gate 	page_colors_mask = page_colors - 1;
12287c478bd9Sstevel@tonic-gate 
12297c478bd9Sstevel@tonic-gate 	ASSERT(ISP2(CPUSETSIZE()));
12307c478bd9Sstevel@tonic-gate 	page_coloring_shift = lowbit(CPUSETSIZE());
12317c478bd9Sstevel@tonic-gate 
12325d07b933Sdp 	/* initialize number of colors per page size */
12335d07b933Sdp 	for (i = 0; i <= mmu.max_page_level; i++) {
12345d07b933Sdp 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
12355d07b933Sdp 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
12365d07b933Sdp 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
12375d07b933Sdp 		hw_page_array[i].hp_colors = (page_colors_mask >>
12385d07b933Sdp 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
12395d07b933Sdp 		    + 1;
12405d07b933Sdp 	}
12415d07b933Sdp 
12425d07b933Sdp 	/*
12435d07b933Sdp 	 * The value of cpu_page_colors determines if additional color bins
12445d07b933Sdp 	 * need to be checked for a particular color in the page_get routines.
12455d07b933Sdp 	 */
12465d07b933Sdp 	if (cpu_page_colors != 0) {
12475d07b933Sdp 
12485d07b933Sdp 		int a = lowbit(page_colors) - lowbit(cpu_page_colors);
12495d07b933Sdp 		ASSERT(a > 0);
12505d07b933Sdp 		ASSERT(a < 16);
12515d07b933Sdp 
12525d07b933Sdp 		for (i = 0; i <= mmu.max_page_level; i++) {
12535d07b933Sdp 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
12545d07b933Sdp 				colorequivszc[i] = 0;
12555d07b933Sdp 				continue;
12565d07b933Sdp 			}
12575d07b933Sdp 			while ((colors >> a) == 0)
12585d07b933Sdp 				a--;
12595d07b933Sdp 			ASSERT(a >= 0);
12605d07b933Sdp 
12615d07b933Sdp 			/* higher 4 bits encodes color equiv mask */
12625d07b933Sdp 			colorequivszc[i] = (a << 4);
12635d07b933Sdp 		}
12645d07b933Sdp 	}
12655d07b933Sdp 
12665d07b933Sdp 	/* factor in colorequiv to check additional 'equivalent' bins. */
12675d07b933Sdp 	if (colorequiv > 1) {
12685d07b933Sdp 
12695d07b933Sdp 		int a = lowbit(colorequiv) - 1;
12705d07b933Sdp 		if (a > 15)
12715d07b933Sdp 			a = 15;
12725d07b933Sdp 
12735d07b933Sdp 		for (i = 0; i <= mmu.max_page_level; i++) {
12745d07b933Sdp 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
12755d07b933Sdp 				continue;
12765d07b933Sdp 			}
12775d07b933Sdp 			while ((colors >> a) == 0)
12785d07b933Sdp 				a--;
12795d07b933Sdp 			if ((a << 4) > colorequivszc[i]) {
12805d07b933Sdp 				colorequivszc[i] = (a << 4);
12815d07b933Sdp 			}
12825d07b933Sdp 		}
12835d07b933Sdp 	}
12845d07b933Sdp 
12857c478bd9Sstevel@tonic-gate 	/* size for mnoderanges */
12865d07b933Sdp 	for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
12875d07b933Sdp 		mnoderangecnt += mnode_range_cnt(i);
12887c478bd9Sstevel@tonic-gate 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
12897c478bd9Sstevel@tonic-gate 
12907c478bd9Sstevel@tonic-gate 	/* size for fpc_mutex and cpc_mutex */
12917c478bd9Sstevel@tonic-gate 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
12927c478bd9Sstevel@tonic-gate 
12937c478bd9Sstevel@tonic-gate 	/* size of page_freelists */
12947c478bd9Sstevel@tonic-gate 	colorsz += mnoderangecnt * sizeof (page_t ***);
12957c478bd9Sstevel@tonic-gate 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
12967c478bd9Sstevel@tonic-gate 
12977c478bd9Sstevel@tonic-gate 	for (i = 0; i < mmu_page_sizes; i++) {
12987c478bd9Sstevel@tonic-gate 		colors = page_get_pagecolors(i);
12997c478bd9Sstevel@tonic-gate 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
13007c478bd9Sstevel@tonic-gate 	}
13017c478bd9Sstevel@tonic-gate 
13027c478bd9Sstevel@tonic-gate 	/* size of page_cachelists */
13037c478bd9Sstevel@tonic-gate 	colorsz += mnoderangecnt * sizeof (page_t **);
13047c478bd9Sstevel@tonic-gate 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
13057c478bd9Sstevel@tonic-gate 
13067c478bd9Sstevel@tonic-gate 	return (colorsz);
13077c478bd9Sstevel@tonic-gate }
13087c478bd9Sstevel@tonic-gate 
13097c478bd9Sstevel@tonic-gate /*
13107c478bd9Sstevel@tonic-gate  * Called once at startup to configure page_coloring data structures and
13117c478bd9Sstevel@tonic-gate  * does the 1st page_free()/page_freelist_add().
13127c478bd9Sstevel@tonic-gate  */
13137c478bd9Sstevel@tonic-gate void
13147c478bd9Sstevel@tonic-gate page_coloring_setup(caddr_t pcmemaddr)
13157c478bd9Sstevel@tonic-gate {
13167c478bd9Sstevel@tonic-gate 	int	i;
13177c478bd9Sstevel@tonic-gate 	int	j;
13187c478bd9Sstevel@tonic-gate 	int	k;
13197c478bd9Sstevel@tonic-gate 	caddr_t	addr;
13207c478bd9Sstevel@tonic-gate 	int	colors;
13217c478bd9Sstevel@tonic-gate 
13227c478bd9Sstevel@tonic-gate 	/*
13237c478bd9Sstevel@tonic-gate 	 * do page coloring setup
13247c478bd9Sstevel@tonic-gate 	 */
13257c478bd9Sstevel@tonic-gate 	addr = pcmemaddr;
13267c478bd9Sstevel@tonic-gate 
13277c478bd9Sstevel@tonic-gate 	mnoderanges = (mnoderange_t *)addr;
13287c478bd9Sstevel@tonic-gate 	addr += (mnoderangecnt * sizeof (mnoderange_t));
13297c478bd9Sstevel@tonic-gate 
13307c478bd9Sstevel@tonic-gate 	mnode_range_setup(mnoderanges);
13317c478bd9Sstevel@tonic-gate 
13327c478bd9Sstevel@tonic-gate 	if (physmax4g)
13337c478bd9Sstevel@tonic-gate 		mtype4g = pfn_2_mtype(0xfffff);
13347c478bd9Sstevel@tonic-gate 
13357c478bd9Sstevel@tonic-gate 	for (k = 0; k < NPC_MUTEX; k++) {
13367c478bd9Sstevel@tonic-gate 		fpc_mutex[k] = (kmutex_t *)addr;
13377c478bd9Sstevel@tonic-gate 		addr += (max_mem_nodes * sizeof (kmutex_t));
13387c478bd9Sstevel@tonic-gate 	}
13397c478bd9Sstevel@tonic-gate 	for (k = 0; k < NPC_MUTEX; k++) {
13407c478bd9Sstevel@tonic-gate 		cpc_mutex[k] = (kmutex_t *)addr;
13417c478bd9Sstevel@tonic-gate 		addr += (max_mem_nodes * sizeof (kmutex_t));
13427c478bd9Sstevel@tonic-gate 	}
13437c478bd9Sstevel@tonic-gate 	page_freelists = (page_t ****)addr;
13447c478bd9Sstevel@tonic-gate 	addr += (mnoderangecnt * sizeof (page_t ***));
13457c478bd9Sstevel@tonic-gate 
13467c478bd9Sstevel@tonic-gate 	page_cachelists = (page_t ***)addr;
13477c478bd9Sstevel@tonic-gate 	addr += (mnoderangecnt * sizeof (page_t **));
13487c478bd9Sstevel@tonic-gate 
13497c478bd9Sstevel@tonic-gate 	for (i = 0; i < mnoderangecnt; i++) {
13507c478bd9Sstevel@tonic-gate 		page_freelists[i] = (page_t ***)addr;
13517c478bd9Sstevel@tonic-gate 		addr += (mmu_page_sizes * sizeof (page_t **));
13527c478bd9Sstevel@tonic-gate 
13537c478bd9Sstevel@tonic-gate 		for (j = 0; j < mmu_page_sizes; j++) {
13547c478bd9Sstevel@tonic-gate 			colors = page_get_pagecolors(j);
13557c478bd9Sstevel@tonic-gate 			page_freelists[i][j] = (page_t **)addr;
13567c478bd9Sstevel@tonic-gate 			addr += (colors * sizeof (page_t *));
13577c478bd9Sstevel@tonic-gate 		}
13587c478bd9Sstevel@tonic-gate 		page_cachelists[i] = (page_t **)addr;
13597c478bd9Sstevel@tonic-gate 		addr += (page_colors * sizeof (page_t *));
13607c478bd9Sstevel@tonic-gate 	}
13617c478bd9Sstevel@tonic-gate }
13627c478bd9Sstevel@tonic-gate 
13637c478bd9Sstevel@tonic-gate /*ARGSUSED*/
13647c478bd9Sstevel@tonic-gate int
13657c478bd9Sstevel@tonic-gate bp_color(struct buf *bp)
13667c478bd9Sstevel@tonic-gate {
13677c478bd9Sstevel@tonic-gate 	return (0);
13687c478bd9Sstevel@tonic-gate }
13697c478bd9Sstevel@tonic-gate 
13707c478bd9Sstevel@tonic-gate /*
13717c478bd9Sstevel@tonic-gate  * get a page from any list with the given mnode
13727c478bd9Sstevel@tonic-gate  */
13737c478bd9Sstevel@tonic-gate page_t *
13747c478bd9Sstevel@tonic-gate page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
13757c478bd9Sstevel@tonic-gate     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
13767c478bd9Sstevel@tonic-gate {
13775d07b933Sdp 	kmutex_t		*pcm;
13785d07b933Sdp 	int			i;
13795d07b933Sdp 	page_t			*pp;
13805d07b933Sdp 	page_t			*first_pp;
13815d07b933Sdp 	uint64_t		pgaddr;
13825d07b933Sdp 	ulong_t			bin;
13835d07b933Sdp 	int			mtypestart;
13845d07b933Sdp 	int			plw_initialized;
13855d07b933Sdp 	page_list_walker_t	plw;
13867c478bd9Sstevel@tonic-gate 
13877c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
13887c478bd9Sstevel@tonic-gate 
13897c478bd9Sstevel@tonic-gate 	ASSERT((flags & PG_MATCH_COLOR) == 0);
13907c478bd9Sstevel@tonic-gate 	ASSERT(szc == 0);
13917c478bd9Sstevel@tonic-gate 	ASSERT(dma_attr != NULL);
13927c478bd9Sstevel@tonic-gate 
13937c478bd9Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
13947c478bd9Sstevel@tonic-gate 	if (mtype < 0) {
13957c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
13967c478bd9Sstevel@tonic-gate 		return (NULL);
13977c478bd9Sstevel@tonic-gate 	}
13987c478bd9Sstevel@tonic-gate 
13997c478bd9Sstevel@tonic-gate 	mtypestart = mtype;
14007c478bd9Sstevel@tonic-gate 
14017c478bd9Sstevel@tonic-gate 	bin = origbin;
14027c478bd9Sstevel@tonic-gate 
14037c478bd9Sstevel@tonic-gate 	/*
14047c478bd9Sstevel@tonic-gate 	 * check up to page_colors + 1 bins - origbin may be checked twice
14057c478bd9Sstevel@tonic-gate 	 * because of BIN_STEP skip
14067c478bd9Sstevel@tonic-gate 	 */
14077c478bd9Sstevel@tonic-gate 	do {
14085d07b933Sdp 		plw_initialized = 0;
14095d07b933Sdp 
14105d07b933Sdp 		for (plw.plw_count = 0;
14115d07b933Sdp 		    plw.plw_count < page_colors; plw.plw_count++) {
14125d07b933Sdp 
14137c478bd9Sstevel@tonic-gate 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
14147c478bd9Sstevel@tonic-gate 				goto nextfreebin;
14157c478bd9Sstevel@tonic-gate 
14167c478bd9Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
14177c478bd9Sstevel@tonic-gate 			mutex_enter(pcm);
14187c478bd9Sstevel@tonic-gate 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
14197c478bd9Sstevel@tonic-gate 			first_pp = pp;
14207c478bd9Sstevel@tonic-gate 			while (pp != NULL) {
14217c478bd9Sstevel@tonic-gate 				if (page_trylock(pp, SE_EXCL) == 0) {
14227c478bd9Sstevel@tonic-gate 					pp = pp->p_next;
14237c478bd9Sstevel@tonic-gate 					if (pp == first_pp) {
14247c478bd9Sstevel@tonic-gate 						pp = NULL;
14257c478bd9Sstevel@tonic-gate 					}
14267c478bd9Sstevel@tonic-gate 					continue;
14277c478bd9Sstevel@tonic-gate 				}
14287c478bd9Sstevel@tonic-gate 
14297c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISFREE(pp));
14307c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp));
14317c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_vnode == NULL);
14327c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_hash == NULL);
14337c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_offset == (u_offset_t)-1);
14347c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == szc);
14357c478bd9Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
14367c478bd9Sstevel@tonic-gate 				/* check if page within DMA attributes */
1437*ae115bc7Smrj 				pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
14387c478bd9Sstevel@tonic-gate 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
14397c478bd9Sstevel@tonic-gate 				    (pgaddr + MMU_PAGESIZE - 1 <=
14407c478bd9Sstevel@tonic-gate 				    dma_attr->dma_attr_addr_hi)) {
14417c478bd9Sstevel@tonic-gate 					break;
14427c478bd9Sstevel@tonic-gate 				}
14437c478bd9Sstevel@tonic-gate 
14447c478bd9Sstevel@tonic-gate 				/* continue looking */
14457c478bd9Sstevel@tonic-gate 				page_unlock(pp);
14467c478bd9Sstevel@tonic-gate 				pp = pp->p_next;
14477c478bd9Sstevel@tonic-gate 				if (pp == first_pp)
14487c478bd9Sstevel@tonic-gate 					pp = NULL;
14497c478bd9Sstevel@tonic-gate 
14507c478bd9Sstevel@tonic-gate 			}
14517c478bd9Sstevel@tonic-gate 			if (pp != NULL) {
14527c478bd9Sstevel@tonic-gate 				ASSERT(mtype == PP_2_MTYPE(pp));
14537c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
14547c478bd9Sstevel@tonic-gate 
14557c478bd9Sstevel@tonic-gate 				/* found a page with specified DMA attributes */
14567c478bd9Sstevel@tonic-gate 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
14577c478bd9Sstevel@tonic-gate 				    mtype), pp);
1458affbd3ccSkchow 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
14597c478bd9Sstevel@tonic-gate 
14607c478bd9Sstevel@tonic-gate 				if ((PP_ISFREE(pp) == 0) ||
14617c478bd9Sstevel@tonic-gate 				    (PP_ISAGED(pp) == 0)) {
14627c478bd9Sstevel@tonic-gate 					cmn_err(CE_PANIC, "page %p is not free",
14637c478bd9Sstevel@tonic-gate 					    (void *)pp);
14647c478bd9Sstevel@tonic-gate 				}
14657c478bd9Sstevel@tonic-gate 
14667c478bd9Sstevel@tonic-gate 				mutex_exit(pcm);
14677c478bd9Sstevel@tonic-gate 				check_dma(dma_attr, pp, 1);
14687c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
14697c478bd9Sstevel@tonic-gate 				return (pp);
14707c478bd9Sstevel@tonic-gate 			}
14717c478bd9Sstevel@tonic-gate 			mutex_exit(pcm);
14727c478bd9Sstevel@tonic-gate nextfreebin:
14735d07b933Sdp 			if (plw_initialized == 0) {
14745d07b933Sdp 				page_list_walk_init(szc, 0, bin, 1, 0, &plw);
14755d07b933Sdp 				ASSERT(plw.plw_ceq_dif == page_colors);
14765d07b933Sdp 				plw_initialized = 1;
14775d07b933Sdp 			}
14787c478bd9Sstevel@tonic-gate 
14795d07b933Sdp 			if (plw.plw_do_split) {
14805d07b933Sdp 				pp = page_freelist_split(szc, bin, mnode,
14815d07b933Sdp 				    mtype,
14825d07b933Sdp 				    mmu_btop(dma_attr->dma_attr_addr_hi + 1),
14835d07b933Sdp 				    &plw);
14845d07b933Sdp 				if (pp != NULL)
14855d07b933Sdp 					return (pp);
14865d07b933Sdp 			}
14875d07b933Sdp 
14885d07b933Sdp 			bin = page_list_walk_next_bin(szc, bin, &plw);
14897c478bd9Sstevel@tonic-gate 		}
14905d07b933Sdp 
1491affbd3ccSkchow 		MTYPE_NEXT(mnode, mtype, flags);
1492affbd3ccSkchow 	} while (mtype >= 0);
14937c478bd9Sstevel@tonic-gate 
14947c478bd9Sstevel@tonic-gate 	/* failed to find a page in the freelist; try it in the cachelist */
14957c478bd9Sstevel@tonic-gate 
14967c478bd9Sstevel@tonic-gate 	/* reset mtype start for cachelist search */
14977c478bd9Sstevel@tonic-gate 	mtype = mtypestart;
14987c478bd9Sstevel@tonic-gate 	ASSERT(mtype >= 0);
14997c478bd9Sstevel@tonic-gate 
15007c478bd9Sstevel@tonic-gate 	/* start with the bin of matching color */
15017c478bd9Sstevel@tonic-gate 	bin = origbin;
15027c478bd9Sstevel@tonic-gate 
15037c478bd9Sstevel@tonic-gate 	do {
15047c478bd9Sstevel@tonic-gate 		for (i = 0; i <= page_colors; i++) {
15057c478bd9Sstevel@tonic-gate 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
15067c478bd9Sstevel@tonic-gate 				goto nextcachebin;
15077c478bd9Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
15087c478bd9Sstevel@tonic-gate 			mutex_enter(pcm);
15097c478bd9Sstevel@tonic-gate 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
15107c478bd9Sstevel@tonic-gate 			first_pp = pp;
15117c478bd9Sstevel@tonic-gate 			while (pp != NULL) {
15127c478bd9Sstevel@tonic-gate 				if (page_trylock(pp, SE_EXCL) == 0) {
15137c478bd9Sstevel@tonic-gate 					pp = pp->p_next;
15147c478bd9Sstevel@tonic-gate 					if (pp == first_pp)
15157c478bd9Sstevel@tonic-gate 						break;
15167c478bd9Sstevel@tonic-gate 					continue;
15177c478bd9Sstevel@tonic-gate 				}
15187c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_vnode);
15197c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp) == 0);
15207c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
15217c478bd9Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
15227c478bd9Sstevel@tonic-gate 
15237c478bd9Sstevel@tonic-gate 				/* check if page within DMA attributes */
15247c478bd9Sstevel@tonic-gate 
1525*ae115bc7Smrj 				pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
15267c478bd9Sstevel@tonic-gate 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
15277c478bd9Sstevel@tonic-gate 				    (pgaddr + MMU_PAGESIZE - 1 <=
15287c478bd9Sstevel@tonic-gate 				    dma_attr->dma_attr_addr_hi)) {
15297c478bd9Sstevel@tonic-gate 					break;
15307c478bd9Sstevel@tonic-gate 				}
15317c478bd9Sstevel@tonic-gate 
15327c478bd9Sstevel@tonic-gate 				/* continue looking */
15337c478bd9Sstevel@tonic-gate 				page_unlock(pp);
15347c478bd9Sstevel@tonic-gate 				pp = pp->p_next;
15357c478bd9Sstevel@tonic-gate 				if (pp == first_pp)
15367c478bd9Sstevel@tonic-gate 					pp = NULL;
15377c478bd9Sstevel@tonic-gate 			}
15387c478bd9Sstevel@tonic-gate 
15397c478bd9Sstevel@tonic-gate 			if (pp != NULL) {
15407c478bd9Sstevel@tonic-gate 				ASSERT(mtype == PP_2_MTYPE(pp));
15417c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
15427c478bd9Sstevel@tonic-gate 
15437c478bd9Sstevel@tonic-gate 				/* found a page with specified DMA attributes */
15447c478bd9Sstevel@tonic-gate 				page_sub(&PAGE_CACHELISTS(mnode, bin,
15457c478bd9Sstevel@tonic-gate 				    mtype), pp);
1546affbd3ccSkchow 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
15477c478bd9Sstevel@tonic-gate 
15487c478bd9Sstevel@tonic-gate 				mutex_exit(pcm);
15497c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_vnode);
15507c478bd9Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp) == 0);
15517c478bd9Sstevel@tonic-gate 				check_dma(dma_attr, pp, 1);
15527c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
15537c478bd9Sstevel@tonic-gate 				return (pp);
15547c478bd9Sstevel@tonic-gate 			}
15557c478bd9Sstevel@tonic-gate 			mutex_exit(pcm);
15567c478bd9Sstevel@tonic-gate nextcachebin:
15577c478bd9Sstevel@tonic-gate 			bin += (i == 0) ? BIN_STEP : 1;
15587c478bd9Sstevel@tonic-gate 			bin &= page_colors_mask;
15597c478bd9Sstevel@tonic-gate 		}
1560affbd3ccSkchow 		MTYPE_NEXT(mnode, mtype, flags);
1561affbd3ccSkchow 	} while (mtype >= 0);
15627c478bd9Sstevel@tonic-gate 
15637c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
15647c478bd9Sstevel@tonic-gate 	return (NULL);
15657c478bd9Sstevel@tonic-gate }
15667c478bd9Sstevel@tonic-gate 
15677c478bd9Sstevel@tonic-gate /*
15687c478bd9Sstevel@tonic-gate  * This function is similar to page_get_freelist()/page_get_cachelist()
15697c478bd9Sstevel@tonic-gate  * but it searches both the lists to find a page with the specified
15707c478bd9Sstevel@tonic-gate  * color (or no color) and DMA attributes. The search is done in the
15717c478bd9Sstevel@tonic-gate  * freelist first and then in the cache list within the highest memory
15727c478bd9Sstevel@tonic-gate  * range (based on DMA attributes) before searching in the lower
15737c478bd9Sstevel@tonic-gate  * memory ranges.
15747c478bd9Sstevel@tonic-gate  *
15757c478bd9Sstevel@tonic-gate  * Note: This function is called only by page_create_io().
15767c478bd9Sstevel@tonic-gate  */
15777c478bd9Sstevel@tonic-gate /*ARGSUSED*/
15787c478bd9Sstevel@tonic-gate page_t *
15797c478bd9Sstevel@tonic-gate page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
15807c478bd9Sstevel@tonic-gate     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
15817c478bd9Sstevel@tonic-gate {
15827c478bd9Sstevel@tonic-gate 	uint_t		bin;
15837c478bd9Sstevel@tonic-gate 	int		mtype;
15847c478bd9Sstevel@tonic-gate 	page_t		*pp;
15857c478bd9Sstevel@tonic-gate 	int		n;
15867c478bd9Sstevel@tonic-gate 	int		m;
15877c478bd9Sstevel@tonic-gate 	int		szc;
15887c478bd9Sstevel@tonic-gate 	int		fullrange;
15897c478bd9Sstevel@tonic-gate 	int		mnode;
15907c478bd9Sstevel@tonic-gate 	int		local_failed_stat = 0;
15917c478bd9Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
15927c478bd9Sstevel@tonic-gate 
15937c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pga_alloc);
15947c478bd9Sstevel@tonic-gate 
15957c478bd9Sstevel@tonic-gate 	/* only base pagesize currently supported */
15967c478bd9Sstevel@tonic-gate 	if (size != MMU_PAGESIZE)
15977c478bd9Sstevel@tonic-gate 		return (NULL);
15987c478bd9Sstevel@tonic-gate 
15997c478bd9Sstevel@tonic-gate 	/*
16007c478bd9Sstevel@tonic-gate 	 * If we're passed a specific lgroup, we use it.  Otherwise,
16017c478bd9Sstevel@tonic-gate 	 * assume first-touch placement is desired.
16027c478bd9Sstevel@tonic-gate 	 */
16037c478bd9Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
16047c478bd9Sstevel@tonic-gate 		lgrp = lgrp_home_lgrp();
16057c478bd9Sstevel@tonic-gate 
16067c478bd9Sstevel@tonic-gate 	/* LINTED */
16075d07b933Sdp 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
16087c478bd9Sstevel@tonic-gate 
16097c478bd9Sstevel@tonic-gate 	/*
16107c478bd9Sstevel@tonic-gate 	 * Only hold one freelist or cachelist lock at a time, that way we
16117c478bd9Sstevel@tonic-gate 	 * can start anywhere and not have to worry about lock
16127c478bd9Sstevel@tonic-gate 	 * ordering.
16137c478bd9Sstevel@tonic-gate 	 */
16147c478bd9Sstevel@tonic-gate 	if (dma_attr == NULL) {
16157c478bd9Sstevel@tonic-gate 		n = 0;
16167c478bd9Sstevel@tonic-gate 		m = mnoderangecnt - 1;
16177c478bd9Sstevel@tonic-gate 		fullrange = 1;
16187c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
16197c478bd9Sstevel@tonic-gate 	} else {
16207c478bd9Sstevel@tonic-gate 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
16217c478bd9Sstevel@tonic-gate 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
16227c478bd9Sstevel@tonic-gate 
16237c478bd9Sstevel@tonic-gate 		/*
16247c478bd9Sstevel@tonic-gate 		 * We can guarantee alignment only for page boundary.
16257c478bd9Sstevel@tonic-gate 		 */
16267c478bd9Sstevel@tonic-gate 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
16277c478bd9Sstevel@tonic-gate 			return (NULL);
16287c478bd9Sstevel@tonic-gate 
16297c478bd9Sstevel@tonic-gate 		n = pfn_2_mtype(pfnlo);
16307c478bd9Sstevel@tonic-gate 		m = pfn_2_mtype(pfnhi);
16317c478bd9Sstevel@tonic-gate 
16327c478bd9Sstevel@tonic-gate 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
16337c478bd9Sstevel@tonic-gate 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
16347c478bd9Sstevel@tonic-gate 	}
16357c478bd9Sstevel@tonic-gate 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
16367c478bd9Sstevel@tonic-gate 
16377c478bd9Sstevel@tonic-gate 	if (n > m)
16387c478bd9Sstevel@tonic-gate 		return (NULL);
16397c478bd9Sstevel@tonic-gate 
16407c478bd9Sstevel@tonic-gate 	szc = 0;
16417c478bd9Sstevel@tonic-gate 
16427c478bd9Sstevel@tonic-gate 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
16437c478bd9Sstevel@tonic-gate 	if (n == 0) {
16447c478bd9Sstevel@tonic-gate 		flags |= PGI_MT_RANGE0;
16457c478bd9Sstevel@tonic-gate 		n = m;
16467c478bd9Sstevel@tonic-gate 	}
16477c478bd9Sstevel@tonic-gate 
16487c478bd9Sstevel@tonic-gate 	/*
16497c478bd9Sstevel@tonic-gate 	 * Try local memory node first, but try remote if we can't
16507c478bd9Sstevel@tonic-gate 	 * get a page of the right color.
16517c478bd9Sstevel@tonic-gate 	 */
16527c478bd9Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
16537c478bd9Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
16547c478bd9Sstevel@tonic-gate 		/*
16557c478bd9Sstevel@tonic-gate 		 * allocate pages from high pfn to low.
16567c478bd9Sstevel@tonic-gate 		 */
16577c478bd9Sstevel@tonic-gate 		for (mtype = m; mtype >= n; mtype--) {
16587c478bd9Sstevel@tonic-gate 			if (fullrange != 0) {
16597c478bd9Sstevel@tonic-gate 				pp = page_get_mnode_freelist(mnode,
16607c478bd9Sstevel@tonic-gate 				    bin, mtype, szc, flags);
16617c478bd9Sstevel@tonic-gate 				if (pp == NULL) {
16627c478bd9Sstevel@tonic-gate 					pp = page_get_mnode_cachelist(
16637c478bd9Sstevel@tonic-gate 						bin, flags, mnode, mtype);
16647c478bd9Sstevel@tonic-gate 				}
16657c478bd9Sstevel@tonic-gate 			} else {
16667c478bd9Sstevel@tonic-gate 				pp = page_get_mnode_anylist(bin, szc,
16677c478bd9Sstevel@tonic-gate 				    flags, mnode, mtype, dma_attr);
16687c478bd9Sstevel@tonic-gate 			}
16697c478bd9Sstevel@tonic-gate 			if (pp != NULL) {
16707c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(pga_vmstats.pga_allocok);
16717c478bd9Sstevel@tonic-gate 				check_dma(dma_attr, pp, 1);
16727c478bd9Sstevel@tonic-gate 				return (pp);
16737c478bd9Sstevel@tonic-gate 			}
16747c478bd9Sstevel@tonic-gate 		}
16757c478bd9Sstevel@tonic-gate 		if (!local_failed_stat) {
16767c478bd9Sstevel@tonic-gate 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
16777c478bd9Sstevel@tonic-gate 			local_failed_stat = 1;
16787c478bd9Sstevel@tonic-gate 		}
16797c478bd9Sstevel@tonic-gate 	}
16807c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
16817c478bd9Sstevel@tonic-gate 
16827c478bd9Sstevel@tonic-gate 	return (NULL);
16837c478bd9Sstevel@tonic-gate }
16847c478bd9Sstevel@tonic-gate 
16857c478bd9Sstevel@tonic-gate /*
16867c478bd9Sstevel@tonic-gate  * page_create_io()
16877c478bd9Sstevel@tonic-gate  *
16887c478bd9Sstevel@tonic-gate  * This function is a copy of page_create_va() with an additional
16897c478bd9Sstevel@tonic-gate  * argument 'mattr' that specifies DMA memory requirements to
16907c478bd9Sstevel@tonic-gate  * the page list functions. This function is used by the segkmem
16917c478bd9Sstevel@tonic-gate  * allocator so it is only to create new pages (i.e PG_EXCL is
16927c478bd9Sstevel@tonic-gate  * set).
16937c478bd9Sstevel@tonic-gate  *
16947c478bd9Sstevel@tonic-gate  * Note: This interface is currently used by x86 PSM only and is
16957c478bd9Sstevel@tonic-gate  *	 not fully specified so the commitment level is only for
16967c478bd9Sstevel@tonic-gate  *	 private interface specific to x86. This interface uses PSM
16977c478bd9Sstevel@tonic-gate  *	 specific page_get_anylist() interface.
16987c478bd9Sstevel@tonic-gate  */
16997c478bd9Sstevel@tonic-gate 
17007c478bd9Sstevel@tonic-gate #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
17017c478bd9Sstevel@tonic-gate 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
17027c478bd9Sstevel@tonic-gate 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
17037c478bd9Sstevel@tonic-gate 			break; \
17047c478bd9Sstevel@tonic-gate 	} \
17057c478bd9Sstevel@tonic-gate }
17067c478bd9Sstevel@tonic-gate 
17077c478bd9Sstevel@tonic-gate 
17087c478bd9Sstevel@tonic-gate page_t *
17097c478bd9Sstevel@tonic-gate page_create_io(
17107c478bd9Sstevel@tonic-gate 	struct vnode	*vp,
17117c478bd9Sstevel@tonic-gate 	u_offset_t	off,
17127c478bd9Sstevel@tonic-gate 	uint_t		bytes,
17137c478bd9Sstevel@tonic-gate 	uint_t		flags,
17147c478bd9Sstevel@tonic-gate 	struct as	*as,
17157c478bd9Sstevel@tonic-gate 	caddr_t		vaddr,
17167c478bd9Sstevel@tonic-gate 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
17177c478bd9Sstevel@tonic-gate {
17187c478bd9Sstevel@tonic-gate 	page_t		*plist = NULL;
17197c478bd9Sstevel@tonic-gate 	uint_t		plist_len = 0;
17207c478bd9Sstevel@tonic-gate 	pgcnt_t		npages;
17217c478bd9Sstevel@tonic-gate 	page_t		*npp = NULL;
17227c478bd9Sstevel@tonic-gate 	uint_t		pages_req;
17237c478bd9Sstevel@tonic-gate 	page_t		*pp;
17247c478bd9Sstevel@tonic-gate 	kmutex_t	*phm = NULL;
17257c478bd9Sstevel@tonic-gate 	uint_t		index;
17267c478bd9Sstevel@tonic-gate 
17277c478bd9Sstevel@tonic-gate 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
17287c478bd9Sstevel@tonic-gate 		"page_create_start:vp %p off %llx bytes %u flags %x",
17297c478bd9Sstevel@tonic-gate 		vp, off, bytes, flags);
17307c478bd9Sstevel@tonic-gate 
17317c478bd9Sstevel@tonic-gate 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
17327c478bd9Sstevel@tonic-gate 
17337c478bd9Sstevel@tonic-gate 	pages_req = npages = mmu_btopr(bytes);
17347c478bd9Sstevel@tonic-gate 
17357c478bd9Sstevel@tonic-gate 	/*
17367c478bd9Sstevel@tonic-gate 	 * Do the freemem and pcf accounting.
17377c478bd9Sstevel@tonic-gate 	 */
17387c478bd9Sstevel@tonic-gate 	if (!page_create_wait(npages, flags)) {
17397c478bd9Sstevel@tonic-gate 		return (NULL);
17407c478bd9Sstevel@tonic-gate 	}
17417c478bd9Sstevel@tonic-gate 
17427c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
17437c478bd9Sstevel@tonic-gate 		"page_create_success:vp %p off %llx",
17447c478bd9Sstevel@tonic-gate 		vp, off);
17457c478bd9Sstevel@tonic-gate 
17467c478bd9Sstevel@tonic-gate 	/*
17477c478bd9Sstevel@tonic-gate 	 * If satisfying this request has left us with too little
17487c478bd9Sstevel@tonic-gate 	 * memory, start the wheels turning to get some back.  The
17497c478bd9Sstevel@tonic-gate 	 * first clause of the test prevents waking up the pageout
17507c478bd9Sstevel@tonic-gate 	 * daemon in situations where it would decide that there's
17517c478bd9Sstevel@tonic-gate 	 * nothing to do.
17527c478bd9Sstevel@tonic-gate 	 */
17537c478bd9Sstevel@tonic-gate 	if (nscan < desscan && freemem < minfree) {
17547c478bd9Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
17557c478bd9Sstevel@tonic-gate 			"pageout_cv_signal:freemem %ld", freemem);
17567c478bd9Sstevel@tonic-gate 		cv_signal(&proc_pageout->p_cv);
17577c478bd9Sstevel@tonic-gate 	}
17587c478bd9Sstevel@tonic-gate 
17597c478bd9Sstevel@tonic-gate 	if (flags & PG_PHYSCONTIG) {
17607c478bd9Sstevel@tonic-gate 
17617c478bd9Sstevel@tonic-gate 		plist = page_get_contigpage(&npages, mattr, 1);
17627c478bd9Sstevel@tonic-gate 		if (plist == NULL) {
17637c478bd9Sstevel@tonic-gate 			page_create_putback(npages);
17647c478bd9Sstevel@tonic-gate 			return (NULL);
17657c478bd9Sstevel@tonic-gate 		}
17667c478bd9Sstevel@tonic-gate 
17677c478bd9Sstevel@tonic-gate 		pp = plist;
17687c478bd9Sstevel@tonic-gate 
17697c478bd9Sstevel@tonic-gate 		do {
17707c478bd9Sstevel@tonic-gate 			if (!page_hashin(pp, vp, off, NULL)) {
17717c478bd9Sstevel@tonic-gate 				panic("pg_creat_io: hashin failed %p %p %llx",
17727c478bd9Sstevel@tonic-gate 				    (void *)pp, (void *)vp, off);
17737c478bd9Sstevel@tonic-gate 			}
17747c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_create_new);
17757c478bd9Sstevel@tonic-gate 			off += MMU_PAGESIZE;
17767c478bd9Sstevel@tonic-gate 			PP_CLRFREE(pp);
17777c478bd9Sstevel@tonic-gate 			PP_CLRAGED(pp);
17787c478bd9Sstevel@tonic-gate 			page_set_props(pp, P_REF);
17797c478bd9Sstevel@tonic-gate 			pp = pp->p_next;
17807c478bd9Sstevel@tonic-gate 		} while (pp != plist);
17817c478bd9Sstevel@tonic-gate 
17827c478bd9Sstevel@tonic-gate 		if (!npages) {
17837c478bd9Sstevel@tonic-gate 			check_dma(mattr, plist, pages_req);
17847c478bd9Sstevel@tonic-gate 			return (plist);
17857c478bd9Sstevel@tonic-gate 		} else {
17867c478bd9Sstevel@tonic-gate 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
17877c478bd9Sstevel@tonic-gate 		}
17887c478bd9Sstevel@tonic-gate 
17897c478bd9Sstevel@tonic-gate 		/*
17907c478bd9Sstevel@tonic-gate 		 * fall-thru:
17917c478bd9Sstevel@tonic-gate 		 *
17927c478bd9Sstevel@tonic-gate 		 * page_get_contigpage returns when npages <= sgllen.
17937c478bd9Sstevel@tonic-gate 		 * Grab the rest of the non-contig pages below from anylist.
17947c478bd9Sstevel@tonic-gate 		 */
17957c478bd9Sstevel@tonic-gate 	}
17967c478bd9Sstevel@tonic-gate 
17977c478bd9Sstevel@tonic-gate 	/*
17987c478bd9Sstevel@tonic-gate 	 * Loop around collecting the requested number of pages.
17997c478bd9Sstevel@tonic-gate 	 * Most of the time, we have to `create' a new page. With
18007c478bd9Sstevel@tonic-gate 	 * this in mind, pull the page off the free list before
18017c478bd9Sstevel@tonic-gate 	 * getting the hash lock.  This will minimize the hash
18027c478bd9Sstevel@tonic-gate 	 * lock hold time, nesting, and the like.  If it turns
18037c478bd9Sstevel@tonic-gate 	 * out we don't need the page, we put it back at the end.
18047c478bd9Sstevel@tonic-gate 	 */
18057c478bd9Sstevel@tonic-gate 	while (npages--) {
18067c478bd9Sstevel@tonic-gate 		phm = NULL;
18077c478bd9Sstevel@tonic-gate 
18087c478bd9Sstevel@tonic-gate 		index = PAGE_HASH_FUNC(vp, off);
18097c478bd9Sstevel@tonic-gate top:
18107c478bd9Sstevel@tonic-gate 		ASSERT(phm == NULL);
18117c478bd9Sstevel@tonic-gate 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
18127c478bd9Sstevel@tonic-gate 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
18137c478bd9Sstevel@tonic-gate 
18147c478bd9Sstevel@tonic-gate 		if (npp == NULL) {
18157c478bd9Sstevel@tonic-gate 			/*
18167c478bd9Sstevel@tonic-gate 			 * Try to get the page of any color either from
18177c478bd9Sstevel@tonic-gate 			 * the freelist or from the cache list.
18187c478bd9Sstevel@tonic-gate 			 */
18197c478bd9Sstevel@tonic-gate 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
18207c478bd9Sstevel@tonic-gate 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
18217c478bd9Sstevel@tonic-gate 			if (npp == NULL) {
18227c478bd9Sstevel@tonic-gate 				if (mattr == NULL) {
18237c478bd9Sstevel@tonic-gate 					/*
18247c478bd9Sstevel@tonic-gate 					 * Not looking for a special page;
18257c478bd9Sstevel@tonic-gate 					 * panic!
18267c478bd9Sstevel@tonic-gate 					 */
18277c478bd9Sstevel@tonic-gate 					panic("no page found %d", (int)npages);
18287c478bd9Sstevel@tonic-gate 				}
18297c478bd9Sstevel@tonic-gate 				/*
18307c478bd9Sstevel@tonic-gate 				 * No page found! This can happen
18317c478bd9Sstevel@tonic-gate 				 * if we are looking for a page
18327c478bd9Sstevel@tonic-gate 				 * within a specific memory range
18337c478bd9Sstevel@tonic-gate 				 * for DMA purposes. If PG_WAIT is
18347c478bd9Sstevel@tonic-gate 				 * specified then we wait for a
18357c478bd9Sstevel@tonic-gate 				 * while and then try again. The
18367c478bd9Sstevel@tonic-gate 				 * wait could be forever if we
18377c478bd9Sstevel@tonic-gate 				 * don't get the page(s) we need.
18387c478bd9Sstevel@tonic-gate 				 *
18397c478bd9Sstevel@tonic-gate 				 * Note: XXX We really need a mechanism
18407c478bd9Sstevel@tonic-gate 				 * to wait for pages in the desired
18417c478bd9Sstevel@tonic-gate 				 * range. For now, we wait for any
18427c478bd9Sstevel@tonic-gate 				 * pages and see if we can use it.
18437c478bd9Sstevel@tonic-gate 				 */
18447c478bd9Sstevel@tonic-gate 
18457c478bd9Sstevel@tonic-gate 				if ((mattr != NULL) && (flags & PG_WAIT)) {
18467c478bd9Sstevel@tonic-gate 					delay(10);
18477c478bd9Sstevel@tonic-gate 					goto top;
18487c478bd9Sstevel@tonic-gate 				}
18497c478bd9Sstevel@tonic-gate 				goto fail; /* undo accounting stuff */
18507c478bd9Sstevel@tonic-gate 			}
18517c478bd9Sstevel@tonic-gate 
18527c478bd9Sstevel@tonic-gate 			if (PP_ISAGED(npp) == 0) {
18537c478bd9Sstevel@tonic-gate 				/*
18547c478bd9Sstevel@tonic-gate 				 * Since this page came from the
18557c478bd9Sstevel@tonic-gate 				 * cachelist, we must destroy the
18567c478bd9Sstevel@tonic-gate 				 * old vnode association.
18577c478bd9Sstevel@tonic-gate 				 */
18587c478bd9Sstevel@tonic-gate 				page_hashout(npp, (kmutex_t *)NULL);
18597c478bd9Sstevel@tonic-gate 			}
18607c478bd9Sstevel@tonic-gate 		}
18617c478bd9Sstevel@tonic-gate 
18627c478bd9Sstevel@tonic-gate 		/*
18637c478bd9Sstevel@tonic-gate 		 * We own this page!
18647c478bd9Sstevel@tonic-gate 		 */
18657c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(npp));
18667c478bd9Sstevel@tonic-gate 		ASSERT(npp->p_vnode == NULL);
18677c478bd9Sstevel@tonic-gate 		ASSERT(!hat_page_is_mapped(npp));
18687c478bd9Sstevel@tonic-gate 		PP_CLRFREE(npp);
18697c478bd9Sstevel@tonic-gate 		PP_CLRAGED(npp);
18707c478bd9Sstevel@tonic-gate 
18717c478bd9Sstevel@tonic-gate 		/*
18727c478bd9Sstevel@tonic-gate 		 * Here we have a page in our hot little mits and are
18737c478bd9Sstevel@tonic-gate 		 * just waiting to stuff it on the appropriate lists.
18747c478bd9Sstevel@tonic-gate 		 * Get the mutex and check to see if it really does
18757c478bd9Sstevel@tonic-gate 		 * not exist.
18767c478bd9Sstevel@tonic-gate 		 */
18777c478bd9Sstevel@tonic-gate 		phm = PAGE_HASH_MUTEX(index);
18787c478bd9Sstevel@tonic-gate 		mutex_enter(phm);
18797c478bd9Sstevel@tonic-gate 		PAGE_HASH_SEARCH(index, pp, vp, off);
18807c478bd9Sstevel@tonic-gate 		if (pp == NULL) {
18817c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_create_new);
18827c478bd9Sstevel@tonic-gate 			pp = npp;
18837c478bd9Sstevel@tonic-gate 			npp = NULL;
18847c478bd9Sstevel@tonic-gate 			if (!page_hashin(pp, vp, off, phm)) {
18857c478bd9Sstevel@tonic-gate 				/*
18867c478bd9Sstevel@tonic-gate 				 * Since we hold the page hash mutex and
18877c478bd9Sstevel@tonic-gate 				 * just searched for this page, page_hashin
18887c478bd9Sstevel@tonic-gate 				 * had better not fail.  If it does, that
18897c478bd9Sstevel@tonic-gate 				 * means somethread did not follow the
18907c478bd9Sstevel@tonic-gate 				 * page hash mutex rules.  Panic now and
18917c478bd9Sstevel@tonic-gate 				 * get it over with.  As usual, go down
18927c478bd9Sstevel@tonic-gate 				 * holding all the locks.
18937c478bd9Sstevel@tonic-gate 				 */
18947c478bd9Sstevel@tonic-gate 				ASSERT(MUTEX_HELD(phm));
18957c478bd9Sstevel@tonic-gate 				panic("page_create: hashin fail %p %p %llx %p",
18967c478bd9Sstevel@tonic-gate 				    (void *)pp, (void *)vp, off, (void *)phm);
18977c478bd9Sstevel@tonic-gate 
18987c478bd9Sstevel@tonic-gate 			}
18997c478bd9Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(phm));
19007c478bd9Sstevel@tonic-gate 			mutex_exit(phm);
19017c478bd9Sstevel@tonic-gate 			phm = NULL;
19027c478bd9Sstevel@tonic-gate 
19037c478bd9Sstevel@tonic-gate 			/*
19047c478bd9Sstevel@tonic-gate 			 * Hat layer locking need not be done to set
19057c478bd9Sstevel@tonic-gate 			 * the following bits since the page is not hashed
19067c478bd9Sstevel@tonic-gate 			 * and was on the free list (i.e., had no mappings).
19077c478bd9Sstevel@tonic-gate 			 *
19087c478bd9Sstevel@tonic-gate 			 * Set the reference bit to protect
19097c478bd9Sstevel@tonic-gate 			 * against immediate pageout
19107c478bd9Sstevel@tonic-gate 			 *
19117c478bd9Sstevel@tonic-gate 			 * XXXmh modify freelist code to set reference
19127c478bd9Sstevel@tonic-gate 			 * bit so we don't have to do it here.
19137c478bd9Sstevel@tonic-gate 			 */
19147c478bd9Sstevel@tonic-gate 			page_set_props(pp, P_REF);
19157c478bd9Sstevel@tonic-gate 		} else {
19167c478bd9Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(phm));
19177c478bd9Sstevel@tonic-gate 			mutex_exit(phm);
19187c478bd9Sstevel@tonic-gate 			phm = NULL;
19197c478bd9Sstevel@tonic-gate 			/*
19207c478bd9Sstevel@tonic-gate 			 * NOTE: This should not happen for pages associated
19217c478bd9Sstevel@tonic-gate 			 *	 with kernel vnode 'kvp'.
19227c478bd9Sstevel@tonic-gate 			 */
19237c478bd9Sstevel@tonic-gate 			/* XX64 - to debug why this happens! */
1924ad23a2dbSjohansen 			ASSERT(!VN_ISKAS(vp));
1925ad23a2dbSjohansen 			if (VN_ISKAS(vp))
19267c478bd9Sstevel@tonic-gate 				cmn_err(CE_NOTE,
19277c478bd9Sstevel@tonic-gate 				    "page_create: page not expected "
19287c478bd9Sstevel@tonic-gate 				    "in hash list for kernel vnode - pp 0x%p",
19297c478bd9Sstevel@tonic-gate 				    (void *)pp);
19307c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_create_exists);
19317c478bd9Sstevel@tonic-gate 			goto fail;
19327c478bd9Sstevel@tonic-gate 		}
19337c478bd9Sstevel@tonic-gate 
19347c478bd9Sstevel@tonic-gate 		/*
19357c478bd9Sstevel@tonic-gate 		 * Got a page!  It is locked.  Acquire the i/o
19367c478bd9Sstevel@tonic-gate 		 * lock since we are going to use the p_next and
19377c478bd9Sstevel@tonic-gate 		 * p_prev fields to link the requested pages together.
19387c478bd9Sstevel@tonic-gate 		 */
19397c478bd9Sstevel@tonic-gate 		page_io_lock(pp);
19407c478bd9Sstevel@tonic-gate 		page_add(&plist, pp);
19417c478bd9Sstevel@tonic-gate 		plist = plist->p_next;
19427c478bd9Sstevel@tonic-gate 		off += MMU_PAGESIZE;
19437c478bd9Sstevel@tonic-gate 		vaddr += MMU_PAGESIZE;
19447c478bd9Sstevel@tonic-gate 	}
19457c478bd9Sstevel@tonic-gate 
19467c478bd9Sstevel@tonic-gate 	check_dma(mattr, plist, pages_req);
19477c478bd9Sstevel@tonic-gate 	return (plist);
19487c478bd9Sstevel@tonic-gate 
19497c478bd9Sstevel@tonic-gate fail:
19507c478bd9Sstevel@tonic-gate 	if (npp != NULL) {
19517c478bd9Sstevel@tonic-gate 		/*
19527c478bd9Sstevel@tonic-gate 		 * Did not need this page after all.
19537c478bd9Sstevel@tonic-gate 		 * Put it back on the free list.
19547c478bd9Sstevel@tonic-gate 		 */
19557c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_create_putbacks);
19567c478bd9Sstevel@tonic-gate 		PP_SETFREE(npp);
19577c478bd9Sstevel@tonic-gate 		PP_SETAGED(npp);
19587c478bd9Sstevel@tonic-gate 		npp->p_offset = (u_offset_t)-1;
19597c478bd9Sstevel@tonic-gate 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
19607c478bd9Sstevel@tonic-gate 		page_unlock(npp);
19617c478bd9Sstevel@tonic-gate 	}
19627c478bd9Sstevel@tonic-gate 
19637c478bd9Sstevel@tonic-gate 	/*
19647c478bd9Sstevel@tonic-gate 	 * Give up the pages we already got.
19657c478bd9Sstevel@tonic-gate 	 */
19667c478bd9Sstevel@tonic-gate 	while (plist != NULL) {
19677c478bd9Sstevel@tonic-gate 		pp = plist;
19687c478bd9Sstevel@tonic-gate 		page_sub(&plist, pp);
19697c478bd9Sstevel@tonic-gate 		page_io_unlock(pp);
19707c478bd9Sstevel@tonic-gate 		plist_len++;
19717c478bd9Sstevel@tonic-gate 		/*LINTED: constant in conditional ctx*/
19727c478bd9Sstevel@tonic-gate 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
19737c478bd9Sstevel@tonic-gate 	}
19747c478bd9Sstevel@tonic-gate 
19757c478bd9Sstevel@tonic-gate 	/*
19767c478bd9Sstevel@tonic-gate 	 * VN_DISPOSE does freemem accounting for the pages in plist
19777c478bd9Sstevel@tonic-gate 	 * by calling page_free. So, we need to undo the pcf accounting
19787c478bd9Sstevel@tonic-gate 	 * for only the remaining pages.
19797c478bd9Sstevel@tonic-gate 	 */
19807c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(page_create_putbacks);
19817c478bd9Sstevel@tonic-gate 	page_create_putback(pages_req - plist_len);
19827c478bd9Sstevel@tonic-gate 
19837c478bd9Sstevel@tonic-gate 	return (NULL);
19847c478bd9Sstevel@tonic-gate }
19857c478bd9Sstevel@tonic-gate 
19867c478bd9Sstevel@tonic-gate 
19877c478bd9Sstevel@tonic-gate /*
19887c478bd9Sstevel@tonic-gate  * Copy the data from the physical page represented by "frompp" to
19897c478bd9Sstevel@tonic-gate  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
19907c478bd9Sstevel@tonic-gate  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
19917c478bd9Sstevel@tonic-gate  * level and no one sleeps with an active mapping there.
19927c478bd9Sstevel@tonic-gate  *
19937c478bd9Sstevel@tonic-gate  * Note that the ref/mod bits in the page_t's are not affected by
19947c478bd9Sstevel@tonic-gate  * this operation, hence it is up to the caller to update them appropriately.
19957c478bd9Sstevel@tonic-gate  */
19968b464eb8Smec int
19977c478bd9Sstevel@tonic-gate ppcopy(page_t *frompp, page_t *topp)
19987c478bd9Sstevel@tonic-gate {
19997c478bd9Sstevel@tonic-gate 	caddr_t		pp_addr1;
20007c478bd9Sstevel@tonic-gate 	caddr_t		pp_addr2;
2001*ae115bc7Smrj 	hat_mempte_t	pte1;
2002*ae115bc7Smrj 	hat_mempte_t	pte2;
20037c478bd9Sstevel@tonic-gate 	kmutex_t	*ppaddr_mutex;
20048b464eb8Smec 	label_t		ljb;
20058b464eb8Smec 	int		ret = 1;
20067c478bd9Sstevel@tonic-gate 
20077c478bd9Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
20087c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(frompp));
20097c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(topp));
20107c478bd9Sstevel@tonic-gate 
20117c478bd9Sstevel@tonic-gate 	if (kpm_enable) {
20127c478bd9Sstevel@tonic-gate 		pp_addr1 = hat_kpm_page2va(frompp, 0);
20137c478bd9Sstevel@tonic-gate 		pp_addr2 = hat_kpm_page2va(topp, 0);
20147c478bd9Sstevel@tonic-gate 		kpreempt_disable();
20157c478bd9Sstevel@tonic-gate 	} else {
20167c478bd9Sstevel@tonic-gate 		/*
20177c478bd9Sstevel@tonic-gate 		 * disable pre-emption so that CPU can't change
20187c478bd9Sstevel@tonic-gate 		 */
20197c478bd9Sstevel@tonic-gate 		kpreempt_disable();
20207c478bd9Sstevel@tonic-gate 
20217c478bd9Sstevel@tonic-gate 		pp_addr1 = CPU->cpu_caddr1;
20227c478bd9Sstevel@tonic-gate 		pp_addr2 = CPU->cpu_caddr2;
2023*ae115bc7Smrj 		pte1 = CPU->cpu_caddr1pte;
2024*ae115bc7Smrj 		pte2 = CPU->cpu_caddr2pte;
20257c478bd9Sstevel@tonic-gate 
20267c478bd9Sstevel@tonic-gate 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
20277c478bd9Sstevel@tonic-gate 		mutex_enter(ppaddr_mutex);
20287c478bd9Sstevel@tonic-gate 
20297c478bd9Sstevel@tonic-gate 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
20307c478bd9Sstevel@tonic-gate 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
20317c478bd9Sstevel@tonic-gate 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
20327c478bd9Sstevel@tonic-gate 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
20337c478bd9Sstevel@tonic-gate 		    HAT_LOAD_NOCONSIST);
20347c478bd9Sstevel@tonic-gate 	}
20357c478bd9Sstevel@tonic-gate 
20368b464eb8Smec 	if (on_fault(&ljb)) {
20378b464eb8Smec 		ret = 0;
20388b464eb8Smec 		goto faulted;
20398b464eb8Smec 	}
20407c478bd9Sstevel@tonic-gate 	if (use_sse_pagecopy)
20417c478bd9Sstevel@tonic-gate 		hwblkpagecopy(pp_addr1, pp_addr2);
20427c478bd9Sstevel@tonic-gate 	else
20437c478bd9Sstevel@tonic-gate 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
20447c478bd9Sstevel@tonic-gate 
20458b464eb8Smec 	no_fault();
20468b464eb8Smec faulted:
2047*ae115bc7Smrj 	if (!kpm_enable) {
20487c478bd9Sstevel@tonic-gate 		mutex_exit(ppaddr_mutex);
2049*ae115bc7Smrj 	}
20507c478bd9Sstevel@tonic-gate 	kpreempt_enable();
20518b464eb8Smec 	return (ret);
20527c478bd9Sstevel@tonic-gate }
20537c478bd9Sstevel@tonic-gate 
20547c478bd9Sstevel@tonic-gate /*
20557c478bd9Sstevel@tonic-gate  * Zero the physical page from off to off + len given by `pp'
20567c478bd9Sstevel@tonic-gate  * without changing the reference and modified bits of page.
20577c478bd9Sstevel@tonic-gate  *
20587c478bd9Sstevel@tonic-gate  * We use this using CPU private page address #2, see ppcopy() for more info.
20597c478bd9Sstevel@tonic-gate  * pagezero() must not be called at interrupt level.
20607c478bd9Sstevel@tonic-gate  */
20617c478bd9Sstevel@tonic-gate void
20627c478bd9Sstevel@tonic-gate pagezero(page_t *pp, uint_t off, uint_t len)
20637c478bd9Sstevel@tonic-gate {
20647c478bd9Sstevel@tonic-gate 	caddr_t		pp_addr2;
2065*ae115bc7Smrj 	hat_mempte_t	pte2;
20667c478bd9Sstevel@tonic-gate 	kmutex_t	*ppaddr_mutex;
20677c478bd9Sstevel@tonic-gate 
20687c478bd9Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
20697c478bd9Sstevel@tonic-gate 	ASSERT(len <= MMU_PAGESIZE);
20707c478bd9Sstevel@tonic-gate 	ASSERT(off <= MMU_PAGESIZE);
20717c478bd9Sstevel@tonic-gate 	ASSERT(off + len <= MMU_PAGESIZE);
20727c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
20737c478bd9Sstevel@tonic-gate 
20747c478bd9Sstevel@tonic-gate 	if (kpm_enable) {
20757c478bd9Sstevel@tonic-gate 		pp_addr2 = hat_kpm_page2va(pp, 0);
20767c478bd9Sstevel@tonic-gate 		kpreempt_disable();
20777c478bd9Sstevel@tonic-gate 	} else {
20787c478bd9Sstevel@tonic-gate 		kpreempt_disable();
20797c478bd9Sstevel@tonic-gate 
20807c478bd9Sstevel@tonic-gate 		pp_addr2 = CPU->cpu_caddr2;
2081*ae115bc7Smrj 		pte2 = CPU->cpu_caddr2pte;
20827c478bd9Sstevel@tonic-gate 
20837c478bd9Sstevel@tonic-gate 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
20847c478bd9Sstevel@tonic-gate 		mutex_enter(ppaddr_mutex);
20857c478bd9Sstevel@tonic-gate 
20867c478bd9Sstevel@tonic-gate 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
20877c478bd9Sstevel@tonic-gate 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
20887c478bd9Sstevel@tonic-gate 		    HAT_LOAD_NOCONSIST);
20897c478bd9Sstevel@tonic-gate 	}
20907c478bd9Sstevel@tonic-gate 
2091*ae115bc7Smrj 	if (use_sse_pagezero) {
20927c478bd9Sstevel@tonic-gate 		hwblkclr(pp_addr2 + off, len);
2093*ae115bc7Smrj 	} else {
20947c478bd9Sstevel@tonic-gate 		bzero(pp_addr2 + off, len);
2095*ae115bc7Smrj 	}
20967c478bd9Sstevel@tonic-gate 
20977c478bd9Sstevel@tonic-gate 	if (!kpm_enable)
20987c478bd9Sstevel@tonic-gate 		mutex_exit(ppaddr_mutex);
20997c478bd9Sstevel@tonic-gate 	kpreempt_enable();
21007c478bd9Sstevel@tonic-gate }
21017c478bd9Sstevel@tonic-gate 
21027c478bd9Sstevel@tonic-gate /*
21037c478bd9Sstevel@tonic-gate  * Platform-dependent page scrub call.
21047c478bd9Sstevel@tonic-gate  */
21057c478bd9Sstevel@tonic-gate void
21067c478bd9Sstevel@tonic-gate pagescrub(page_t *pp, uint_t off, uint_t len)
21077c478bd9Sstevel@tonic-gate {
21087c478bd9Sstevel@tonic-gate 	/*
21097c478bd9Sstevel@tonic-gate 	 * For now, we rely on the fact that pagezero() will
21107c478bd9Sstevel@tonic-gate 	 * always clear UEs.
21117c478bd9Sstevel@tonic-gate 	 */
21127c478bd9Sstevel@tonic-gate 	pagezero(pp, off, len);
21137c478bd9Sstevel@tonic-gate }
21147c478bd9Sstevel@tonic-gate 
21157c478bd9Sstevel@tonic-gate /*
21167c478bd9Sstevel@tonic-gate  * set up two private addresses for use on a given CPU for use in ppcopy()
21177c478bd9Sstevel@tonic-gate  */
21187c478bd9Sstevel@tonic-gate void
21197c478bd9Sstevel@tonic-gate setup_vaddr_for_ppcopy(struct cpu *cpup)
21207c478bd9Sstevel@tonic-gate {
21217c478bd9Sstevel@tonic-gate 	void *addr;
2122*ae115bc7Smrj 	hat_mempte_t pte_pa;
21237c478bd9Sstevel@tonic-gate 
21247c478bd9Sstevel@tonic-gate 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
2125*ae115bc7Smrj 	pte_pa = hat_mempte_setup(addr);
21267c478bd9Sstevel@tonic-gate 	cpup->cpu_caddr1 = addr;
2127*ae115bc7Smrj 	cpup->cpu_caddr1pte = pte_pa;
21287c478bd9Sstevel@tonic-gate 
21297c478bd9Sstevel@tonic-gate 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
2130*ae115bc7Smrj 	pte_pa = hat_mempte_setup(addr);
21317c478bd9Sstevel@tonic-gate 	cpup->cpu_caddr2 = addr;
2132*ae115bc7Smrj 	cpup->cpu_caddr2pte = pte_pa;
21337c478bd9Sstevel@tonic-gate 
21347c478bd9Sstevel@tonic-gate 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
21357c478bd9Sstevel@tonic-gate }
21367c478bd9Sstevel@tonic-gate 
2137*ae115bc7Smrj /*
2138*ae115bc7Smrj  * Undo setup_vaddr_for_ppcopy
2139*ae115bc7Smrj  */
2140*ae115bc7Smrj void
2141*ae115bc7Smrj teardown_vaddr_for_ppcopy(struct cpu *cpup)
2142*ae115bc7Smrj {
2143*ae115bc7Smrj 	mutex_destroy(&cpup->cpu_ppaddr_mutex);
2144*ae115bc7Smrj 
2145*ae115bc7Smrj 	hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
2146*ae115bc7Smrj 	cpup->cpu_caddr2pte = 0;
2147*ae115bc7Smrj 	vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
2148*ae115bc7Smrj 	cpup->cpu_caddr2 = 0;
2149*ae115bc7Smrj 
2150*ae115bc7Smrj 	hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
2151*ae115bc7Smrj 	cpup->cpu_caddr1pte = 0;
2152*ae115bc7Smrj 	vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
2153*ae115bc7Smrj 	cpup->cpu_caddr1 = 0;
2154*ae115bc7Smrj }
21557c478bd9Sstevel@tonic-gate 
21567c478bd9Sstevel@tonic-gate /*
21577c478bd9Sstevel@tonic-gate  * Create the pageout scanner thread. The thread has to
21587c478bd9Sstevel@tonic-gate  * start at procedure with process pp and priority pri.
21597c478bd9Sstevel@tonic-gate  */
21607c478bd9Sstevel@tonic-gate void
21617c478bd9Sstevel@tonic-gate pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
21627c478bd9Sstevel@tonic-gate {
21637c478bd9Sstevel@tonic-gate 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
21647c478bd9Sstevel@tonic-gate }
21657c478bd9Sstevel@tonic-gate 
21667c478bd9Sstevel@tonic-gate /*
21677c478bd9Sstevel@tonic-gate  * Function for flushing D-cache when performing module relocations
21687c478bd9Sstevel@tonic-gate  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
21697c478bd9Sstevel@tonic-gate  */
21707c478bd9Sstevel@tonic-gate void
21717c478bd9Sstevel@tonic-gate dcache_flushall()
21727c478bd9Sstevel@tonic-gate {}
2173102033aaSdp 
2174102033aaSdp size_t
2175102033aaSdp exec_get_spslew(void)
2176102033aaSdp {
2177102033aaSdp 	return (0);
2178102033aaSdp }
2179*ae115bc7Smrj 
2180*ae115bc7Smrj /*
2181*ae115bc7Smrj  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
2182*ae115bc7Smrj  * number to vary where the pages come from.  This is quite a hacked up
2183*ae115bc7Smrj  * method -- it works for now, but really needs to be fixed up a bit.
2184*ae115bc7Smrj  *
2185*ae115bc7Smrj  * We currently use page_create_va() on the kvp with fake offsets,
2186*ae115bc7Smrj  * segments and virt address.  This is pretty bogus, but was copied from the
2187*ae115bc7Smrj  * old hat_i86.c code.  A better approach would be to specify either mnode
2188*ae115bc7Smrj  * random or mnode local and takes a page from whatever color has the MOST
2189*ae115bc7Smrj  * available - this would have a minimal impact on page coloring.
2190*ae115bc7Smrj  */
2191*ae115bc7Smrj page_t *
2192*ae115bc7Smrj page_get_physical(uintptr_t seed)
2193*ae115bc7Smrj {
2194*ae115bc7Smrj 	page_t *pp;
2195*ae115bc7Smrj 	u_offset_t offset;
2196*ae115bc7Smrj 	static struct seg tmpseg;
2197*ae115bc7Smrj 	static uintptr_t ctr = 0;
2198*ae115bc7Smrj 
2199*ae115bc7Smrj 	/*
2200*ae115bc7Smrj 	 * This code is gross, we really need a simpler page allocator.
2201*ae115bc7Smrj 	 *
2202*ae115bc7Smrj 	 * We need assign an offset for the page to call page_create_va().
2203*ae115bc7Smrj 	 * To avoid conflicts with other pages, we get creative with the offset.
2204*ae115bc7Smrj 	 * For 32 bits, we pick an offset > 4Gig
2205*ae115bc7Smrj 	 * For 64 bits, pick an offset somewhere in the VA hole.
2206*ae115bc7Smrj 	 */
2207*ae115bc7Smrj 	offset = seed;
2208*ae115bc7Smrj 	if (offset > kernelbase)
2209*ae115bc7Smrj 		offset -= kernelbase;
2210*ae115bc7Smrj 	offset <<= MMU_PAGESHIFT;
2211*ae115bc7Smrj #if defined(__amd64)
2212*ae115bc7Smrj 	offset += mmu.hole_start;	/* something in VA hole */
2213*ae115bc7Smrj #else
2214*ae115bc7Smrj 	offset += 1ULL << 40;		/* something > 4 Gig */
2215*ae115bc7Smrj #endif
2216*ae115bc7Smrj 
2217*ae115bc7Smrj 	if (page_resv(1, KM_NOSLEEP) == 0)
2218*ae115bc7Smrj 		return (NULL);
2219*ae115bc7Smrj 
2220*ae115bc7Smrj #ifdef	DEBUG
2221*ae115bc7Smrj 	pp = page_exists(&kvp, offset);
2222*ae115bc7Smrj 	if (pp != NULL)
2223*ae115bc7Smrj 		panic("page already exists %p", pp);
2224*ae115bc7Smrj #endif
2225*ae115bc7Smrj 
2226*ae115bc7Smrj 	pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL | PG_NORELOC,
2227*ae115bc7Smrj 	    &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));	/* changing VA usage */
2228*ae115bc7Smrj 	if (pp == NULL)
2229*ae115bc7Smrj 		return (NULL);
2230*ae115bc7Smrj 	page_io_unlock(pp);
2231*ae115bc7Smrj 	page_hashout(pp, NULL);
2232*ae115bc7Smrj 	return (pp);
2233*ae115bc7Smrj }
2234