xref: /illumos-gate/usr/src/uts/i86pc/vm/vm_machdep.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved   */
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 /*
38  * UNIX machine dependent virtual memory support.
39  */
40 
41 #include <sys/types.h>
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/user.h>
45 #include <sys/proc.h>
46 #include <sys/kmem.h>
47 #include <sys/vmem.h>
48 #include <sys/buf.h>
49 #include <sys/cpuvar.h>
50 #include <sys/lgrp.h>
51 #include <sys/disp.h>
52 #include <sys/vm.h>
53 #include <sys/mman.h>
54 #include <sys/vnode.h>
55 #include <sys/cred.h>
56 #include <sys/exec.h>
57 #include <sys/exechdr.h>
58 #include <sys/debug.h>
59 
60 #include <vm/hat.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_vn.h>
65 #include <vm/page.h>
66 #include <vm/seg_kmem.h>
67 #include <vm/seg_kpm.h>
68 #include <vm/vm_dep.h>
69 
70 #include <sys/cpu.h>
71 #include <sys/vm_machparam.h>
72 #include <sys/memlist.h>
73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
74 #include <vm/hat_i86.h>
75 #include <sys/x86_archext.h>
76 #include <sys/elf_386.h>
77 #include <sys/cmn_err.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 
81 #include <sys/vtrace.h>
82 #include <sys/ddidmareq.h>
83 #include <sys/promif.h>
84 #include <sys/memnode.h>
85 #include <sys/stack.h>
86 
87 uint_t vac_colors = 0;
88 
89 int largepagesupport = 0;
90 extern uint_t page_create_new;
91 extern uint_t page_create_exists;
92 extern uint_t page_create_putbacks;
93 extern uint_t page_create_putbacks;
94 extern uintptr_t eprom_kernelbase;
95 extern int use_sse_pagecopy, use_sse_pagezero;	/* in ml/float.s */
96 
97 /* 4g memory management */
98 pgcnt_t		maxmem4g;
99 pgcnt_t		freemem4g;
100 int		physmax4g;
101 int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
102 int		lotsfree4gshift = 3;
103 
104 #ifdef VM_STATS
105 struct {
106 	ulong_t	pga_alloc;
107 	ulong_t	pga_notfullrange;
108 	ulong_t	pga_nulldmaattr;
109 	ulong_t	pga_allocok;
110 	ulong_t	pga_allocfailed;
111 	ulong_t	pgma_alloc;
112 	ulong_t	pgma_allocok;
113 	ulong_t	pgma_allocfailed;
114 	ulong_t	pgma_allocempty;
115 } pga_vmstats;
116 #endif
117 
118 uint_t mmu_page_sizes;
119 
120 /* How many page sizes the users can see */
121 uint_t mmu_exported_page_sizes;
122 
123 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */
124 
125 /*
126  * Return the optimum page size for a given mapping
127  */
128 /*ARGSUSED*/
129 size_t
130 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap)
131 {
132 	level_t l;
133 
134 	if (remap)
135 		*remap = 0;
136 
137 	switch (maptype) {
138 
139 	case MAPPGSZ_STK:
140 	case MAPPGSZ_HEAP:
141 	case MAPPGSZ_VA:
142 		/*
143 		 * use the pages size that best fits len
144 		 */
145 		for (l = mmu.max_page_level; l > 0; --l) {
146 			if (len < LEVEL_SIZE(l))
147 				continue;
148 			break;
149 		}
150 		return (LEVEL_SIZE(l));
151 
152 	/*
153 	 * for ISM use the 1st large page size.
154 	 */
155 	case MAPPGSZ_ISM:
156 		if (mmu.max_page_level == 0)
157 			return (MMU_PAGESIZE);
158 		return (LEVEL_SIZE(1));
159 	}
160 	return (0);
161 }
162 
163 /*
164  * This can be patched via /etc/system to allow large pages
165  * to be used for mapping application and libraries text segments.
166  */
167 int	use_text_largepages = 0;
168 
169 /*
170  * Return a bit vector of large page size codes that
171  * can be used to map [addr, addr + len) region.
172  */
173 
174 /*ARGSUSED*/
175 uint_t
176 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
177 {
178 	size_t	pgsz;
179 	caddr_t a;
180 
181 	if (!text || !use_text_largepages ||
182 	    mmu.max_page_level == 0)
183 		return (0);
184 
185 	pgsz = LEVEL_SIZE(1);
186 	a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
187 	if (a < addr || a >= addr + len) {
188 		return (0);
189 	}
190 	len -= (a - addr);
191 	if (len < pgsz) {
192 		return (0);
193 	}
194 	return (1 << 1);
195 }
196 
197 /*
198  * Handle a pagefault.
199  */
200 faultcode_t
201 pagefault(
202 	caddr_t addr,
203 	enum fault_type type,
204 	enum seg_rw rw,
205 	int iskernel)
206 {
207 	struct as *as;
208 	struct hat *hat;
209 	struct proc *p;
210 	kthread_t *t;
211 	faultcode_t res;
212 	caddr_t base;
213 	size_t len;
214 	int err;
215 	int mapped_red;
216 	uintptr_t ea;
217 
218 	ASSERT_STACK_ALIGNED();
219 
220 	if (INVALID_VADDR(addr))
221 		return (FC_NOMAP);
222 
223 	mapped_red = segkp_map_red();
224 
225 	if (iskernel) {
226 		as = &kas;
227 		hat = as->a_hat;
228 	} else {
229 		t = curthread;
230 		p = ttoproc(t);
231 		as = p->p_as;
232 		hat = as->a_hat;
233 	}
234 
235 	/*
236 	 * Dispatch pagefault.
237 	 */
238 	res = as_fault(hat, as, addr, 1, type, rw);
239 
240 	/*
241 	 * If this isn't a potential unmapped hole in the user's
242 	 * UNIX data or stack segments, just return status info.
243 	 */
244 	if (res != FC_NOMAP || iskernel)
245 		goto out;
246 
247 	/*
248 	 * Check to see if we happened to faulted on a currently unmapped
249 	 * part of the UNIX data or stack segments.  If so, create a zfod
250 	 * mapping there and then try calling the fault routine again.
251 	 */
252 	base = p->p_brkbase;
253 	len = p->p_brksize;
254 
255 	if (addr < base || addr >= base + len) {		/* data seg? */
256 		base = (caddr_t)p->p_usrstack - p->p_stksize;
257 		len = p->p_stksize;
258 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
259 			/* not in either UNIX data or stack segments */
260 			res = FC_NOMAP;
261 			goto out;
262 		}
263 	}
264 
265 	/*
266 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
267 	 * This code is probably not needed anymore
268 	 */
269 	if (p->p_model == DATAMODEL_ILP32) {
270 
271 		/* expand the gap to the page boundaries on each side */
272 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
273 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
274 		len = ea - (uintptr_t)base;
275 
276 		as_rangelock(as);
277 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
278 		    0) {
279 			err = as_map(as, base, len, segvn_create, zfod_argsp);
280 			as_rangeunlock(as);
281 			if (err) {
282 				res = FC_MAKE_ERR(err);
283 				goto out;
284 			}
285 		} else {
286 			/*
287 			 * This page is already mapped by another thread after
288 			 * we returned from as_fault() above.  We just fall
289 			 * through as_fault() below.
290 			 */
291 			as_rangeunlock(as);
292 		}
293 
294 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
295 	}
296 
297 out:
298 	if (mapped_red)
299 		segkp_unmap_red();
300 
301 	return (res);
302 }
303 
304 void
305 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
306 {
307 	struct proc *p = curproc;
308 	caddr_t userlimit = (flags & _MAP_LOW32) ?
309 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
310 
311 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
312 }
313 
314 /*ARGSUSED*/
315 int
316 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
317 {
318 	return (0);
319 }
320 
321 /*
322  * map_addr_proc() is the routine called when the system is to
323  * choose an address for the user.  We will pick an address
324  * range which is the highest available below kernelbase.
325  *
326  * addrp is a value/result parameter.
327  *	On input it is a hint from the user to be used in a completely
328  *	machine dependent fashion.  We decide to completely ignore this hint.
329  *
330  *	On output it is NULL if no address can be found in the current
331  *	processes address space or else an address that is currently
332  *	not mapped for len bytes with a page of red zone on either side.
333  *
334  *	align is not needed on x86 (it's for viturally addressed caches)
335  */
336 /*ARGSUSED*/
337 void
338 map_addr_proc(
339 	caddr_t *addrp,
340 	size_t len,
341 	offset_t off,
342 	int vacalign,
343 	caddr_t userlimit,
344 	struct proc *p,
345 	uint_t flags)
346 {
347 	struct as *as = p->p_as;
348 	caddr_t addr;
349 	caddr_t base;
350 	size_t slen;
351 	size_t align_amount;
352 
353 	ASSERT32(userlimit == as->a_userlimit);
354 
355 	base = p->p_brkbase;
356 #if defined(__amd64)
357 	/*
358 	 * XX64 Yes, this needs more work.
359 	 */
360 	if (p->p_model == DATAMODEL_NATIVE) {
361 		if (userlimit < as->a_userlimit) {
362 			/*
363 			 * This happens when a program wants to map
364 			 * something in a range that's accessible to a
365 			 * program in a smaller address space.  For example,
366 			 * a 64-bit program calling mmap32(2) to guarantee
367 			 * that the returned address is below 4Gbytes.
368 			 */
369 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
370 
371 			if (userlimit > base)
372 				slen = userlimit - base;
373 			else {
374 				*addrp = NULL;
375 				return;
376 			}
377 		} else {
378 			/*
379 			 * XX64 This layout is probably wrong .. but in
380 			 * the event we make the amd64 address space look
381 			 * like sparcv9 i.e. with the stack -above- the
382 			 * heap, this bit of code might even be correct.
383 			 */
384 			slen = p->p_usrstack - base -
385 			    (((size_t)rctl_enforced_value(
386 			    rctlproc_legacy[RLIMIT_STACK],
387 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
388 		}
389 	} else
390 #endif
391 		slen = userlimit - base;
392 
393 	len = (len + PAGEOFFSET) & PAGEMASK;
394 
395 	/*
396 	 * Redzone for each side of the request. This is done to leave
397 	 * one page unmapped between segments. This is not required, but
398 	 * it's useful for the user because if their program strays across
399 	 * a segment boundary, it will catch a fault immediately making
400 	 * debugging a little easier.
401 	 */
402 	len += 2 * MMU_PAGESIZE;
403 
404 	/*
405 	 * figure out what the alignment should be
406 	 *
407 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
408 	 */
409 	if (len <= ELF_386_MAXPGSZ) {
410 		/*
411 		 * Align virtual addresses to ensure that ELF shared libraries
412 		 * are mapped with the appropriate alignment constraints by
413 		 * the run-time linker.
414 		 */
415 		align_amount = ELF_386_MAXPGSZ;
416 	} else {
417 		int l = mmu.max_page_level;
418 
419 		while (l && len < LEVEL_SIZE(l))
420 			--l;
421 
422 		align_amount = LEVEL_SIZE(l);
423 	}
424 
425 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
426 		align_amount = (uintptr_t)*addrp;
427 
428 	len += align_amount;
429 
430 	/*
431 	 * Look for a large enough hole starting below userlimit.
432 	 * After finding it, use the upper part.  Addition of PAGESIZE
433 	 * is for the redzone as described above.
434 	 */
435 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
436 		caddr_t as_addr;
437 
438 		addr = base + slen - len + MMU_PAGESIZE;
439 		as_addr = addr;
440 		/*
441 		 * Round address DOWN to the alignment amount,
442 		 * add the offset, and if this address is less
443 		 * than the original address, add alignment amount.
444 		 */
445 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
446 		addr += (uintptr_t)(off & (align_amount - 1));
447 		if (addr < as_addr)
448 			addr += align_amount;
449 
450 		ASSERT(addr <= (as_addr + align_amount));
451 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
452 		    ((uintptr_t)(off & (align_amount - 1))));
453 		*addrp = addr;
454 	} else {
455 		*addrp = NULL;	/* no more virtual space */
456 	}
457 }
458 
459 /*
460  * Determine whether [base, base+len] contains a valid range of
461  * addresses at least minlen long. base and len are adjusted if
462  * required to provide a valid range.
463  */
464 /*ARGSUSED3*/
465 int
466 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
467 {
468 	uintptr_t hi, lo;
469 
470 	lo = (uintptr_t)*basep;
471 	hi = lo + *lenp;
472 
473 	/*
474 	 * If hi rolled over the top, try cutting back.
475 	 */
476 	if (hi < lo) {
477 		if (0 - lo + hi < minlen)
478 			return (0);
479 		if (0 - lo < minlen)
480 			return (0);
481 		*lenp = 0 - lo;
482 	} else if (hi - lo < minlen) {
483 		return (0);
484 	}
485 #if defined(__amd64)
486 	/*
487 	 * Deal with a possible hole in the address range between
488 	 * hole_start and hole_end that should never be mapped.
489 	 */
490 	if (lo < hole_start) {
491 		if (hi > hole_start) {
492 			if (hi < hole_end) {
493 				hi = hole_start;
494 			} else {
495 				/* lo < hole_start && hi >= hole_end */
496 				if (dir == AH_LO) {
497 					/*
498 					 * prefer lowest range
499 					 */
500 					if (hole_start - lo >= minlen)
501 						hi = hole_start;
502 					else if (hi - hole_end >= minlen)
503 						lo = hole_end;
504 					else
505 						return (0);
506 				} else {
507 					/*
508 					 * prefer highest range
509 					 */
510 					if (hi - hole_end >= minlen)
511 						lo = hole_end;
512 					else if (hole_start - lo >= minlen)
513 						hi = hole_start;
514 					else
515 						return (0);
516 				}
517 			}
518 		}
519 	} else {
520 		/* lo >= hole_start */
521 		if (hi < hole_end)
522 			return (0);
523 		if (lo < hole_end)
524 			lo = hole_end;
525 	}
526 
527 	if (hi - lo < minlen)
528 		return (0);
529 
530 	*basep = (caddr_t)lo;
531 	*lenp = hi - lo;
532 #endif
533 	return (1);
534 }
535 
536 /*
537  * Determine whether [addr, addr+len] are valid user addresses.
538  */
539 /*ARGSUSED*/
540 int
541 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
542     caddr_t userlimit)
543 {
544 	caddr_t eaddr = addr + len;
545 
546 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
547 		return (RANGE_BADADDR);
548 
549 #if defined(__amd64)
550 	/*
551 	 * Check for the VA hole
552 	 */
553 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
554 		return (RANGE_BADADDR);
555 #endif
556 
557 	return (RANGE_OKAY);
558 }
559 
560 /*
561  * Return 1 if the page frame is onboard memory, else 0.
562  */
563 int
564 pf_is_memory(pfn_t pf)
565 {
566 	return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1));
567 }
568 
569 
570 /*
571  * initialized by page_coloring_init().
572  */
573 uint_t	page_colors;
574 uint_t	page_colors_mask;
575 uint_t	page_coloring_shift;
576 int	cpu_page_colors;
577 static uint_t	l2_colors;
578 
579 /*
580  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
581  * and page_colors are calculated from the l2 cache n-way set size.  Within a
582  * mnode range, the page freelist and cachelist are hashed into bins based on
583  * color. This makes it easier to search for a page within a specific memory
584  * range.
585  */
586 #define	PAGE_COLORS_MIN	16
587 
588 page_t ****page_freelists;
589 page_t ***page_cachelists;
590 
591 /*
592  * As the PC architecture evolved memory up was clumped into several
593  * ranges for various historical I/O devices to do DMA.
594  * < 16Meg - ISA bus
595  * < 2Gig - ???
596  * < 4Gig - PCI bus or drivers that don't understand PAE mode
597  */
598 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
599     0x100000,	/* pfn range for 4G and above */
600     0x80000,	/* pfn range for 2G-4G */
601     0x01000,	/* pfn range for 16M-2G */
602     0x00000,	/* pfn range for 0-16M */
603 };
604 
605 /*
606  * These are changed during startup if the machine has limited memory.
607  */
608 pfn_t *memranges = &arch_memranges[0];
609 int nranges = NUM_MEM_RANGES;
610 
611 /*
612  * Used by page layer to know about page sizes
613  */
614 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
615 
616 /*
617  * This can be patched via /etc/system to allow old non-PAE aware device
618  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
619  */
620 #if defined(__i386)
621 int restricted_kmemalloc = 1;	/* XX64 re-examine with PSARC 2004/405 */
622 #elif defined(__amd64)
623 int restricted_kmemalloc = 0;
624 #endif
625 
626 kmutex_t	*fpc_mutex[NPC_MUTEX];
627 kmutex_t	*cpc_mutex[NPC_MUTEX];
628 
629 
630 /*
631  * return the memrange containing pfn
632  */
633 int
634 memrange_num(pfn_t pfn)
635 {
636 	int n;
637 
638 	for (n = 0; n < nranges - 1; ++n) {
639 		if (pfn >= memranges[n])
640 			break;
641 	}
642 	return (n);
643 }
644 
645 /*
646  * return the mnoderange containing pfn
647  */
648 int
649 pfn_2_mtype(pfn_t pfn)
650 {
651 	int	n;
652 
653 	for (n = mnoderangecnt - 1; n >= 0; n--) {
654 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
655 			break;
656 		}
657 	}
658 	return (n);
659 }
660 
661 /*
662  * is_contigpage_free:
663  *	returns a page list of contiguous pages. It minimally has to return
664  *	minctg pages. Caller determines minctg based on the scatter-gather
665  *	list length.
666  *
667  *	pfnp is set to the next page frame to search on return.
668  */
669 static page_t *
670 is_contigpage_free(
671 	pfn_t *pfnp,
672 	pgcnt_t *pgcnt,
673 	pgcnt_t minctg,
674 	uint64_t pfnseg,
675 	int iolock)
676 {
677 	int	i = 0;
678 	pfn_t	pfn = *pfnp;
679 	page_t	*pp;
680 	page_t	*plist = NULL;
681 
682 	/*
683 	 * fail if pfn + minctg crosses a segment boundary.
684 	 * Adjust for next starting pfn to begin at segment boundary.
685 	 */
686 
687 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
688 		*pfnp = roundup(*pfnp, pfnseg + 1);
689 		return (NULL);
690 	}
691 
692 	do {
693 retry:
694 		pp = page_numtopp_nolock(pfn + i);
695 		if ((pp == NULL) ||
696 		    (page_trylock(pp, SE_EXCL) == 0)) {
697 			(*pfnp)++;
698 			break;
699 		}
700 		if (page_pptonum(pp) != pfn + i) {
701 			page_unlock(pp);
702 			goto retry;
703 		}
704 
705 		if (!(PP_ISFREE(pp))) {
706 			page_unlock(pp);
707 			(*pfnp)++;
708 			break;
709 		}
710 
711 		if (!PP_ISAGED(pp)) {
712 			page_list_sub(pp, PG_CACHE_LIST);
713 			page_hashout(pp, (kmutex_t *)NULL);
714 		} else {
715 			page_list_sub(pp, PG_FREE_LIST);
716 		}
717 
718 		if (iolock)
719 			page_io_lock(pp);
720 		page_list_concat(&plist, &pp);
721 
722 		/*
723 		 * exit loop when pgcnt satisfied or segment boundary reached.
724 		 */
725 
726 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
727 
728 	*pfnp += i;		/* set to next pfn to search */
729 
730 	if (i >= minctg) {
731 		*pgcnt -= i;
732 		return (plist);
733 	}
734 
735 	/*
736 	 * failure: minctg not satisfied.
737 	 *
738 	 * if next request crosses segment boundary, set next pfn
739 	 * to search from the segment boundary.
740 	 */
741 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
742 		*pfnp = roundup(*pfnp, pfnseg + 1);
743 
744 	/* clean up any pages already allocated */
745 
746 	while (plist) {
747 		pp = plist;
748 		page_sub(&plist, pp);
749 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
750 		if (iolock)
751 			page_io_unlock(pp);
752 		page_unlock(pp);
753 	}
754 
755 	return (NULL);
756 }
757 
758 /*
759  * verify that pages being returned from allocator have correct DMA attribute
760  */
761 #ifndef DEBUG
762 #define	check_dma(a, b, c) (0)
763 #else
764 static void
765 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
766 {
767 	if (dma_attr == NULL)
768 		return;
769 
770 	while (cnt-- > 0) {
771 		if (mmu_ptob((uint64_t)pp->p_pagenum) <
772 		    dma_attr->dma_attr_addr_lo)
773 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
774 		if (mmu_ptob((uint64_t)pp->p_pagenum) >=
775 		    dma_attr->dma_attr_addr_hi)
776 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
777 		pp = pp->p_next;
778 	}
779 }
780 #endif
781 
782 static kmutex_t	contig_lock;
783 
784 #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
785 #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
786 
787 #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
788 
789 static page_t *
790 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
791 {
792 	pfn_t		pfn;
793 	int		sgllen;
794 	uint64_t	pfnseg;
795 	pgcnt_t		minctg;
796 	page_t		*pplist = NULL, *plist;
797 	uint64_t	lo, hi;
798 	pgcnt_t		pfnalign = 0;
799 	static pfn_t	startpfn;
800 	static pgcnt_t	lastctgcnt;
801 	uintptr_t	align;
802 
803 	CONTIG_LOCK();
804 
805 	if (mattr) {
806 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
807 		hi = mmu_btop(mattr->dma_attr_addr_hi);
808 		if (hi >= physmax)
809 			hi = physmax - 1;
810 		sgllen = mattr->dma_attr_sgllen;
811 		pfnseg = mmu_btop(mattr->dma_attr_seg);
812 
813 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
814 		if (align > MMU_PAGESIZE)
815 			pfnalign = mmu_btop(align);
816 
817 		/*
818 		 * in order to satisfy the request, must minimally
819 		 * acquire minctg contiguous pages
820 		 */
821 		minctg = howmany(*pgcnt, sgllen);
822 
823 		ASSERT(hi >= lo);
824 
825 		/*
826 		 * start from where last searched if the minctg >= lastctgcnt
827 		 */
828 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
829 			startpfn = lo;
830 	} else {
831 		hi = physmax - 1;
832 		lo = 0;
833 		sgllen = 1;
834 		pfnseg = mmu.highest_pfn;
835 		minctg = *pgcnt;
836 
837 		if (minctg < lastctgcnt)
838 			startpfn = lo;
839 	}
840 	lastctgcnt = minctg;
841 
842 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
843 
844 	/* conserve 16m memory - start search above 16m when possible */
845 	if (hi > PFN_16M && startpfn < PFN_16M)
846 		startpfn = PFN_16M;
847 
848 	pfn = startpfn;
849 	if (pfnalign)
850 		pfn = P2ROUNDUP(pfn, pfnalign);
851 
852 	while (pfn + minctg - 1 <= hi) {
853 
854 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
855 		if (plist) {
856 			page_list_concat(&pplist, &plist);
857 			sgllen--;
858 			/*
859 			 * return when contig pages no longer needed
860 			 */
861 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
862 				startpfn = pfn;
863 				CONTIG_UNLOCK();
864 				check_dma(mattr, pplist, *pgcnt);
865 				return (pplist);
866 			}
867 			minctg = howmany(*pgcnt, sgllen);
868 		}
869 		if (pfnalign)
870 			pfn = P2ROUNDUP(pfn, pfnalign);
871 	}
872 
873 	/* cannot find contig pages in specified range */
874 	if (startpfn == lo) {
875 		CONTIG_UNLOCK();
876 		return (NULL);
877 	}
878 
879 	/* did not start with lo previously */
880 	pfn = lo;
881 	if (pfnalign)
882 		pfn = P2ROUNDUP(pfn, pfnalign);
883 
884 	/* allow search to go above startpfn */
885 	while (pfn < startpfn) {
886 
887 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
888 		if (plist != NULL) {
889 
890 			page_list_concat(&pplist, &plist);
891 			sgllen--;
892 
893 			/*
894 			 * return when contig pages no longer needed
895 			 */
896 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
897 				startpfn = pfn;
898 				CONTIG_UNLOCK();
899 				check_dma(mattr, pplist, *pgcnt);
900 				return (pplist);
901 			}
902 			minctg = howmany(*pgcnt, sgllen);
903 		}
904 		if (pfnalign)
905 			pfn = P2ROUNDUP(pfn, pfnalign);
906 	}
907 	CONTIG_UNLOCK();
908 	return (NULL);
909 }
910 
911 /*
912  * combine mem_node_config and memrange memory ranges into one data
913  * structure to be used for page list management.
914  *
915  * mnode_range_cnt() calculates the number of memory ranges for mnode and
916  * memranges[]. Used to determine the size of page lists and mnoderanges.
917  *
918  * mnode_range_setup() initializes mnoderanges.
919  */
920 mnoderange_t	*mnoderanges;
921 int		mnoderangecnt;
922 int		mtype4g;
923 
924 int
925 mnode_range_cnt()
926 {
927 	int	mri;
928 	int	mnrcnt = 0;
929 	int	mnode;
930 
931 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
932 		if (mem_node_config[mnode].exists == 0)
933 			continue;
934 
935 		mri = nranges - 1;
936 
937 		/* find the memranges index below contained in mnode range */
938 
939 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
940 			mri--;
941 
942 		/*
943 		 * increment mnode range counter when memranges or mnode
944 		 * boundary is reached.
945 		 */
946 		while (mri >= 0 &&
947 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
948 			mnrcnt++;
949 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
950 				mri--;
951 			else
952 				break;
953 		}
954 	}
955 	return (mnrcnt);
956 }
957 
958 void
959 mnode_range_setup(mnoderange_t *mnoderanges)
960 {
961 	int	mnode, mri;
962 
963 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
964 		if (mem_node_config[mnode].exists == 0)
965 			continue;
966 
967 		mri = nranges - 1;
968 
969 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
970 			mri--;
971 
972 		while (mri >= 0 && mem_node_config[mnode].physmax >=
973 		    MEMRANGELO(mri)) {
974 			mnoderanges->mnr_pfnlo =
975 			    MAX(MEMRANGELO(mri),
976 				mem_node_config[mnode].physbase);
977 			mnoderanges->mnr_pfnhi =
978 			    MIN(MEMRANGEHI(mri),
979 				mem_node_config[mnode].physmax);
980 			mnoderanges->mnr_mnode = mnode;
981 			mnoderanges->mnr_memrange = mri;
982 			mnoderanges++;
983 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
984 				mri--;
985 			else
986 				break;
987 		}
988 	}
989 }
990 
991 /*
992  * Determine if the mnode range specified in mtype contains memory belonging
993  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
994  * the range of indices to 0 or 4g.
995  *
996  * Return first mnode range type index found otherwise return -1 if none found.
997  */
998 int
999 mtype_func(int mnode, int mtype, uint_t flags)
1000 {
1001 	if (flags & PGI_MT_RANGE) {
1002 		int	mtlim = 0;	/* default to PGI_MT_RANGEO */
1003 
1004 		if (flags & PGI_MT_NEXT)
1005 			mtype--;
1006 		if (flags & PGI_MT_RANGE4G)
1007 			mtlim = mtype4g + 1;
1008 		while (mtype >= mtlim) {
1009 			if (mnoderanges[mtype].mnr_mnode == mnode)
1010 				return (mtype);
1011 			mtype--;
1012 		}
1013 	} else {
1014 		if (mnoderanges[mtype].mnr_mnode == mnode)
1015 			return (mtype);
1016 	}
1017 	return (-1);
1018 }
1019 
1020 /*
1021  * Initialize page coloring variables based on the l2 cache parameters.
1022  * Calculate and return memory needed for page coloring data structures.
1023  */
1024 size_t
1025 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1026 {
1027 	size_t	colorsz = 0;
1028 	int	i;
1029 	int	colors;
1030 
1031 	/*
1032 	 * Reduce the memory ranges lists if we don't have large amounts
1033 	 * of memory. This avoids searching known empty free lists.
1034 	 */
1035 	i = memrange_num(physmax);
1036 	memranges += i;
1037 	nranges -= i;
1038 #if defined(__i386)
1039 	if (i > 0)
1040 		restricted_kmemalloc = 0;
1041 #endif
1042 	/* physmax greater than 4g */
1043 	if (i == 0)
1044 		physmax4g = 1;
1045 
1046 	/*
1047 	 * setup pagesize for generic page layer
1048 	 */
1049 	for (i = 0; i <= mmu.max_page_level; ++i) {
1050 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
1051 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1052 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1053 	}
1054 
1055 	ASSERT(ISP2(l2_sz));
1056 	ASSERT(ISP2(l2_linesz));
1057 	ASSERT(l2_sz > MMU_PAGESIZE);
1058 
1059 	/* l2_assoc is 0 for fully associative l2 cache */
1060 	if (l2_assoc)
1061 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1062 	else
1063 		l2_colors = 1;
1064 
1065 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
1066 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1067 
1068 	/*
1069 	 * cpu_page_colors is non-zero when a page color may be spread across
1070 	 * multiple bins.
1071 	 */
1072 	if (l2_colors < page_colors)
1073 		cpu_page_colors = l2_colors;
1074 
1075 	ASSERT(ISP2(page_colors));
1076 
1077 	page_colors_mask = page_colors - 1;
1078 
1079 	ASSERT(ISP2(CPUSETSIZE()));
1080 	page_coloring_shift = lowbit(CPUSETSIZE());
1081 
1082 	/* size for mnoderanges */
1083 	mnoderangecnt = mnode_range_cnt();
1084 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
1085 
1086 	/* size for fpc_mutex and cpc_mutex */
1087 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1088 
1089 	/* size of page_freelists */
1090 	colorsz += mnoderangecnt * sizeof (page_t ***);
1091 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1092 
1093 	for (i = 0; i < mmu_page_sizes; i++) {
1094 		colors = page_get_pagecolors(i);
1095 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
1096 	}
1097 
1098 	/* size of page_cachelists */
1099 	colorsz += mnoderangecnt * sizeof (page_t **);
1100 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1101 
1102 	return (colorsz);
1103 }
1104 
1105 /*
1106  * Called once at startup to configure page_coloring data structures and
1107  * does the 1st page_free()/page_freelist_add().
1108  */
1109 void
1110 page_coloring_setup(caddr_t pcmemaddr)
1111 {
1112 	int	i;
1113 	int	j;
1114 	int	k;
1115 	caddr_t	addr;
1116 	int	colors;
1117 
1118 	/*
1119 	 * do page coloring setup
1120 	 */
1121 	addr = pcmemaddr;
1122 
1123 	mnoderanges = (mnoderange_t *)addr;
1124 	addr += (mnoderangecnt * sizeof (mnoderange_t));
1125 
1126 	mnode_range_setup(mnoderanges);
1127 
1128 	if (physmax4g)
1129 		mtype4g = pfn_2_mtype(0xfffff);
1130 
1131 	for (k = 0; k < NPC_MUTEX; k++) {
1132 		fpc_mutex[k] = (kmutex_t *)addr;
1133 		addr += (max_mem_nodes * sizeof (kmutex_t));
1134 	}
1135 	for (k = 0; k < NPC_MUTEX; k++) {
1136 		cpc_mutex[k] = (kmutex_t *)addr;
1137 		addr += (max_mem_nodes * sizeof (kmutex_t));
1138 	}
1139 	page_freelists = (page_t ****)addr;
1140 	addr += (mnoderangecnt * sizeof (page_t ***));
1141 
1142 	page_cachelists = (page_t ***)addr;
1143 	addr += (mnoderangecnt * sizeof (page_t **));
1144 
1145 	for (i = 0; i < mnoderangecnt; i++) {
1146 		page_freelists[i] = (page_t ***)addr;
1147 		addr += (mmu_page_sizes * sizeof (page_t **));
1148 
1149 		for (j = 0; j < mmu_page_sizes; j++) {
1150 			colors = page_get_pagecolors(j);
1151 			page_freelists[i][j] = (page_t **)addr;
1152 			addr += (colors * sizeof (page_t *));
1153 		}
1154 		page_cachelists[i] = (page_t **)addr;
1155 		addr += (page_colors * sizeof (page_t *));
1156 	}
1157 }
1158 
1159 /*ARGSUSED*/
1160 int
1161 bp_color(struct buf *bp)
1162 {
1163 	return (0);
1164 }
1165 
1166 /*
1167  * get a page from any list with the given mnode
1168  */
1169 page_t *
1170 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
1171     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
1172 {
1173 	kmutex_t	*pcm;
1174 	int		i;
1175 	page_t		*pp;
1176 	page_t		*first_pp;
1177 	uint64_t	pgaddr;
1178 	ulong_t		bin;
1179 	int		mtypestart;
1180 
1181 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
1182 
1183 	ASSERT((flags & PG_MATCH_COLOR) == 0);
1184 	ASSERT(szc == 0);
1185 	ASSERT(dma_attr != NULL);
1186 
1187 
1188 	MTYPE_START(mnode, mtype, flags);
1189 	if (mtype < 0) {
1190 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
1191 		return (NULL);
1192 	}
1193 
1194 	mtypestart = mtype;
1195 
1196 	bin = origbin;
1197 
1198 	/*
1199 	 * check up to page_colors + 1 bins - origbin may be checked twice
1200 	 * because of BIN_STEP skip
1201 	 */
1202 	do {
1203 		i = 0;
1204 		while (i <= page_colors) {
1205 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
1206 				goto nextfreebin;
1207 
1208 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1209 			mutex_enter(pcm);
1210 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
1211 			first_pp = pp;
1212 			while (pp != NULL) {
1213 				if (page_trylock(pp, SE_EXCL) == 0) {
1214 					pp = pp->p_next;
1215 					if (pp == first_pp) {
1216 						pp = NULL;
1217 					}
1218 					continue;
1219 				}
1220 
1221 				ASSERT(PP_ISFREE(pp));
1222 				ASSERT(PP_ISAGED(pp));
1223 				ASSERT(pp->p_vnode == NULL);
1224 				ASSERT(pp->p_hash == NULL);
1225 				ASSERT(pp->p_offset == (u_offset_t)-1);
1226 				ASSERT(pp->p_szc == szc);
1227 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1228 				/* check if page within DMA attributes */
1229 				pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum));
1230 
1231 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1232 				    (pgaddr + MMU_PAGESIZE - 1 <=
1233 				    dma_attr->dma_attr_addr_hi)) {
1234 					break;
1235 				}
1236 
1237 				/* continue looking */
1238 				page_unlock(pp);
1239 				pp = pp->p_next;
1240 				if (pp == first_pp)
1241 					pp = NULL;
1242 
1243 			}
1244 			if (pp != NULL) {
1245 				ASSERT(mtype == PP_2_MTYPE(pp));
1246 				ASSERT(pp->p_szc == 0);
1247 
1248 				/* found a page with specified DMA attributes */
1249 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
1250 				    mtype), pp);
1251 				page_ctr_sub(pp, PG_FREE_LIST);
1252 
1253 				if ((PP_ISFREE(pp) == 0) ||
1254 				    (PP_ISAGED(pp) == 0)) {
1255 					cmn_err(CE_PANIC, "page %p is not free",
1256 					    (void *)pp);
1257 				}
1258 
1259 				mutex_exit(pcm);
1260 				check_dma(dma_attr, pp, 1);
1261 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1262 				return (pp);
1263 			}
1264 			mutex_exit(pcm);
1265 nextfreebin:
1266 			pp = page_freelist_fill(szc, bin, mnode, mtype,
1267 			    mmu_btop(dma_attr->dma_attr_addr_hi + 1));
1268 			if (pp)
1269 				return (pp);
1270 
1271 			/* try next bin */
1272 			bin += (i == 0) ? BIN_STEP : 1;
1273 			bin &= page_colors_mask;
1274 			i++;
1275 		}
1276 	} while ((flags & PGI_MT_RANGE) &&
1277 	    (MTYPE_NEXT(mnode, mtype, flags) >= 0));
1278 
1279 	/* failed to find a page in the freelist; try it in the cachelist */
1280 
1281 	/* reset mtype start for cachelist search */
1282 	mtype = mtypestart;
1283 	ASSERT(mtype >= 0);
1284 
1285 	/* start with the bin of matching color */
1286 	bin = origbin;
1287 
1288 	do {
1289 		for (i = 0; i <= page_colors; i++) {
1290 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
1291 				goto nextcachebin;
1292 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
1293 			mutex_enter(pcm);
1294 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
1295 			first_pp = pp;
1296 			while (pp != NULL) {
1297 				if (page_trylock(pp, SE_EXCL) == 0) {
1298 					pp = pp->p_next;
1299 					if (pp == first_pp)
1300 						break;
1301 					continue;
1302 				}
1303 				ASSERT(pp->p_vnode);
1304 				ASSERT(PP_ISAGED(pp) == 0);
1305 				ASSERT(pp->p_szc == 0);
1306 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
1307 
1308 				/* check if page within DMA attributes */
1309 
1310 				pgaddr = ptob((uint64_t)(pp->p_pagenum));
1311 
1312 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
1313 				    (pgaddr + MMU_PAGESIZE - 1 <=
1314 				    dma_attr->dma_attr_addr_hi)) {
1315 					break;
1316 				}
1317 
1318 				/* continue looking */
1319 				page_unlock(pp);
1320 				pp = pp->p_next;
1321 				if (pp == first_pp)
1322 					pp = NULL;
1323 			}
1324 
1325 			if (pp != NULL) {
1326 				ASSERT(mtype == PP_2_MTYPE(pp));
1327 				ASSERT(pp->p_szc == 0);
1328 
1329 				/* found a page with specified DMA attributes */
1330 				page_sub(&PAGE_CACHELISTS(mnode, bin,
1331 				    mtype), pp);
1332 				page_ctr_sub(pp, PG_CACHE_LIST);
1333 
1334 				mutex_exit(pcm);
1335 				ASSERT(pp->p_vnode);
1336 				ASSERT(PP_ISAGED(pp) == 0);
1337 				check_dma(dma_attr, pp, 1);
1338 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
1339 				return (pp);
1340 			}
1341 			mutex_exit(pcm);
1342 nextcachebin:
1343 			bin += (i == 0) ? BIN_STEP : 1;
1344 			bin &= page_colors_mask;
1345 		}
1346 	} while ((flags & PGI_MT_RANGE) &&
1347 	    (MTYPE_NEXT(mnode, mtype, flags) >= 0));
1348 
1349 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
1350 	return (NULL);
1351 }
1352 
1353 /*
1354  * This function is similar to page_get_freelist()/page_get_cachelist()
1355  * but it searches both the lists to find a page with the specified
1356  * color (or no color) and DMA attributes. The search is done in the
1357  * freelist first and then in the cache list within the highest memory
1358  * range (based on DMA attributes) before searching in the lower
1359  * memory ranges.
1360  *
1361  * Note: This function is called only by page_create_io().
1362  */
1363 /*ARGSUSED*/
1364 page_t *
1365 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
1366     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
1367 {
1368 	uint_t		bin;
1369 	int		mtype;
1370 	page_t		*pp;
1371 	int		n;
1372 	int		m;
1373 	int		szc;
1374 	int		fullrange;
1375 	int		mnode;
1376 	int		local_failed_stat = 0;
1377 	lgrp_mnode_cookie_t	lgrp_cookie;
1378 
1379 	VM_STAT_ADD(pga_vmstats.pga_alloc);
1380 
1381 	/* only base pagesize currently supported */
1382 	if (size != MMU_PAGESIZE)
1383 		return (NULL);
1384 
1385 	/*
1386 	 * If we're passed a specific lgroup, we use it.  Otherwise,
1387 	 * assume first-touch placement is desired.
1388 	 */
1389 	if (!LGRP_EXISTS(lgrp))
1390 		lgrp = lgrp_home_lgrp();
1391 
1392 	/* LINTED */
1393 	AS_2_BIN(as, seg, vp, vaddr, bin);
1394 
1395 	/*
1396 	 * Only hold one freelist or cachelist lock at a time, that way we
1397 	 * can start anywhere and not have to worry about lock
1398 	 * ordering.
1399 	 */
1400 	if (dma_attr == NULL) {
1401 		n = 0;
1402 		m = mnoderangecnt - 1;
1403 		fullrange = 1;
1404 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
1405 	} else {
1406 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
1407 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
1408 
1409 		/*
1410 		 * We can guarantee alignment only for page boundary.
1411 		 */
1412 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
1413 			return (NULL);
1414 
1415 		n = pfn_2_mtype(pfnlo);
1416 		m = pfn_2_mtype(pfnhi);
1417 
1418 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
1419 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
1420 	}
1421 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
1422 
1423 	if (n > m)
1424 		return (NULL);
1425 
1426 	szc = 0;
1427 
1428 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
1429 	if (n == 0) {
1430 		flags |= PGI_MT_RANGE0;
1431 		n = m;
1432 	}
1433 
1434 	/*
1435 	 * Try local memory node first, but try remote if we can't
1436 	 * get a page of the right color.
1437 	 */
1438 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
1439 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
1440 		/*
1441 		 * allocate pages from high pfn to low.
1442 		 */
1443 		for (mtype = m; mtype >= n; mtype--) {
1444 			if (fullrange != 0) {
1445 				pp = page_get_mnode_freelist(mnode,
1446 				    bin, mtype, szc, flags);
1447 				if (pp == NULL) {
1448 					pp = page_get_mnode_cachelist(
1449 						bin, flags, mnode, mtype);
1450 				}
1451 			} else {
1452 				pp = page_get_mnode_anylist(bin, szc,
1453 				    flags, mnode, mtype, dma_attr);
1454 			}
1455 			if (pp != NULL) {
1456 				VM_STAT_ADD(pga_vmstats.pga_allocok);
1457 				check_dma(dma_attr, pp, 1);
1458 				return (pp);
1459 			}
1460 		}
1461 		if (!local_failed_stat) {
1462 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
1463 			local_failed_stat = 1;
1464 		}
1465 	}
1466 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
1467 
1468 	return (NULL);
1469 }
1470 
1471 /*
1472  * page_create_io()
1473  *
1474  * This function is a copy of page_create_va() with an additional
1475  * argument 'mattr' that specifies DMA memory requirements to
1476  * the page list functions. This function is used by the segkmem
1477  * allocator so it is only to create new pages (i.e PG_EXCL is
1478  * set).
1479  *
1480  * Note: This interface is currently used by x86 PSM only and is
1481  *	 not fully specified so the commitment level is only for
1482  *	 private interface specific to x86. This interface uses PSM
1483  *	 specific page_get_anylist() interface.
1484  */
1485 
1486 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
1487 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
1488 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
1489 			break; \
1490 	} \
1491 }
1492 
1493 
1494 page_t *
1495 page_create_io(
1496 	struct vnode	*vp,
1497 	u_offset_t	off,
1498 	uint_t		bytes,
1499 	uint_t		flags,
1500 	struct as	*as,
1501 	caddr_t		vaddr,
1502 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
1503 {
1504 	page_t		*plist = NULL;
1505 	uint_t		plist_len = 0;
1506 	pgcnt_t		npages;
1507 	page_t		*npp = NULL;
1508 	uint_t		pages_req;
1509 	page_t		*pp;
1510 	kmutex_t	*phm = NULL;
1511 	uint_t		index;
1512 
1513 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
1514 		"page_create_start:vp %p off %llx bytes %u flags %x",
1515 		vp, off, bytes, flags);
1516 
1517 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
1518 
1519 	pages_req = npages = mmu_btopr(bytes);
1520 
1521 	/*
1522 	 * Do the freemem and pcf accounting.
1523 	 */
1524 	if (!page_create_wait(npages, flags)) {
1525 		return (NULL);
1526 	}
1527 
1528 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
1529 		"page_create_success:vp %p off %llx",
1530 		vp, off);
1531 
1532 	/*
1533 	 * If satisfying this request has left us with too little
1534 	 * memory, start the wheels turning to get some back.  The
1535 	 * first clause of the test prevents waking up the pageout
1536 	 * daemon in situations where it would decide that there's
1537 	 * nothing to do.
1538 	 */
1539 	if (nscan < desscan && freemem < minfree) {
1540 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
1541 			"pageout_cv_signal:freemem %ld", freemem);
1542 		cv_signal(&proc_pageout->p_cv);
1543 	}
1544 
1545 	if (flags & PG_PHYSCONTIG) {
1546 
1547 		plist = page_get_contigpage(&npages, mattr, 1);
1548 		if (plist == NULL) {
1549 			page_create_putback(npages);
1550 			return (NULL);
1551 		}
1552 
1553 		pp = plist;
1554 
1555 		do {
1556 			if (!page_hashin(pp, vp, off, NULL)) {
1557 				panic("pg_creat_io: hashin failed %p %p %llx",
1558 				    (void *)pp, (void *)vp, off);
1559 			}
1560 			VM_STAT_ADD(page_create_new);
1561 			off += MMU_PAGESIZE;
1562 			PP_CLRFREE(pp);
1563 			PP_CLRAGED(pp);
1564 			page_set_props(pp, P_REF);
1565 			pp = pp->p_next;
1566 		} while (pp != plist);
1567 
1568 		if (!npages) {
1569 			check_dma(mattr, plist, pages_req);
1570 			return (plist);
1571 		} else {
1572 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
1573 		}
1574 
1575 		/*
1576 		 * fall-thru:
1577 		 *
1578 		 * page_get_contigpage returns when npages <= sgllen.
1579 		 * Grab the rest of the non-contig pages below from anylist.
1580 		 */
1581 	}
1582 
1583 	/*
1584 	 * Loop around collecting the requested number of pages.
1585 	 * Most of the time, we have to `create' a new page. With
1586 	 * this in mind, pull the page off the free list before
1587 	 * getting the hash lock.  This will minimize the hash
1588 	 * lock hold time, nesting, and the like.  If it turns
1589 	 * out we don't need the page, we put it back at the end.
1590 	 */
1591 	while (npages--) {
1592 		phm = NULL;
1593 
1594 		index = PAGE_HASH_FUNC(vp, off);
1595 top:
1596 		ASSERT(phm == NULL);
1597 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
1598 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1599 
1600 		if (npp == NULL) {
1601 			/*
1602 			 * Try to get the page of any color either from
1603 			 * the freelist or from the cache list.
1604 			 */
1605 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
1606 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
1607 			if (npp == NULL) {
1608 				if (mattr == NULL) {
1609 					/*
1610 					 * Not looking for a special page;
1611 					 * panic!
1612 					 */
1613 					panic("no page found %d", (int)npages);
1614 				}
1615 				/*
1616 				 * No page found! This can happen
1617 				 * if we are looking for a page
1618 				 * within a specific memory range
1619 				 * for DMA purposes. If PG_WAIT is
1620 				 * specified then we wait for a
1621 				 * while and then try again. The
1622 				 * wait could be forever if we
1623 				 * don't get the page(s) we need.
1624 				 *
1625 				 * Note: XXX We really need a mechanism
1626 				 * to wait for pages in the desired
1627 				 * range. For now, we wait for any
1628 				 * pages and see if we can use it.
1629 				 */
1630 
1631 				if ((mattr != NULL) && (flags & PG_WAIT)) {
1632 					delay(10);
1633 					goto top;
1634 				}
1635 
1636 				goto fail; /* undo accounting stuff */
1637 			}
1638 
1639 			if (PP_ISAGED(npp) == 0) {
1640 				/*
1641 				 * Since this page came from the
1642 				 * cachelist, we must destroy the
1643 				 * old vnode association.
1644 				 */
1645 				page_hashout(npp, (kmutex_t *)NULL);
1646 			}
1647 		}
1648 
1649 		/*
1650 		 * We own this page!
1651 		 */
1652 		ASSERT(PAGE_EXCL(npp));
1653 		ASSERT(npp->p_vnode == NULL);
1654 		ASSERT(!hat_page_is_mapped(npp));
1655 		PP_CLRFREE(npp);
1656 		PP_CLRAGED(npp);
1657 
1658 		/*
1659 		 * Here we have a page in our hot little mits and are
1660 		 * just waiting to stuff it on the appropriate lists.
1661 		 * Get the mutex and check to see if it really does
1662 		 * not exist.
1663 		 */
1664 		phm = PAGE_HASH_MUTEX(index);
1665 		mutex_enter(phm);
1666 		PAGE_HASH_SEARCH(index, pp, vp, off);
1667 		if (pp == NULL) {
1668 			VM_STAT_ADD(page_create_new);
1669 			pp = npp;
1670 			npp = NULL;
1671 			if (!page_hashin(pp, vp, off, phm)) {
1672 				/*
1673 				 * Since we hold the page hash mutex and
1674 				 * just searched for this page, page_hashin
1675 				 * had better not fail.  If it does, that
1676 				 * means somethread did not follow the
1677 				 * page hash mutex rules.  Panic now and
1678 				 * get it over with.  As usual, go down
1679 				 * holding all the locks.
1680 				 */
1681 				ASSERT(MUTEX_HELD(phm));
1682 				panic("page_create: hashin fail %p %p %llx %p",
1683 				    (void *)pp, (void *)vp, off, (void *)phm);
1684 
1685 			}
1686 			ASSERT(MUTEX_HELD(phm));
1687 			mutex_exit(phm);
1688 			phm = NULL;
1689 
1690 			/*
1691 			 * Hat layer locking need not be done to set
1692 			 * the following bits since the page is not hashed
1693 			 * and was on the free list (i.e., had no mappings).
1694 			 *
1695 			 * Set the reference bit to protect
1696 			 * against immediate pageout
1697 			 *
1698 			 * XXXmh modify freelist code to set reference
1699 			 * bit so we don't have to do it here.
1700 			 */
1701 			page_set_props(pp, P_REF);
1702 		} else {
1703 			ASSERT(MUTEX_HELD(phm));
1704 			mutex_exit(phm);
1705 			phm = NULL;
1706 			/*
1707 			 * NOTE: This should not happen for pages associated
1708 			 *	 with kernel vnode 'kvp'.
1709 			 */
1710 			/* XX64 - to debug why this happens! */
1711 			ASSERT(vp != &kvp);
1712 			if (vp == &kvp)
1713 				cmn_err(CE_NOTE,
1714 				    "page_create: page not expected "
1715 				    "in hash list for kernel vnode - pp 0x%p",
1716 				    (void *)pp);
1717 			VM_STAT_ADD(page_create_exists);
1718 			goto fail;
1719 		}
1720 
1721 		/*
1722 		 * Got a page!  It is locked.  Acquire the i/o
1723 		 * lock since we are going to use the p_next and
1724 		 * p_prev fields to link the requested pages together.
1725 		 */
1726 		page_io_lock(pp);
1727 		page_add(&plist, pp);
1728 		plist = plist->p_next;
1729 		off += MMU_PAGESIZE;
1730 		vaddr += MMU_PAGESIZE;
1731 	}
1732 
1733 	check_dma(mattr, plist, pages_req);
1734 	return (plist);
1735 
1736 fail:
1737 	if (npp != NULL) {
1738 		/*
1739 		 * Did not need this page after all.
1740 		 * Put it back on the free list.
1741 		 */
1742 		VM_STAT_ADD(page_create_putbacks);
1743 		PP_SETFREE(npp);
1744 		PP_SETAGED(npp);
1745 		npp->p_offset = (u_offset_t)-1;
1746 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
1747 		page_unlock(npp);
1748 	}
1749 
1750 	/*
1751 	 * Give up the pages we already got.
1752 	 */
1753 	while (plist != NULL) {
1754 		pp = plist;
1755 		page_sub(&plist, pp);
1756 		page_io_unlock(pp);
1757 		plist_len++;
1758 		/*LINTED: constant in conditional ctx*/
1759 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
1760 	}
1761 
1762 	/*
1763 	 * VN_DISPOSE does freemem accounting for the pages in plist
1764 	 * by calling page_free. So, we need to undo the pcf accounting
1765 	 * for only the remaining pages.
1766 	 */
1767 	VM_STAT_ADD(page_create_putbacks);
1768 	page_create_putback(pages_req - plist_len);
1769 
1770 	return (NULL);
1771 }
1772 
1773 
1774 /*
1775  * Copy the data from the physical page represented by "frompp" to
1776  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
1777  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
1778  * level and no one sleeps with an active mapping there.
1779  *
1780  * Note that the ref/mod bits in the page_t's are not affected by
1781  * this operation, hence it is up to the caller to update them appropriately.
1782  */
1783 void
1784 ppcopy(page_t *frompp, page_t *topp)
1785 {
1786 	caddr_t		pp_addr1;
1787 	caddr_t		pp_addr2;
1788 	void		*pte1;
1789 	void		*pte2;
1790 	kmutex_t	*ppaddr_mutex;
1791 
1792 	ASSERT_STACK_ALIGNED();
1793 	ASSERT(PAGE_LOCKED(frompp));
1794 	ASSERT(PAGE_LOCKED(topp));
1795 
1796 	if (kpm_enable) {
1797 		pp_addr1 = hat_kpm_page2va(frompp, 0);
1798 		pp_addr2 = hat_kpm_page2va(topp, 0);
1799 		kpreempt_disable();
1800 	} else {
1801 		/*
1802 		 * disable pre-emption so that CPU can't change
1803 		 */
1804 		kpreempt_disable();
1805 
1806 		pp_addr1 = CPU->cpu_caddr1;
1807 		pp_addr2 = CPU->cpu_caddr2;
1808 		pte1 = (void *)CPU->cpu_caddr1pte;
1809 		pte2 = (void *)CPU->cpu_caddr2pte;
1810 
1811 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1812 		mutex_enter(ppaddr_mutex);
1813 
1814 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
1815 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
1816 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
1817 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1818 		    HAT_LOAD_NOCONSIST);
1819 	}
1820 
1821 	if (use_sse_pagecopy)
1822 		hwblkpagecopy(pp_addr1, pp_addr2);
1823 	else
1824 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
1825 
1826 	if (!kpm_enable)
1827 		mutex_exit(ppaddr_mutex);
1828 	kpreempt_enable();
1829 }
1830 
1831 /*
1832  * Zero the physical page from off to off + len given by `pp'
1833  * without changing the reference and modified bits of page.
1834  *
1835  * We use this using CPU private page address #2, see ppcopy() for more info.
1836  * pagezero() must not be called at interrupt level.
1837  */
1838 void
1839 pagezero(page_t *pp, uint_t off, uint_t len)
1840 {
1841 	caddr_t		pp_addr2;
1842 	void		*pte2;
1843 	kmutex_t	*ppaddr_mutex;
1844 
1845 	ASSERT_STACK_ALIGNED();
1846 	ASSERT(len <= MMU_PAGESIZE);
1847 	ASSERT(off <= MMU_PAGESIZE);
1848 	ASSERT(off + len <= MMU_PAGESIZE);
1849 	ASSERT(PAGE_LOCKED(pp));
1850 
1851 	if (kpm_enable) {
1852 		pp_addr2 = hat_kpm_page2va(pp, 0);
1853 		kpreempt_disable();
1854 	} else {
1855 		kpreempt_disable();
1856 
1857 		pp_addr2 = CPU->cpu_caddr2;
1858 		pte2 = (void *)CPU->cpu_caddr2pte;
1859 
1860 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
1861 		mutex_enter(ppaddr_mutex);
1862 
1863 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
1864 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
1865 		    HAT_LOAD_NOCONSIST);
1866 	}
1867 
1868 	if (use_sse_pagezero)
1869 		hwblkclr(pp_addr2 + off, len);
1870 	else
1871 		bzero(pp_addr2 + off, len);
1872 
1873 	if (!kpm_enable)
1874 		mutex_exit(ppaddr_mutex);
1875 	kpreempt_enable();
1876 }
1877 
1878 /*
1879  * Platform-dependent page scrub call.
1880  */
1881 void
1882 pagescrub(page_t *pp, uint_t off, uint_t len)
1883 {
1884 	/*
1885 	 * For now, we rely on the fact that pagezero() will
1886 	 * always clear UEs.
1887 	 */
1888 	pagezero(pp, off, len);
1889 }
1890 
1891 /*
1892  * set up two private addresses for use on a given CPU for use in ppcopy()
1893  */
1894 void
1895 setup_vaddr_for_ppcopy(struct cpu *cpup)
1896 {
1897 	void *addr;
1898 	void *pte;
1899 
1900 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1901 	pte = hat_mempte_setup(addr);
1902 	cpup->cpu_caddr1 = addr;
1903 	cpup->cpu_caddr1pte = (pteptr_t)pte;
1904 
1905 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
1906 	pte = hat_mempte_setup(addr);
1907 	cpup->cpu_caddr2 = addr;
1908 	cpup->cpu_caddr2pte = (pteptr_t)pte;
1909 
1910 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
1911 }
1912 
1913 
1914 /*
1915  * Create the pageout scanner thread. The thread has to
1916  * start at procedure with process pp and priority pri.
1917  */
1918 void
1919 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1920 {
1921 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1922 }
1923 
1924 /*
1925  * any use for this?
1926  */
1927 void
1928 post_startup_mmu_initialization(void)
1929 {}
1930 
1931 /*
1932  * Function for flushing D-cache when performing module relocations
1933  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
1934  */
1935 void
1936 dcache_flushall()
1937 {}
1938