17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5aa042c4bSkchow * Common Development and Distribution License (the "License"). 6aa042c4bSkchow * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 22cb15d5d9SPeter Rival * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 237c478bd9Sstevel@tonic-gate */ 24a3114836SGerry Liu /* 25a3114836SGerry Liu * Copyright (c) 2010, Intel Corporation. 26a3114836SGerry Liu * All rights reserved. 27*dddac438SJohn Levon * Copyright 2019, Joyent, Inc. 28a3114836SGerry Liu */ 297c478bd9Sstevel@tonic-gate 307c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 317c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 327c478bd9Sstevel@tonic-gate 337c478bd9Sstevel@tonic-gate /* 347c478bd9Sstevel@tonic-gate * Portions of this source code were derived from Berkeley 4.3 BSD 357c478bd9Sstevel@tonic-gate * under license from the Regents of the University of California. 367c478bd9Sstevel@tonic-gate */ 377c478bd9Sstevel@tonic-gate 387c478bd9Sstevel@tonic-gate /* 397c478bd9Sstevel@tonic-gate * UNIX machine dependent virtual memory support. 407c478bd9Sstevel@tonic-gate */ 417c478bd9Sstevel@tonic-gate 427c478bd9Sstevel@tonic-gate #include <sys/types.h> 437c478bd9Sstevel@tonic-gate #include <sys/param.h> 447c478bd9Sstevel@tonic-gate #include <sys/systm.h> 457c478bd9Sstevel@tonic-gate #include <sys/user.h> 467c478bd9Sstevel@tonic-gate #include <sys/proc.h> 477c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 487c478bd9Sstevel@tonic-gate #include <sys/vmem.h> 497c478bd9Sstevel@tonic-gate #include <sys/buf.h> 507c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 517c478bd9Sstevel@tonic-gate #include <sys/lgrp.h> 527c478bd9Sstevel@tonic-gate #include <sys/disp.h> 537c478bd9Sstevel@tonic-gate #include <sys/vm.h> 547c478bd9Sstevel@tonic-gate #include <sys/mman.h> 557c478bd9Sstevel@tonic-gate #include <sys/vnode.h> 567c478bd9Sstevel@tonic-gate #include <sys/cred.h> 577c478bd9Sstevel@tonic-gate #include <sys/exec.h> 587c478bd9Sstevel@tonic-gate #include <sys/exechdr.h> 597c478bd9Sstevel@tonic-gate #include <sys/debug.h> 60ec25b48fSsusans #include <sys/vmsystm.h> 61cb15d5d9SPeter Rival #include <sys/swap.h> 621f84c0d7SDave Plauger #include <sys/dumphdr.h> 63d2a70789SRichard Lowe #include <sys/random.h> 647c478bd9Sstevel@tonic-gate 657c478bd9Sstevel@tonic-gate #include <vm/hat.h> 667c478bd9Sstevel@tonic-gate #include <vm/as.h> 677c478bd9Sstevel@tonic-gate #include <vm/seg.h> 687c478bd9Sstevel@tonic-gate #include <vm/seg_kp.h> 697c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h> 707c478bd9Sstevel@tonic-gate #include <vm/page.h> 717c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 727c478bd9Sstevel@tonic-gate #include <vm/seg_kpm.h> 737c478bd9Sstevel@tonic-gate #include <vm/vm_dep.h> 747c478bd9Sstevel@tonic-gate 757c478bd9Sstevel@tonic-gate #include <sys/cpu.h> 767c478bd9Sstevel@tonic-gate #include <sys/vm_machparam.h> 777c478bd9Sstevel@tonic-gate #include <sys/memlist.h> 787c478bd9Sstevel@tonic-gate #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 797c478bd9Sstevel@tonic-gate #include <vm/hat_i86.h> 807c478bd9Sstevel@tonic-gate #include <sys/x86_archext.h> 817c478bd9Sstevel@tonic-gate #include <sys/elf_386.h> 827c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 837c478bd9Sstevel@tonic-gate #include <sys/archsystm.h> 847c478bd9Sstevel@tonic-gate #include <sys/machsystm.h> 85d2a70789SRichard Lowe #include <sys/secflags.h> 867c478bd9Sstevel@tonic-gate 877c478bd9Sstevel@tonic-gate #include <sys/vtrace.h> 887c478bd9Sstevel@tonic-gate #include <sys/ddidmareq.h> 897c478bd9Sstevel@tonic-gate #include <sys/promif.h> 907c478bd9Sstevel@tonic-gate #include <sys/memnode.h> 917c478bd9Sstevel@tonic-gate #include <sys/stack.h> 92843e1988Sjohnlev #include <util/qsort.h> 93843e1988Sjohnlev #include <sys/taskq.h> 94843e1988Sjohnlev 95843e1988Sjohnlev #ifdef __xpv 96843e1988Sjohnlev 97843e1988Sjohnlev #include <sys/hypervisor.h> 98843e1988Sjohnlev #include <sys/xen_mmu.h> 99843e1988Sjohnlev #include <sys/balloon_impl.h> 100843e1988Sjohnlev 101843e1988Sjohnlev /* 102843e1988Sjohnlev * domain 0 pages usable for DMA are kept pre-allocated and kept in 103843e1988Sjohnlev * distinct lists, ordered by increasing mfn. 104843e1988Sjohnlev */ 105843e1988Sjohnlev static kmutex_t io_pool_lock; 106b9bc7f78Ssmaybe static kmutex_t contig_list_lock; 107843e1988Sjohnlev static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 108843e1988Sjohnlev static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 109843e1988Sjohnlev static long io_pool_cnt; 110843e1988Sjohnlev static long io_pool_cnt_max = 0; 111843e1988Sjohnlev #define DEFAULT_IO_POOL_MIN 128 112843e1988Sjohnlev static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 113843e1988Sjohnlev static long io_pool_cnt_lowater = 0; 114843e1988Sjohnlev static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 115843e1988Sjohnlev static long io_pool_shrinks; /* how many times did we really shrink */ 116843e1988Sjohnlev static long io_pool_grows; /* how many times did we grow */ 117843e1988Sjohnlev static mfn_t start_mfn = 1; 118843e1988Sjohnlev static caddr_t io_pool_kva; /* use to alloc pages when needed */ 119843e1988Sjohnlev 120843e1988Sjohnlev static int create_contig_pfnlist(uint_t); 121843e1988Sjohnlev 122843e1988Sjohnlev /* 123843e1988Sjohnlev * percentage of phys mem to hold in the i/o pool 124843e1988Sjohnlev */ 125843e1988Sjohnlev #define DEFAULT_IO_POOL_PCT 2 126843e1988Sjohnlev static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 127843e1988Sjohnlev static void page_io_pool_sub(page_t **, page_t *, page_t *); 128b9bc7f78Ssmaybe int ioalloc_dbg = 0; 129843e1988Sjohnlev 130843e1988Sjohnlev #endif /* __xpv */ 1317c478bd9Sstevel@tonic-gate 1325d07b933Sdp uint_t vac_colors = 1; 1337c478bd9Sstevel@tonic-gate 1347c478bd9Sstevel@tonic-gate int largepagesupport = 0; 1357c478bd9Sstevel@tonic-gate extern uint_t page_create_new; 1367c478bd9Sstevel@tonic-gate extern uint_t page_create_exists; 1377c478bd9Sstevel@tonic-gate extern uint_t page_create_putbacks; 138ae115bc7Smrj /* 139ae115bc7Smrj * Allow users to disable the kernel's use of SSE. 140ae115bc7Smrj */ 141ae115bc7Smrj extern int use_sse_pagecopy, use_sse_pagezero; 1427c478bd9Sstevel@tonic-gate 143d94ffb28Sjmcp /* 144d94ffb28Sjmcp * combined memory ranges from mnode and memranges[] to manage single 145d94ffb28Sjmcp * mnode/mtype dimension in the page lists. 146d94ffb28Sjmcp */ 147d94ffb28Sjmcp typedef struct { 148d94ffb28Sjmcp pfn_t mnr_pfnlo; 149d94ffb28Sjmcp pfn_t mnr_pfnhi; 150d94ffb28Sjmcp int mnr_mnode; 151d94ffb28Sjmcp int mnr_memrange; /* index into memranges[] */ 152d94ffb28Sjmcp int mnr_next; /* next lower PA mnoderange */ 153d94ffb28Sjmcp int mnr_exists; 154d94ffb28Sjmcp /* maintain page list stats */ 155d94ffb28Sjmcp pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 156d94ffb28Sjmcp pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */ 157d94ffb28Sjmcp pgcnt_t mnr_mt_totcnt; /* sum of cache and free lists */ 158d94ffb28Sjmcp #ifdef DEBUG 159d94ffb28Sjmcp struct mnr_mts { /* mnode/mtype szc stats */ 160d94ffb28Sjmcp pgcnt_t mnr_mts_pgcnt; 161d94ffb28Sjmcp int mnr_mts_colors; 162d94ffb28Sjmcp pgcnt_t *mnr_mtsc_pgcnt; 163*dddac438SJohn Levon } *mnr_mts; 164d94ffb28Sjmcp #endif 165d94ffb28Sjmcp } mnoderange_t; 166d94ffb28Sjmcp 167d94ffb28Sjmcp #define MEMRANGEHI(mtype) \ 168d94ffb28Sjmcp ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 169d94ffb28Sjmcp #define MEMRANGELO(mtype) (memranges[mtype]) 170d94ffb28Sjmcp 171d94ffb28Sjmcp #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt) 172d94ffb28Sjmcp 173843e1988Sjohnlev /* 174843e1988Sjohnlev * As the PC architecture evolved memory up was clumped into several 175843e1988Sjohnlev * ranges for various historical I/O devices to do DMA. 176843e1988Sjohnlev * < 16Meg - ISA bus 177843e1988Sjohnlev * < 2Gig - ??? 178843e1988Sjohnlev * < 4Gig - PCI bus or drivers that don't understand PAE mode 179843e1988Sjohnlev * 180843e1988Sjohnlev * These are listed in reverse order, so that we can skip over unused 181843e1988Sjohnlev * ranges on machines with small memories. 182843e1988Sjohnlev * 183843e1988Sjohnlev * For now under the Hypervisor, we'll only ever have one memrange. 184843e1988Sjohnlev */ 185843e1988Sjohnlev #define PFN_4GIG 0x100000 186843e1988Sjohnlev #define PFN_16MEG 0x1000 187a3114836SGerry Liu /* Indices into the memory range (arch_memranges) array. */ 188a3114836SGerry Liu #define MRI_4G 0 189a3114836SGerry Liu #define MRI_2G 1 190a3114836SGerry Liu #define MRI_16M 2 191a3114836SGerry Liu #define MRI_0 3 192843e1988Sjohnlev static pfn_t arch_memranges[NUM_MEM_RANGES] = { 193843e1988Sjohnlev PFN_4GIG, /* pfn range for 4G and above */ 194843e1988Sjohnlev 0x80000, /* pfn range for 2G-4G */ 195843e1988Sjohnlev PFN_16MEG, /* pfn range for 16M-2G */ 196843e1988Sjohnlev 0x00000, /* pfn range for 0-16M */ 197843e1988Sjohnlev }; 198843e1988Sjohnlev pfn_t *memranges = &arch_memranges[0]; 199843e1988Sjohnlev int nranges = NUM_MEM_RANGES; 200843e1988Sjohnlev 201843e1988Sjohnlev /* 202843e1988Sjohnlev * This combines mem_node_config and memranges into one data 203843e1988Sjohnlev * structure to be used for page list management. 204843e1988Sjohnlev */ 205*dddac438SJohn Levon static mnoderange_t *mnoderanges; 206*dddac438SJohn Levon static int mnoderangecnt; 207*dddac438SJohn Levon static int mtype4g; 208*dddac438SJohn Levon static int mtype16m; 209*dddac438SJohn Levon static int mtypetop; 210843e1988Sjohnlev 211843e1988Sjohnlev /* 212843e1988Sjohnlev * 4g memory management variables for systems with more than 4g of memory: 213843e1988Sjohnlev * 214843e1988Sjohnlev * physical memory below 4g is required for 32bit dma devices and, currently, 215843e1988Sjohnlev * for kmem memory. On systems with more than 4g of memory, the pool of memory 216843e1988Sjohnlev * below 4g can be depleted without any paging activity given that there is 217843e1988Sjohnlev * likely to be sufficient memory above 4g. 218843e1988Sjohnlev * 219843e1988Sjohnlev * physmax4g is set true if the largest pfn is over 4g. The rest of the 220843e1988Sjohnlev * 4g memory management code is enabled only when physmax4g is true. 221843e1988Sjohnlev * 222843e1988Sjohnlev * maxmem4g is the count of the maximum number of pages on the page lists 223843e1988Sjohnlev * with physical addresses below 4g. It can be a lot less then 4g given that 224843e1988Sjohnlev * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 225843e1988Sjohnlev * agp aperture etc. 226843e1988Sjohnlev * 227843e1988Sjohnlev * freemem4g maintains the count of the number of available pages on the 228843e1988Sjohnlev * page lists with physical addresses below 4g. 229843e1988Sjohnlev * 230843e1988Sjohnlev * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 231843e1988Sjohnlev * 6% (desfree4gshift = 4) of maxmem4g. 232843e1988Sjohnlev * 233843e1988Sjohnlev * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 234843e1988Sjohnlev * and the amount of physical memory above 4g is greater than freemem4g. 235843e1988Sjohnlev * In this case, page_get_* routines will restrict below 4g allocations 236843e1988Sjohnlev * for requests that don't specifically require it. 237843e1988Sjohnlev */ 238843e1988Sjohnlev 239843e1988Sjohnlev #define DESFREE4G (maxmem4g >> desfree4gshift) 240843e1988Sjohnlev 241843e1988Sjohnlev #define RESTRICT4G_ALLOC \ 242843e1988Sjohnlev (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 2437c478bd9Sstevel@tonic-gate 244843e1988Sjohnlev static pgcnt_t maxmem4g; 245843e1988Sjohnlev static pgcnt_t freemem4g; 246843e1988Sjohnlev static int physmax4g; 247843e1988Sjohnlev static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 248843e1988Sjohnlev 249843e1988Sjohnlev /* 250843e1988Sjohnlev * 16m memory management: 251843e1988Sjohnlev * 252843e1988Sjohnlev * reserve some amount of physical memory below 16m for legacy devices. 253843e1988Sjohnlev * 254843e1988Sjohnlev * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 255843e1988Sjohnlev * 16m or if the 16m pool drops below DESFREE16M. 256843e1988Sjohnlev * 257843e1988Sjohnlev * In this case, general page allocations via page_get_{free,cache}list 258843e1988Sjohnlev * routines will be restricted from allocating from the 16m pool. Allocations 259843e1988Sjohnlev * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 260843e1988Sjohnlev * are not restricted. 261843e1988Sjohnlev */ 262843e1988Sjohnlev 263a3114836SGerry Liu #define FREEMEM16M MTYPE_FREEMEM(mtype16m) 264843e1988Sjohnlev #define DESFREE16M desfree16m 265*dddac438SJohn Levon #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 266*dddac438SJohn Levon (mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \ 267*dddac438SJohn Levon ((freemem >= (FREEMEM16M)) || \ 268843e1988Sjohnlev (FREEMEM16M < (DESFREE16M + pgcnt)))) 269843e1988Sjohnlev 270843e1988Sjohnlev static pgcnt_t desfree16m = 0x380; 271843e1988Sjohnlev 272843e1988Sjohnlev /* 273843e1988Sjohnlev * This can be patched via /etc/system to allow old non-PAE aware device 274843e1988Sjohnlev * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 275843e1988Sjohnlev */ 276843e1988Sjohnlev int restricted_kmemalloc = 0; 27707ad560dSkchow 2787c478bd9Sstevel@tonic-gate #ifdef VM_STATS 2797c478bd9Sstevel@tonic-gate struct { 2807c478bd9Sstevel@tonic-gate ulong_t pga_alloc; 2817c478bd9Sstevel@tonic-gate ulong_t pga_notfullrange; 2827c478bd9Sstevel@tonic-gate ulong_t pga_nulldmaattr; 2837c478bd9Sstevel@tonic-gate ulong_t pga_allocok; 2847c478bd9Sstevel@tonic-gate ulong_t pga_allocfailed; 2857c478bd9Sstevel@tonic-gate ulong_t pgma_alloc; 2867c478bd9Sstevel@tonic-gate ulong_t pgma_allocok; 2877c478bd9Sstevel@tonic-gate ulong_t pgma_allocfailed; 2887c478bd9Sstevel@tonic-gate ulong_t pgma_allocempty; 2897c478bd9Sstevel@tonic-gate } pga_vmstats; 2907c478bd9Sstevel@tonic-gate #endif 2917c478bd9Sstevel@tonic-gate 2927c478bd9Sstevel@tonic-gate uint_t mmu_page_sizes; 2937c478bd9Sstevel@tonic-gate 2947c478bd9Sstevel@tonic-gate /* How many page sizes the users can see */ 2957c478bd9Sstevel@tonic-gate uint_t mmu_exported_page_sizes; 2967c478bd9Sstevel@tonic-gate 29702bc52beSkchow /* page sizes that legacy applications can see */ 29802bc52beSkchow uint_t mmu_legacy_page_sizes; 29902bc52beSkchow 300beb1bda0Sdavemq /* 301beb1bda0Sdavemq * Number of pages in 1 GB. Don't enable automatic large pages if we have 302beb1bda0Sdavemq * fewer than this many pages. 303beb1bda0Sdavemq */ 304ec25b48fSsusans pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 305ec25b48fSsusans pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 306ec25b48fSsusans 307ec25b48fSsusans /* 308ec25b48fSsusans * Maximum and default segment size tunables for user private 309ec25b48fSsusans * and shared anon memory, and user text and initialized data. 310ec25b48fSsusans * These can be patched via /etc/system to allow large pages 311ec25b48fSsusans * to be used for mapping application private and shared anon memory. 312ec25b48fSsusans */ 313ec25b48fSsusans size_t mcntl0_lpsize = MMU_PAGESIZE; 314ec25b48fSsusans size_t max_uheap_lpsize = MMU_PAGESIZE; 315ec25b48fSsusans size_t default_uheap_lpsize = MMU_PAGESIZE; 316ec25b48fSsusans size_t max_ustack_lpsize = MMU_PAGESIZE; 317ec25b48fSsusans size_t default_ustack_lpsize = MMU_PAGESIZE; 318ec25b48fSsusans size_t max_privmap_lpsize = MMU_PAGESIZE; 319ec25b48fSsusans size_t max_uidata_lpsize = MMU_PAGESIZE; 320ec25b48fSsusans size_t max_utext_lpsize = MMU_PAGESIZE; 321ec25b48fSsusans size_t max_shm_lpsize = MMU_PAGESIZE; 3227c478bd9Sstevel@tonic-gate 323843e1988Sjohnlev 324843e1988Sjohnlev /* 325843e1988Sjohnlev * initialized by page_coloring_init(). 326843e1988Sjohnlev */ 327843e1988Sjohnlev uint_t page_colors; 328843e1988Sjohnlev uint_t page_colors_mask; 329843e1988Sjohnlev uint_t page_coloring_shift; 330843e1988Sjohnlev int cpu_page_colors; 331843e1988Sjohnlev static uint_t l2_colors; 332843e1988Sjohnlev 333843e1988Sjohnlev /* 334843e1988Sjohnlev * Page freelists and cachelists are dynamically allocated once mnoderangecnt 335843e1988Sjohnlev * and page_colors are calculated from the l2 cache n-way set size. Within a 336843e1988Sjohnlev * mnode range, the page freelist and cachelist are hashed into bins based on 337843e1988Sjohnlev * color. This makes it easier to search for a page within a specific memory 338843e1988Sjohnlev * range. 339843e1988Sjohnlev */ 340843e1988Sjohnlev #define PAGE_COLORS_MIN 16 341843e1988Sjohnlev 342843e1988Sjohnlev page_t ****page_freelists; 343843e1988Sjohnlev page_t ***page_cachelists; 344843e1988Sjohnlev 345843e1988Sjohnlev 346843e1988Sjohnlev /* 347843e1988Sjohnlev * Used by page layer to know about page sizes 348843e1988Sjohnlev */ 349843e1988Sjohnlev hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 350843e1988Sjohnlev 351d94ffb28Sjmcp kmutex_t *fpc_mutex[NPC_MUTEX]; 352d94ffb28Sjmcp kmutex_t *cpc_mutex[NPC_MUTEX]; 353843e1988Sjohnlev 354a3114836SGerry Liu /* Lock to protect mnoderanges array for memory DR operations. */ 355a3114836SGerry Liu static kmutex_t mnoderange_lock; 356a3114836SGerry Liu 357843e1988Sjohnlev /* 358843e1988Sjohnlev * Only let one thread at a time try to coalesce large pages, to 359843e1988Sjohnlev * prevent them from working against each other. 360843e1988Sjohnlev */ 361843e1988Sjohnlev static kmutex_t contig_lock; 362843e1988Sjohnlev #define CONTIG_LOCK() mutex_enter(&contig_lock); 363843e1988Sjohnlev #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 364843e1988Sjohnlev 365843e1988Sjohnlev #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 366843e1988Sjohnlev 36774ecdb51SJohn Levon caddr_t 36874ecdb51SJohn Levon i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot) 36974ecdb51SJohn Levon { 37074ecdb51SJohn Levon caddr_t addr; 37174ecdb51SJohn Levon caddr_t addr1; 37274ecdb51SJohn Levon page_t *pp; 37374ecdb51SJohn Levon 37474ecdb51SJohn Levon addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP); 37574ecdb51SJohn Levon 37674ecdb51SJohn Levon for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) { 37774ecdb51SJohn Levon pp = page_numtopp_nolock(pf); 37874ecdb51SJohn Levon if (pp == NULL) { 37974ecdb51SJohn Levon hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf, 38074ecdb51SJohn Levon prot | HAT_NOSYNC, HAT_LOAD_LOCK); 38174ecdb51SJohn Levon } else { 38274ecdb51SJohn Levon hat_memload(kas.a_hat, addr, pp, 38374ecdb51SJohn Levon prot | HAT_NOSYNC, HAT_LOAD_LOCK); 38474ecdb51SJohn Levon } 38574ecdb51SJohn Levon } 38674ecdb51SJohn Levon 38774ecdb51SJohn Levon return (addr1); 38874ecdb51SJohn Levon } 38974ecdb51SJohn Levon 39074ecdb51SJohn Levon /* 39174ecdb51SJohn Levon * This routine is like page_numtopp, but accepts only free pages, which 39274ecdb51SJohn Levon * it allocates (unfrees) and returns with the exclusive lock held. 39374ecdb51SJohn Levon * It is used by machdep.c/dma_init() to find contiguous free pages. 39474ecdb51SJohn Levon */ 39574ecdb51SJohn Levon page_t * 39674ecdb51SJohn Levon page_numtopp_alloc(pfn_t pfnum) 39774ecdb51SJohn Levon { 39874ecdb51SJohn Levon page_t *pp; 39974ecdb51SJohn Levon 40074ecdb51SJohn Levon retry: 40174ecdb51SJohn Levon pp = page_numtopp_nolock(pfnum); 40274ecdb51SJohn Levon if (pp == NULL) { 40374ecdb51SJohn Levon return (NULL); 40474ecdb51SJohn Levon } 40574ecdb51SJohn Levon 40674ecdb51SJohn Levon if (!page_trylock(pp, SE_EXCL)) { 40774ecdb51SJohn Levon return (NULL); 40874ecdb51SJohn Levon } 40974ecdb51SJohn Levon 41074ecdb51SJohn Levon if (page_pptonum(pp) != pfnum) { 41174ecdb51SJohn Levon page_unlock(pp); 41274ecdb51SJohn Levon goto retry; 41374ecdb51SJohn Levon } 41474ecdb51SJohn Levon 41574ecdb51SJohn Levon if (!PP_ISFREE(pp)) { 41674ecdb51SJohn Levon page_unlock(pp); 41774ecdb51SJohn Levon return (NULL); 41874ecdb51SJohn Levon } 41974ecdb51SJohn Levon if (pp->p_szc) { 42074ecdb51SJohn Levon page_demote_free_pages(pp); 42174ecdb51SJohn Levon page_unlock(pp); 42274ecdb51SJohn Levon goto retry; 42374ecdb51SJohn Levon } 42474ecdb51SJohn Levon 42574ecdb51SJohn Levon /* If associated with a vnode, destroy mappings */ 42674ecdb51SJohn Levon 42774ecdb51SJohn Levon if (pp->p_vnode) { 42874ecdb51SJohn Levon 42974ecdb51SJohn Levon page_destroy_free(pp); 43074ecdb51SJohn Levon 43174ecdb51SJohn Levon if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) { 43274ecdb51SJohn Levon return (NULL); 43374ecdb51SJohn Levon } 43474ecdb51SJohn Levon 43574ecdb51SJohn Levon if (page_pptonum(pp) != pfnum) { 43674ecdb51SJohn Levon page_unlock(pp); 43774ecdb51SJohn Levon goto retry; 43874ecdb51SJohn Levon } 43974ecdb51SJohn Levon } 44074ecdb51SJohn Levon 44174ecdb51SJohn Levon if (!PP_ISFREE(pp)) { 44274ecdb51SJohn Levon page_unlock(pp); 44374ecdb51SJohn Levon return (NULL); 44474ecdb51SJohn Levon } 44574ecdb51SJohn Levon 44674ecdb51SJohn Levon if (!page_reclaim(pp, (kmutex_t *)NULL)) 44774ecdb51SJohn Levon return (NULL); 44874ecdb51SJohn Levon 44974ecdb51SJohn Levon return (pp); 45074ecdb51SJohn Levon } 45174ecdb51SJohn Levon 4527c478bd9Sstevel@tonic-gate /* 4537c478bd9Sstevel@tonic-gate * Return the optimum page size for a given mapping 4547c478bd9Sstevel@tonic-gate */ 4557c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 4567c478bd9Sstevel@tonic-gate size_t 457ec25b48fSsusans map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 4587c478bd9Sstevel@tonic-gate { 459ec25b48fSsusans level_t l = 0; 460ec25b48fSsusans size_t pgsz = MMU_PAGESIZE; 461ec25b48fSsusans size_t max_lpsize; 462ec25b48fSsusans uint_t mszc; 4637c478bd9Sstevel@tonic-gate 464ec25b48fSsusans ASSERT(maptype != MAPPGSZ_VA); 4657c478bd9Sstevel@tonic-gate 466ec25b48fSsusans if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 467ec25b48fSsusans return (MMU_PAGESIZE); 468ec25b48fSsusans } 4697c478bd9Sstevel@tonic-gate 470ec25b48fSsusans switch (maptype) { 4717c478bd9Sstevel@tonic-gate case MAPPGSZ_HEAP: 472ec25b48fSsusans case MAPPGSZ_STK: 473ec25b48fSsusans max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 474ec25b48fSsusans MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 475ec25b48fSsusans if (max_lpsize == MMU_PAGESIZE) { 476ec25b48fSsusans return (MMU_PAGESIZE); 477ec25b48fSsusans } 478ec25b48fSsusans if (len == 0) { 479ec25b48fSsusans len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 480ec25b48fSsusans p->p_brksize - p->p_bssbase : p->p_stksize; 481ec25b48fSsusans } 482ec25b48fSsusans len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 483ec25b48fSsusans default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 484ec25b48fSsusans 4857c478bd9Sstevel@tonic-gate /* 4867c478bd9Sstevel@tonic-gate * use the pages size that best fits len 4877c478bd9Sstevel@tonic-gate */ 48802bc52beSkchow for (l = mmu.umax_page_level; l > 0; --l) { 489ec25b48fSsusans if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 4907c478bd9Sstevel@tonic-gate continue; 491ec25b48fSsusans } else { 492ec25b48fSsusans pgsz = LEVEL_SIZE(l); 493ec25b48fSsusans } 4947c478bd9Sstevel@tonic-gate break; 4957c478bd9Sstevel@tonic-gate } 496ec25b48fSsusans 497ec25b48fSsusans mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 498ec25b48fSsusans p->p_stkpageszc); 499ec25b48fSsusans if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 500ec25b48fSsusans pgsz = hw_page_array[mszc].hp_size; 501ec25b48fSsusans } 502ec25b48fSsusans return (pgsz); 5037c478bd9Sstevel@tonic-gate 5047c478bd9Sstevel@tonic-gate case MAPPGSZ_ISM: 50502bc52beSkchow for (l = mmu.umax_page_level; l > 0; --l) { 50602bc52beSkchow if (len >= LEVEL_SIZE(l)) 50702bc52beSkchow return (LEVEL_SIZE(l)); 50802bc52beSkchow } 50902bc52beSkchow return (LEVEL_SIZE(0)); 5107c478bd9Sstevel@tonic-gate } 511ec25b48fSsusans return (pgsz); 5127c478bd9Sstevel@tonic-gate } 5137c478bd9Sstevel@tonic-gate 514ec25b48fSsusans static uint_t 515ec25b48fSsusans map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 516ec25b48fSsusans size_t min_physmem) 5177c478bd9Sstevel@tonic-gate { 518ec25b48fSsusans caddr_t eaddr = addr + size; 519ec25b48fSsusans uint_t szcvec = 0; 520ec25b48fSsusans caddr_t raddr; 521ec25b48fSsusans caddr_t readdr; 5227c478bd9Sstevel@tonic-gate size_t pgsz; 523ec25b48fSsusans int i; 5247c478bd9Sstevel@tonic-gate 525ec25b48fSsusans if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 5267c478bd9Sstevel@tonic-gate return (0); 5277c478bd9Sstevel@tonic-gate } 528ec25b48fSsusans 52902bc52beSkchow for (i = mmu_exported_page_sizes - 1; i > 0; i--) { 530ec25b48fSsusans pgsz = page_get_pagesize(i); 531ec25b48fSsusans if (pgsz > max_lpsize) { 532ec25b48fSsusans continue; 533ec25b48fSsusans } 534ec25b48fSsusans raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 535ec25b48fSsusans readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 536ec25b48fSsusans if (raddr < addr || raddr >= readdr) { 537ec25b48fSsusans continue; 538ec25b48fSsusans } 539ec25b48fSsusans if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 540ec25b48fSsusans continue; 541ec25b48fSsusans } 542ec25b48fSsusans /* 543ec25b48fSsusans * Set szcvec to the remaining page sizes. 544ec25b48fSsusans */ 545ec25b48fSsusans szcvec = ((1 << (i + 1)) - 1) & ~1; 546ec25b48fSsusans break; 5477c478bd9Sstevel@tonic-gate } 548ec25b48fSsusans return (szcvec); 5497c478bd9Sstevel@tonic-gate } 5507c478bd9Sstevel@tonic-gate 551ec25b48fSsusans /* 552ec25b48fSsusans * Return a bit vector of large page size codes that 553ec25b48fSsusans * can be used to map [addr, addr + len) region. 554ec25b48fSsusans */ 555ec25b48fSsusans /*ARGSUSED*/ 55607b65a64Saguzovsk uint_t 557ec25b48fSsusans map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 558ec25b48fSsusans int memcntl) 55907b65a64Saguzovsk { 560ec25b48fSsusans size_t max_lpsize = mcntl0_lpsize; 56107b65a64Saguzovsk 562ec25b48fSsusans if (mmu.max_page_level == 0) 56307b65a64Saguzovsk return (0); 56407b65a64Saguzovsk 565ec25b48fSsusans if (flags & MAP_TEXT) { 566843e1988Sjohnlev if (!memcntl) 567843e1988Sjohnlev max_lpsize = max_utext_lpsize; 568843e1988Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 569ec25b48fSsusans shm_lpg_min_physmem)); 570ec25b48fSsusans 571ec25b48fSsusans } else if (flags & MAP_INITDATA) { 572843e1988Sjohnlev if (!memcntl) 573843e1988Sjohnlev max_lpsize = max_uidata_lpsize; 574843e1988Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 575ec25b48fSsusans privm_lpg_min_physmem)); 576ec25b48fSsusans 577ec25b48fSsusans } else if (type == MAPPGSZC_SHM) { 578843e1988Sjohnlev if (!memcntl) 579843e1988Sjohnlev max_lpsize = max_shm_lpsize; 580843e1988Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 581ec25b48fSsusans shm_lpg_min_physmem)); 582ec25b48fSsusans 583ec25b48fSsusans } else if (type == MAPPGSZC_HEAP) { 584843e1988Sjohnlev if (!memcntl) 585843e1988Sjohnlev max_lpsize = max_uheap_lpsize; 586843e1988Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 587ec25b48fSsusans privm_lpg_min_physmem)); 588ec25b48fSsusans 589ec25b48fSsusans } else if (type == MAPPGSZC_STACK) { 590843e1988Sjohnlev if (!memcntl) 591843e1988Sjohnlev max_lpsize = max_ustack_lpsize; 592843e1988Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 593ec25b48fSsusans privm_lpg_min_physmem)); 594ec25b48fSsusans 595ec25b48fSsusans } else { 596843e1988Sjohnlev if (!memcntl) 597843e1988Sjohnlev max_lpsize = max_privmap_lpsize; 598843e1988Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 599ec25b48fSsusans privm_lpg_min_physmem)); 60007b65a64Saguzovsk } 60107b65a64Saguzovsk } 60207b65a64Saguzovsk 6037c478bd9Sstevel@tonic-gate /* 6047c478bd9Sstevel@tonic-gate * Handle a pagefault. 6057c478bd9Sstevel@tonic-gate */ 6067c478bd9Sstevel@tonic-gate faultcode_t 6077c478bd9Sstevel@tonic-gate pagefault( 6087c478bd9Sstevel@tonic-gate caddr_t addr, 6097c478bd9Sstevel@tonic-gate enum fault_type type, 6107c478bd9Sstevel@tonic-gate enum seg_rw rw, 6117c478bd9Sstevel@tonic-gate int iskernel) 6127c478bd9Sstevel@tonic-gate { 6137c478bd9Sstevel@tonic-gate struct as *as; 6147c478bd9Sstevel@tonic-gate struct hat *hat; 6157c478bd9Sstevel@tonic-gate struct proc *p; 6167c478bd9Sstevel@tonic-gate kthread_t *t; 6177c478bd9Sstevel@tonic-gate faultcode_t res; 6187c478bd9Sstevel@tonic-gate caddr_t base; 6197c478bd9Sstevel@tonic-gate size_t len; 6207c478bd9Sstevel@tonic-gate int err; 6217c478bd9Sstevel@tonic-gate int mapped_red; 6227c478bd9Sstevel@tonic-gate uintptr_t ea; 6237c478bd9Sstevel@tonic-gate 6247c478bd9Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 6257c478bd9Sstevel@tonic-gate 6267c478bd9Sstevel@tonic-gate if (INVALID_VADDR(addr)) 6277c478bd9Sstevel@tonic-gate return (FC_NOMAP); 6287c478bd9Sstevel@tonic-gate 6297c478bd9Sstevel@tonic-gate mapped_red = segkp_map_red(); 6307c478bd9Sstevel@tonic-gate 6317c478bd9Sstevel@tonic-gate if (iskernel) { 6327c478bd9Sstevel@tonic-gate as = &kas; 6337c478bd9Sstevel@tonic-gate hat = as->a_hat; 6347c478bd9Sstevel@tonic-gate } else { 6357c478bd9Sstevel@tonic-gate t = curthread; 6367c478bd9Sstevel@tonic-gate p = ttoproc(t); 6377c478bd9Sstevel@tonic-gate as = p->p_as; 6387c478bd9Sstevel@tonic-gate hat = as->a_hat; 6397c478bd9Sstevel@tonic-gate } 6407c478bd9Sstevel@tonic-gate 6417c478bd9Sstevel@tonic-gate /* 6427c478bd9Sstevel@tonic-gate * Dispatch pagefault. 6437c478bd9Sstevel@tonic-gate */ 6447c478bd9Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, type, rw); 6457c478bd9Sstevel@tonic-gate 6467c478bd9Sstevel@tonic-gate /* 6477c478bd9Sstevel@tonic-gate * If this isn't a potential unmapped hole in the user's 6487c478bd9Sstevel@tonic-gate * UNIX data or stack segments, just return status info. 6497c478bd9Sstevel@tonic-gate */ 6507c478bd9Sstevel@tonic-gate if (res != FC_NOMAP || iskernel) 6517c478bd9Sstevel@tonic-gate goto out; 6527c478bd9Sstevel@tonic-gate 6537c478bd9Sstevel@tonic-gate /* 6547c478bd9Sstevel@tonic-gate * Check to see if we happened to faulted on a currently unmapped 6557c478bd9Sstevel@tonic-gate * part of the UNIX data or stack segments. If so, create a zfod 6567c478bd9Sstevel@tonic-gate * mapping there and then try calling the fault routine again. 6577c478bd9Sstevel@tonic-gate */ 6587c478bd9Sstevel@tonic-gate base = p->p_brkbase; 6597c478bd9Sstevel@tonic-gate len = p->p_brksize; 6607c478bd9Sstevel@tonic-gate 6617c478bd9Sstevel@tonic-gate if (addr < base || addr >= base + len) { /* data seg? */ 6627c478bd9Sstevel@tonic-gate base = (caddr_t)p->p_usrstack - p->p_stksize; 6637c478bd9Sstevel@tonic-gate len = p->p_stksize; 6647c478bd9Sstevel@tonic-gate if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 6657c478bd9Sstevel@tonic-gate /* not in either UNIX data or stack segments */ 6667c478bd9Sstevel@tonic-gate res = FC_NOMAP; 6677c478bd9Sstevel@tonic-gate goto out; 6687c478bd9Sstevel@tonic-gate } 6697c478bd9Sstevel@tonic-gate } 6707c478bd9Sstevel@tonic-gate 6717c478bd9Sstevel@tonic-gate /* 6727c478bd9Sstevel@tonic-gate * the rest of this function implements a 3.X 4.X 5.X compatibility 6737c478bd9Sstevel@tonic-gate * This code is probably not needed anymore 6747c478bd9Sstevel@tonic-gate */ 6757c478bd9Sstevel@tonic-gate if (p->p_model == DATAMODEL_ILP32) { 6767c478bd9Sstevel@tonic-gate 6777c478bd9Sstevel@tonic-gate /* expand the gap to the page boundaries on each side */ 6787c478bd9Sstevel@tonic-gate ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 6797c478bd9Sstevel@tonic-gate base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 6807c478bd9Sstevel@tonic-gate len = ea - (uintptr_t)base; 6817c478bd9Sstevel@tonic-gate 6827c478bd9Sstevel@tonic-gate as_rangelock(as); 6837c478bd9Sstevel@tonic-gate if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 6847c478bd9Sstevel@tonic-gate 0) { 6857c478bd9Sstevel@tonic-gate err = as_map(as, base, len, segvn_create, zfod_argsp); 6867c478bd9Sstevel@tonic-gate as_rangeunlock(as); 6877c478bd9Sstevel@tonic-gate if (err) { 6887c478bd9Sstevel@tonic-gate res = FC_MAKE_ERR(err); 6897c478bd9Sstevel@tonic-gate goto out; 6907c478bd9Sstevel@tonic-gate } 6917c478bd9Sstevel@tonic-gate } else { 6927c478bd9Sstevel@tonic-gate /* 6937c478bd9Sstevel@tonic-gate * This page is already mapped by another thread after 6947c478bd9Sstevel@tonic-gate * we returned from as_fault() above. We just fall 6957c478bd9Sstevel@tonic-gate * through as_fault() below. 6967c478bd9Sstevel@tonic-gate */ 6977c478bd9Sstevel@tonic-gate as_rangeunlock(as); 6987c478bd9Sstevel@tonic-gate } 6997c478bd9Sstevel@tonic-gate 7007c478bd9Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, F_INVAL, rw); 7017c478bd9Sstevel@tonic-gate } 7027c478bd9Sstevel@tonic-gate 7037c478bd9Sstevel@tonic-gate out: 7047c478bd9Sstevel@tonic-gate if (mapped_red) 7057c478bd9Sstevel@tonic-gate segkp_unmap_red(); 7067c478bd9Sstevel@tonic-gate 7077c478bd9Sstevel@tonic-gate return (res); 7087c478bd9Sstevel@tonic-gate } 7097c478bd9Sstevel@tonic-gate 7107c478bd9Sstevel@tonic-gate void 7117c478bd9Sstevel@tonic-gate map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 7127c478bd9Sstevel@tonic-gate { 7137c478bd9Sstevel@tonic-gate struct proc *p = curproc; 7147c478bd9Sstevel@tonic-gate caddr_t userlimit = (flags & _MAP_LOW32) ? 7157c478bd9Sstevel@tonic-gate (caddr_t)_userlimit32 : p->p_as->a_userlimit; 7167c478bd9Sstevel@tonic-gate 7177c478bd9Sstevel@tonic-gate map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 7187c478bd9Sstevel@tonic-gate } 7197c478bd9Sstevel@tonic-gate 7207c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 7217c478bd9Sstevel@tonic-gate int 7227c478bd9Sstevel@tonic-gate map_addr_vacalign_check(caddr_t addr, u_offset_t off) 7237c478bd9Sstevel@tonic-gate { 7247c478bd9Sstevel@tonic-gate return (0); 7257c478bd9Sstevel@tonic-gate } 7267c478bd9Sstevel@tonic-gate 727d2a70789SRichard Lowe /* 728d2a70789SRichard Lowe * The maximum amount a randomized mapping will be slewed. We should perhaps 729d2a70789SRichard Lowe * arrange things so these tunables can be separate for mmap, mmapobj, and 730d2a70789SRichard Lowe * ld.so 731d2a70789SRichard Lowe */ 732d2a70789SRichard Lowe size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */ 733d2a70789SRichard Lowe 7347c478bd9Sstevel@tonic-gate /* 7357c478bd9Sstevel@tonic-gate * map_addr_proc() is the routine called when the system is to 7367c478bd9Sstevel@tonic-gate * choose an address for the user. We will pick an address 737ae115bc7Smrj * range which is the highest available below userlimit. 7387c478bd9Sstevel@tonic-gate * 73946ab9534Smec * Every mapping will have a redzone of a single page on either side of 74046ab9534Smec * the request. This is done to leave one page unmapped between segments. 74146ab9534Smec * This is not required, but it's useful for the user because if their 74246ab9534Smec * program strays across a segment boundary, it will catch a fault 74346ab9534Smec * immediately making debugging a little easier. Currently the redzone 74446ab9534Smec * is mandatory. 74546ab9534Smec * 7467c478bd9Sstevel@tonic-gate * addrp is a value/result parameter. 7477c478bd9Sstevel@tonic-gate * On input it is a hint from the user to be used in a completely 7487c478bd9Sstevel@tonic-gate * machine dependent fashion. We decide to completely ignore this hint. 74946ab9534Smec * If MAP_ALIGN was specified, addrp contains the minimal alignment, which 75046ab9534Smec * must be some "power of two" multiple of pagesize. 7517c478bd9Sstevel@tonic-gate * 7527c478bd9Sstevel@tonic-gate * On output it is NULL if no address can be found in the current 7537c478bd9Sstevel@tonic-gate * processes address space or else an address that is currently 7547c478bd9Sstevel@tonic-gate * not mapped for len bytes with a page of red zone on either side. 7557c478bd9Sstevel@tonic-gate * 75646ab9534Smec * vacalign is not needed on x86 (it's for viturally addressed caches) 7577c478bd9Sstevel@tonic-gate */ 7587c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 7597c478bd9Sstevel@tonic-gate void 7607c478bd9Sstevel@tonic-gate map_addr_proc( 7617c478bd9Sstevel@tonic-gate caddr_t *addrp, 7627c478bd9Sstevel@tonic-gate size_t len, 7637c478bd9Sstevel@tonic-gate offset_t off, 7647c478bd9Sstevel@tonic-gate int vacalign, 7657c478bd9Sstevel@tonic-gate caddr_t userlimit, 7667c478bd9Sstevel@tonic-gate struct proc *p, 7677c478bd9Sstevel@tonic-gate uint_t flags) 7687c478bd9Sstevel@tonic-gate { 7697c478bd9Sstevel@tonic-gate struct as *as = p->p_as; 7707c478bd9Sstevel@tonic-gate caddr_t addr; 7717c478bd9Sstevel@tonic-gate caddr_t base; 7727c478bd9Sstevel@tonic-gate size_t slen; 7737c478bd9Sstevel@tonic-gate size_t align_amount; 7747c478bd9Sstevel@tonic-gate 7757c478bd9Sstevel@tonic-gate ASSERT32(userlimit == as->a_userlimit); 7767c478bd9Sstevel@tonic-gate 7777c478bd9Sstevel@tonic-gate base = p->p_brkbase; 7787c478bd9Sstevel@tonic-gate #if defined(__amd64) 7797c478bd9Sstevel@tonic-gate if (p->p_model == DATAMODEL_NATIVE) { 7807c478bd9Sstevel@tonic-gate if (userlimit < as->a_userlimit) { 7817c478bd9Sstevel@tonic-gate /* 7827c478bd9Sstevel@tonic-gate * This happens when a program wants to map 7837c478bd9Sstevel@tonic-gate * something in a range that's accessible to a 7847c478bd9Sstevel@tonic-gate * program in a smaller address space. For example, 7857c478bd9Sstevel@tonic-gate * a 64-bit program calling mmap32(2) to guarantee 7867c478bd9Sstevel@tonic-gate * that the returned address is below 4Gbytes. 7877c478bd9Sstevel@tonic-gate */ 7887c478bd9Sstevel@tonic-gate ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 7897c478bd9Sstevel@tonic-gate 7907c478bd9Sstevel@tonic-gate if (userlimit > base) 7917c478bd9Sstevel@tonic-gate slen = userlimit - base; 7927c478bd9Sstevel@tonic-gate else { 7937c478bd9Sstevel@tonic-gate *addrp = NULL; 7947c478bd9Sstevel@tonic-gate return; 7957c478bd9Sstevel@tonic-gate } 7967c478bd9Sstevel@tonic-gate } else { 7977c478bd9Sstevel@tonic-gate /* 798284ce987SPatrick Mooney * With the stack positioned at a higher address than 799284ce987SPatrick Mooney * the heap for 64-bit processes, it is necessary to be 800284ce987SPatrick Mooney * mindful of its location and potential size. 801284ce987SPatrick Mooney * 802284ce987SPatrick Mooney * Unallocated space above the top of the stack (that 803284ce987SPatrick Mooney * is, at a lower address) but still within the bounds 804284ce987SPatrick Mooney * of the stack limit should be considered unavailable. 805284ce987SPatrick Mooney * 806284ce987SPatrick Mooney * As the 64-bit stack guard is mapped in immediately 807284ce987SPatrick Mooney * adjacent to the stack limit boundary, this prevents 808284ce987SPatrick Mooney * new mappings from having accidentally dangerous 809284ce987SPatrick Mooney * proximity to the stack. 8107c478bd9Sstevel@tonic-gate */ 8117c478bd9Sstevel@tonic-gate slen = p->p_usrstack - base - 8121e1e1eecSMichael Corcoran ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK); 8137c478bd9Sstevel@tonic-gate } 8147c478bd9Sstevel@tonic-gate } else 815284ce987SPatrick Mooney #endif /* defined(__amd64) */ 8167c478bd9Sstevel@tonic-gate slen = userlimit - base; 8177c478bd9Sstevel@tonic-gate 81846ab9534Smec /* Make len be a multiple of PAGESIZE */ 8197c478bd9Sstevel@tonic-gate len = (len + PAGEOFFSET) & PAGEMASK; 8207c478bd9Sstevel@tonic-gate 8217c478bd9Sstevel@tonic-gate /* 8227c478bd9Sstevel@tonic-gate * figure out what the alignment should be 8237c478bd9Sstevel@tonic-gate * 8247c478bd9Sstevel@tonic-gate * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 8257c478bd9Sstevel@tonic-gate */ 8267c478bd9Sstevel@tonic-gate if (len <= ELF_386_MAXPGSZ) { 8277c478bd9Sstevel@tonic-gate /* 8287c478bd9Sstevel@tonic-gate * Align virtual addresses to ensure that ELF shared libraries 8297c478bd9Sstevel@tonic-gate * are mapped with the appropriate alignment constraints by 8307c478bd9Sstevel@tonic-gate * the run-time linker. 8317c478bd9Sstevel@tonic-gate */ 8327c478bd9Sstevel@tonic-gate align_amount = ELF_386_MAXPGSZ; 8337c478bd9Sstevel@tonic-gate } else { 834534f2768SSudheer A /* 835534f2768SSudheer A * For 32-bit processes, only those which have specified 836534f2768SSudheer A * MAP_ALIGN and an addr will be aligned on a larger page size. 837534f2768SSudheer A * Not doing so can potentially waste up to 1G of process 838534f2768SSudheer A * address space. 839534f2768SSudheer A */ 840534f2768SSudheer A int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 : 841534f2768SSudheer A mmu.umax_page_level; 8427c478bd9Sstevel@tonic-gate 843534f2768SSudheer A while (lvl && len < LEVEL_SIZE(lvl)) 844534f2768SSudheer A --lvl; 8457c478bd9Sstevel@tonic-gate 846534f2768SSudheer A align_amount = LEVEL_SIZE(lvl); 8477c478bd9Sstevel@tonic-gate } 8487c478bd9Sstevel@tonic-gate if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 8497c478bd9Sstevel@tonic-gate align_amount = (uintptr_t)*addrp; 8507c478bd9Sstevel@tonic-gate 85146ab9534Smec ASSERT(ISP2(align_amount)); 85246ab9534Smec ASSERT(align_amount == 0 || align_amount >= PAGESIZE); 8537c478bd9Sstevel@tonic-gate 85446ab9534Smec off = off & (align_amount - 1); 855d2a70789SRichard Lowe 8567c478bd9Sstevel@tonic-gate /* 8577c478bd9Sstevel@tonic-gate * Look for a large enough hole starting below userlimit. 85846ab9534Smec * After finding it, use the upper part. 8597c478bd9Sstevel@tonic-gate */ 86046ab9534Smec if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount, 86146ab9534Smec PAGESIZE, off) == 0) { 8627c478bd9Sstevel@tonic-gate caddr_t as_addr; 8637c478bd9Sstevel@tonic-gate 86446ab9534Smec /* 86546ab9534Smec * addr is the highest possible address to use since we have 86646ab9534Smec * a PAGESIZE redzone at the beginning and end. 86746ab9534Smec */ 86846ab9534Smec addr = base + slen - (PAGESIZE + len); 8697c478bd9Sstevel@tonic-gate as_addr = addr; 8707c478bd9Sstevel@tonic-gate /* 87146ab9534Smec * Round address DOWN to the alignment amount and 87246ab9534Smec * add the offset in. 87346ab9534Smec * If addr is greater than as_addr, len would not be large 87446ab9534Smec * enough to include the redzone, so we must adjust down 87546ab9534Smec * by the alignment amount. 8767c478bd9Sstevel@tonic-gate */ 8777c478bd9Sstevel@tonic-gate addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 87846ab9534Smec addr += (uintptr_t)off; 87946ab9534Smec if (addr > as_addr) { 88046ab9534Smec addr -= align_amount; 88146ab9534Smec } 8827c478bd9Sstevel@tonic-gate 883d2a70789SRichard Lowe /* 884d2a70789SRichard Lowe * If randomization is requested, slew the allocation 885d2a70789SRichard Lowe * backwards, within the same gap, by a random amount. 886d2a70789SRichard Lowe */ 887d2a70789SRichard Lowe if (flags & _MAP_RANDOMIZE) { 888d2a70789SRichard Lowe uint32_t slew; 889d2a70789SRichard Lowe 890d2a70789SRichard Lowe (void) random_get_pseudo_bytes((uint8_t *)&slew, 891d2a70789SRichard Lowe sizeof (slew)); 892d2a70789SRichard Lowe 893d2a70789SRichard Lowe slew = slew % MIN(aslr_max_map_skew, (addr - base)); 894d2a70789SRichard Lowe addr -= P2ALIGN(slew, align_amount); 895d2a70789SRichard Lowe } 896d2a70789SRichard Lowe 89746ab9534Smec ASSERT(addr > base); 89846ab9534Smec ASSERT(addr + len < base + slen); 8997c478bd9Sstevel@tonic-gate ASSERT(((uintptr_t)addr & (align_amount - 1)) == 90046ab9534Smec ((uintptr_t)(off))); 9017c478bd9Sstevel@tonic-gate *addrp = addr; 9027c478bd9Sstevel@tonic-gate } else { 9037c478bd9Sstevel@tonic-gate *addrp = NULL; /* no more virtual space */ 9047c478bd9Sstevel@tonic-gate } 9057c478bd9Sstevel@tonic-gate } 9067c478bd9Sstevel@tonic-gate 90746ab9534Smec int valid_va_range_aligned_wraparound; 90846ab9534Smec 9097c478bd9Sstevel@tonic-gate /* 91046ab9534Smec * Determine whether [*basep, *basep + *lenp) contains a mappable range of 91146ab9534Smec * addresses at least "minlen" long, where the base of the range is at "off" 91246ab9534Smec * phase from an "align" boundary and there is space for a "redzone"-sized 91346ab9534Smec * redzone on either side of the range. On success, 1 is returned and *basep 91446ab9534Smec * and *lenp are adjusted to describe the acceptable range (including 91546ab9534Smec * the redzone). On failure, 0 is returned. 9167c478bd9Sstevel@tonic-gate */ 9177c478bd9Sstevel@tonic-gate /*ARGSUSED3*/ 9187c478bd9Sstevel@tonic-gate int 91946ab9534Smec valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 92046ab9534Smec size_t align, size_t redzone, size_t off) 9217c478bd9Sstevel@tonic-gate { 9227c478bd9Sstevel@tonic-gate uintptr_t hi, lo; 92346ab9534Smec size_t tot_len; 92446ab9534Smec 92546ab9534Smec ASSERT(align == 0 ? off == 0 : off < align); 92646ab9534Smec ASSERT(ISP2(align)); 92746ab9534Smec ASSERT(align == 0 || align >= PAGESIZE); 9287c478bd9Sstevel@tonic-gate 9297c478bd9Sstevel@tonic-gate lo = (uintptr_t)*basep; 9307c478bd9Sstevel@tonic-gate hi = lo + *lenp; 93146ab9534Smec tot_len = minlen + 2 * redzone; /* need at least this much space */ 9327c478bd9Sstevel@tonic-gate 9337c478bd9Sstevel@tonic-gate /* 9347c478bd9Sstevel@tonic-gate * If hi rolled over the top, try cutting back. 9357c478bd9Sstevel@tonic-gate */ 9367c478bd9Sstevel@tonic-gate if (hi < lo) { 93746ab9534Smec *lenp = 0UL - lo - 1UL; 93846ab9534Smec /* See if this really happens. If so, then we figure out why */ 93946ab9534Smec valid_va_range_aligned_wraparound++; 94046ab9534Smec hi = lo + *lenp; 94146ab9534Smec } 94246ab9534Smec if (*lenp < tot_len) { 9437c478bd9Sstevel@tonic-gate return (0); 9447c478bd9Sstevel@tonic-gate } 94546ab9534Smec 9467c478bd9Sstevel@tonic-gate #if defined(__amd64) 9477c478bd9Sstevel@tonic-gate /* 9487c478bd9Sstevel@tonic-gate * Deal with a possible hole in the address range between 9497c478bd9Sstevel@tonic-gate * hole_start and hole_end that should never be mapped. 9507c478bd9Sstevel@tonic-gate */ 9517c478bd9Sstevel@tonic-gate if (lo < hole_start) { 9527c478bd9Sstevel@tonic-gate if (hi > hole_start) { 9537c478bd9Sstevel@tonic-gate if (hi < hole_end) { 9547c478bd9Sstevel@tonic-gate hi = hole_start; 9557c478bd9Sstevel@tonic-gate } else { 9567c478bd9Sstevel@tonic-gate /* lo < hole_start && hi >= hole_end */ 9577c478bd9Sstevel@tonic-gate if (dir == AH_LO) { 9587c478bd9Sstevel@tonic-gate /* 9597c478bd9Sstevel@tonic-gate * prefer lowest range 9607c478bd9Sstevel@tonic-gate */ 96146ab9534Smec if (hole_start - lo >= tot_len) 9627c478bd9Sstevel@tonic-gate hi = hole_start; 96346ab9534Smec else if (hi - hole_end >= tot_len) 9647c478bd9Sstevel@tonic-gate lo = hole_end; 9657c478bd9Sstevel@tonic-gate else 9667c478bd9Sstevel@tonic-gate return (0); 9677c478bd9Sstevel@tonic-gate } else { 9687c478bd9Sstevel@tonic-gate /* 9697c478bd9Sstevel@tonic-gate * prefer highest range 9707c478bd9Sstevel@tonic-gate */ 97146ab9534Smec if (hi - hole_end >= tot_len) 9727c478bd9Sstevel@tonic-gate lo = hole_end; 97346ab9534Smec else if (hole_start - lo >= tot_len) 9747c478bd9Sstevel@tonic-gate hi = hole_start; 9757c478bd9Sstevel@tonic-gate else 9767c478bd9Sstevel@tonic-gate return (0); 9777c478bd9Sstevel@tonic-gate } 9787c478bd9Sstevel@tonic-gate } 9797c478bd9Sstevel@tonic-gate } 9807c478bd9Sstevel@tonic-gate } else { 9817c478bd9Sstevel@tonic-gate /* lo >= hole_start */ 9827c478bd9Sstevel@tonic-gate if (hi < hole_end) 9837c478bd9Sstevel@tonic-gate return (0); 9847c478bd9Sstevel@tonic-gate if (lo < hole_end) 9857c478bd9Sstevel@tonic-gate lo = hole_end; 9867c478bd9Sstevel@tonic-gate } 98746ab9534Smec #endif 9887c478bd9Sstevel@tonic-gate 98946ab9534Smec if (hi - lo < tot_len) 9907c478bd9Sstevel@tonic-gate return (0); 9917c478bd9Sstevel@tonic-gate 99246ab9534Smec if (align > 1) { 99346ab9534Smec uintptr_t tlo = lo + redzone; 99446ab9534Smec uintptr_t thi = hi - redzone; 99546ab9534Smec tlo = (uintptr_t)P2PHASEUP(tlo, align, off); 99646ab9534Smec if (tlo < lo + redzone) { 99746ab9534Smec return (0); 99846ab9534Smec } 99946ab9534Smec if (thi < tlo || thi - tlo < minlen) { 100046ab9534Smec return (0); 100146ab9534Smec } 100246ab9534Smec } 100346ab9534Smec 10047c478bd9Sstevel@tonic-gate *basep = (caddr_t)lo; 10057c478bd9Sstevel@tonic-gate *lenp = hi - lo; 10067c478bd9Sstevel@tonic-gate return (1); 10077c478bd9Sstevel@tonic-gate } 10087c478bd9Sstevel@tonic-gate 100946ab9534Smec /* 101046ab9534Smec * Determine whether [*basep, *basep + *lenp) contains a mappable range of 101146ab9534Smec * addresses at least "minlen" long. On success, 1 is returned and *basep 101246ab9534Smec * and *lenp are adjusted to describe the acceptable range. On failure, 0 101346ab9534Smec * is returned. 101446ab9534Smec */ 101546ab9534Smec int 101646ab9534Smec valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 101746ab9534Smec { 101846ab9534Smec return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 101946ab9534Smec } 102046ab9534Smec 1021d2a70789SRichard Lowe /* 1022d2a70789SRichard Lowe * Default to forbidding the first 64k of address space. This protects most 1023d2a70789SRichard Lowe * reasonably sized structures from dereferences through NULL: 1024d2a70789SRichard Lowe * ((foo_t *)0)->bar 1025d2a70789SRichard Lowe */ 1026d2a70789SRichard Lowe uintptr_t forbidden_null_mapping_sz = 0x10000; 1027d2a70789SRichard Lowe 10287c478bd9Sstevel@tonic-gate /* 10297c478bd9Sstevel@tonic-gate * Determine whether [addr, addr+len] are valid user addresses. 10307c478bd9Sstevel@tonic-gate */ 10317c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 10327c478bd9Sstevel@tonic-gate int 10337c478bd9Sstevel@tonic-gate valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 10347c478bd9Sstevel@tonic-gate caddr_t userlimit) 10357c478bd9Sstevel@tonic-gate { 10367c478bd9Sstevel@tonic-gate caddr_t eaddr = addr + len; 10377c478bd9Sstevel@tonic-gate 10387c478bd9Sstevel@tonic-gate if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 10397c478bd9Sstevel@tonic-gate return (RANGE_BADADDR); 10407c478bd9Sstevel@tonic-gate 1041d2a70789SRichard Lowe if ((addr <= (caddr_t)forbidden_null_mapping_sz) && 1042a02406b9SPatrick Mooney as->a_proc != NULL && 1043d2a70789SRichard Lowe secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP)) 1044d2a70789SRichard Lowe return (RANGE_BADADDR); 1045d2a70789SRichard Lowe 10467c478bd9Sstevel@tonic-gate #if defined(__amd64) 10477c478bd9Sstevel@tonic-gate /* 10487c478bd9Sstevel@tonic-gate * Check for the VA hole 10497c478bd9Sstevel@tonic-gate */ 10507c478bd9Sstevel@tonic-gate if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 10517c478bd9Sstevel@tonic-gate return (RANGE_BADADDR); 10527c478bd9Sstevel@tonic-gate #endif 10537c478bd9Sstevel@tonic-gate 10547c478bd9Sstevel@tonic-gate return (RANGE_OKAY); 10557c478bd9Sstevel@tonic-gate } 10567c478bd9Sstevel@tonic-gate 10577c478bd9Sstevel@tonic-gate /* 10587c478bd9Sstevel@tonic-gate * Return 1 if the page frame is onboard memory, else 0. 10597c478bd9Sstevel@tonic-gate */ 10607c478bd9Sstevel@tonic-gate int 10617c478bd9Sstevel@tonic-gate pf_is_memory(pfn_t pf) 10627c478bd9Sstevel@tonic-gate { 1063ae115bc7Smrj if (pfn_is_foreign(pf)) 1064ae115bc7Smrj return (0); 1065ae115bc7Smrj return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 10667c478bd9Sstevel@tonic-gate } 10677c478bd9Sstevel@tonic-gate 10687c478bd9Sstevel@tonic-gate /* 10697c478bd9Sstevel@tonic-gate * return the memrange containing pfn 10707c478bd9Sstevel@tonic-gate */ 10717c478bd9Sstevel@tonic-gate int 10727c478bd9Sstevel@tonic-gate memrange_num(pfn_t pfn) 10737c478bd9Sstevel@tonic-gate { 10747c478bd9Sstevel@tonic-gate int n; 10757c478bd9Sstevel@tonic-gate 10767c478bd9Sstevel@tonic-gate for (n = 0; n < nranges - 1; ++n) { 10777c478bd9Sstevel@tonic-gate if (pfn >= memranges[n]) 10787c478bd9Sstevel@tonic-gate break; 10797c478bd9Sstevel@tonic-gate } 10807c478bd9Sstevel@tonic-gate return (n); 10817c478bd9Sstevel@tonic-gate } 10827c478bd9Sstevel@tonic-gate 10837c478bd9Sstevel@tonic-gate /* 10847c478bd9Sstevel@tonic-gate * return the mnoderange containing pfn 10857c478bd9Sstevel@tonic-gate */ 1086843e1988Sjohnlev /*ARGSUSED*/ 10877c478bd9Sstevel@tonic-gate int 10887c478bd9Sstevel@tonic-gate pfn_2_mtype(pfn_t pfn) 10897c478bd9Sstevel@tonic-gate { 1090843e1988Sjohnlev #if defined(__xpv) 1091843e1988Sjohnlev return (0); 1092843e1988Sjohnlev #else 10937c478bd9Sstevel@tonic-gate int n; 10947c478bd9Sstevel@tonic-gate 1095a3114836SGerry Liu /* Always start from highest pfn and work our way down */ 1096a3114836SGerry Liu for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { 10977c478bd9Sstevel@tonic-gate if (pfn >= mnoderanges[n].mnr_pfnlo) { 10987c478bd9Sstevel@tonic-gate break; 10997c478bd9Sstevel@tonic-gate } 11007c478bd9Sstevel@tonic-gate } 11017c478bd9Sstevel@tonic-gate return (n); 1102843e1988Sjohnlev #endif 11037c478bd9Sstevel@tonic-gate } 11047c478bd9Sstevel@tonic-gate 1105843e1988Sjohnlev #if !defined(__xpv) 11067c478bd9Sstevel@tonic-gate /* 11077c478bd9Sstevel@tonic-gate * is_contigpage_free: 11087c478bd9Sstevel@tonic-gate * returns a page list of contiguous pages. It minimally has to return 11097c478bd9Sstevel@tonic-gate * minctg pages. Caller determines minctg based on the scatter-gather 11107c478bd9Sstevel@tonic-gate * list length. 11117c478bd9Sstevel@tonic-gate * 11127c478bd9Sstevel@tonic-gate * pfnp is set to the next page frame to search on return. 11137c478bd9Sstevel@tonic-gate */ 11147c478bd9Sstevel@tonic-gate static page_t * 11157c478bd9Sstevel@tonic-gate is_contigpage_free( 11167c478bd9Sstevel@tonic-gate pfn_t *pfnp, 11177c478bd9Sstevel@tonic-gate pgcnt_t *pgcnt, 11187c478bd9Sstevel@tonic-gate pgcnt_t minctg, 11197c478bd9Sstevel@tonic-gate uint64_t pfnseg, 11207c478bd9Sstevel@tonic-gate int iolock) 11217c478bd9Sstevel@tonic-gate { 11227c478bd9Sstevel@tonic-gate int i = 0; 11237c478bd9Sstevel@tonic-gate pfn_t pfn = *pfnp; 11247c478bd9Sstevel@tonic-gate page_t *pp; 11257c478bd9Sstevel@tonic-gate page_t *plist = NULL; 11267c478bd9Sstevel@tonic-gate 11277c478bd9Sstevel@tonic-gate /* 11287c478bd9Sstevel@tonic-gate * fail if pfn + minctg crosses a segment boundary. 11297c478bd9Sstevel@tonic-gate * Adjust for next starting pfn to begin at segment boundary. 11307c478bd9Sstevel@tonic-gate */ 11317c478bd9Sstevel@tonic-gate 11327c478bd9Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 11337c478bd9Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 11347c478bd9Sstevel@tonic-gate return (NULL); 11357c478bd9Sstevel@tonic-gate } 11367c478bd9Sstevel@tonic-gate 11377c478bd9Sstevel@tonic-gate do { 11387c478bd9Sstevel@tonic-gate retry: 11397c478bd9Sstevel@tonic-gate pp = page_numtopp_nolock(pfn + i); 11401f84c0d7SDave Plauger if ((pp == NULL) || IS_DUMP_PAGE(pp) || 11417c478bd9Sstevel@tonic-gate (page_trylock(pp, SE_EXCL) == 0)) { 11427c478bd9Sstevel@tonic-gate (*pfnp)++; 11437c478bd9Sstevel@tonic-gate break; 11447c478bd9Sstevel@tonic-gate } 11457c478bd9Sstevel@tonic-gate if (page_pptonum(pp) != pfn + i) { 11467c478bd9Sstevel@tonic-gate page_unlock(pp); 11477c478bd9Sstevel@tonic-gate goto retry; 11487c478bd9Sstevel@tonic-gate } 11497c478bd9Sstevel@tonic-gate 11507c478bd9Sstevel@tonic-gate if (!(PP_ISFREE(pp))) { 11517c478bd9Sstevel@tonic-gate page_unlock(pp); 11527c478bd9Sstevel@tonic-gate (*pfnp)++; 11537c478bd9Sstevel@tonic-gate break; 11547c478bd9Sstevel@tonic-gate } 11557c478bd9Sstevel@tonic-gate 11567c478bd9Sstevel@tonic-gate if (!PP_ISAGED(pp)) { 11577c478bd9Sstevel@tonic-gate page_list_sub(pp, PG_CACHE_LIST); 11587c478bd9Sstevel@tonic-gate page_hashout(pp, (kmutex_t *)NULL); 11597c478bd9Sstevel@tonic-gate } else { 11607c478bd9Sstevel@tonic-gate page_list_sub(pp, PG_FREE_LIST); 11617c478bd9Sstevel@tonic-gate } 11627c478bd9Sstevel@tonic-gate 11637c478bd9Sstevel@tonic-gate if (iolock) 11647c478bd9Sstevel@tonic-gate page_io_lock(pp); 11657c478bd9Sstevel@tonic-gate page_list_concat(&plist, &pp); 11667c478bd9Sstevel@tonic-gate 11677c478bd9Sstevel@tonic-gate /* 11687c478bd9Sstevel@tonic-gate * exit loop when pgcnt satisfied or segment boundary reached. 11697c478bd9Sstevel@tonic-gate */ 11707c478bd9Sstevel@tonic-gate 11717c478bd9Sstevel@tonic-gate } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 11727c478bd9Sstevel@tonic-gate 11737c478bd9Sstevel@tonic-gate *pfnp += i; /* set to next pfn to search */ 11747c478bd9Sstevel@tonic-gate 11757c478bd9Sstevel@tonic-gate if (i >= minctg) { 11767c478bd9Sstevel@tonic-gate *pgcnt -= i; 11777c478bd9Sstevel@tonic-gate return (plist); 11787c478bd9Sstevel@tonic-gate } 11797c478bd9Sstevel@tonic-gate 11807c478bd9Sstevel@tonic-gate /* 11817c478bd9Sstevel@tonic-gate * failure: minctg not satisfied. 11827c478bd9Sstevel@tonic-gate * 11837c478bd9Sstevel@tonic-gate * if next request crosses segment boundary, set next pfn 11847c478bd9Sstevel@tonic-gate * to search from the segment boundary. 11857c478bd9Sstevel@tonic-gate */ 11867c478bd9Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 11877c478bd9Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 11887c478bd9Sstevel@tonic-gate 11897c478bd9Sstevel@tonic-gate /* clean up any pages already allocated */ 11907c478bd9Sstevel@tonic-gate 11917c478bd9Sstevel@tonic-gate while (plist) { 11927c478bd9Sstevel@tonic-gate pp = plist; 11937c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 11947c478bd9Sstevel@tonic-gate page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 11957c478bd9Sstevel@tonic-gate if (iolock) 11967c478bd9Sstevel@tonic-gate page_io_unlock(pp); 11977c478bd9Sstevel@tonic-gate page_unlock(pp); 11987c478bd9Sstevel@tonic-gate } 11997c478bd9Sstevel@tonic-gate 12007c478bd9Sstevel@tonic-gate return (NULL); 12017c478bd9Sstevel@tonic-gate } 1202843e1988Sjohnlev #endif /* !__xpv */ 12037c478bd9Sstevel@tonic-gate 12047c478bd9Sstevel@tonic-gate /* 12057c478bd9Sstevel@tonic-gate * verify that pages being returned from allocator have correct DMA attribute 12067c478bd9Sstevel@tonic-gate */ 12077c478bd9Sstevel@tonic-gate #ifndef DEBUG 1208a3114836SGerry Liu #define check_dma(a, b, c) (void)(0) 12097c478bd9Sstevel@tonic-gate #else 12107c478bd9Sstevel@tonic-gate static void 12117c478bd9Sstevel@tonic-gate check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 12127c478bd9Sstevel@tonic-gate { 12137c478bd9Sstevel@tonic-gate if (dma_attr == NULL) 12147c478bd9Sstevel@tonic-gate return; 12157c478bd9Sstevel@tonic-gate 12167c478bd9Sstevel@tonic-gate while (cnt-- > 0) { 1217ae115bc7Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 12187c478bd9Sstevel@tonic-gate dma_attr->dma_attr_addr_lo) 1219903a11ebSrh panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp); 1220ae115bc7Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 12217c478bd9Sstevel@tonic-gate dma_attr->dma_attr_addr_hi) 1222903a11ebSrh panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp); 12237c478bd9Sstevel@tonic-gate pp = pp->p_next; 12247c478bd9Sstevel@tonic-gate } 12257c478bd9Sstevel@tonic-gate } 12267c478bd9Sstevel@tonic-gate #endif 12277c478bd9Sstevel@tonic-gate 1228843e1988Sjohnlev #if !defined(__xpv) 12297c478bd9Sstevel@tonic-gate static page_t * 12307c478bd9Sstevel@tonic-gate page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 12317c478bd9Sstevel@tonic-gate { 12327c478bd9Sstevel@tonic-gate pfn_t pfn; 12337c478bd9Sstevel@tonic-gate int sgllen; 12347c478bd9Sstevel@tonic-gate uint64_t pfnseg; 12357c478bd9Sstevel@tonic-gate pgcnt_t minctg; 12367c478bd9Sstevel@tonic-gate page_t *pplist = NULL, *plist; 12377c478bd9Sstevel@tonic-gate uint64_t lo, hi; 12387c478bd9Sstevel@tonic-gate pgcnt_t pfnalign = 0; 12397c478bd9Sstevel@tonic-gate static pfn_t startpfn; 12407c478bd9Sstevel@tonic-gate static pgcnt_t lastctgcnt; 12417c478bd9Sstevel@tonic-gate uintptr_t align; 12427c478bd9Sstevel@tonic-gate 12437c478bd9Sstevel@tonic-gate CONTIG_LOCK(); 12447c478bd9Sstevel@tonic-gate 12457c478bd9Sstevel@tonic-gate if (mattr) { 12467c478bd9Sstevel@tonic-gate lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 12477c478bd9Sstevel@tonic-gate hi = mmu_btop(mattr->dma_attr_addr_hi); 12487c478bd9Sstevel@tonic-gate if (hi >= physmax) 12497c478bd9Sstevel@tonic-gate hi = physmax - 1; 12507c478bd9Sstevel@tonic-gate sgllen = mattr->dma_attr_sgllen; 12517c478bd9Sstevel@tonic-gate pfnseg = mmu_btop(mattr->dma_attr_seg); 12527c478bd9Sstevel@tonic-gate 12537c478bd9Sstevel@tonic-gate align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 12547c478bd9Sstevel@tonic-gate if (align > MMU_PAGESIZE) 12557c478bd9Sstevel@tonic-gate pfnalign = mmu_btop(align); 12567c478bd9Sstevel@tonic-gate 12577c478bd9Sstevel@tonic-gate /* 12587c478bd9Sstevel@tonic-gate * in order to satisfy the request, must minimally 12597c478bd9Sstevel@tonic-gate * acquire minctg contiguous pages 12607c478bd9Sstevel@tonic-gate */ 12617c478bd9Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 12627c478bd9Sstevel@tonic-gate 12637c478bd9Sstevel@tonic-gate ASSERT(hi >= lo); 12647c478bd9Sstevel@tonic-gate 12657c478bd9Sstevel@tonic-gate /* 12667c478bd9Sstevel@tonic-gate * start from where last searched if the minctg >= lastctgcnt 12677c478bd9Sstevel@tonic-gate */ 12687c478bd9Sstevel@tonic-gate if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 12697c478bd9Sstevel@tonic-gate startpfn = lo; 12707c478bd9Sstevel@tonic-gate } else { 12717c478bd9Sstevel@tonic-gate hi = physmax - 1; 12727c478bd9Sstevel@tonic-gate lo = 0; 12737c478bd9Sstevel@tonic-gate sgllen = 1; 12747c478bd9Sstevel@tonic-gate pfnseg = mmu.highest_pfn; 12757c478bd9Sstevel@tonic-gate minctg = *pgcnt; 12767c478bd9Sstevel@tonic-gate 12777c478bd9Sstevel@tonic-gate if (minctg < lastctgcnt) 12787c478bd9Sstevel@tonic-gate startpfn = lo; 12797c478bd9Sstevel@tonic-gate } 12807c478bd9Sstevel@tonic-gate lastctgcnt = minctg; 12817c478bd9Sstevel@tonic-gate 12827c478bd9Sstevel@tonic-gate ASSERT(pfnseg + 1 >= (uint64_t)minctg); 12837c478bd9Sstevel@tonic-gate 12847c478bd9Sstevel@tonic-gate /* conserve 16m memory - start search above 16m when possible */ 12857c478bd9Sstevel@tonic-gate if (hi > PFN_16M && startpfn < PFN_16M) 12867c478bd9Sstevel@tonic-gate startpfn = PFN_16M; 12877c478bd9Sstevel@tonic-gate 12887c478bd9Sstevel@tonic-gate pfn = startpfn; 12897c478bd9Sstevel@tonic-gate if (pfnalign) 12907c478bd9Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 12917c478bd9Sstevel@tonic-gate 12927c478bd9Sstevel@tonic-gate while (pfn + minctg - 1 <= hi) { 12937c478bd9Sstevel@tonic-gate 12947c478bd9Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 12957c478bd9Sstevel@tonic-gate if (plist) { 12967c478bd9Sstevel@tonic-gate page_list_concat(&pplist, &plist); 12977c478bd9Sstevel@tonic-gate sgllen--; 12987c478bd9Sstevel@tonic-gate /* 12997c478bd9Sstevel@tonic-gate * return when contig pages no longer needed 13007c478bd9Sstevel@tonic-gate */ 13017c478bd9Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 13027c478bd9Sstevel@tonic-gate startpfn = pfn; 13037c478bd9Sstevel@tonic-gate CONTIG_UNLOCK(); 13047c478bd9Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 13057c478bd9Sstevel@tonic-gate return (pplist); 13067c478bd9Sstevel@tonic-gate } 13077c478bd9Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 13087c478bd9Sstevel@tonic-gate } 13097c478bd9Sstevel@tonic-gate if (pfnalign) 13107c478bd9Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 13117c478bd9Sstevel@tonic-gate } 13127c478bd9Sstevel@tonic-gate 13137c478bd9Sstevel@tonic-gate /* cannot find contig pages in specified range */ 13147c478bd9Sstevel@tonic-gate if (startpfn == lo) { 13157c478bd9Sstevel@tonic-gate CONTIG_UNLOCK(); 13167c478bd9Sstevel@tonic-gate return (NULL); 13177c478bd9Sstevel@tonic-gate } 13187c478bd9Sstevel@tonic-gate 13197c478bd9Sstevel@tonic-gate /* did not start with lo previously */ 13207c478bd9Sstevel@tonic-gate pfn = lo; 13217c478bd9Sstevel@tonic-gate if (pfnalign) 13227c478bd9Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 13237c478bd9Sstevel@tonic-gate 13247c478bd9Sstevel@tonic-gate /* allow search to go above startpfn */ 13257c478bd9Sstevel@tonic-gate while (pfn < startpfn) { 13267c478bd9Sstevel@tonic-gate 13277c478bd9Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 13287c478bd9Sstevel@tonic-gate if (plist != NULL) { 13297c478bd9Sstevel@tonic-gate 13307c478bd9Sstevel@tonic-gate page_list_concat(&pplist, &plist); 13317c478bd9Sstevel@tonic-gate sgllen--; 13327c478bd9Sstevel@tonic-gate 13337c478bd9Sstevel@tonic-gate /* 13347c478bd9Sstevel@tonic-gate * return when contig pages no longer needed 13357c478bd9Sstevel@tonic-gate */ 13367c478bd9Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 13377c478bd9Sstevel@tonic-gate startpfn = pfn; 13387c478bd9Sstevel@tonic-gate CONTIG_UNLOCK(); 13397c478bd9Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 13407c478bd9Sstevel@tonic-gate return (pplist); 13417c478bd9Sstevel@tonic-gate } 13427c478bd9Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 13437c478bd9Sstevel@tonic-gate } 13447c478bd9Sstevel@tonic-gate if (pfnalign) 13457c478bd9Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 13467c478bd9Sstevel@tonic-gate } 13477c478bd9Sstevel@tonic-gate CONTIG_UNLOCK(); 13487c478bd9Sstevel@tonic-gate return (NULL); 13497c478bd9Sstevel@tonic-gate } 1350843e1988Sjohnlev #endif /* !__xpv */ 13517c478bd9Sstevel@tonic-gate 13527c478bd9Sstevel@tonic-gate /* 13537c478bd9Sstevel@tonic-gate * mnode_range_cnt() calculates the number of memory ranges for mnode and 13547c478bd9Sstevel@tonic-gate * memranges[]. Used to determine the size of page lists and mnoderanges. 13557c478bd9Sstevel@tonic-gate */ 13567c478bd9Sstevel@tonic-gate int 13575d07b933Sdp mnode_range_cnt(int mnode) 13587c478bd9Sstevel@tonic-gate { 1359843e1988Sjohnlev #if defined(__xpv) 1360843e1988Sjohnlev ASSERT(mnode == 0); 1361843e1988Sjohnlev return (1); 1362843e1988Sjohnlev #else /* __xpv */ 13637c478bd9Sstevel@tonic-gate int mri; 13647c478bd9Sstevel@tonic-gate int mnrcnt = 0; 13657c478bd9Sstevel@tonic-gate 13665d07b933Sdp if (mem_node_config[mnode].exists != 0) { 13677c478bd9Sstevel@tonic-gate mri = nranges - 1; 13687c478bd9Sstevel@tonic-gate 13697c478bd9Sstevel@tonic-gate /* find the memranges index below contained in mnode range */ 13707c478bd9Sstevel@tonic-gate 13717c478bd9Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 13727c478bd9Sstevel@tonic-gate mri--; 13737c478bd9Sstevel@tonic-gate 13747c478bd9Sstevel@tonic-gate /* 13757c478bd9Sstevel@tonic-gate * increment mnode range counter when memranges or mnode 13767c478bd9Sstevel@tonic-gate * boundary is reached. 13777c478bd9Sstevel@tonic-gate */ 13787c478bd9Sstevel@tonic-gate while (mri >= 0 && 13797c478bd9Sstevel@tonic-gate mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 13807c478bd9Sstevel@tonic-gate mnrcnt++; 13817c478bd9Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 13827c478bd9Sstevel@tonic-gate mri--; 13837c478bd9Sstevel@tonic-gate else 13847c478bd9Sstevel@tonic-gate break; 13857c478bd9Sstevel@tonic-gate } 13867c478bd9Sstevel@tonic-gate } 13875d07b933Sdp ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 13887c478bd9Sstevel@tonic-gate return (mnrcnt); 1389843e1988Sjohnlev #endif /* __xpv */ 13907c478bd9Sstevel@tonic-gate } 13917c478bd9Sstevel@tonic-gate 1392*dddac438SJohn Levon static int 1393*dddac438SJohn Levon mnoderange_cmp(const void *v1, const void *v2) 1394*dddac438SJohn Levon { 1395*dddac438SJohn Levon const mnoderange_t *m1 = v1; 1396*dddac438SJohn Levon const mnoderange_t *m2 = v2; 1397*dddac438SJohn Levon 1398*dddac438SJohn Levon if (m1->mnr_pfnlo < m2->mnr_pfnlo) 1399*dddac438SJohn Levon return (-1); 1400*dddac438SJohn Levon return (m1->mnr_pfnlo > m2->mnr_pfnlo); 1401*dddac438SJohn Levon } 1402*dddac438SJohn Levon 14037c478bd9Sstevel@tonic-gate void 14047c478bd9Sstevel@tonic-gate mnode_range_setup(mnoderange_t *mnoderanges) 14057c478bd9Sstevel@tonic-gate { 1406*dddac438SJohn Levon mnoderange_t *mp; 1407*dddac438SJohn Levon size_t nr_ranges; 1408*dddac438SJohn Levon size_t mnode; 1409*dddac438SJohn Levon 1410*dddac438SJohn Levon for (mnode = 0, nr_ranges = 0, mp = mnoderanges; 1411*dddac438SJohn Levon mnode < max_mem_nodes; mnode++) { 1412*dddac438SJohn Levon size_t mri = nranges - 1; 1413*dddac438SJohn Levon 14147c478bd9Sstevel@tonic-gate if (mem_node_config[mnode].exists == 0) 14157c478bd9Sstevel@tonic-gate continue; 14167c478bd9Sstevel@tonic-gate 14177c478bd9Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 14187c478bd9Sstevel@tonic-gate mri--; 14197c478bd9Sstevel@tonic-gate 14207c478bd9Sstevel@tonic-gate while (mri >= 0 && mem_node_config[mnode].physmax >= 14217c478bd9Sstevel@tonic-gate MEMRANGELO(mri)) { 1422*dddac438SJohn Levon mp->mnr_pfnlo = MAX(MEMRANGELO(mri), 1423843e1988Sjohnlev mem_node_config[mnode].physbase); 1424*dddac438SJohn Levon mp->mnr_pfnhi = MIN(MEMRANGEHI(mri), 1425843e1988Sjohnlev mem_node_config[mnode].physmax); 1426*dddac438SJohn Levon mp->mnr_mnode = mnode; 1427*dddac438SJohn Levon mp->mnr_memrange = mri; 1428*dddac438SJohn Levon mp->mnr_next = -1; 1429*dddac438SJohn Levon mp->mnr_exists = 1; 1430*dddac438SJohn Levon mp++; 1431*dddac438SJohn Levon nr_ranges++; 14327c478bd9Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 14337c478bd9Sstevel@tonic-gate mri--; 14347c478bd9Sstevel@tonic-gate else 14357c478bd9Sstevel@tonic-gate break; 14367c478bd9Sstevel@tonic-gate } 14377c478bd9Sstevel@tonic-gate } 1438a3114836SGerry Liu 1439a3114836SGerry Liu /* 1440*dddac438SJohn Levon * mnoderangecnt can be larger than nr_ranges when memory DR is 1441*dddac438SJohn Levon * supposedly supported. 1442a3114836SGerry Liu */ 1443*dddac438SJohn Levon VERIFY3U(nr_ranges, <=, mnoderangecnt); 1444*dddac438SJohn Levon 1445*dddac438SJohn Levon qsort(mnoderanges, nr_ranges, sizeof (mnoderange_t), mnoderange_cmp); 1446*dddac438SJohn Levon 1447*dddac438SJohn Levon /* 1448*dddac438SJohn Levon * If some intrepid soul takes the axe to the memory DR code, we can 1449*dddac438SJohn Levon * remove ->mnr_next altogether, as we just sorted by ->mnr_pfnlo order. 1450*dddac438SJohn Levon * 1451*dddac438SJohn Levon * The VERIFY3U() above can be "==" then too. 1452*dddac438SJohn Levon */ 1453*dddac438SJohn Levon for (size_t i = 1; i < nr_ranges; i++) 1454*dddac438SJohn Levon mnoderanges[i].mnr_next = i - 1; 1455*dddac438SJohn Levon 1456*dddac438SJohn Levon mtypetop = nr_ranges - 1; 1457*dddac438SJohn Levon mtype16m = pfn_2_mtype(PFN_16MEG - 1); /* Can be -1 ... */ 1458*dddac438SJohn Levon if (physmax4g) 1459*dddac438SJohn Levon mtype4g = pfn_2_mtype(0xfffff); 14607c478bd9Sstevel@tonic-gate } 14617c478bd9Sstevel@tonic-gate 1462a3114836SGerry Liu #ifndef __xpv 1463a3114836SGerry Liu /* 1464a3114836SGerry Liu * Update mnoderanges for memory hot-add DR operations. 1465a3114836SGerry Liu */ 1466a3114836SGerry Liu static void 1467a3114836SGerry Liu mnode_range_add(int mnode) 1468a3114836SGerry Liu { 1469a3114836SGerry Liu int *prev; 1470a3114836SGerry Liu int n, mri; 1471a3114836SGerry Liu pfn_t start, end; 1472a3114836SGerry Liu extern void membar_sync(void); 1473a3114836SGerry Liu 1474a3114836SGerry Liu ASSERT(0 <= mnode && mnode < max_mem_nodes); 1475a3114836SGerry Liu ASSERT(mem_node_config[mnode].exists); 1476a3114836SGerry Liu start = mem_node_config[mnode].physbase; 1477a3114836SGerry Liu end = mem_node_config[mnode].physmax; 1478a3114836SGerry Liu ASSERT(start <= end); 1479a3114836SGerry Liu mutex_enter(&mnoderange_lock); 1480a3114836SGerry Liu 1481a3114836SGerry Liu #ifdef DEBUG 1482a3114836SGerry Liu /* Check whether it interleaves with other memory nodes. */ 1483a3114836SGerry Liu for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { 1484a3114836SGerry Liu ASSERT(mnoderanges[n].mnr_exists); 1485a3114836SGerry Liu if (mnoderanges[n].mnr_mnode == mnode) 1486a3114836SGerry Liu continue; 1487a3114836SGerry Liu ASSERT(start > mnoderanges[n].mnr_pfnhi || 1488a3114836SGerry Liu end < mnoderanges[n].mnr_pfnlo); 1489a3114836SGerry Liu } 1490a3114836SGerry Liu #endif /* DEBUG */ 1491a3114836SGerry Liu 1492a3114836SGerry Liu mri = nranges - 1; 1493a3114836SGerry Liu while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1494a3114836SGerry Liu mri--; 1495a3114836SGerry Liu while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1496a3114836SGerry Liu /* Check whether mtype already exists. */ 1497a3114836SGerry Liu for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { 1498a3114836SGerry Liu if (mnoderanges[n].mnr_mnode == mnode && 1499a3114836SGerry Liu mnoderanges[n].mnr_memrange == mri) { 1500a3114836SGerry Liu mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), 1501a3114836SGerry Liu start); 1502a3114836SGerry Liu mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), 1503a3114836SGerry Liu end); 1504a3114836SGerry Liu break; 1505a3114836SGerry Liu } 1506a3114836SGerry Liu } 1507a3114836SGerry Liu 1508a3114836SGerry Liu /* Add a new entry if it doesn't exist yet. */ 1509a3114836SGerry Liu if (n == -1) { 1510a3114836SGerry Liu /* Try to find an unused entry in mnoderanges array. */ 1511a3114836SGerry Liu for (n = 0; n < mnoderangecnt; n++) { 1512a3114836SGerry Liu if (mnoderanges[n].mnr_exists == 0) 1513a3114836SGerry Liu break; 1514a3114836SGerry Liu } 1515a3114836SGerry Liu ASSERT(n < mnoderangecnt); 1516a3114836SGerry Liu mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start); 1517a3114836SGerry Liu mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end); 1518a3114836SGerry Liu mnoderanges[n].mnr_mnode = mnode; 1519a3114836SGerry Liu mnoderanges[n].mnr_memrange = mri; 1520a3114836SGerry Liu mnoderanges[n].mnr_exists = 1; 1521a3114836SGerry Liu /* Page 0 should always be present. */ 1522a3114836SGerry Liu for (prev = &mtypetop; 1523a3114836SGerry Liu mnoderanges[*prev].mnr_pfnlo > start; 1524a3114836SGerry Liu prev = &mnoderanges[*prev].mnr_next) { 1525a3114836SGerry Liu ASSERT(mnoderanges[*prev].mnr_next >= 0); 1526a3114836SGerry Liu ASSERT(mnoderanges[*prev].mnr_pfnlo > end); 1527a3114836SGerry Liu } 1528a3114836SGerry Liu mnoderanges[n].mnr_next = *prev; 1529a3114836SGerry Liu membar_sync(); 1530a3114836SGerry Liu *prev = n; 1531a3114836SGerry Liu } 1532a3114836SGerry Liu 1533a3114836SGerry Liu if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1534a3114836SGerry Liu mri--; 1535a3114836SGerry Liu else 1536a3114836SGerry Liu break; 1537a3114836SGerry Liu } 1538a3114836SGerry Liu 1539a3114836SGerry Liu mutex_exit(&mnoderange_lock); 1540a3114836SGerry Liu } 1541a3114836SGerry Liu 1542a3114836SGerry Liu /* 1543a3114836SGerry Liu * Update mnoderanges for memory hot-removal DR operations. 1544a3114836SGerry Liu */ 1545a3114836SGerry Liu static void 1546a3114836SGerry Liu mnode_range_del(int mnode) 1547a3114836SGerry Liu { 1548a3114836SGerry Liu _NOTE(ARGUNUSED(mnode)); 1549a3114836SGerry Liu ASSERT(0 <= mnode && mnode < max_mem_nodes); 1550a3114836SGerry Liu /* TODO: support deletion operation. */ 1551a3114836SGerry Liu ASSERT(0); 1552a3114836SGerry Liu } 1553a3114836SGerry Liu 1554a3114836SGerry Liu void 1555a3114836SGerry Liu plat_slice_add(pfn_t start, pfn_t end) 1556a3114836SGerry Liu { 1557a3114836SGerry Liu mem_node_add_slice(start, end); 1558a3114836SGerry Liu if (plat_dr_enabled()) { 1559a3114836SGerry Liu mnode_range_add(PFN_2_MEM_NODE(start)); 1560a3114836SGerry Liu } 1561a3114836SGerry Liu } 1562a3114836SGerry Liu 1563a3114836SGerry Liu void 1564a3114836SGerry Liu plat_slice_del(pfn_t start, pfn_t end) 1565a3114836SGerry Liu { 1566a3114836SGerry Liu ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end)); 1567a3114836SGerry Liu ASSERT(plat_dr_enabled()); 1568a3114836SGerry Liu mnode_range_del(PFN_2_MEM_NODE(start)); 1569a3114836SGerry Liu mem_node_del_slice(start, end); 1570a3114836SGerry Liu } 1571a3114836SGerry Liu #endif /* __xpv */ 1572a3114836SGerry Liu 1573843e1988Sjohnlev /*ARGSUSED*/ 1574843e1988Sjohnlev int 1575843e1988Sjohnlev mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 1576843e1988Sjohnlev { 1577a3114836SGerry Liu int mtype = mtypetop; 1578843e1988Sjohnlev 1579843e1988Sjohnlev #if !defined(__xpv) 1580843e1988Sjohnlev #if defined(__i386) 1581843e1988Sjohnlev /* 1582843e1988Sjohnlev * set the mtype range 1583a3114836SGerry Liu * - kmem requests need to be below 4g if restricted_kmemalloc is set. 1584843e1988Sjohnlev * - for non kmem requests, set range to above 4g if memory below 4g 1585843e1988Sjohnlev * runs low. 1586843e1988Sjohnlev */ 1587843e1988Sjohnlev if (restricted_kmemalloc && VN_ISKAS(vp) && 1588843e1988Sjohnlev (caddr_t)(vaddr) >= kernelheap && 1589843e1988Sjohnlev (caddr_t)(vaddr) < ekernelheap) { 1590843e1988Sjohnlev ASSERT(physmax4g); 1591843e1988Sjohnlev mtype = mtype4g; 1592843e1988Sjohnlev if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 1593843e1988Sjohnlev btop(pgsz), *flags)) { 1594843e1988Sjohnlev *flags |= PGI_MT_RANGE16M; 1595843e1988Sjohnlev } else { 1596843e1988Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1597843e1988Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), 1598843e1988Sjohnlev vmm_vmstats.pgpanicalloc); 1599843e1988Sjohnlev *flags |= PGI_MT_RANGE0; 1600843e1988Sjohnlev } 1601843e1988Sjohnlev return (mtype); 1602843e1988Sjohnlev } 1603843e1988Sjohnlev #endif /* __i386 */ 1604843e1988Sjohnlev 1605843e1988Sjohnlev if (RESTRICT4G_ALLOC) { 1606843e1988Sjohnlev VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 1607843e1988Sjohnlev /* here only for > 4g systems */ 1608843e1988Sjohnlev *flags |= PGI_MT_RANGE4G; 1609843e1988Sjohnlev } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 1610843e1988Sjohnlev *flags |= PGI_MT_RANGE16M; 1611843e1988Sjohnlev } else { 1612843e1988Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1613843e1988Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 1614843e1988Sjohnlev *flags |= PGI_MT_RANGE0; 1615843e1988Sjohnlev } 1616843e1988Sjohnlev #endif /* !__xpv */ 1617843e1988Sjohnlev return (mtype); 1618843e1988Sjohnlev } 1619843e1988Sjohnlev 1620843e1988Sjohnlev 1621843e1988Sjohnlev /* mtype init for page_get_replacement_page */ 1622843e1988Sjohnlev /*ARGSUSED*/ 1623843e1988Sjohnlev int 1624843e1988Sjohnlev mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 1625843e1988Sjohnlev { 1626a3114836SGerry Liu int mtype = mtypetop; 1627a3114836SGerry Liu #if !defined(__xpv) 1628843e1988Sjohnlev if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 1629843e1988Sjohnlev *flags |= PGI_MT_RANGE16M; 1630843e1988Sjohnlev } else { 1631843e1988Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1632843e1988Sjohnlev *flags |= PGI_MT_RANGE0; 1633843e1988Sjohnlev } 1634843e1988Sjohnlev #endif 1635843e1988Sjohnlev return (mtype); 1636843e1988Sjohnlev } 1637843e1988Sjohnlev 16387c478bd9Sstevel@tonic-gate /* 16397c478bd9Sstevel@tonic-gate * Determine if the mnode range specified in mtype contains memory belonging 16407c478bd9Sstevel@tonic-gate * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1641a3114836SGerry Liu * the range from high pfn to 0, 16m or 4g. 16427c478bd9Sstevel@tonic-gate * 16437c478bd9Sstevel@tonic-gate * Return first mnode range type index found otherwise return -1 if none found. 16447c478bd9Sstevel@tonic-gate */ 16457c478bd9Sstevel@tonic-gate int 16467c478bd9Sstevel@tonic-gate mtype_func(int mnode, int mtype, uint_t flags) 16477c478bd9Sstevel@tonic-gate { 16487c478bd9Sstevel@tonic-gate if (flags & PGI_MT_RANGE) { 1649a3114836SGerry Liu int mnr_lim = MRI_0; 16507c478bd9Sstevel@tonic-gate 1651a3114836SGerry Liu if (flags & PGI_MT_NEXT) { 1652a3114836SGerry Liu mtype = mnoderanges[mtype].mnr_next; 1653a3114836SGerry Liu } 1654843e1988Sjohnlev if (flags & PGI_MT_RANGE4G) 1655a3114836SGerry Liu mnr_lim = MRI_4G; /* exclude 0-4g range */ 165607ad560dSkchow else if (flags & PGI_MT_RANGE16M) 1657a3114836SGerry Liu mnr_lim = MRI_16M; /* exclude 0-16m range */ 1658a3114836SGerry Liu while (mtype != -1 && 1659a3114836SGerry Liu mnoderanges[mtype].mnr_memrange <= mnr_lim) { 16607c478bd9Sstevel@tonic-gate if (mnoderanges[mtype].mnr_mnode == mnode) 16617c478bd9Sstevel@tonic-gate return (mtype); 1662a3114836SGerry Liu mtype = mnoderanges[mtype].mnr_next; 16637c478bd9Sstevel@tonic-gate } 1664843e1988Sjohnlev } else if (mnoderanges[mtype].mnr_mnode == mnode) { 1665843e1988Sjohnlev return (mtype); 16667c478bd9Sstevel@tonic-gate } 16677c478bd9Sstevel@tonic-gate return (-1); 16687c478bd9Sstevel@tonic-gate } 16697c478bd9Sstevel@tonic-gate 1670e21bae1bSkchow /* 1671e21bae1bSkchow * Update the page list max counts with the pfn range specified by the 1672a3114836SGerry Liu * input parameters. 1673e21bae1bSkchow */ 1674e21bae1bSkchow void 1675e21bae1bSkchow mtype_modify_max(pfn_t startpfn, long cnt) 1676e21bae1bSkchow { 1677a3114836SGerry Liu int mtype; 1678a3114836SGerry Liu pgcnt_t inc; 1679a3114836SGerry Liu spgcnt_t scnt = (spgcnt_t)(cnt); 1680a3114836SGerry Liu pgcnt_t acnt = ABS(scnt); 1681a3114836SGerry Liu pfn_t endpfn = startpfn + acnt; 1682a3114836SGerry Liu pfn_t pfn, lo; 1683e21bae1bSkchow 1684843e1988Sjohnlev if (!physmax4g) 1685843e1988Sjohnlev return; 1686843e1988Sjohnlev 1687a3114836SGerry Liu mtype = mtypetop; 1688a3114836SGerry Liu for (pfn = endpfn; pfn > startpfn; ) { 1689a3114836SGerry Liu ASSERT(mtype != -1); 1690a3114836SGerry Liu lo = mnoderanges[mtype].mnr_pfnlo; 1691a3114836SGerry Liu if (pfn > lo) { 1692a3114836SGerry Liu if (startpfn >= lo) { 1693a3114836SGerry Liu inc = pfn - startpfn; 1694e21bae1bSkchow } else { 1695a3114836SGerry Liu inc = pfn - lo; 1696a3114836SGerry Liu } 1697a3114836SGerry Liu if (mnoderanges[mtype].mnr_memrange != MRI_4G) { 1698a3114836SGerry Liu if (scnt > 0) 1699a3114836SGerry Liu maxmem4g += inc; 1700a3114836SGerry Liu else 1701a3114836SGerry Liu maxmem4g -= inc; 1702e21bae1bSkchow } 1703a3114836SGerry Liu pfn -= inc; 1704e21bae1bSkchow } 1705a3114836SGerry Liu mtype = mnoderanges[mtype].mnr_next; 1706e21bae1bSkchow } 1707e21bae1bSkchow } 1708e21bae1bSkchow 1709843e1988Sjohnlev int 1710843e1988Sjohnlev mtype_2_mrange(int mtype) 1711843e1988Sjohnlev { 1712843e1988Sjohnlev return (mnoderanges[mtype].mnr_memrange); 1713843e1988Sjohnlev } 1714843e1988Sjohnlev 1715843e1988Sjohnlev void 1716843e1988Sjohnlev mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 1717843e1988Sjohnlev { 1718a3114836SGerry Liu _NOTE(ARGUNUSED(mnode)); 1719843e1988Sjohnlev ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 1720843e1988Sjohnlev *pfnlo = mnoderanges[mtype].mnr_pfnlo; 1721843e1988Sjohnlev *pfnhi = mnoderanges[mtype].mnr_pfnhi; 1722843e1988Sjohnlev } 1723843e1988Sjohnlev 1724843e1988Sjohnlev size_t 1725843e1988Sjohnlev plcnt_sz(size_t ctrs_sz) 1726843e1988Sjohnlev { 1727843e1988Sjohnlev #ifdef DEBUG 1728843e1988Sjohnlev int szc, colors; 1729843e1988Sjohnlev 1730843e1988Sjohnlev ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 1731843e1988Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 1732843e1988Sjohnlev colors = page_get_pagecolors(szc); 1733843e1988Sjohnlev ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 1734843e1988Sjohnlev } 1735843e1988Sjohnlev #endif 1736843e1988Sjohnlev return (ctrs_sz); 1737843e1988Sjohnlev } 1738843e1988Sjohnlev 1739843e1988Sjohnlev caddr_t 1740843e1988Sjohnlev plcnt_init(caddr_t addr) 1741843e1988Sjohnlev { 1742843e1988Sjohnlev #ifdef DEBUG 1743843e1988Sjohnlev int mt, szc, colors; 1744843e1988Sjohnlev 1745843e1988Sjohnlev for (mt = 0; mt < mnoderangecnt; mt++) { 1746843e1988Sjohnlev mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 1747843e1988Sjohnlev addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 1748843e1988Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 1749843e1988Sjohnlev colors = page_get_pagecolors(szc); 1750843e1988Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 1751843e1988Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 1752843e1988Sjohnlev (pgcnt_t *)addr; 1753843e1988Sjohnlev addr += (sizeof (pgcnt_t) * colors); 1754843e1988Sjohnlev } 1755843e1988Sjohnlev } 1756843e1988Sjohnlev #endif 1757843e1988Sjohnlev return (addr); 1758843e1988Sjohnlev } 1759843e1988Sjohnlev 1760843e1988Sjohnlev void 1761843e1988Sjohnlev plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 1762843e1988Sjohnlev { 1763a3114836SGerry Liu _NOTE(ARGUNUSED(pp)); 1764843e1988Sjohnlev #ifdef DEBUG 1765843e1988Sjohnlev int bin = PP_2_BIN(pp); 1766843e1988Sjohnlev 1767843e1988Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 1768843e1988Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 1769843e1988Sjohnlev cnt); 1770843e1988Sjohnlev #endif 1771843e1988Sjohnlev ASSERT(mtype == PP_2_MTYPE(pp)); 1772a3114836SGerry Liu if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G) 1773843e1988Sjohnlev atomic_add_long(&freemem4g, cnt); 1774843e1988Sjohnlev if (flags & PG_CACHE_LIST) 1775843e1988Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 1776843e1988Sjohnlev else 177778b03d3aSkchow atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt); 177878b03d3aSkchow atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt); 1779843e1988Sjohnlev } 1780843e1988Sjohnlev 1781affbd3ccSkchow /* 1782affbd3ccSkchow * Returns the free page count for mnode 1783affbd3ccSkchow */ 1784affbd3ccSkchow int 1785affbd3ccSkchow mnode_pgcnt(int mnode) 1786affbd3ccSkchow { 1787a3114836SGerry Liu int mtype = mtypetop; 1788affbd3ccSkchow int flags = PGI_MT_RANGE0; 1789affbd3ccSkchow pgcnt_t pgcnt = 0; 1790affbd3ccSkchow 1791affbd3ccSkchow mtype = mtype_func(mnode, mtype, flags); 1792affbd3ccSkchow 1793affbd3ccSkchow while (mtype != -1) { 179407ad560dSkchow pgcnt += MTYPE_FREEMEM(mtype); 1795affbd3ccSkchow mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1796affbd3ccSkchow } 1797affbd3ccSkchow return (pgcnt); 1798affbd3ccSkchow } 1799affbd3ccSkchow 18007c478bd9Sstevel@tonic-gate /* 18017c478bd9Sstevel@tonic-gate * Initialize page coloring variables based on the l2 cache parameters. 18027c478bd9Sstevel@tonic-gate * Calculate and return memory needed for page coloring data structures. 18037c478bd9Sstevel@tonic-gate */ 18047c478bd9Sstevel@tonic-gate size_t 18057c478bd9Sstevel@tonic-gate page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 18067c478bd9Sstevel@tonic-gate { 1807a3114836SGerry Liu _NOTE(ARGUNUSED(l2_linesz)); 18087c478bd9Sstevel@tonic-gate size_t colorsz = 0; 18097c478bd9Sstevel@tonic-gate int i; 18107c478bd9Sstevel@tonic-gate int colors; 18117c478bd9Sstevel@tonic-gate 1812843e1988Sjohnlev #if defined(__xpv) 1813843e1988Sjohnlev /* 1814843e1988Sjohnlev * Hypervisor domains currently don't have any concept of NUMA. 1815843e1988Sjohnlev * Hence we'll act like there is only 1 memrange. 1816843e1988Sjohnlev */ 1817843e1988Sjohnlev i = memrange_num(1); 1818843e1988Sjohnlev #else /* !__xpv */ 18197c478bd9Sstevel@tonic-gate /* 18207c478bd9Sstevel@tonic-gate * Reduce the memory ranges lists if we don't have large amounts 18217c478bd9Sstevel@tonic-gate * of memory. This avoids searching known empty free lists. 1822a3114836SGerry Liu * To support memory DR operations, we need to keep memory ranges 1823a3114836SGerry Liu * for possible memory hot-add operations. 18247c478bd9Sstevel@tonic-gate */ 1825a3114836SGerry Liu if (plat_dr_physmax > physmax) 1826a3114836SGerry Liu i = memrange_num(plat_dr_physmax); 1827a3114836SGerry Liu else 1828a3114836SGerry Liu i = memrange_num(physmax); 18297c478bd9Sstevel@tonic-gate #if defined(__i386) 1830a3114836SGerry Liu if (i > MRI_4G) 18317c478bd9Sstevel@tonic-gate restricted_kmemalloc = 0; 18327c478bd9Sstevel@tonic-gate #endif 18337c478bd9Sstevel@tonic-gate /* physmax greater than 4g */ 1834a3114836SGerry Liu if (i == MRI_4G) 18357c478bd9Sstevel@tonic-gate physmax4g = 1; 1836843e1988Sjohnlev #endif /* !__xpv */ 1837843e1988Sjohnlev memranges += i; 1838843e1988Sjohnlev nranges -= i; 18397c478bd9Sstevel@tonic-gate 184002bc52beSkchow ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES); 184102bc52beSkchow 18427c478bd9Sstevel@tonic-gate ASSERT(ISP2(l2_linesz)); 18437c478bd9Sstevel@tonic-gate ASSERT(l2_sz > MMU_PAGESIZE); 18447c478bd9Sstevel@tonic-gate 18457c478bd9Sstevel@tonic-gate /* l2_assoc is 0 for fully associative l2 cache */ 18467c478bd9Sstevel@tonic-gate if (l2_assoc) 18477c478bd9Sstevel@tonic-gate l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 18487c478bd9Sstevel@tonic-gate else 18497c478bd9Sstevel@tonic-gate l2_colors = 1; 18507c478bd9Sstevel@tonic-gate 1851e4ab3d6dSvd ASSERT(ISP2(l2_colors)); 1852e4ab3d6dSvd 18537c478bd9Sstevel@tonic-gate /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 18547c478bd9Sstevel@tonic-gate page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 18557c478bd9Sstevel@tonic-gate 18567c478bd9Sstevel@tonic-gate /* 18577c478bd9Sstevel@tonic-gate * cpu_page_colors is non-zero when a page color may be spread across 18587c478bd9Sstevel@tonic-gate * multiple bins. 18597c478bd9Sstevel@tonic-gate */ 18607c478bd9Sstevel@tonic-gate if (l2_colors < page_colors) 18617c478bd9Sstevel@tonic-gate cpu_page_colors = l2_colors; 18627c478bd9Sstevel@tonic-gate 18637c478bd9Sstevel@tonic-gate ASSERT(ISP2(page_colors)); 18647c478bd9Sstevel@tonic-gate 18657c478bd9Sstevel@tonic-gate page_colors_mask = page_colors - 1; 18667c478bd9Sstevel@tonic-gate 18677c478bd9Sstevel@tonic-gate ASSERT(ISP2(CPUSETSIZE())); 18687c478bd9Sstevel@tonic-gate page_coloring_shift = lowbit(CPUSETSIZE()); 18697c478bd9Sstevel@tonic-gate 18705d07b933Sdp /* initialize number of colors per page size */ 18715d07b933Sdp for (i = 0; i <= mmu.max_page_level; i++) { 18725d07b933Sdp hw_page_array[i].hp_size = LEVEL_SIZE(i); 18735d07b933Sdp hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 18745d07b933Sdp hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 18755d07b933Sdp hw_page_array[i].hp_colors = (page_colors_mask >> 18765d07b933Sdp (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 18775d07b933Sdp + 1; 1878932dc8e5Sdp colorequivszc[i] = 0; 18795d07b933Sdp } 18805d07b933Sdp 18815d07b933Sdp /* 18825d07b933Sdp * The value of cpu_page_colors determines if additional color bins 18835d07b933Sdp * need to be checked for a particular color in the page_get routines. 18845d07b933Sdp */ 18855d07b933Sdp if (cpu_page_colors != 0) { 18865d07b933Sdp 18875d07b933Sdp int a = lowbit(page_colors) - lowbit(cpu_page_colors); 18885d07b933Sdp ASSERT(a > 0); 18895d07b933Sdp ASSERT(a < 16); 18905d07b933Sdp 18915d07b933Sdp for (i = 0; i <= mmu.max_page_level; i++) { 18925d07b933Sdp if ((colors = hw_page_array[i].hp_colors) <= 1) { 18935d07b933Sdp colorequivszc[i] = 0; 18945d07b933Sdp continue; 18955d07b933Sdp } 18965d07b933Sdp while ((colors >> a) == 0) 18975d07b933Sdp a--; 18985d07b933Sdp ASSERT(a >= 0); 18995d07b933Sdp 19005d07b933Sdp /* higher 4 bits encodes color equiv mask */ 19015d07b933Sdp colorequivszc[i] = (a << 4); 19025d07b933Sdp } 19035d07b933Sdp } 19045d07b933Sdp 1905843e1988Sjohnlev /* factor in colorequiv to check additional 'equivalent' bins. */ 1906843e1988Sjohnlev if (colorequiv > 1) { 1907843e1988Sjohnlev 1908843e1988Sjohnlev int a = lowbit(colorequiv) - 1; 1909843e1988Sjohnlev if (a > 15) 1910843e1988Sjohnlev a = 15; 1911843e1988Sjohnlev 1912843e1988Sjohnlev for (i = 0; i <= mmu.max_page_level; i++) { 1913843e1988Sjohnlev if ((colors = hw_page_array[i].hp_colors) <= 1) { 1914843e1988Sjohnlev continue; 1915843e1988Sjohnlev } 1916843e1988Sjohnlev while ((colors >> a) == 0) 1917843e1988Sjohnlev a--; 1918843e1988Sjohnlev if ((a << 4) > colorequivszc[i]) { 1919843e1988Sjohnlev colorequivszc[i] = (a << 4); 1920843e1988Sjohnlev } 1921843e1988Sjohnlev } 1922843e1988Sjohnlev } 1923843e1988Sjohnlev 19247c478bd9Sstevel@tonic-gate /* size for mnoderanges */ 19255d07b933Sdp for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 19265d07b933Sdp mnoderangecnt += mnode_range_cnt(i); 1927a3114836SGerry Liu if (plat_dr_support_memory()) { 1928a3114836SGerry Liu /* 1929a3114836SGerry Liu * Reserve enough space for memory DR operations. 1930a3114836SGerry Liu * Two extra mnoderanges for possbile fragmentations, 1931a3114836SGerry Liu * one for the 2G boundary and the other for the 4G boundary. 1932a3114836SGerry Liu * We don't expect a memory board crossing the 16M boundary 1933a3114836SGerry Liu * for memory hot-add operations on x86 platforms. 1934a3114836SGerry Liu */ 1935a3114836SGerry Liu mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt; 1936a3114836SGerry Liu } 19377c478bd9Sstevel@tonic-gate colorsz = mnoderangecnt * sizeof (mnoderange_t); 19387c478bd9Sstevel@tonic-gate 19397c478bd9Sstevel@tonic-gate /* size for fpc_mutex and cpc_mutex */ 19407c478bd9Sstevel@tonic-gate colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 19417c478bd9Sstevel@tonic-gate 19427c478bd9Sstevel@tonic-gate /* size of page_freelists */ 19437c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t ***); 19447c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 19457c478bd9Sstevel@tonic-gate 19467c478bd9Sstevel@tonic-gate for (i = 0; i < mmu_page_sizes; i++) { 19477c478bd9Sstevel@tonic-gate colors = page_get_pagecolors(i); 19487c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * colors * sizeof (page_t *); 19497c478bd9Sstevel@tonic-gate } 19507c478bd9Sstevel@tonic-gate 19517c478bd9Sstevel@tonic-gate /* size of page_cachelists */ 19527c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t **); 19537c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 19547c478bd9Sstevel@tonic-gate 19557c478bd9Sstevel@tonic-gate return (colorsz); 19567c478bd9Sstevel@tonic-gate } 19577c478bd9Sstevel@tonic-gate 19587c478bd9Sstevel@tonic-gate /* 19597c478bd9Sstevel@tonic-gate * Called once at startup to configure page_coloring data structures and 19607c478bd9Sstevel@tonic-gate * does the 1st page_free()/page_freelist_add(). 19617c478bd9Sstevel@tonic-gate */ 19627c478bd9Sstevel@tonic-gate void 19637c478bd9Sstevel@tonic-gate page_coloring_setup(caddr_t pcmemaddr) 19647c478bd9Sstevel@tonic-gate { 19657c478bd9Sstevel@tonic-gate int i; 19667c478bd9Sstevel@tonic-gate int j; 19677c478bd9Sstevel@tonic-gate int k; 19687c478bd9Sstevel@tonic-gate caddr_t addr; 19697c478bd9Sstevel@tonic-gate int colors; 19707c478bd9Sstevel@tonic-gate 19717c478bd9Sstevel@tonic-gate /* 19727c478bd9Sstevel@tonic-gate * do page coloring setup 19737c478bd9Sstevel@tonic-gate */ 19747c478bd9Sstevel@tonic-gate addr = pcmemaddr; 19757c478bd9Sstevel@tonic-gate 19767c478bd9Sstevel@tonic-gate mnoderanges = (mnoderange_t *)addr; 19777c478bd9Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (mnoderange_t)); 19787c478bd9Sstevel@tonic-gate 19797c478bd9Sstevel@tonic-gate mnode_range_setup(mnoderanges); 19807c478bd9Sstevel@tonic-gate 19817c478bd9Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 19827c478bd9Sstevel@tonic-gate fpc_mutex[k] = (kmutex_t *)addr; 19837c478bd9Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 19847c478bd9Sstevel@tonic-gate } 19857c478bd9Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 19867c478bd9Sstevel@tonic-gate cpc_mutex[k] = (kmutex_t *)addr; 19877c478bd9Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 19887c478bd9Sstevel@tonic-gate } 1989d94ffb28Sjmcp page_freelists = (page_t ****)addr; 19907c478bd9Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t ***)); 19917c478bd9Sstevel@tonic-gate 19927c478bd9Sstevel@tonic-gate page_cachelists = (page_t ***)addr; 19937c478bd9Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t **)); 19947c478bd9Sstevel@tonic-gate 19957c478bd9Sstevel@tonic-gate for (i = 0; i < mnoderangecnt; i++) { 1996d94ffb28Sjmcp page_freelists[i] = (page_t ***)addr; 19977c478bd9Sstevel@tonic-gate addr += (mmu_page_sizes * sizeof (page_t **)); 19987c478bd9Sstevel@tonic-gate 19997c478bd9Sstevel@tonic-gate for (j = 0; j < mmu_page_sizes; j++) { 20007c478bd9Sstevel@tonic-gate colors = page_get_pagecolors(j); 2001d94ffb28Sjmcp page_freelists[i][j] = (page_t **)addr; 20027c478bd9Sstevel@tonic-gate addr += (colors * sizeof (page_t *)); 20037c478bd9Sstevel@tonic-gate } 20047c478bd9Sstevel@tonic-gate page_cachelists[i] = (page_t **)addr; 20057c478bd9Sstevel@tonic-gate addr += (page_colors * sizeof (page_t *)); 20067c478bd9Sstevel@tonic-gate } 20077c478bd9Sstevel@tonic-gate } 20087c478bd9Sstevel@tonic-gate 2009843e1988Sjohnlev #if defined(__xpv) 2010843e1988Sjohnlev /* 2011843e1988Sjohnlev * Give back 10% of the io_pool pages to the free list. 2012843e1988Sjohnlev * Don't shrink the pool below some absolute minimum. 2013843e1988Sjohnlev */ 2014843e1988Sjohnlev static void 2015843e1988Sjohnlev page_io_pool_shrink() 2016843e1988Sjohnlev { 2017843e1988Sjohnlev int retcnt; 2018843e1988Sjohnlev page_t *pp, *pp_first, *pp_last, **curpool; 2019843e1988Sjohnlev mfn_t mfn; 2020843e1988Sjohnlev int bothpools = 0; 2021843e1988Sjohnlev 2022843e1988Sjohnlev mutex_enter(&io_pool_lock); 2023843e1988Sjohnlev io_pool_shrink_attempts++; /* should be a kstat? */ 2024843e1988Sjohnlev retcnt = io_pool_cnt / 10; 2025843e1988Sjohnlev if (io_pool_cnt - retcnt < io_pool_cnt_min) 2026843e1988Sjohnlev retcnt = io_pool_cnt - io_pool_cnt_min; 2027843e1988Sjohnlev if (retcnt <= 0) 2028843e1988Sjohnlev goto done; 2029843e1988Sjohnlev io_pool_shrinks++; /* should be a kstat? */ 2030843e1988Sjohnlev curpool = &io_pool_4g; 2031843e1988Sjohnlev domore: 2032843e1988Sjohnlev /* 2033843e1988Sjohnlev * Loop through taking pages from the end of the list 2034843e1988Sjohnlev * (highest mfns) till amount to return reached. 2035843e1988Sjohnlev */ 2036843e1988Sjohnlev for (pp = *curpool; pp && retcnt > 0; ) { 2037843e1988Sjohnlev pp_first = pp_last = pp->p_prev; 2038843e1988Sjohnlev if (pp_first == *curpool) 2039843e1988Sjohnlev break; 2040843e1988Sjohnlev retcnt--; 2041843e1988Sjohnlev io_pool_cnt--; 2042843e1988Sjohnlev page_io_pool_sub(curpool, pp_first, pp_last); 2043843e1988Sjohnlev if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 2044843e1988Sjohnlev start_mfn = mfn; 2045843e1988Sjohnlev page_free(pp_first, 1); 2046843e1988Sjohnlev pp = *curpool; 2047843e1988Sjohnlev } 2048843e1988Sjohnlev if (retcnt != 0 && !bothpools) { 2049843e1988Sjohnlev /* 2050843e1988Sjohnlev * If not enough found in less constrained pool try the 2051843e1988Sjohnlev * more constrained one. 2052843e1988Sjohnlev */ 2053843e1988Sjohnlev curpool = &io_pool_16m; 2054843e1988Sjohnlev bothpools = 1; 2055843e1988Sjohnlev goto domore; 2056843e1988Sjohnlev } 2057843e1988Sjohnlev done: 2058843e1988Sjohnlev mutex_exit(&io_pool_lock); 2059843e1988Sjohnlev } 2060843e1988Sjohnlev 2061843e1988Sjohnlev #endif /* __xpv */ 2062843e1988Sjohnlev 2063843e1988Sjohnlev uint_t 2064843e1988Sjohnlev page_create_update_flags_x86(uint_t flags) 2065843e1988Sjohnlev { 2066843e1988Sjohnlev #if defined(__xpv) 2067843e1988Sjohnlev /* 2068843e1988Sjohnlev * Check this is an urgent allocation and free pages are depleted. 2069843e1988Sjohnlev */ 2070843e1988Sjohnlev if (!(flags & PG_WAIT) && freemem < desfree) 2071843e1988Sjohnlev page_io_pool_shrink(); 2072843e1988Sjohnlev #else /* !__xpv */ 2073843e1988Sjohnlev /* 2074843e1988Sjohnlev * page_create_get_something may call this because 4g memory may be 2075843e1988Sjohnlev * depleted. Set flags to allow for relocation of base page below 2076843e1988Sjohnlev * 4g if necessary. 2077843e1988Sjohnlev */ 2078843e1988Sjohnlev if (physmax4g) 2079843e1988Sjohnlev flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 2080843e1988Sjohnlev #endif /* __xpv */ 2081843e1988Sjohnlev return (flags); 2082843e1988Sjohnlev } 2083843e1988Sjohnlev 2084843e1988Sjohnlev /*ARGSUSED*/ 2085843e1988Sjohnlev int 2086843e1988Sjohnlev bp_color(struct buf *bp) 20877c478bd9Sstevel@tonic-gate { 20887c478bd9Sstevel@tonic-gate return (0); 20897c478bd9Sstevel@tonic-gate } 20907c478bd9Sstevel@tonic-gate 2091843e1988Sjohnlev #if defined(__xpv) 2092843e1988Sjohnlev 20937c478bd9Sstevel@tonic-gate /* 2094843e1988Sjohnlev * Take pages out of an io_pool 2095843e1988Sjohnlev */ 2096843e1988Sjohnlev static void 2097843e1988Sjohnlev page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 2098843e1988Sjohnlev { 2099843e1988Sjohnlev if (*poolp == pp_first) { 2100843e1988Sjohnlev *poolp = pp_last->p_next; 2101843e1988Sjohnlev if (*poolp == pp_first) 2102843e1988Sjohnlev *poolp = NULL; 2103843e1988Sjohnlev } 2104843e1988Sjohnlev pp_first->p_prev->p_next = pp_last->p_next; 2105843e1988Sjohnlev pp_last->p_next->p_prev = pp_first->p_prev; 2106843e1988Sjohnlev pp_first->p_prev = pp_last; 2107843e1988Sjohnlev pp_last->p_next = pp_first; 2108843e1988Sjohnlev } 2109843e1988Sjohnlev 2110843e1988Sjohnlev /* 2111843e1988Sjohnlev * Put a page on the io_pool list. The list is ordered by increasing MFN. 2112843e1988Sjohnlev */ 2113843e1988Sjohnlev static void 2114843e1988Sjohnlev page_io_pool_add(page_t **poolp, page_t *pp) 2115843e1988Sjohnlev { 2116843e1988Sjohnlev page_t *look; 2117843e1988Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 2118843e1988Sjohnlev 2119843e1988Sjohnlev if (*poolp == NULL) { 2120843e1988Sjohnlev *poolp = pp; 2121843e1988Sjohnlev pp->p_next = pp; 2122843e1988Sjohnlev pp->p_prev = pp; 2123843e1988Sjohnlev return; 2124843e1988Sjohnlev } 2125843e1988Sjohnlev 2126843e1988Sjohnlev /* 2127843e1988Sjohnlev * Since we try to take pages from the high end of the pool 2128843e1988Sjohnlev * chances are good that the pages to be put on the list will 2129843e1988Sjohnlev * go at or near the end of the list. so start at the end and 2130843e1988Sjohnlev * work backwards. 2131843e1988Sjohnlev */ 2132843e1988Sjohnlev look = (*poolp)->p_prev; 2133843e1988Sjohnlev while (mfn < mfn_list[look->p_pagenum]) { 2134843e1988Sjohnlev look = look->p_prev; 2135843e1988Sjohnlev if (look == (*poolp)->p_prev) 2136843e1988Sjohnlev break; /* backed all the way to front of list */ 2137843e1988Sjohnlev } 2138843e1988Sjohnlev 2139843e1988Sjohnlev /* insert after look */ 2140843e1988Sjohnlev pp->p_prev = look; 2141843e1988Sjohnlev pp->p_next = look->p_next; 2142843e1988Sjohnlev pp->p_next->p_prev = pp; 2143843e1988Sjohnlev look->p_next = pp; 2144843e1988Sjohnlev if (mfn < mfn_list[(*poolp)->p_pagenum]) { 2145843e1988Sjohnlev /* 2146843e1988Sjohnlev * we inserted a new first list element 2147843e1988Sjohnlev * adjust pool pointer to newly inserted element 2148843e1988Sjohnlev */ 2149843e1988Sjohnlev *poolp = pp; 2150843e1988Sjohnlev } 2151843e1988Sjohnlev } 2152843e1988Sjohnlev 2153843e1988Sjohnlev /* 2154843e1988Sjohnlev * Add a page to the io_pool. Setting the force flag will force the page 2155843e1988Sjohnlev * into the io_pool no matter what. 2156843e1988Sjohnlev */ 2157843e1988Sjohnlev static void 2158843e1988Sjohnlev add_page_to_pool(page_t *pp, int force) 2159843e1988Sjohnlev { 2160843e1988Sjohnlev page_t *highest; 2161843e1988Sjohnlev page_t *freep = NULL; 2162843e1988Sjohnlev 2163843e1988Sjohnlev mutex_enter(&io_pool_lock); 2164843e1988Sjohnlev /* 2165843e1988Sjohnlev * Always keep the scarce low memory pages 2166843e1988Sjohnlev */ 2167843e1988Sjohnlev if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 2168843e1988Sjohnlev ++io_pool_cnt; 2169843e1988Sjohnlev page_io_pool_add(&io_pool_16m, pp); 2170843e1988Sjohnlev goto done; 2171843e1988Sjohnlev } 2172cf902cd2Ssmaybe if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) { 2173843e1988Sjohnlev ++io_pool_cnt; 2174843e1988Sjohnlev page_io_pool_add(&io_pool_4g, pp); 2175843e1988Sjohnlev } else { 2176843e1988Sjohnlev highest = io_pool_4g->p_prev; 2177843e1988Sjohnlev if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 2178843e1988Sjohnlev page_io_pool_sub(&io_pool_4g, highest, highest); 2179843e1988Sjohnlev page_io_pool_add(&io_pool_4g, pp); 2180843e1988Sjohnlev freep = highest; 2181843e1988Sjohnlev } else { 2182843e1988Sjohnlev freep = pp; 2183843e1988Sjohnlev } 2184843e1988Sjohnlev } 2185843e1988Sjohnlev done: 2186843e1988Sjohnlev mutex_exit(&io_pool_lock); 2187843e1988Sjohnlev if (freep) 2188843e1988Sjohnlev page_free(freep, 1); 2189843e1988Sjohnlev } 2190843e1988Sjohnlev 2191843e1988Sjohnlev 2192843e1988Sjohnlev int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 2193843e1988Sjohnlev int contig_pfn_max; /* capacity of the contig pfn list */ 2194843e1988Sjohnlev int next_alloc_pfn; /* next position in list to start a contig search */ 2195843e1988Sjohnlev int contig_pfnlist_updates; /* pfn list update count */ 2196843e1988Sjohnlev int contig_pfnlist_builds; /* how many times have we (re)built list */ 2197843e1988Sjohnlev int contig_pfnlist_buildfailed; /* how many times has list build failed */ 2198843e1988Sjohnlev int create_contig_pending; /* nonzero means taskq creating contig list */ 2199843e1988Sjohnlev pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 2200843e1988Sjohnlev 2201843e1988Sjohnlev /* 2202843e1988Sjohnlev * Function to use in sorting a list of pfns by their underlying mfns. 2203843e1988Sjohnlev */ 2204843e1988Sjohnlev static int 2205843e1988Sjohnlev mfn_compare(const void *pfnp1, const void *pfnp2) 2206843e1988Sjohnlev { 2207843e1988Sjohnlev mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 2208843e1988Sjohnlev mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 2209843e1988Sjohnlev 2210843e1988Sjohnlev if (mfn1 > mfn2) 2211843e1988Sjohnlev return (1); 2212843e1988Sjohnlev if (mfn1 < mfn2) 2213843e1988Sjohnlev return (-1); 2214843e1988Sjohnlev return (0); 2215843e1988Sjohnlev } 2216843e1988Sjohnlev 2217843e1988Sjohnlev /* 2218843e1988Sjohnlev * Compact the contig_pfn_list by tossing all the non-contiguous 2219843e1988Sjohnlev * elements from the list. 2220843e1988Sjohnlev */ 2221843e1988Sjohnlev static void 2222843e1988Sjohnlev compact_contig_pfn_list(void) 2223843e1988Sjohnlev { 2224843e1988Sjohnlev pfn_t pfn, lapfn, prev_lapfn; 2225843e1988Sjohnlev mfn_t mfn; 2226843e1988Sjohnlev int i, newcnt = 0; 2227843e1988Sjohnlev 2228843e1988Sjohnlev prev_lapfn = 0; 2229843e1988Sjohnlev for (i = 0; i < contig_pfn_cnt - 1; i++) { 2230843e1988Sjohnlev pfn = contig_pfn_list[i]; 2231843e1988Sjohnlev lapfn = contig_pfn_list[i + 1]; 2232843e1988Sjohnlev mfn = mfn_list[pfn]; 2233843e1988Sjohnlev /* 2234843e1988Sjohnlev * See if next pfn is for a contig mfn 2235843e1988Sjohnlev */ 2236843e1988Sjohnlev if (mfn_list[lapfn] != mfn + 1) 2237843e1988Sjohnlev continue; 2238843e1988Sjohnlev /* 2239843e1988Sjohnlev * pfn and lookahead are both put in list 2240843e1988Sjohnlev * unless pfn is the previous lookahead. 2241843e1988Sjohnlev */ 2242843e1988Sjohnlev if (pfn != prev_lapfn) 2243843e1988Sjohnlev contig_pfn_list[newcnt++] = pfn; 2244843e1988Sjohnlev contig_pfn_list[newcnt++] = lapfn; 2245843e1988Sjohnlev prev_lapfn = lapfn; 2246843e1988Sjohnlev } 2247843e1988Sjohnlev for (i = newcnt; i < contig_pfn_cnt; i++) 2248843e1988Sjohnlev contig_pfn_list[i] = 0; 2249843e1988Sjohnlev contig_pfn_cnt = newcnt; 2250843e1988Sjohnlev } 2251843e1988Sjohnlev 2252843e1988Sjohnlev /*ARGSUSED*/ 2253843e1988Sjohnlev static void 2254843e1988Sjohnlev call_create_contiglist(void *arg) 2255843e1988Sjohnlev { 2256843e1988Sjohnlev (void) create_contig_pfnlist(PG_WAIT); 2257843e1988Sjohnlev } 2258843e1988Sjohnlev 2259843e1988Sjohnlev /* 2260843e1988Sjohnlev * Create list of freelist pfns that have underlying 2261843e1988Sjohnlev * contiguous mfns. The list is kept in ascending mfn order. 2262843e1988Sjohnlev * returns 1 if list created else 0. 2263843e1988Sjohnlev */ 2264843e1988Sjohnlev static int 2265843e1988Sjohnlev create_contig_pfnlist(uint_t flags) 2266843e1988Sjohnlev { 2267843e1988Sjohnlev pfn_t pfn; 2268843e1988Sjohnlev page_t *pp; 2269b9bc7f78Ssmaybe int ret = 1; 2270843e1988Sjohnlev 2271b9bc7f78Ssmaybe mutex_enter(&contig_list_lock); 2272843e1988Sjohnlev if (contig_pfn_list != NULL) 2273b9bc7f78Ssmaybe goto out; 2274843e1988Sjohnlev contig_pfn_max = freemem + (freemem / 10); 2275843e1988Sjohnlev contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 2276843e1988Sjohnlev (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 2277843e1988Sjohnlev if (contig_pfn_list == NULL) { 2278843e1988Sjohnlev /* 2279843e1988Sjohnlev * If we could not create the contig list (because 2280843e1988Sjohnlev * we could not sleep for memory). Dispatch a taskq that can 2281843e1988Sjohnlev * sleep to get the memory. 2282843e1988Sjohnlev */ 2283843e1988Sjohnlev if (!create_contig_pending) { 2284843e1988Sjohnlev if (taskq_dispatch(system_taskq, call_create_contiglist, 2285fc8ae2ecSToomas Soome NULL, TQ_NOSLEEP) != TASKQID_INVALID) 2286843e1988Sjohnlev create_contig_pending = 1; 2287843e1988Sjohnlev } 2288843e1988Sjohnlev contig_pfnlist_buildfailed++; /* count list build failures */ 2289b9bc7f78Ssmaybe ret = 0; 2290b9bc7f78Ssmaybe goto out; 2291843e1988Sjohnlev } 2292b9bc7f78Ssmaybe create_contig_pending = 0; 2293843e1988Sjohnlev ASSERT(contig_pfn_cnt == 0); 2294843e1988Sjohnlev for (pfn = 0; pfn < mfn_count; pfn++) { 2295843e1988Sjohnlev pp = page_numtopp_nolock(pfn); 2296843e1988Sjohnlev if (pp == NULL || !PP_ISFREE(pp)) 2297843e1988Sjohnlev continue; 2298843e1988Sjohnlev contig_pfn_list[contig_pfn_cnt] = pfn; 2299843e1988Sjohnlev if (++contig_pfn_cnt == contig_pfn_max) 2300843e1988Sjohnlev break; 2301843e1988Sjohnlev } 23026358f641SStuart Maybee /* 23036358f641SStuart Maybee * Sanity check the new list. 23046358f641SStuart Maybee */ 23056358f641SStuart Maybee if (contig_pfn_cnt < 2) { /* no contig pfns */ 23066358f641SStuart Maybee contig_pfn_cnt = 0; 23076358f641SStuart Maybee contig_pfnlist_buildfailed++; 23086358f641SStuart Maybee kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t)); 23096358f641SStuart Maybee contig_pfn_list = NULL; 23106358f641SStuart Maybee contig_pfn_max = 0; 23116358f641SStuart Maybee ret = 0; 23126358f641SStuart Maybee goto out; 23136358f641SStuart Maybee } 2314843e1988Sjohnlev qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 2315843e1988Sjohnlev compact_contig_pfn_list(); 2316843e1988Sjohnlev /* 2317843e1988Sjohnlev * Make sure next search of the newly created contiguous pfn 2318843e1988Sjohnlev * list starts at the beginning of the list. 2319843e1988Sjohnlev */ 2320843e1988Sjohnlev next_alloc_pfn = 0; 2321843e1988Sjohnlev contig_pfnlist_builds++; /* count list builds */ 2322b9bc7f78Ssmaybe out: 2323b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2324b9bc7f78Ssmaybe return (ret); 2325843e1988Sjohnlev } 2326843e1988Sjohnlev 2327843e1988Sjohnlev 2328843e1988Sjohnlev /* 2329843e1988Sjohnlev * Toss the current contig pfnlist. Someone is about to do a massive 2330843e1988Sjohnlev * update to pfn<->mfn mappings. So we have them destroy the list and lock 2331843e1988Sjohnlev * it till they are done with their update. 2332843e1988Sjohnlev */ 2333843e1988Sjohnlev void 2334843e1988Sjohnlev clear_and_lock_contig_pfnlist() 2335843e1988Sjohnlev { 2336843e1988Sjohnlev pfn_t *listp = NULL; 2337843e1988Sjohnlev size_t listsize; 2338843e1988Sjohnlev 2339b9bc7f78Ssmaybe mutex_enter(&contig_list_lock); 2340843e1988Sjohnlev if (contig_pfn_list != NULL) { 2341843e1988Sjohnlev listp = contig_pfn_list; 2342843e1988Sjohnlev listsize = contig_pfn_max * sizeof (pfn_t); 2343843e1988Sjohnlev contig_pfn_list = NULL; 2344843e1988Sjohnlev contig_pfn_max = contig_pfn_cnt = 0; 2345843e1988Sjohnlev } 2346843e1988Sjohnlev if (listp != NULL) 2347843e1988Sjohnlev kmem_free(listp, listsize); 2348843e1988Sjohnlev } 2349843e1988Sjohnlev 2350843e1988Sjohnlev /* 2351843e1988Sjohnlev * Unlock the contig_pfn_list. The next attempted use of it will cause 2352843e1988Sjohnlev * it to be re-created. 2353843e1988Sjohnlev */ 2354843e1988Sjohnlev void 2355843e1988Sjohnlev unlock_contig_pfnlist() 2356843e1988Sjohnlev { 2357b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2358843e1988Sjohnlev } 2359843e1988Sjohnlev 2360843e1988Sjohnlev /* 2361843e1988Sjohnlev * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 2362843e1988Sjohnlev */ 2363843e1988Sjohnlev void 2364843e1988Sjohnlev update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 2365843e1988Sjohnlev { 2366843e1988Sjohnlev int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 2367843e1988Sjohnlev pfn_t probe_pfn; 2368843e1988Sjohnlev mfn_t probe_mfn; 2369b9bc7f78Ssmaybe int drop_lock = 0; 2370843e1988Sjohnlev 2371b9bc7f78Ssmaybe if (mutex_owner(&contig_list_lock) != curthread) { 2372b9bc7f78Ssmaybe drop_lock = 1; 2373b9bc7f78Ssmaybe mutex_enter(&contig_list_lock); 2374b9bc7f78Ssmaybe } 2375843e1988Sjohnlev if (contig_pfn_list == NULL) 2376b9bc7f78Ssmaybe goto done; 2377843e1988Sjohnlev contig_pfnlist_updates++; 2378843e1988Sjohnlev /* 2379843e1988Sjohnlev * Find the pfn in the current list. Use a binary chop to locate it. 2380843e1988Sjohnlev */ 2381843e1988Sjohnlev probe_hi = contig_pfn_cnt - 1; 2382843e1988Sjohnlev probe_lo = 0; 2383843e1988Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 2384843e1988Sjohnlev while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 2385843e1988Sjohnlev if (probe_pos == probe_lo) { /* pfn not in list */ 2386843e1988Sjohnlev probe_pos = -1; 2387843e1988Sjohnlev break; 2388843e1988Sjohnlev } 2389843e1988Sjohnlev if (pfn_to_mfn(probe_pfn) <= oldmfn) 2390843e1988Sjohnlev probe_lo = probe_pos; 2391843e1988Sjohnlev else 2392843e1988Sjohnlev probe_hi = probe_pos; 2393843e1988Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 2394843e1988Sjohnlev } 23956358f641SStuart Maybee if (probe_pos >= 0) { 23966358f641SStuart Maybee /* 23976358f641SStuart Maybee * Remove pfn from list and ensure next alloc 23986358f641SStuart Maybee * position stays in bounds. 23996358f641SStuart Maybee */ 24006358f641SStuart Maybee if (--contig_pfn_cnt <= next_alloc_pfn) 24016358f641SStuart Maybee next_alloc_pfn = 0; 2402349b53ddSStuart Maybee if (contig_pfn_cnt < 2) { /* no contig pfns */ 2403349b53ddSStuart Maybee contig_pfn_cnt = 0; 2404349b53ddSStuart Maybee kmem_free(contig_pfn_list, 2405349b53ddSStuart Maybee contig_pfn_max * sizeof (pfn_t)); 2406349b53ddSStuart Maybee contig_pfn_list = NULL; 2407349b53ddSStuart Maybee contig_pfn_max = 0; 2408349b53ddSStuart Maybee goto done; 2409349b53ddSStuart Maybee } 2410843e1988Sjohnlev ovbcopy(&contig_pfn_list[probe_pos + 1], 2411843e1988Sjohnlev &contig_pfn_list[probe_pos], 2412843e1988Sjohnlev (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 2413843e1988Sjohnlev } 2414843e1988Sjohnlev if (newmfn == MFN_INVALID) 2415843e1988Sjohnlev goto done; 2416843e1988Sjohnlev /* 2417843e1988Sjohnlev * Check if new mfn has adjacent mfns in the list 2418843e1988Sjohnlev */ 2419843e1988Sjohnlev probe_hi = contig_pfn_cnt - 1; 2420843e1988Sjohnlev probe_lo = 0; 2421843e1988Sjohnlev insert_after = -2; 2422843e1988Sjohnlev do { 2423843e1988Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 2424843e1988Sjohnlev probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 2425843e1988Sjohnlev if (newmfn == probe_mfn + 1) 2426843e1988Sjohnlev insert_after = probe_pos; 2427843e1988Sjohnlev else if (newmfn == probe_mfn - 1) 2428843e1988Sjohnlev insert_after = probe_pos - 1; 2429843e1988Sjohnlev if (probe_pos == probe_lo) 2430843e1988Sjohnlev break; 2431843e1988Sjohnlev if (probe_mfn <= newmfn) 2432843e1988Sjohnlev probe_lo = probe_pos; 2433843e1988Sjohnlev else 2434843e1988Sjohnlev probe_hi = probe_pos; 2435843e1988Sjohnlev } while (insert_after == -2); 2436843e1988Sjohnlev /* 2437843e1988Sjohnlev * If there is space in the list and there are adjacent mfns 2438843e1988Sjohnlev * insert the pfn in to its proper place in the list. 2439843e1988Sjohnlev */ 2440843e1988Sjohnlev if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 2441843e1988Sjohnlev insert_point = insert_after + 1; 2442843e1988Sjohnlev ovbcopy(&contig_pfn_list[insert_point], 2443843e1988Sjohnlev &contig_pfn_list[insert_point + 1], 2444843e1988Sjohnlev (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 2445843e1988Sjohnlev contig_pfn_list[insert_point] = pfn; 2446843e1988Sjohnlev contig_pfn_cnt++; 2447843e1988Sjohnlev } 2448843e1988Sjohnlev done: 2449b9bc7f78Ssmaybe if (drop_lock) 2450b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2451843e1988Sjohnlev } 2452843e1988Sjohnlev 2453843e1988Sjohnlev /* 2454843e1988Sjohnlev * Called to (re-)populate the io_pool from the free page lists. 2455843e1988Sjohnlev */ 2456843e1988Sjohnlev long 2457843e1988Sjohnlev populate_io_pool(void) 2458843e1988Sjohnlev { 2459843e1988Sjohnlev pfn_t pfn; 2460843e1988Sjohnlev mfn_t mfn, max_mfn; 2461843e1988Sjohnlev page_t *pp; 2462843e1988Sjohnlev 2463843e1988Sjohnlev /* 2464843e1988Sjohnlev * Figure out the bounds of the pool on first invocation. 2465843e1988Sjohnlev * We use a percentage of memory for the io pool size. 2466843e1988Sjohnlev * we allow that to shrink, but not to less than a fixed minimum 2467843e1988Sjohnlev */ 2468843e1988Sjohnlev if (io_pool_cnt_max == 0) { 2469843e1988Sjohnlev io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 2470843e1988Sjohnlev io_pool_cnt_lowater = io_pool_cnt_max; 2471843e1988Sjohnlev /* 2472843e1988Sjohnlev * This is the first time in populate_io_pool, grab a va to use 2473843e1988Sjohnlev * when we need to allocate pages. 2474843e1988Sjohnlev */ 2475843e1988Sjohnlev io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 2476843e1988Sjohnlev } 2477843e1988Sjohnlev /* 2478843e1988Sjohnlev * If we are out of pages in the pool, then grow the size of the pool 2479843e1988Sjohnlev */ 2480cf902cd2Ssmaybe if (io_pool_cnt == 0) { 2481cf902cd2Ssmaybe /* 2482cf902cd2Ssmaybe * Grow the max size of the io pool by 5%, but never more than 2483cf902cd2Ssmaybe * 25% of physical memory. 2484cf902cd2Ssmaybe */ 2485cf902cd2Ssmaybe if (io_pool_cnt_max < physmem / 4) 2486cf902cd2Ssmaybe io_pool_cnt_max += io_pool_cnt_max / 20; 2487cf902cd2Ssmaybe } 2488843e1988Sjohnlev io_pool_grows++; /* should be a kstat? */ 2489843e1988Sjohnlev 2490843e1988Sjohnlev /* 2491843e1988Sjohnlev * Get highest mfn on this platform, but limit to the 32 bit DMA max. 2492843e1988Sjohnlev */ 2493843e1988Sjohnlev (void) mfn_to_pfn(start_mfn); 2494843e1988Sjohnlev max_mfn = MIN(cached_max_mfn, PFN_4GIG); 2495843e1988Sjohnlev for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 2496843e1988Sjohnlev pfn = mfn_to_pfn(mfn); 2497843e1988Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 2498843e1988Sjohnlev continue; 2499843e1988Sjohnlev /* 2500843e1988Sjohnlev * try to allocate it from free pages 2501843e1988Sjohnlev */ 2502843e1988Sjohnlev pp = page_numtopp_alloc(pfn); 2503843e1988Sjohnlev if (pp == NULL) 2504843e1988Sjohnlev continue; 2505843e1988Sjohnlev PP_CLRFREE(pp); 2506843e1988Sjohnlev add_page_to_pool(pp, 1); 2507843e1988Sjohnlev if (io_pool_cnt >= io_pool_cnt_max) 2508843e1988Sjohnlev break; 2509843e1988Sjohnlev } 2510843e1988Sjohnlev 2511843e1988Sjohnlev return (io_pool_cnt); 2512843e1988Sjohnlev } 2513843e1988Sjohnlev 2514843e1988Sjohnlev /* 2515843e1988Sjohnlev * Destroy a page that was being used for DMA I/O. It may or 2516843e1988Sjohnlev * may not actually go back to the io_pool. 2517843e1988Sjohnlev */ 2518843e1988Sjohnlev void 2519843e1988Sjohnlev page_destroy_io(page_t *pp) 2520843e1988Sjohnlev { 2521843e1988Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 2522843e1988Sjohnlev 2523843e1988Sjohnlev /* 2524843e1988Sjohnlev * When the page was alloc'd a reservation was made, release it now 2525843e1988Sjohnlev */ 2526843e1988Sjohnlev page_unresv(1); 2527843e1988Sjohnlev /* 2528843e1988Sjohnlev * Unload translations, if any, then hash out the 2529843e1988Sjohnlev * page to erase its identity. 2530843e1988Sjohnlev */ 2531843e1988Sjohnlev (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2532843e1988Sjohnlev page_hashout(pp, NULL); 2533843e1988Sjohnlev 2534843e1988Sjohnlev /* 2535843e1988Sjohnlev * If the page came from the free lists, just put it back to them. 2536843e1988Sjohnlev * DomU pages always go on the free lists as well. 2537843e1988Sjohnlev */ 2538843e1988Sjohnlev if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 2539843e1988Sjohnlev page_free(pp, 1); 2540843e1988Sjohnlev return; 2541843e1988Sjohnlev } 2542843e1988Sjohnlev 2543843e1988Sjohnlev add_page_to_pool(pp, 0); 2544843e1988Sjohnlev } 2545843e1988Sjohnlev 2546843e1988Sjohnlev 2547843e1988Sjohnlev long contig_searches; /* count of times contig pages requested */ 2548843e1988Sjohnlev long contig_search_restarts; /* count of contig ranges tried */ 2549843e1988Sjohnlev long contig_search_failed; /* count of contig alloc failures */ 2550843e1988Sjohnlev 2551349b53ddSStuart Maybee /* 2552349b53ddSStuart Maybee * Free partial page list 2553349b53ddSStuart Maybee */ 2554349b53ddSStuart Maybee static void 2555349b53ddSStuart Maybee free_partial_list(page_t **pplist) 2556349b53ddSStuart Maybee { 2557349b53ddSStuart Maybee page_t *pp; 2558349b53ddSStuart Maybee 2559349b53ddSStuart Maybee while (*pplist != NULL) { 2560349b53ddSStuart Maybee pp = *pplist; 2561349b53ddSStuart Maybee page_io_pool_sub(pplist, pp, pp); 2562349b53ddSStuart Maybee page_free(pp, 1); 2563349b53ddSStuart Maybee } 2564349b53ddSStuart Maybee } 2565349b53ddSStuart Maybee 2566843e1988Sjohnlev /* 2567843e1988Sjohnlev * Look thru the contiguous pfns that are not part of the io_pool for 2568843e1988Sjohnlev * contiguous free pages. Return a list of the found pages or NULL. 2569843e1988Sjohnlev */ 2570843e1988Sjohnlev page_t * 2571349b53ddSStuart Maybee find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg, 2572349b53ddSStuart Maybee pgcnt_t pfnalign) 2573843e1988Sjohnlev { 2574843e1988Sjohnlev page_t *pp, *plist = NULL; 25752e8a1712Ssmaybe mfn_t mfn, prev_mfn, start_mfn; 2576843e1988Sjohnlev pfn_t pfn; 2577843e1988Sjohnlev int pages_needed, pages_requested; 2578843e1988Sjohnlev int search_start; 2579843e1988Sjohnlev 2580843e1988Sjohnlev /* 2581843e1988Sjohnlev * create the contig pfn list if not already done 2582843e1988Sjohnlev */ 2583b9bc7f78Ssmaybe retry: 2584b9bc7f78Ssmaybe mutex_enter(&contig_list_lock); 2585843e1988Sjohnlev if (contig_pfn_list == NULL) { 2586b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2587b9bc7f78Ssmaybe if (!create_contig_pfnlist(flags)) { 2588843e1988Sjohnlev return (NULL); 2589843e1988Sjohnlev } 2590b9bc7f78Ssmaybe goto retry; 2591843e1988Sjohnlev } 2592843e1988Sjohnlev contig_searches++; 2593843e1988Sjohnlev /* 2594843e1988Sjohnlev * Search contiguous pfn list for physically contiguous pages not in 2595843e1988Sjohnlev * the io_pool. Start the search where the last search left off. 2596843e1988Sjohnlev */ 25976f235fc0Ssmaybe pages_requested = pages_needed = npages; 2598843e1988Sjohnlev search_start = next_alloc_pfn; 25992e8a1712Ssmaybe start_mfn = prev_mfn = 0; 2600843e1988Sjohnlev while (pages_needed) { 2601843e1988Sjohnlev pfn = contig_pfn_list[next_alloc_pfn]; 2602843e1988Sjohnlev mfn = pfn_to_mfn(pfn); 26032e8a1712Ssmaybe /* 26042e8a1712Ssmaybe * Check if mfn is first one or contig to previous one and 26052e8a1712Ssmaybe * if page corresponding to mfn is free and that mfn 26062e8a1712Ssmaybe * range is not crossing a segment boundary. 26072e8a1712Ssmaybe */ 2608843e1988Sjohnlev if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 26092e8a1712Ssmaybe (pp = page_numtopp_alloc(pfn)) != NULL && 26102e8a1712Ssmaybe !((mfn & pfnseg) < (start_mfn & pfnseg))) { 2611843e1988Sjohnlev PP_CLRFREE(pp); 2612843e1988Sjohnlev page_io_pool_add(&plist, pp); 2613843e1988Sjohnlev pages_needed--; 2614349b53ddSStuart Maybee if (prev_mfn == 0) { 2615349b53ddSStuart Maybee if (pfnalign && 2616349b53ddSStuart Maybee mfn != P2ROUNDUP(mfn, pfnalign)) { 2617349b53ddSStuart Maybee /* 2618349b53ddSStuart Maybee * not properly aligned 2619349b53ddSStuart Maybee */ 2620349b53ddSStuart Maybee contig_search_restarts++; 2621349b53ddSStuart Maybee free_partial_list(&plist); 2622349b53ddSStuart Maybee pages_needed = pages_requested; 2623349b53ddSStuart Maybee start_mfn = prev_mfn = 0; 2624349b53ddSStuart Maybee goto skip; 2625349b53ddSStuart Maybee } 26262e8a1712Ssmaybe start_mfn = mfn; 2627349b53ddSStuart Maybee } 2628843e1988Sjohnlev prev_mfn = mfn; 2629843e1988Sjohnlev } else { 2630843e1988Sjohnlev contig_search_restarts++; 2631349b53ddSStuart Maybee free_partial_list(&plist); 2632843e1988Sjohnlev pages_needed = pages_requested; 26332e8a1712Ssmaybe start_mfn = prev_mfn = 0; 2634843e1988Sjohnlev } 2635349b53ddSStuart Maybee skip: 2636843e1988Sjohnlev if (++next_alloc_pfn == contig_pfn_cnt) 2637843e1988Sjohnlev next_alloc_pfn = 0; 2638843e1988Sjohnlev if (next_alloc_pfn == search_start) 2639843e1988Sjohnlev break; /* all pfns searched */ 2640843e1988Sjohnlev } 2641b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2642843e1988Sjohnlev if (pages_needed) { 2643843e1988Sjohnlev contig_search_failed++; 2644843e1988Sjohnlev /* 2645843e1988Sjohnlev * Failed to find enough contig pages. 2646843e1988Sjohnlev * free partial page list 2647843e1988Sjohnlev */ 2648349b53ddSStuart Maybee free_partial_list(&plist); 2649843e1988Sjohnlev } 2650843e1988Sjohnlev return (plist); 2651843e1988Sjohnlev } 2652843e1988Sjohnlev 2653843e1988Sjohnlev /* 26546f235fc0Ssmaybe * Search the reserved io pool pages for a page range with the 26556f235fc0Ssmaybe * desired characteristics. 26567c478bd9Sstevel@tonic-gate */ 26577c478bd9Sstevel@tonic-gate page_t * 26586f235fc0Ssmaybe page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg) 2659843e1988Sjohnlev { 26606f235fc0Ssmaybe page_t *pp_first, *pp_last; 26616f235fc0Ssmaybe page_t *pp, **poolp; 26626f235fc0Ssmaybe pgcnt_t nwanted, pfnalign; 2663843e1988Sjohnlev uint64_t pfnseg; 26646f235fc0Ssmaybe mfn_t mfn, tmfn, hi_mfn, lo_mfn; 26656f235fc0Ssmaybe int align, attempt = 0; 2666843e1988Sjohnlev 26676f235fc0Ssmaybe if (minctg == 1) 26686f235fc0Ssmaybe contig = 0; 2669843e1988Sjohnlev lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2670843e1988Sjohnlev hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 26716f235fc0Ssmaybe pfnseg = mmu_btop(mattr->dma_attr_seg); 2672843e1988Sjohnlev align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2673843e1988Sjohnlev if (align > MMU_PAGESIZE) 2674843e1988Sjohnlev pfnalign = mmu_btop(align); 26756f235fc0Ssmaybe else 26766f235fc0Ssmaybe pfnalign = 0; 2677843e1988Sjohnlev 2678843e1988Sjohnlev try_again: 2679843e1988Sjohnlev /* 2680843e1988Sjohnlev * See if we want pages for a legacy device 2681843e1988Sjohnlev */ 2682843e1988Sjohnlev if (hi_mfn < PFN_16MEG) 2683843e1988Sjohnlev poolp = &io_pool_16m; 2684843e1988Sjohnlev else 2685843e1988Sjohnlev poolp = &io_pool_4g; 2686843e1988Sjohnlev try_smaller: 2687843e1988Sjohnlev /* 26886f235fc0Ssmaybe * Take pages from I/O pool. We'll use pages from the highest 26896f235fc0Ssmaybe * MFN range possible. 2690843e1988Sjohnlev */ 2691843e1988Sjohnlev pp_first = pp_last = NULL; 2692843e1988Sjohnlev mutex_enter(&io_pool_lock); 26936f235fc0Ssmaybe nwanted = minctg; 26946f235fc0Ssmaybe for (pp = *poolp; pp && nwanted > 0; ) { 2695843e1988Sjohnlev pp = pp->p_prev; 2696843e1988Sjohnlev 2697843e1988Sjohnlev /* 2698843e1988Sjohnlev * skip pages above allowable range 2699843e1988Sjohnlev */ 2700843e1988Sjohnlev mfn = mfn_list[pp->p_pagenum]; 2701843e1988Sjohnlev if (hi_mfn < mfn) 2702843e1988Sjohnlev goto skip; 2703843e1988Sjohnlev 2704843e1988Sjohnlev /* 2705843e1988Sjohnlev * stop at pages below allowable range 2706843e1988Sjohnlev */ 2707843e1988Sjohnlev if (lo_mfn > mfn) 2708843e1988Sjohnlev break; 2709843e1988Sjohnlev restart: 2710843e1988Sjohnlev if (pp_last == NULL) { 2711843e1988Sjohnlev /* 2712843e1988Sjohnlev * Check alignment 2713843e1988Sjohnlev */ 27146f235fc0Ssmaybe tmfn = mfn - (minctg - 1); 27156f235fc0Ssmaybe if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign)) 27166f235fc0Ssmaybe goto skip; /* not properly aligned */ 2717843e1988Sjohnlev /* 2718843e1988Sjohnlev * Check segment 2719843e1988Sjohnlev */ 2720843e1988Sjohnlev if ((mfn & pfnseg) < (tmfn & pfnseg)) 27216f235fc0Ssmaybe goto skip; /* crosses seg boundary */ 2722843e1988Sjohnlev /* 2723843e1988Sjohnlev * Start building page list 2724843e1988Sjohnlev */ 2725843e1988Sjohnlev pp_first = pp_last = pp; 27266f235fc0Ssmaybe nwanted--; 2727843e1988Sjohnlev } else { 2728843e1988Sjohnlev /* 2729843e1988Sjohnlev * check physical contiguity if required 2730843e1988Sjohnlev */ 2731843e1988Sjohnlev if (contig && 2732843e1988Sjohnlev mfn_list[pp_first->p_pagenum] != mfn + 1) { 2733843e1988Sjohnlev /* 2734843e1988Sjohnlev * not a contiguous page, restart list. 2735843e1988Sjohnlev */ 2736843e1988Sjohnlev pp_last = NULL; 27376f235fc0Ssmaybe nwanted = minctg; 2738843e1988Sjohnlev goto restart; 2739843e1988Sjohnlev } else { /* add page to list */ 2740843e1988Sjohnlev pp_first = pp; 27416f235fc0Ssmaybe nwanted--; 2742843e1988Sjohnlev } 2743843e1988Sjohnlev } 2744843e1988Sjohnlev skip: 2745843e1988Sjohnlev if (pp == *poolp) 2746843e1988Sjohnlev break; 2747843e1988Sjohnlev } 2748843e1988Sjohnlev 2749843e1988Sjohnlev /* 2750843e1988Sjohnlev * If we didn't find memory. Try the more constrained pool, then 27516f235fc0Ssmaybe * sweep free pages into the DMA pool and try again. 2752843e1988Sjohnlev */ 27536f235fc0Ssmaybe if (nwanted != 0) { 2754843e1988Sjohnlev mutex_exit(&io_pool_lock); 2755843e1988Sjohnlev /* 27566f235fc0Ssmaybe * If we were looking in the less constrained pool and 27576f235fc0Ssmaybe * didn't find pages, try the more constrained pool. 2758843e1988Sjohnlev */ 2759843e1988Sjohnlev if (poolp == &io_pool_4g) { 2760843e1988Sjohnlev poolp = &io_pool_16m; 2761843e1988Sjohnlev goto try_smaller; 2762843e1988Sjohnlev } 2763843e1988Sjohnlev kmem_reap(); 2764843e1988Sjohnlev if (++attempt < 4) { 2765843e1988Sjohnlev /* 2766843e1988Sjohnlev * Grab some more io_pool pages 2767843e1988Sjohnlev */ 2768843e1988Sjohnlev (void) populate_io_pool(); 27696f235fc0Ssmaybe goto try_again; /* go around and retry */ 2770843e1988Sjohnlev } 27716f235fc0Ssmaybe return (NULL); 27726f235fc0Ssmaybe } 27736f235fc0Ssmaybe /* 27746f235fc0Ssmaybe * Found the pages, now snip them from the list 27756f235fc0Ssmaybe */ 27766f235fc0Ssmaybe page_io_pool_sub(poolp, pp_first, pp_last); 27776f235fc0Ssmaybe io_pool_cnt -= minctg; 27786f235fc0Ssmaybe /* 27796f235fc0Ssmaybe * reset low water mark 27806f235fc0Ssmaybe */ 27816f235fc0Ssmaybe if (io_pool_cnt < io_pool_cnt_lowater) 27826f235fc0Ssmaybe io_pool_cnt_lowater = io_pool_cnt; 27836f235fc0Ssmaybe mutex_exit(&io_pool_lock); 27846f235fc0Ssmaybe return (pp_first); 27856f235fc0Ssmaybe } 2786843e1988Sjohnlev 27876f235fc0Ssmaybe page_t * 27886f235fc0Ssmaybe page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr, 27896f235fc0Ssmaybe ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg) 27906f235fc0Ssmaybe { 27916f235fc0Ssmaybe uint_t kflags; 27926f235fc0Ssmaybe int order, extra, extpages, i, contig, nbits, extents; 27936f235fc0Ssmaybe page_t *pp, *expp, *pp_first, **pplist = NULL; 27946f235fc0Ssmaybe mfn_t *mfnlist = NULL; 27956f235fc0Ssmaybe 27966f235fc0Ssmaybe contig = flags & PG_PHYSCONTIG; 27976f235fc0Ssmaybe if (minctg == 1) 27986f235fc0Ssmaybe contig = 0; 27996f235fc0Ssmaybe flags &= ~PG_PHYSCONTIG; 28006f235fc0Ssmaybe kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 28016f235fc0Ssmaybe /* 28026f235fc0Ssmaybe * Hypervisor will allocate extents, if we want contig 28036f235fc0Ssmaybe * pages extent must be >= minctg 28046f235fc0Ssmaybe */ 28056f235fc0Ssmaybe if (contig) { 28066f235fc0Ssmaybe order = highbit(minctg) - 1; 28076f235fc0Ssmaybe if (minctg & ((1 << order) - 1)) 28086f235fc0Ssmaybe order++; 28096f235fc0Ssmaybe extpages = 1 << order; 28106f235fc0Ssmaybe } else { 28116f235fc0Ssmaybe order = 0; 28126f235fc0Ssmaybe extpages = minctg; 28136f235fc0Ssmaybe } 28146f235fc0Ssmaybe if (extpages > minctg) { 28156f235fc0Ssmaybe extra = extpages - minctg; 28166f235fc0Ssmaybe if (!page_resv(extra, kflags)) 2817843e1988Sjohnlev return (NULL); 28186f235fc0Ssmaybe } 28196f235fc0Ssmaybe pp_first = NULL; 28206f235fc0Ssmaybe pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 28216f235fc0Ssmaybe if (pplist == NULL) 28226f235fc0Ssmaybe goto balloon_fail; 28236f235fc0Ssmaybe mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 28246f235fc0Ssmaybe if (mfnlist == NULL) 28256f235fc0Ssmaybe goto balloon_fail; 28266f235fc0Ssmaybe pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr); 28276f235fc0Ssmaybe if (pp == NULL) 28286f235fc0Ssmaybe goto balloon_fail; 28296f235fc0Ssmaybe pp_first = pp; 28306f235fc0Ssmaybe if (extpages > minctg) { 2831843e1988Sjohnlev /* 28326f235fc0Ssmaybe * fill out the rest of extent pages to swap 28336f235fc0Ssmaybe * with the hypervisor 2834843e1988Sjohnlev */ 28356f235fc0Ssmaybe for (i = 0; i < extra; i++) { 28366f235fc0Ssmaybe expp = page_create_va(vp, 28376f235fc0Ssmaybe (u_offset_t)(uintptr_t)io_pool_kva, 28386f235fc0Ssmaybe PAGESIZE, flags, &kvseg, io_pool_kva); 28396f235fc0Ssmaybe if (expp == NULL) 28406f235fc0Ssmaybe goto balloon_fail; 28416f235fc0Ssmaybe (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 28426f235fc0Ssmaybe page_io_unlock(expp); 28436f235fc0Ssmaybe page_hashout(expp, NULL); 28446f235fc0Ssmaybe page_io_lock(expp); 2845843e1988Sjohnlev /* 28466f235fc0Ssmaybe * add page to end of list 2847843e1988Sjohnlev */ 28486f235fc0Ssmaybe expp->p_prev = pp_first->p_prev; 28496f235fc0Ssmaybe expp->p_next = pp_first; 28506f235fc0Ssmaybe expp->p_prev->p_next = expp; 28516f235fc0Ssmaybe pp_first->p_prev = expp; 2852b9bc7f78Ssmaybe } 2853843e1988Sjohnlev 28546f235fc0Ssmaybe } 28556f235fc0Ssmaybe for (i = 0; i < extpages; i++) { 28566f235fc0Ssmaybe pplist[i] = pp; 28576f235fc0Ssmaybe pp = pp->p_next; 28586f235fc0Ssmaybe } 28596f235fc0Ssmaybe nbits = highbit(mattr->dma_attr_addr_hi); 28606f235fc0Ssmaybe extents = contig ? 1 : minctg; 28616f235fc0Ssmaybe if (balloon_replace_pages(extents, pplist, nbits, order, 28626f235fc0Ssmaybe mfnlist) != extents) { 28636f235fc0Ssmaybe if (ioalloc_dbg) 28646f235fc0Ssmaybe cmn_err(CE_NOTE, "request to hypervisor" 28656f235fc0Ssmaybe " for %d pages, maxaddr %" PRIx64 " failed", 28666f235fc0Ssmaybe extpages, mattr->dma_attr_addr_hi); 28676f235fc0Ssmaybe goto balloon_fail; 2868843e1988Sjohnlev } 2869843e1988Sjohnlev 28706f235fc0Ssmaybe kmem_free(pplist, extpages * sizeof (page_t *)); 28716f235fc0Ssmaybe kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2872843e1988Sjohnlev /* 28736f235fc0Ssmaybe * Return any excess pages to free list 2874843e1988Sjohnlev */ 28756f235fc0Ssmaybe if (extpages > minctg) { 28766f235fc0Ssmaybe for (i = 0; i < extra; i++) { 28776f235fc0Ssmaybe pp = pp_first->p_prev; 28786f235fc0Ssmaybe page_sub(&pp_first, pp); 28796f235fc0Ssmaybe page_io_unlock(pp); 28806f235fc0Ssmaybe page_unresv(1); 28816f235fc0Ssmaybe page_free(pp, 1); 2882843e1988Sjohnlev } 28836f235fc0Ssmaybe } 2884843e1988Sjohnlev return (pp_first); 2885843e1988Sjohnlev balloon_fail: 2886843e1988Sjohnlev /* 2887843e1988Sjohnlev * Return pages to free list and return failure 2888843e1988Sjohnlev */ 2889843e1988Sjohnlev while (pp_first != NULL) { 2890843e1988Sjohnlev pp = pp_first; 2891843e1988Sjohnlev page_sub(&pp_first, pp); 2892843e1988Sjohnlev page_io_unlock(pp); 2893843e1988Sjohnlev if (pp->p_vnode != NULL) 2894843e1988Sjohnlev page_hashout(pp, NULL); 2895843e1988Sjohnlev page_free(pp, 1); 2896843e1988Sjohnlev } 2897843e1988Sjohnlev if (pplist) 2898843e1988Sjohnlev kmem_free(pplist, extpages * sizeof (page_t *)); 2899843e1988Sjohnlev if (mfnlist) 2900843e1988Sjohnlev kmem_free(mfnlist, extpages * sizeof (mfn_t)); 29016f235fc0Ssmaybe page_unresv(extpages - minctg); 29026f235fc0Ssmaybe return (NULL); 29036f235fc0Ssmaybe } 29046f235fc0Ssmaybe 29056f235fc0Ssmaybe static void 29066f235fc0Ssmaybe return_partial_alloc(page_t *plist) 29076f235fc0Ssmaybe { 29086f235fc0Ssmaybe page_t *pp; 29096f235fc0Ssmaybe 29106f235fc0Ssmaybe while (plist != NULL) { 29116f235fc0Ssmaybe pp = plist; 29126f235fc0Ssmaybe page_sub(&plist, pp); 2913d21b39ddSmrj page_io_unlock(pp); 29146f235fc0Ssmaybe page_destroy_io(pp); 29156f235fc0Ssmaybe } 29166f235fc0Ssmaybe } 29176f235fc0Ssmaybe 29186f235fc0Ssmaybe static page_t * 29196f235fc0Ssmaybe page_get_contigpages( 29206f235fc0Ssmaybe struct vnode *vp, 29216f235fc0Ssmaybe u_offset_t off, 29226f235fc0Ssmaybe int *npagesp, 29236f235fc0Ssmaybe uint_t flags, 29246f235fc0Ssmaybe caddr_t vaddr, 29256f235fc0Ssmaybe ddi_dma_attr_t *mattr) 29266f235fc0Ssmaybe { 29276f235fc0Ssmaybe mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 29286f235fc0Ssmaybe page_t *plist; /* list to return */ 29296f235fc0Ssmaybe page_t *pp, *mcpl; 29306f235fc0Ssmaybe int contig, anyaddr, npages, getone = 0; 29316f235fc0Ssmaybe mfn_t lo_mfn; 29326f235fc0Ssmaybe mfn_t hi_mfn; 29336f235fc0Ssmaybe pgcnt_t pfnalign = 0; 29346f235fc0Ssmaybe int align, sgllen; 29356f235fc0Ssmaybe uint64_t pfnseg; 29366f235fc0Ssmaybe pgcnt_t minctg; 29376f235fc0Ssmaybe 29386f235fc0Ssmaybe npages = *npagesp; 29396f235fc0Ssmaybe ASSERT(mattr != NULL); 29406f235fc0Ssmaybe lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 29416f235fc0Ssmaybe hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 29426f235fc0Ssmaybe sgllen = mattr->dma_attr_sgllen; 29436f235fc0Ssmaybe pfnseg = mmu_btop(mattr->dma_attr_seg); 29446f235fc0Ssmaybe align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 29456f235fc0Ssmaybe if (align > MMU_PAGESIZE) 29466f235fc0Ssmaybe pfnalign = mmu_btop(align); 29476f235fc0Ssmaybe 2948349b53ddSStuart Maybee contig = flags & PG_PHYSCONTIG; 2949349b53ddSStuart Maybee if (npages == -1) { 2950349b53ddSStuart Maybee npages = 1; 2951349b53ddSStuart Maybee pfnalign = 0; 2952349b53ddSStuart Maybee } 29536f235fc0Ssmaybe /* 29546f235fc0Ssmaybe * Clear the contig flag if only one page is needed. 29556f235fc0Ssmaybe */ 29566f235fc0Ssmaybe if (npages == 1) { 29576f235fc0Ssmaybe getone = 1; 29586f235fc0Ssmaybe contig = 0; 29596f235fc0Ssmaybe } 29606f235fc0Ssmaybe 29616f235fc0Ssmaybe /* 29626f235fc0Ssmaybe * Check if any page in the system is fine. 29636f235fc0Ssmaybe */ 2964349b53ddSStuart Maybee anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn; 2965349b53ddSStuart Maybee if (!contig && anyaddr && !pfnalign) { 29666f235fc0Ssmaybe flags &= ~PG_PHYSCONTIG; 29676f235fc0Ssmaybe plist = page_create_va(vp, off, npages * MMU_PAGESIZE, 29686f235fc0Ssmaybe flags, &kvseg, vaddr); 29696f235fc0Ssmaybe if (plist != NULL) { 29706f235fc0Ssmaybe *npagesp = 0; 29716f235fc0Ssmaybe return (plist); 29726f235fc0Ssmaybe } 29736f235fc0Ssmaybe } 29746f235fc0Ssmaybe plist = NULL; 29756f235fc0Ssmaybe minctg = howmany(npages, sgllen); 29766f235fc0Ssmaybe while (npages > sgllen || getone) { 2977804cf79fSsmaybe if (minctg > npages) 2978804cf79fSsmaybe minctg = npages; 2979804cf79fSsmaybe mcpl = NULL; 29806f235fc0Ssmaybe /* 2981349b53ddSStuart Maybee * We could want contig pages with no address range limits. 29826f235fc0Ssmaybe */ 29832e8a1712Ssmaybe if (anyaddr && contig) { 29846f235fc0Ssmaybe /* 29856f235fc0Ssmaybe * Look for free contig pages to satisfy the request. 29866f235fc0Ssmaybe */ 2987349b53ddSStuart Maybee mcpl = find_contig_free(minctg, flags, pfnseg, 2988349b53ddSStuart Maybee pfnalign); 29896f235fc0Ssmaybe } 29906f235fc0Ssmaybe /* 29916f235fc0Ssmaybe * Try the reserved io pools next 29926f235fc0Ssmaybe */ 29936f235fc0Ssmaybe if (mcpl == NULL) 29946f235fc0Ssmaybe mcpl = page_io_pool_alloc(mattr, contig, minctg); 29956f235fc0Ssmaybe if (mcpl != NULL) { 29966f235fc0Ssmaybe pp = mcpl; 29976f235fc0Ssmaybe do { 29986f235fc0Ssmaybe if (!page_hashin(pp, vp, off, NULL)) { 29996f235fc0Ssmaybe panic("page_get_contigpages:" 30006f235fc0Ssmaybe " hashin failed" 30016f235fc0Ssmaybe " pp %p, vp %p, off %llx", 30026f235fc0Ssmaybe (void *)pp, (void *)vp, off); 30036f235fc0Ssmaybe } 30046f235fc0Ssmaybe off += MMU_PAGESIZE; 30056f235fc0Ssmaybe PP_CLRFREE(pp); 30066f235fc0Ssmaybe PP_CLRAGED(pp); 30076f235fc0Ssmaybe page_set_props(pp, P_REF); 30086f235fc0Ssmaybe page_io_lock(pp); 30096f235fc0Ssmaybe pp = pp->p_next; 30106f235fc0Ssmaybe } while (pp != mcpl); 30116f235fc0Ssmaybe } else { 30126f235fc0Ssmaybe /* 30136f235fc0Ssmaybe * Hypervisor exchange doesn't handle segment or 30146f235fc0Ssmaybe * alignment constraints 30156f235fc0Ssmaybe */ 30166f235fc0Ssmaybe if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || 30176f235fc0Ssmaybe pfnalign) 30186f235fc0Ssmaybe goto fail; 30196f235fc0Ssmaybe /* 30206f235fc0Ssmaybe * Try exchanging pages with the hypervisor 30216f235fc0Ssmaybe */ 30226f235fc0Ssmaybe mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr, 30236f235fc0Ssmaybe flags, minctg); 30246f235fc0Ssmaybe if (mcpl == NULL) 30256f235fc0Ssmaybe goto fail; 30266f235fc0Ssmaybe off += minctg * MMU_PAGESIZE; 30276f235fc0Ssmaybe } 30286f235fc0Ssmaybe check_dma(mattr, mcpl, minctg); 30296f235fc0Ssmaybe /* 30306f235fc0Ssmaybe * Here with a minctg run of contiguous pages, add them to the 30316f235fc0Ssmaybe * list we will return for this request. 30326f235fc0Ssmaybe */ 30336f235fc0Ssmaybe page_list_concat(&plist, &mcpl); 30346f235fc0Ssmaybe npages -= minctg; 30356f235fc0Ssmaybe *npagesp = npages; 30366f235fc0Ssmaybe sgllen--; 3037804cf79fSsmaybe if (getone) 3038804cf79fSsmaybe break; 30396f235fc0Ssmaybe } 30406f235fc0Ssmaybe return (plist); 30416f235fc0Ssmaybe fail: 30426f235fc0Ssmaybe return_partial_alloc(plist); 30436f235fc0Ssmaybe return (NULL); 30446f235fc0Ssmaybe } 30456f235fc0Ssmaybe 30466f235fc0Ssmaybe /* 30476f235fc0Ssmaybe * Allocator for domain 0 I/O pages. We match the required 30486f235fc0Ssmaybe * DMA attributes and contiguity constraints. 30496f235fc0Ssmaybe */ 30506f235fc0Ssmaybe /*ARGSUSED*/ 30516f235fc0Ssmaybe page_t * 30526f235fc0Ssmaybe page_create_io( 30536f235fc0Ssmaybe struct vnode *vp, 30546f235fc0Ssmaybe u_offset_t off, 30556f235fc0Ssmaybe uint_t bytes, 30566f235fc0Ssmaybe uint_t flags, 30576f235fc0Ssmaybe struct as *as, 30586f235fc0Ssmaybe caddr_t vaddr, 30596f235fc0Ssmaybe ddi_dma_attr_t *mattr) 30606f235fc0Ssmaybe { 30616f235fc0Ssmaybe page_t *plist = NULL, *pp; 30626f235fc0Ssmaybe int npages = 0, contig, anyaddr, pages_req; 30636f235fc0Ssmaybe mfn_t lo_mfn; 30646f235fc0Ssmaybe mfn_t hi_mfn; 30656f235fc0Ssmaybe pgcnt_t pfnalign = 0; 30666f235fc0Ssmaybe int align; 30676f235fc0Ssmaybe int is_domu = 0; 30686f235fc0Ssmaybe int dummy, bytes_got; 30696f235fc0Ssmaybe mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 30706f235fc0Ssmaybe 30716f235fc0Ssmaybe ASSERT(mattr != NULL); 30726f235fc0Ssmaybe lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 30736f235fc0Ssmaybe hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 30746f235fc0Ssmaybe align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 30756f235fc0Ssmaybe if (align > MMU_PAGESIZE) 30766f235fc0Ssmaybe pfnalign = mmu_btop(align); 30776f235fc0Ssmaybe 30786f235fc0Ssmaybe /* 30796f235fc0Ssmaybe * Clear the contig flag if only one page is needed or the scatter 30806f235fc0Ssmaybe * gather list length is >= npages. 30816f235fc0Ssmaybe */ 30826f235fc0Ssmaybe pages_req = npages = mmu_btopr(bytes); 30836f235fc0Ssmaybe contig = (flags & PG_PHYSCONTIG); 30846f235fc0Ssmaybe bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 30856f235fc0Ssmaybe if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages) 30866f235fc0Ssmaybe contig = 0; 30876f235fc0Ssmaybe 30886f235fc0Ssmaybe /* 30896f235fc0Ssmaybe * Check if any old page in the system is fine. 30906f235fc0Ssmaybe * DomU should always go down this path. 30916f235fc0Ssmaybe */ 30926f235fc0Ssmaybe is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 30936f235fc0Ssmaybe anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 30946f235fc0Ssmaybe if ((!contig && anyaddr) || is_domu) { 30956f235fc0Ssmaybe flags &= ~PG_PHYSCONTIG; 30966f235fc0Ssmaybe plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 30976f235fc0Ssmaybe if (plist != NULL) 30986f235fc0Ssmaybe return (plist); 30996f235fc0Ssmaybe else if (is_domu) 31006f235fc0Ssmaybe return (NULL); /* no memory available */ 31016f235fc0Ssmaybe } 31026f235fc0Ssmaybe /* 31036f235fc0Ssmaybe * DomU should never reach here 31046f235fc0Ssmaybe */ 31056f235fc0Ssmaybe if (contig) { 31066f235fc0Ssmaybe plist = page_get_contigpages(vp, off, &npages, flags, vaddr, 31076f235fc0Ssmaybe mattr); 31086f235fc0Ssmaybe if (plist == NULL) 31096f235fc0Ssmaybe goto fail; 31106f235fc0Ssmaybe bytes_got = (pages_req - npages) << MMU_PAGESHIFT; 31116f235fc0Ssmaybe vaddr += bytes_got; 31126f235fc0Ssmaybe off += bytes_got; 31136f235fc0Ssmaybe /* 31146f235fc0Ssmaybe * We now have all the contiguous pages we need, but 31156f235fc0Ssmaybe * we may still need additional non-contiguous pages. 31166f235fc0Ssmaybe */ 31176f235fc0Ssmaybe } 31186f235fc0Ssmaybe /* 31196f235fc0Ssmaybe * now loop collecting the requested number of pages, these do 31206f235fc0Ssmaybe * not have to be contiguous pages but we will use the contig 31216f235fc0Ssmaybe * page alloc code to get the pages since it will honor any 31226f235fc0Ssmaybe * other constraints the pages may have. 31236f235fc0Ssmaybe */ 31246f235fc0Ssmaybe while (npages--) { 3125349b53ddSStuart Maybee dummy = -1; 31266f235fc0Ssmaybe pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr); 31276f235fc0Ssmaybe if (pp == NULL) 31286f235fc0Ssmaybe goto fail; 31296f235fc0Ssmaybe page_add(&plist, pp); 31306f235fc0Ssmaybe vaddr += MMU_PAGESIZE; 31316f235fc0Ssmaybe off += MMU_PAGESIZE; 31326f235fc0Ssmaybe } 31336f235fc0Ssmaybe return (plist); 31346f235fc0Ssmaybe fail: 31356f235fc0Ssmaybe /* 31366f235fc0Ssmaybe * Failed to get enough pages, return ones we did get 31376f235fc0Ssmaybe */ 31386f235fc0Ssmaybe return_partial_alloc(plist); 3139843e1988Sjohnlev return (NULL); 3140843e1988Sjohnlev } 3141843e1988Sjohnlev 3142843e1988Sjohnlev /* 3143843e1988Sjohnlev * Lock and return the page with the highest mfn that we can find. last_mfn 3144843e1988Sjohnlev * holds the last one found, so the next search can start from there. We 3145843e1988Sjohnlev * also keep a counter so that we don't loop forever if the machine has no 3146843e1988Sjohnlev * free pages. 3147843e1988Sjohnlev * 3148843e1988Sjohnlev * This is called from the balloon thread to find pages to give away. new_high 3149843e1988Sjohnlev * is used when new mfn's have been added to the system - we will reset our 3150843e1988Sjohnlev * search if the new mfn's are higher than our current search position. 3151843e1988Sjohnlev */ 3152843e1988Sjohnlev page_t * 3153843e1988Sjohnlev page_get_high_mfn(mfn_t new_high) 3154843e1988Sjohnlev { 3155843e1988Sjohnlev static mfn_t last_mfn = 0; 3156843e1988Sjohnlev pfn_t pfn; 3157843e1988Sjohnlev page_t *pp; 3158843e1988Sjohnlev ulong_t loop_count = 0; 3159843e1988Sjohnlev 3160843e1988Sjohnlev if (new_high > last_mfn) 3161843e1988Sjohnlev last_mfn = new_high; 3162843e1988Sjohnlev 3163843e1988Sjohnlev for (; loop_count < mfn_count; loop_count++, last_mfn--) { 3164843e1988Sjohnlev if (last_mfn == 0) { 3165843e1988Sjohnlev last_mfn = cached_max_mfn; 3166843e1988Sjohnlev } 3167843e1988Sjohnlev 3168843e1988Sjohnlev pfn = mfn_to_pfn(last_mfn); 3169843e1988Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 3170843e1988Sjohnlev continue; 3171843e1988Sjohnlev 3172843e1988Sjohnlev /* See if the page is free. If so, lock it. */ 3173843e1988Sjohnlev pp = page_numtopp_alloc(pfn); 3174843e1988Sjohnlev if (pp == NULL) 3175843e1988Sjohnlev continue; 3176843e1988Sjohnlev PP_CLRFREE(pp); 3177843e1988Sjohnlev 3178843e1988Sjohnlev ASSERT(PAGE_EXCL(pp)); 3179843e1988Sjohnlev ASSERT(pp->p_vnode == NULL); 3180843e1988Sjohnlev ASSERT(!hat_page_is_mapped(pp)); 3181843e1988Sjohnlev last_mfn--; 3182843e1988Sjohnlev return (pp); 3183843e1988Sjohnlev } 3184843e1988Sjohnlev return (NULL); 3185843e1988Sjohnlev } 3186843e1988Sjohnlev 3187843e1988Sjohnlev #else /* !__xpv */ 3188843e1988Sjohnlev 3189843e1988Sjohnlev /* 3190843e1988Sjohnlev * get a page from any list with the given mnode 3191843e1988Sjohnlev */ 3192843e1988Sjohnlev static page_t * 31937c478bd9Sstevel@tonic-gate page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 31947c478bd9Sstevel@tonic-gate int mnode, int mtype, ddi_dma_attr_t *dma_attr) 31957c478bd9Sstevel@tonic-gate { 31965d07b933Sdp kmutex_t *pcm; 31975d07b933Sdp int i; 31985d07b933Sdp page_t *pp; 31995d07b933Sdp page_t *first_pp; 32005d07b933Sdp uint64_t pgaddr; 32015d07b933Sdp ulong_t bin; 32025d07b933Sdp int mtypestart; 32035d07b933Sdp int plw_initialized; 32045d07b933Sdp page_list_walker_t plw; 32057c478bd9Sstevel@tonic-gate 32067c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_alloc); 32077c478bd9Sstevel@tonic-gate 32087c478bd9Sstevel@tonic-gate ASSERT((flags & PG_MATCH_COLOR) == 0); 32097c478bd9Sstevel@tonic-gate ASSERT(szc == 0); 32107c478bd9Sstevel@tonic-gate ASSERT(dma_attr != NULL); 32117c478bd9Sstevel@tonic-gate 32127c478bd9Sstevel@tonic-gate MTYPE_START(mnode, mtype, flags); 32137c478bd9Sstevel@tonic-gate if (mtype < 0) { 32147c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocempty); 32157c478bd9Sstevel@tonic-gate return (NULL); 32167c478bd9Sstevel@tonic-gate } 32177c478bd9Sstevel@tonic-gate 32187c478bd9Sstevel@tonic-gate mtypestart = mtype; 32197c478bd9Sstevel@tonic-gate 32207c478bd9Sstevel@tonic-gate bin = origbin; 32217c478bd9Sstevel@tonic-gate 32227c478bd9Sstevel@tonic-gate /* 32237c478bd9Sstevel@tonic-gate * check up to page_colors + 1 bins - origbin may be checked twice 32247c478bd9Sstevel@tonic-gate * because of BIN_STEP skip 32257c478bd9Sstevel@tonic-gate */ 32267c478bd9Sstevel@tonic-gate do { 32275d07b933Sdp plw_initialized = 0; 32285d07b933Sdp 32295d07b933Sdp for (plw.plw_count = 0; 32305d07b933Sdp plw.plw_count < page_colors; plw.plw_count++) { 32315d07b933Sdp 3232d94ffb28Sjmcp if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 32337c478bd9Sstevel@tonic-gate goto nextfreebin; 32347c478bd9Sstevel@tonic-gate 3235d94ffb28Sjmcp pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 32367c478bd9Sstevel@tonic-gate mutex_enter(pcm); 3237d94ffb28Sjmcp pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 32387c478bd9Sstevel@tonic-gate first_pp = pp; 32397c478bd9Sstevel@tonic-gate while (pp != NULL) { 32401f84c0d7SDave Plauger if (IS_DUMP_PAGE(pp) || page_trylock(pp, 32411f84c0d7SDave Plauger SE_EXCL) == 0) { 32427c478bd9Sstevel@tonic-gate pp = pp->p_next; 32437c478bd9Sstevel@tonic-gate if (pp == first_pp) { 32447c478bd9Sstevel@tonic-gate pp = NULL; 32457c478bd9Sstevel@tonic-gate } 32467c478bd9Sstevel@tonic-gate continue; 32477c478bd9Sstevel@tonic-gate } 32487c478bd9Sstevel@tonic-gate 32497c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 32507c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 32517c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode == NULL); 32527c478bd9Sstevel@tonic-gate ASSERT(pp->p_hash == NULL); 32537c478bd9Sstevel@tonic-gate ASSERT(pp->p_offset == (u_offset_t)-1); 32547c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == szc); 32557c478bd9Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 32567c478bd9Sstevel@tonic-gate /* check if page within DMA attributes */ 3257ae115bc7Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 32587c478bd9Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 32597c478bd9Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 32607c478bd9Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 32617c478bd9Sstevel@tonic-gate break; 32627c478bd9Sstevel@tonic-gate } 32637c478bd9Sstevel@tonic-gate 32647c478bd9Sstevel@tonic-gate /* continue looking */ 32657c478bd9Sstevel@tonic-gate page_unlock(pp); 32667c478bd9Sstevel@tonic-gate pp = pp->p_next; 32677c478bd9Sstevel@tonic-gate if (pp == first_pp) 32687c478bd9Sstevel@tonic-gate pp = NULL; 32697c478bd9Sstevel@tonic-gate 32707c478bd9Sstevel@tonic-gate } 32717c478bd9Sstevel@tonic-gate if (pp != NULL) { 32727c478bd9Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 32737c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 32747c478bd9Sstevel@tonic-gate 32757c478bd9Sstevel@tonic-gate /* found a page with specified DMA attributes */ 3276d94ffb28Sjmcp page_sub(&PAGE_FREELISTS(mnode, szc, bin, 3277d94ffb28Sjmcp mtype), pp); 3278affbd3ccSkchow page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 32797c478bd9Sstevel@tonic-gate 32807c478bd9Sstevel@tonic-gate if ((PP_ISFREE(pp) == 0) || 32817c478bd9Sstevel@tonic-gate (PP_ISAGED(pp) == 0)) { 32827c478bd9Sstevel@tonic-gate cmn_err(CE_PANIC, "page %p is not free", 32837c478bd9Sstevel@tonic-gate (void *)pp); 32847c478bd9Sstevel@tonic-gate } 32857c478bd9Sstevel@tonic-gate 32867c478bd9Sstevel@tonic-gate mutex_exit(pcm); 32877c478bd9Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 32887c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 32897c478bd9Sstevel@tonic-gate return (pp); 32907c478bd9Sstevel@tonic-gate } 32917c478bd9Sstevel@tonic-gate mutex_exit(pcm); 32927c478bd9Sstevel@tonic-gate nextfreebin: 32935d07b933Sdp if (plw_initialized == 0) { 32945d07b933Sdp page_list_walk_init(szc, 0, bin, 1, 0, &plw); 32955d07b933Sdp ASSERT(plw.plw_ceq_dif == page_colors); 32965d07b933Sdp plw_initialized = 1; 32975d07b933Sdp } 32987c478bd9Sstevel@tonic-gate 32995d07b933Sdp if (plw.plw_do_split) { 33005d07b933Sdp pp = page_freelist_split(szc, bin, mnode, 33015d07b933Sdp mtype, 330219397407SSherry Moore mmu_btop(dma_attr->dma_attr_addr_lo), 33035d07b933Sdp mmu_btop(dma_attr->dma_attr_addr_hi + 1), 33045d07b933Sdp &plw); 330519397407SSherry Moore if (pp != NULL) { 330619397407SSherry Moore check_dma(dma_attr, pp, 1); 33075d07b933Sdp return (pp); 330819397407SSherry Moore } 33095d07b933Sdp } 33105d07b933Sdp 33115d07b933Sdp bin = page_list_walk_next_bin(szc, bin, &plw); 33127c478bd9Sstevel@tonic-gate } 33135d07b933Sdp 3314affbd3ccSkchow MTYPE_NEXT(mnode, mtype, flags); 3315affbd3ccSkchow } while (mtype >= 0); 33167c478bd9Sstevel@tonic-gate 33177c478bd9Sstevel@tonic-gate /* failed to find a page in the freelist; try it in the cachelist */ 33187c478bd9Sstevel@tonic-gate 33197c478bd9Sstevel@tonic-gate /* reset mtype start for cachelist search */ 33207c478bd9Sstevel@tonic-gate mtype = mtypestart; 33217c478bd9Sstevel@tonic-gate ASSERT(mtype >= 0); 33227c478bd9Sstevel@tonic-gate 33237c478bd9Sstevel@tonic-gate /* start with the bin of matching color */ 33247c478bd9Sstevel@tonic-gate bin = origbin; 33257c478bd9Sstevel@tonic-gate 33267c478bd9Sstevel@tonic-gate do { 33277c478bd9Sstevel@tonic-gate for (i = 0; i <= page_colors; i++) { 33287c478bd9Sstevel@tonic-gate if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 33297c478bd9Sstevel@tonic-gate goto nextcachebin; 3330d94ffb28Sjmcp pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 33317c478bd9Sstevel@tonic-gate mutex_enter(pcm); 33327c478bd9Sstevel@tonic-gate pp = PAGE_CACHELISTS(mnode, bin, mtype); 33337c478bd9Sstevel@tonic-gate first_pp = pp; 33347c478bd9Sstevel@tonic-gate while (pp != NULL) { 33351f84c0d7SDave Plauger if (IS_DUMP_PAGE(pp) || page_trylock(pp, 33361f84c0d7SDave Plauger SE_EXCL) == 0) { 33377c478bd9Sstevel@tonic-gate pp = pp->p_next; 33387c478bd9Sstevel@tonic-gate if (pp == first_pp) 3339e172a44eSSherry Moore pp = NULL; 33407c478bd9Sstevel@tonic-gate continue; 33417c478bd9Sstevel@tonic-gate } 33427c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode); 33437c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 33447c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 33457c478bd9Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 33467c478bd9Sstevel@tonic-gate 33477c478bd9Sstevel@tonic-gate /* check if page within DMA attributes */ 33487c478bd9Sstevel@tonic-gate 3349ae115bc7Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 33507c478bd9Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 33517c478bd9Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 33527c478bd9Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 33537c478bd9Sstevel@tonic-gate break; 33547c478bd9Sstevel@tonic-gate } 33557c478bd9Sstevel@tonic-gate 33567c478bd9Sstevel@tonic-gate /* continue looking */ 33577c478bd9Sstevel@tonic-gate page_unlock(pp); 33587c478bd9Sstevel@tonic-gate pp = pp->p_next; 33597c478bd9Sstevel@tonic-gate if (pp == first_pp) 33607c478bd9Sstevel@tonic-gate pp = NULL; 33617c478bd9Sstevel@tonic-gate } 33627c478bd9Sstevel@tonic-gate 33637c478bd9Sstevel@tonic-gate if (pp != NULL) { 33647c478bd9Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 33657c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 33667c478bd9Sstevel@tonic-gate 33677c478bd9Sstevel@tonic-gate /* found a page with specified DMA attributes */ 33687c478bd9Sstevel@tonic-gate page_sub(&PAGE_CACHELISTS(mnode, bin, 33697c478bd9Sstevel@tonic-gate mtype), pp); 3370affbd3ccSkchow page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 33717c478bd9Sstevel@tonic-gate 33727c478bd9Sstevel@tonic-gate mutex_exit(pcm); 33737c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode); 33747c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 33757c478bd9Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 33767c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 33777c478bd9Sstevel@tonic-gate return (pp); 33787c478bd9Sstevel@tonic-gate } 33797c478bd9Sstevel@tonic-gate mutex_exit(pcm); 33807c478bd9Sstevel@tonic-gate nextcachebin: 33817c478bd9Sstevel@tonic-gate bin += (i == 0) ? BIN_STEP : 1; 33827c478bd9Sstevel@tonic-gate bin &= page_colors_mask; 33837c478bd9Sstevel@tonic-gate } 3384affbd3ccSkchow MTYPE_NEXT(mnode, mtype, flags); 3385affbd3ccSkchow } while (mtype >= 0); 33867c478bd9Sstevel@tonic-gate 33877c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 33887c478bd9Sstevel@tonic-gate return (NULL); 33897c478bd9Sstevel@tonic-gate } 33907c478bd9Sstevel@tonic-gate 33917c478bd9Sstevel@tonic-gate /* 33927c478bd9Sstevel@tonic-gate * This function is similar to page_get_freelist()/page_get_cachelist() 33937c478bd9Sstevel@tonic-gate * but it searches both the lists to find a page with the specified 33947c478bd9Sstevel@tonic-gate * color (or no color) and DMA attributes. The search is done in the 33957c478bd9Sstevel@tonic-gate * freelist first and then in the cache list within the highest memory 33967c478bd9Sstevel@tonic-gate * range (based on DMA attributes) before searching in the lower 33977c478bd9Sstevel@tonic-gate * memory ranges. 33987c478bd9Sstevel@tonic-gate * 33997c478bd9Sstevel@tonic-gate * Note: This function is called only by page_create_io(). 34007c478bd9Sstevel@tonic-gate */ 34017c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 3402843e1988Sjohnlev static page_t * 34037c478bd9Sstevel@tonic-gate page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 34047c478bd9Sstevel@tonic-gate size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 34057c478bd9Sstevel@tonic-gate { 34067c478bd9Sstevel@tonic-gate uint_t bin; 34077c478bd9Sstevel@tonic-gate int mtype; 34087c478bd9Sstevel@tonic-gate page_t *pp; 34097c478bd9Sstevel@tonic-gate int n; 34107c478bd9Sstevel@tonic-gate int m; 34117c478bd9Sstevel@tonic-gate int szc; 34127c478bd9Sstevel@tonic-gate int fullrange; 34137c478bd9Sstevel@tonic-gate int mnode; 34147c478bd9Sstevel@tonic-gate int local_failed_stat = 0; 34157c478bd9Sstevel@tonic-gate lgrp_mnode_cookie_t lgrp_cookie; 34167c478bd9Sstevel@tonic-gate 34177c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_alloc); 34187c478bd9Sstevel@tonic-gate 34197c478bd9Sstevel@tonic-gate /* only base pagesize currently supported */ 34207c478bd9Sstevel@tonic-gate if (size != MMU_PAGESIZE) 34217c478bd9Sstevel@tonic-gate return (NULL); 34227c478bd9Sstevel@tonic-gate 34237c478bd9Sstevel@tonic-gate /* 34247c478bd9Sstevel@tonic-gate * If we're passed a specific lgroup, we use it. Otherwise, 34257c478bd9Sstevel@tonic-gate * assume first-touch placement is desired. 34267c478bd9Sstevel@tonic-gate */ 34277c478bd9Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 34287c478bd9Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 34297c478bd9Sstevel@tonic-gate 34307c478bd9Sstevel@tonic-gate /* LINTED */ 3431d94ffb28Sjmcp AS_2_BIN(as, seg, vp, vaddr, bin, 0); 34327c478bd9Sstevel@tonic-gate 34337c478bd9Sstevel@tonic-gate /* 34347c478bd9Sstevel@tonic-gate * Only hold one freelist or cachelist lock at a time, that way we 34357c478bd9Sstevel@tonic-gate * can start anywhere and not have to worry about lock 34367c478bd9Sstevel@tonic-gate * ordering. 34377c478bd9Sstevel@tonic-gate */ 34387c478bd9Sstevel@tonic-gate if (dma_attr == NULL) { 3439a3114836SGerry Liu n = mtype16m; 3440a3114836SGerry Liu m = mtypetop; 34417c478bd9Sstevel@tonic-gate fullrange = 1; 34427c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 34437c478bd9Sstevel@tonic-gate } else { 34447c478bd9Sstevel@tonic-gate pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 34457c478bd9Sstevel@tonic-gate pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 34467c478bd9Sstevel@tonic-gate 34477c478bd9Sstevel@tonic-gate /* 34487c478bd9Sstevel@tonic-gate * We can guarantee alignment only for page boundary. 34497c478bd9Sstevel@tonic-gate */ 34507c478bd9Sstevel@tonic-gate if (dma_attr->dma_attr_align > MMU_PAGESIZE) 34517c478bd9Sstevel@tonic-gate return (NULL); 34527c478bd9Sstevel@tonic-gate 3453a3114836SGerry Liu /* Sanity check the dma_attr */ 3454a3114836SGerry Liu if (pfnlo > pfnhi) 3455a3114836SGerry Liu return (NULL); 3456a3114836SGerry Liu 34577c478bd9Sstevel@tonic-gate n = pfn_2_mtype(pfnlo); 34587c478bd9Sstevel@tonic-gate m = pfn_2_mtype(pfnhi); 34597c478bd9Sstevel@tonic-gate 34607c478bd9Sstevel@tonic-gate fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 34617c478bd9Sstevel@tonic-gate (pfnhi >= mnoderanges[m].mnr_pfnhi)); 34627c478bd9Sstevel@tonic-gate } 34637c478bd9Sstevel@tonic-gate VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 34647c478bd9Sstevel@tonic-gate 34657c478bd9Sstevel@tonic-gate szc = 0; 34667c478bd9Sstevel@tonic-gate 3467a3114836SGerry Liu /* cylcing thru mtype handled by RANGE0 if n == mtype16m */ 3468a3114836SGerry Liu if (n == mtype16m) { 34697c478bd9Sstevel@tonic-gate flags |= PGI_MT_RANGE0; 34707c478bd9Sstevel@tonic-gate n = m; 34717c478bd9Sstevel@tonic-gate } 34727c478bd9Sstevel@tonic-gate 34737c478bd9Sstevel@tonic-gate /* 34747c478bd9Sstevel@tonic-gate * Try local memory node first, but try remote if we can't 34757c478bd9Sstevel@tonic-gate * get a page of the right color. 34767c478bd9Sstevel@tonic-gate */ 34777c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 34787c478bd9Sstevel@tonic-gate while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 34797c478bd9Sstevel@tonic-gate /* 34807c478bd9Sstevel@tonic-gate * allocate pages from high pfn to low. 34817c478bd9Sstevel@tonic-gate */ 3482a3114836SGerry Liu mtype = m; 3483a3114836SGerry Liu do { 34847c478bd9Sstevel@tonic-gate if (fullrange != 0) { 3485d94ffb28Sjmcp pp = page_get_mnode_freelist(mnode, 34867c478bd9Sstevel@tonic-gate bin, mtype, szc, flags); 34877c478bd9Sstevel@tonic-gate if (pp == NULL) { 34887c478bd9Sstevel@tonic-gate pp = page_get_mnode_cachelist( 3489843e1988Sjohnlev bin, flags, mnode, mtype); 34907c478bd9Sstevel@tonic-gate } 34917c478bd9Sstevel@tonic-gate } else { 34927c478bd9Sstevel@tonic-gate pp = page_get_mnode_anylist(bin, szc, 34937c478bd9Sstevel@tonic-gate flags, mnode, mtype, dma_attr); 34947c478bd9Sstevel@tonic-gate } 34957c478bd9Sstevel@tonic-gate if (pp != NULL) { 34967c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocok); 34977c478bd9Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 34987c478bd9Sstevel@tonic-gate return (pp); 34997c478bd9Sstevel@tonic-gate } 3500a3114836SGerry Liu } while (mtype != n && 3501a3114836SGerry Liu (mtype = mnoderanges[mtype].mnr_next) != -1); 35027c478bd9Sstevel@tonic-gate if (!local_failed_stat) { 35037c478bd9Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 35047c478bd9Sstevel@tonic-gate local_failed_stat = 1; 35057c478bd9Sstevel@tonic-gate } 35067c478bd9Sstevel@tonic-gate } 35077c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocfailed); 35087c478bd9Sstevel@tonic-gate 35097c478bd9Sstevel@tonic-gate return (NULL); 35107c478bd9Sstevel@tonic-gate } 35117c478bd9Sstevel@tonic-gate 35127c478bd9Sstevel@tonic-gate /* 35137c478bd9Sstevel@tonic-gate * page_create_io() 35147c478bd9Sstevel@tonic-gate * 35157c478bd9Sstevel@tonic-gate * This function is a copy of page_create_va() with an additional 35167c478bd9Sstevel@tonic-gate * argument 'mattr' that specifies DMA memory requirements to 35177c478bd9Sstevel@tonic-gate * the page list functions. This function is used by the segkmem 35187c478bd9Sstevel@tonic-gate * allocator so it is only to create new pages (i.e PG_EXCL is 35197c478bd9Sstevel@tonic-gate * set). 35207c478bd9Sstevel@tonic-gate * 35217c478bd9Sstevel@tonic-gate * Note: This interface is currently used by x86 PSM only and is 35227c478bd9Sstevel@tonic-gate * not fully specified so the commitment level is only for 35237c478bd9Sstevel@tonic-gate * private interface specific to x86. This interface uses PSM 35247c478bd9Sstevel@tonic-gate * specific page_get_anylist() interface. 35257c478bd9Sstevel@tonic-gate */ 35267c478bd9Sstevel@tonic-gate 35277c478bd9Sstevel@tonic-gate #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 35287c478bd9Sstevel@tonic-gate for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 35297c478bd9Sstevel@tonic-gate if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 35307c478bd9Sstevel@tonic-gate break; \ 35317c478bd9Sstevel@tonic-gate } \ 35327c478bd9Sstevel@tonic-gate } 35337c478bd9Sstevel@tonic-gate 35347c478bd9Sstevel@tonic-gate 35357c478bd9Sstevel@tonic-gate page_t * 35367c478bd9Sstevel@tonic-gate page_create_io( 35377c478bd9Sstevel@tonic-gate struct vnode *vp, 35387c478bd9Sstevel@tonic-gate u_offset_t off, 35397c478bd9Sstevel@tonic-gate uint_t bytes, 35407c478bd9Sstevel@tonic-gate uint_t flags, 35417c478bd9Sstevel@tonic-gate struct as *as, 35427c478bd9Sstevel@tonic-gate caddr_t vaddr, 35437c478bd9Sstevel@tonic-gate ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 35447c478bd9Sstevel@tonic-gate { 35457c478bd9Sstevel@tonic-gate page_t *plist = NULL; 35467c478bd9Sstevel@tonic-gate uint_t plist_len = 0; 35477c478bd9Sstevel@tonic-gate pgcnt_t npages; 35487c478bd9Sstevel@tonic-gate page_t *npp = NULL; 35497c478bd9Sstevel@tonic-gate uint_t pages_req; 35507c478bd9Sstevel@tonic-gate page_t *pp; 35517c478bd9Sstevel@tonic-gate kmutex_t *phm = NULL; 35527c478bd9Sstevel@tonic-gate uint_t index; 35537c478bd9Sstevel@tonic-gate 35547c478bd9Sstevel@tonic-gate TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 3555843e1988Sjohnlev "page_create_start:vp %p off %llx bytes %u flags %x", 3556843e1988Sjohnlev vp, off, bytes, flags); 35577c478bd9Sstevel@tonic-gate 35587c478bd9Sstevel@tonic-gate ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 35597c478bd9Sstevel@tonic-gate 35607c478bd9Sstevel@tonic-gate pages_req = npages = mmu_btopr(bytes); 35617c478bd9Sstevel@tonic-gate 35627c478bd9Sstevel@tonic-gate /* 35637c478bd9Sstevel@tonic-gate * Do the freemem and pcf accounting. 35647c478bd9Sstevel@tonic-gate */ 35657c478bd9Sstevel@tonic-gate if (!page_create_wait(npages, flags)) { 35667c478bd9Sstevel@tonic-gate return (NULL); 35677c478bd9Sstevel@tonic-gate } 35687c478bd9Sstevel@tonic-gate 35697c478bd9Sstevel@tonic-gate TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 3570843e1988Sjohnlev "page_create_success:vp %p off %llx", vp, off); 35717c478bd9Sstevel@tonic-gate 35727c478bd9Sstevel@tonic-gate /* 35737c478bd9Sstevel@tonic-gate * If satisfying this request has left us with too little 35747c478bd9Sstevel@tonic-gate * memory, start the wheels turning to get some back. The 35757c478bd9Sstevel@tonic-gate * first clause of the test prevents waking up the pageout 35767c478bd9Sstevel@tonic-gate * daemon in situations where it would decide that there's 35777c478bd9Sstevel@tonic-gate * nothing to do. 35787c478bd9Sstevel@tonic-gate */ 35797c478bd9Sstevel@tonic-gate if (nscan < desscan && freemem < minfree) { 35807c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 3581843e1988Sjohnlev "pageout_cv_signal:freemem %ld", freemem); 35827c478bd9Sstevel@tonic-gate cv_signal(&proc_pageout->p_cv); 35837c478bd9Sstevel@tonic-gate } 35847c478bd9Sstevel@tonic-gate 35857c478bd9Sstevel@tonic-gate if (flags & PG_PHYSCONTIG) { 35867c478bd9Sstevel@tonic-gate 35877c478bd9Sstevel@tonic-gate plist = page_get_contigpage(&npages, mattr, 1); 35887c478bd9Sstevel@tonic-gate if (plist == NULL) { 35897c478bd9Sstevel@tonic-gate page_create_putback(npages); 35907c478bd9Sstevel@tonic-gate return (NULL); 35917c478bd9Sstevel@tonic-gate } 35927c478bd9Sstevel@tonic-gate 35937c478bd9Sstevel@tonic-gate pp = plist; 35947c478bd9Sstevel@tonic-gate 35957c478bd9Sstevel@tonic-gate do { 35967c478bd9Sstevel@tonic-gate if (!page_hashin(pp, vp, off, NULL)) { 35977c478bd9Sstevel@tonic-gate panic("pg_creat_io: hashin failed %p %p %llx", 35987c478bd9Sstevel@tonic-gate (void *)pp, (void *)vp, off); 35997c478bd9Sstevel@tonic-gate } 36007c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 36017c478bd9Sstevel@tonic-gate off += MMU_PAGESIZE; 36027c478bd9Sstevel@tonic-gate PP_CLRFREE(pp); 36037c478bd9Sstevel@tonic-gate PP_CLRAGED(pp); 36047c478bd9Sstevel@tonic-gate page_set_props(pp, P_REF); 36057c478bd9Sstevel@tonic-gate pp = pp->p_next; 36067c478bd9Sstevel@tonic-gate } while (pp != plist); 36077c478bd9Sstevel@tonic-gate 36087c478bd9Sstevel@tonic-gate if (!npages) { 36097c478bd9Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 36107c478bd9Sstevel@tonic-gate return (plist); 36117c478bd9Sstevel@tonic-gate } else { 36127c478bd9Sstevel@tonic-gate vaddr += (pages_req - npages) << MMU_PAGESHIFT; 36137c478bd9Sstevel@tonic-gate } 36147c478bd9Sstevel@tonic-gate 36157c478bd9Sstevel@tonic-gate /* 36167c478bd9Sstevel@tonic-gate * fall-thru: 36177c478bd9Sstevel@tonic-gate * 36187c478bd9Sstevel@tonic-gate * page_get_contigpage returns when npages <= sgllen. 36197c478bd9Sstevel@tonic-gate * Grab the rest of the non-contig pages below from anylist. 36207c478bd9Sstevel@tonic-gate */ 36217c478bd9Sstevel@tonic-gate } 36227c478bd9Sstevel@tonic-gate 36237c478bd9Sstevel@tonic-gate /* 36247c478bd9Sstevel@tonic-gate * Loop around collecting the requested number of pages. 36257c478bd9Sstevel@tonic-gate * Most of the time, we have to `create' a new page. With 36267c478bd9Sstevel@tonic-gate * this in mind, pull the page off the free list before 36277c478bd9Sstevel@tonic-gate * getting the hash lock. This will minimize the hash 36287c478bd9Sstevel@tonic-gate * lock hold time, nesting, and the like. If it turns 36297c478bd9Sstevel@tonic-gate * out we don't need the page, we put it back at the end. 36307c478bd9Sstevel@tonic-gate */ 36317c478bd9Sstevel@tonic-gate while (npages--) { 36327c478bd9Sstevel@tonic-gate phm = NULL; 36337c478bd9Sstevel@tonic-gate 36347c478bd9Sstevel@tonic-gate index = PAGE_HASH_FUNC(vp, off); 36357c478bd9Sstevel@tonic-gate top: 36367c478bd9Sstevel@tonic-gate ASSERT(phm == NULL); 36377c478bd9Sstevel@tonic-gate ASSERT(index == PAGE_HASH_FUNC(vp, off)); 36387c478bd9Sstevel@tonic-gate ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 36397c478bd9Sstevel@tonic-gate 36407c478bd9Sstevel@tonic-gate if (npp == NULL) { 36417c478bd9Sstevel@tonic-gate /* 36427c478bd9Sstevel@tonic-gate * Try to get the page of any color either from 36437c478bd9Sstevel@tonic-gate * the freelist or from the cache list. 36447c478bd9Sstevel@tonic-gate */ 36457c478bd9Sstevel@tonic-gate npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 36467c478bd9Sstevel@tonic-gate flags & ~PG_MATCH_COLOR, mattr, NULL); 36477c478bd9Sstevel@tonic-gate if (npp == NULL) { 36487c478bd9Sstevel@tonic-gate if (mattr == NULL) { 36497c478bd9Sstevel@tonic-gate /* 36507c478bd9Sstevel@tonic-gate * Not looking for a special page; 36517c478bd9Sstevel@tonic-gate * panic! 36527c478bd9Sstevel@tonic-gate */ 36537c478bd9Sstevel@tonic-gate panic("no page found %d", (int)npages); 36547c478bd9Sstevel@tonic-gate } 36557c478bd9Sstevel@tonic-gate /* 36567c478bd9Sstevel@tonic-gate * No page found! This can happen 36577c478bd9Sstevel@tonic-gate * if we are looking for a page 36587c478bd9Sstevel@tonic-gate * within a specific memory range 36597c478bd9Sstevel@tonic-gate * for DMA purposes. If PG_WAIT is 36607c478bd9Sstevel@tonic-gate * specified then we wait for a 36617c478bd9Sstevel@tonic-gate * while and then try again. The 36627c478bd9Sstevel@tonic-gate * wait could be forever if we 36637c478bd9Sstevel@tonic-gate * don't get the page(s) we need. 36647c478bd9Sstevel@tonic-gate * 36657c478bd9Sstevel@tonic-gate * Note: XXX We really need a mechanism 36667c478bd9Sstevel@tonic-gate * to wait for pages in the desired 36677c478bd9Sstevel@tonic-gate * range. For now, we wait for any 36687c478bd9Sstevel@tonic-gate * pages and see if we can use it. 36697c478bd9Sstevel@tonic-gate */ 36707c478bd9Sstevel@tonic-gate 36717c478bd9Sstevel@tonic-gate if ((mattr != NULL) && (flags & PG_WAIT)) { 36727c478bd9Sstevel@tonic-gate delay(10); 36737c478bd9Sstevel@tonic-gate goto top; 36747c478bd9Sstevel@tonic-gate } 36757c478bd9Sstevel@tonic-gate goto fail; /* undo accounting stuff */ 36767c478bd9Sstevel@tonic-gate } 36777c478bd9Sstevel@tonic-gate 36787c478bd9Sstevel@tonic-gate if (PP_ISAGED(npp) == 0) { 36797c478bd9Sstevel@tonic-gate /* 36807c478bd9Sstevel@tonic-gate * Since this page came from the 36817c478bd9Sstevel@tonic-gate * cachelist, we must destroy the 36827c478bd9Sstevel@tonic-gate * old vnode association. 36837c478bd9Sstevel@tonic-gate */ 36847c478bd9Sstevel@tonic-gate page_hashout(npp, (kmutex_t *)NULL); 36857c478bd9Sstevel@tonic-gate } 36867c478bd9Sstevel@tonic-gate } 36877c478bd9Sstevel@tonic-gate 36887c478bd9Sstevel@tonic-gate /* 36897c478bd9Sstevel@tonic-gate * We own this page! 36907c478bd9Sstevel@tonic-gate */ 36917c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(npp)); 36927c478bd9Sstevel@tonic-gate ASSERT(npp->p_vnode == NULL); 36937c478bd9Sstevel@tonic-gate ASSERT(!hat_page_is_mapped(npp)); 36947c478bd9Sstevel@tonic-gate PP_CLRFREE(npp); 36957c478bd9Sstevel@tonic-gate PP_CLRAGED(npp); 36967c478bd9Sstevel@tonic-gate 36977c478bd9Sstevel@tonic-gate /* 36987c478bd9Sstevel@tonic-gate * Here we have a page in our hot little mits and are 36997c478bd9Sstevel@tonic-gate * just waiting to stuff it on the appropriate lists. 37007c478bd9Sstevel@tonic-gate * Get the mutex and check to see if it really does 37017c478bd9Sstevel@tonic-gate * not exist. 37027c478bd9Sstevel@tonic-gate */ 37037c478bd9Sstevel@tonic-gate phm = PAGE_HASH_MUTEX(index); 37047c478bd9Sstevel@tonic-gate mutex_enter(phm); 37057c478bd9Sstevel@tonic-gate PAGE_HASH_SEARCH(index, pp, vp, off); 37067c478bd9Sstevel@tonic-gate if (pp == NULL) { 37077c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 37087c478bd9Sstevel@tonic-gate pp = npp; 37097c478bd9Sstevel@tonic-gate npp = NULL; 37107c478bd9Sstevel@tonic-gate if (!page_hashin(pp, vp, off, phm)) { 37117c478bd9Sstevel@tonic-gate /* 37127c478bd9Sstevel@tonic-gate * Since we hold the page hash mutex and 37137c478bd9Sstevel@tonic-gate * just searched for this page, page_hashin 37147c478bd9Sstevel@tonic-gate * had better not fail. If it does, that 37157c478bd9Sstevel@tonic-gate * means somethread did not follow the 37167c478bd9Sstevel@tonic-gate * page hash mutex rules. Panic now and 37177c478bd9Sstevel@tonic-gate * get it over with. As usual, go down 37187c478bd9Sstevel@tonic-gate * holding all the locks. 37197c478bd9Sstevel@tonic-gate */ 37207c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 37217c478bd9Sstevel@tonic-gate panic("page_create: hashin fail %p %p %llx %p", 37227c478bd9Sstevel@tonic-gate (void *)pp, (void *)vp, off, (void *)phm); 37237c478bd9Sstevel@tonic-gate 37247c478bd9Sstevel@tonic-gate } 37257c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 37267c478bd9Sstevel@tonic-gate mutex_exit(phm); 37277c478bd9Sstevel@tonic-gate phm = NULL; 37287c478bd9Sstevel@tonic-gate 37297c478bd9Sstevel@tonic-gate /* 37307c478bd9Sstevel@tonic-gate * Hat layer locking need not be done to set 37317c478bd9Sstevel@tonic-gate * the following bits since the page is not hashed 37327c478bd9Sstevel@tonic-gate * and was on the free list (i.e., had no mappings). 37337c478bd9Sstevel@tonic-gate * 37347c478bd9Sstevel@tonic-gate * Set the reference bit to protect 37357c478bd9Sstevel@tonic-gate * against immediate pageout 37367c478bd9Sstevel@tonic-gate * 37377c478bd9Sstevel@tonic-gate * XXXmh modify freelist code to set reference 37387c478bd9Sstevel@tonic-gate * bit so we don't have to do it here. 37397c478bd9Sstevel@tonic-gate */ 37407c478bd9Sstevel@tonic-gate page_set_props(pp, P_REF); 37417c478bd9Sstevel@tonic-gate } else { 37427c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 37437c478bd9Sstevel@tonic-gate mutex_exit(phm); 37447c478bd9Sstevel@tonic-gate phm = NULL; 37457c478bd9Sstevel@tonic-gate /* 37467c478bd9Sstevel@tonic-gate * NOTE: This should not happen for pages associated 37477c478bd9Sstevel@tonic-gate * with kernel vnode 'kvp'. 37487c478bd9Sstevel@tonic-gate */ 37497c478bd9Sstevel@tonic-gate /* XX64 - to debug why this happens! */ 3750ad23a2dbSjohansen ASSERT(!VN_ISKAS(vp)); 3751ad23a2dbSjohansen if (VN_ISKAS(vp)) 37527c478bd9Sstevel@tonic-gate cmn_err(CE_NOTE, 37537c478bd9Sstevel@tonic-gate "page_create: page not expected " 37547c478bd9Sstevel@tonic-gate "in hash list for kernel vnode - pp 0x%p", 37557c478bd9Sstevel@tonic-gate (void *)pp); 37567c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_exists); 37577c478bd9Sstevel@tonic-gate goto fail; 37587c478bd9Sstevel@tonic-gate } 37597c478bd9Sstevel@tonic-gate 37607c478bd9Sstevel@tonic-gate /* 37617c478bd9Sstevel@tonic-gate * Got a page! It is locked. Acquire the i/o 37627c478bd9Sstevel@tonic-gate * lock since we are going to use the p_next and 37637c478bd9Sstevel@tonic-gate * p_prev fields to link the requested pages together. 37647c478bd9Sstevel@tonic-gate */ 37657c478bd9Sstevel@tonic-gate page_io_lock(pp); 37667c478bd9Sstevel@tonic-gate page_add(&plist, pp); 37677c478bd9Sstevel@tonic-gate plist = plist->p_next; 37687c478bd9Sstevel@tonic-gate off += MMU_PAGESIZE; 37697c478bd9Sstevel@tonic-gate vaddr += MMU_PAGESIZE; 37707c478bd9Sstevel@tonic-gate } 37717c478bd9Sstevel@tonic-gate 37727c478bd9Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 37737c478bd9Sstevel@tonic-gate return (plist); 37747c478bd9Sstevel@tonic-gate 37757c478bd9Sstevel@tonic-gate fail: 37767c478bd9Sstevel@tonic-gate if (npp != NULL) { 37777c478bd9Sstevel@tonic-gate /* 37787c478bd9Sstevel@tonic-gate * Did not need this page after all. 37797c478bd9Sstevel@tonic-gate * Put it back on the free list. 37807c478bd9Sstevel@tonic-gate */ 37817c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 37827c478bd9Sstevel@tonic-gate PP_SETFREE(npp); 37837c478bd9Sstevel@tonic-gate PP_SETAGED(npp); 37847c478bd9Sstevel@tonic-gate npp->p_offset = (u_offset_t)-1; 37857c478bd9Sstevel@tonic-gate page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 37867c478bd9Sstevel@tonic-gate page_unlock(npp); 37877c478bd9Sstevel@tonic-gate } 37887c478bd9Sstevel@tonic-gate 37897c478bd9Sstevel@tonic-gate /* 37907c478bd9Sstevel@tonic-gate * Give up the pages we already got. 37917c478bd9Sstevel@tonic-gate */ 37927c478bd9Sstevel@tonic-gate while (plist != NULL) { 37937c478bd9Sstevel@tonic-gate pp = plist; 37947c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 37957c478bd9Sstevel@tonic-gate page_io_unlock(pp); 37967c478bd9Sstevel@tonic-gate plist_len++; 37977c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional ctx*/ 37987c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 37997c478bd9Sstevel@tonic-gate } 38007c478bd9Sstevel@tonic-gate 38017c478bd9Sstevel@tonic-gate /* 38027c478bd9Sstevel@tonic-gate * VN_DISPOSE does freemem accounting for the pages in plist 38037c478bd9Sstevel@tonic-gate * by calling page_free. So, we need to undo the pcf accounting 38047c478bd9Sstevel@tonic-gate * for only the remaining pages. 38057c478bd9Sstevel@tonic-gate */ 38067c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 38077c478bd9Sstevel@tonic-gate page_create_putback(pages_req - plist_len); 38087c478bd9Sstevel@tonic-gate 38097c478bd9Sstevel@tonic-gate return (NULL); 38107c478bd9Sstevel@tonic-gate } 3811843e1988Sjohnlev #endif /* !__xpv */ 38127c478bd9Sstevel@tonic-gate 38137c478bd9Sstevel@tonic-gate 38147c478bd9Sstevel@tonic-gate /* 38157c478bd9Sstevel@tonic-gate * Copy the data from the physical page represented by "frompp" to 38167c478bd9Sstevel@tonic-gate * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 38177c478bd9Sstevel@tonic-gate * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 38187c478bd9Sstevel@tonic-gate * level and no one sleeps with an active mapping there. 38197c478bd9Sstevel@tonic-gate * 38207c478bd9Sstevel@tonic-gate * Note that the ref/mod bits in the page_t's are not affected by 38217c478bd9Sstevel@tonic-gate * this operation, hence it is up to the caller to update them appropriately. 38227c478bd9Sstevel@tonic-gate */ 38238b464eb8Smec int 38247c478bd9Sstevel@tonic-gate ppcopy(page_t *frompp, page_t *topp) 38257c478bd9Sstevel@tonic-gate { 38267c478bd9Sstevel@tonic-gate caddr_t pp_addr1; 38277c478bd9Sstevel@tonic-gate caddr_t pp_addr2; 3828ae115bc7Smrj hat_mempte_t pte1; 3829ae115bc7Smrj hat_mempte_t pte2; 38307c478bd9Sstevel@tonic-gate kmutex_t *ppaddr_mutex; 38318b464eb8Smec label_t ljb; 38328b464eb8Smec int ret = 1; 38337c478bd9Sstevel@tonic-gate 38347c478bd9Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 38357c478bd9Sstevel@tonic-gate ASSERT(PAGE_LOCKED(frompp)); 38367c478bd9Sstevel@tonic-gate ASSERT(PAGE_LOCKED(topp)); 38377c478bd9Sstevel@tonic-gate 38387c478bd9Sstevel@tonic-gate if (kpm_enable) { 38397c478bd9Sstevel@tonic-gate pp_addr1 = hat_kpm_page2va(frompp, 0); 38407c478bd9Sstevel@tonic-gate pp_addr2 = hat_kpm_page2va(topp, 0); 38417c478bd9Sstevel@tonic-gate kpreempt_disable(); 38427c478bd9Sstevel@tonic-gate } else { 38437c478bd9Sstevel@tonic-gate /* 38447c478bd9Sstevel@tonic-gate * disable pre-emption so that CPU can't change 38457c478bd9Sstevel@tonic-gate */ 38467c478bd9Sstevel@tonic-gate kpreempt_disable(); 38477c478bd9Sstevel@tonic-gate 38487c478bd9Sstevel@tonic-gate pp_addr1 = CPU->cpu_caddr1; 38497c478bd9Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 3850ae115bc7Smrj pte1 = CPU->cpu_caddr1pte; 3851ae115bc7Smrj pte2 = CPU->cpu_caddr2pte; 38527c478bd9Sstevel@tonic-gate 38537c478bd9Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 38547c478bd9Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 38557c478bd9Sstevel@tonic-gate 38567c478bd9Sstevel@tonic-gate hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 38577c478bd9Sstevel@tonic-gate PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 38587c478bd9Sstevel@tonic-gate hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 38597c478bd9Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 38607c478bd9Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 38617c478bd9Sstevel@tonic-gate } 38627c478bd9Sstevel@tonic-gate 38638b464eb8Smec if (on_fault(&ljb)) { 38648b464eb8Smec ret = 0; 38658b464eb8Smec goto faulted; 38668b464eb8Smec } 38677c478bd9Sstevel@tonic-gate if (use_sse_pagecopy) 3868843e1988Sjohnlev #ifdef __xpv 3869843e1988Sjohnlev page_copy_no_xmm(pp_addr2, pp_addr1); 3870843e1988Sjohnlev #else 38717c478bd9Sstevel@tonic-gate hwblkpagecopy(pp_addr1, pp_addr2); 3872843e1988Sjohnlev #endif 38737c478bd9Sstevel@tonic-gate else 38747c478bd9Sstevel@tonic-gate bcopy(pp_addr1, pp_addr2, PAGESIZE); 38757c478bd9Sstevel@tonic-gate 38768b464eb8Smec no_fault(); 38778b464eb8Smec faulted: 3878ae115bc7Smrj if (!kpm_enable) { 3879843e1988Sjohnlev #ifdef __xpv 3880843e1988Sjohnlev /* 38818ea72728Sjosephb * We can't leave unused mappings laying about under the 38828ea72728Sjosephb * hypervisor, so blow them away. 3883843e1988Sjohnlev */ 38848ea72728Sjosephb if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0, 38858ea72728Sjosephb UVMF_INVLPG | UVMF_LOCAL) < 0) 38868ea72728Sjosephb panic("HYPERVISOR_update_va_mapping() failed"); 3887843e1988Sjohnlev if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3888843e1988Sjohnlev UVMF_INVLPG | UVMF_LOCAL) < 0) 3889843e1988Sjohnlev panic("HYPERVISOR_update_va_mapping() failed"); 3890843e1988Sjohnlev #endif 38917c478bd9Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 3892ae115bc7Smrj } 38937c478bd9Sstevel@tonic-gate kpreempt_enable(); 38948b464eb8Smec return (ret); 38957c478bd9Sstevel@tonic-gate } 38967c478bd9Sstevel@tonic-gate 3897d2b85481Srscott void 3898d2b85481Srscott pagezero(page_t *pp, uint_t off, uint_t len) 3899d2b85481Srscott { 3900d2b85481Srscott ASSERT(PAGE_LOCKED(pp)); 3901d2b85481Srscott pfnzero(page_pptonum(pp), off, len); 3902d2b85481Srscott } 3903d2b85481Srscott 39047c478bd9Sstevel@tonic-gate /* 3905d2b85481Srscott * Zero the physical page from off to off + len given by pfn 39067c478bd9Sstevel@tonic-gate * without changing the reference and modified bits of page. 39077c478bd9Sstevel@tonic-gate * 39087c478bd9Sstevel@tonic-gate * We use this using CPU private page address #2, see ppcopy() for more info. 3909d2b85481Srscott * pfnzero() must not be called at interrupt level. 39107c478bd9Sstevel@tonic-gate */ 39117c478bd9Sstevel@tonic-gate void 3912d2b85481Srscott pfnzero(pfn_t pfn, uint_t off, uint_t len) 39137c478bd9Sstevel@tonic-gate { 39147c478bd9Sstevel@tonic-gate caddr_t pp_addr2; 3915ae115bc7Smrj hat_mempte_t pte2; 3916d2b85481Srscott kmutex_t *ppaddr_mutex = NULL; 39177c478bd9Sstevel@tonic-gate 39187c478bd9Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 39197c478bd9Sstevel@tonic-gate ASSERT(len <= MMU_PAGESIZE); 39207c478bd9Sstevel@tonic-gate ASSERT(off <= MMU_PAGESIZE); 39217c478bd9Sstevel@tonic-gate ASSERT(off + len <= MMU_PAGESIZE); 39227c478bd9Sstevel@tonic-gate 3923d2b85481Srscott if (kpm_enable && !pfn_is_foreign(pfn)) { 3924d2b85481Srscott pp_addr2 = hat_kpm_pfn2va(pfn); 39257c478bd9Sstevel@tonic-gate kpreempt_disable(); 39267c478bd9Sstevel@tonic-gate } else { 39277c478bd9Sstevel@tonic-gate kpreempt_disable(); 39287c478bd9Sstevel@tonic-gate 39297c478bd9Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 3930ae115bc7Smrj pte2 = CPU->cpu_caddr2pte; 39317c478bd9Sstevel@tonic-gate 39327c478bd9Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 39337c478bd9Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 39347c478bd9Sstevel@tonic-gate 3935d2b85481Srscott hat_mempte_remap(pfn, pp_addr2, pte2, 39367c478bd9Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 39377c478bd9Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 39387c478bd9Sstevel@tonic-gate } 39397c478bd9Sstevel@tonic-gate 3940ae115bc7Smrj if (use_sse_pagezero) { 3941843e1988Sjohnlev #ifdef __xpv 3942843e1988Sjohnlev uint_t rem; 3943843e1988Sjohnlev 3944843e1988Sjohnlev /* 3945843e1988Sjohnlev * zero a byte at a time until properly aligned for 3946843e1988Sjohnlev * block_zero_no_xmm(). 3947843e1988Sjohnlev */ 3948843e1988Sjohnlev while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 3949843e1988Sjohnlev pp_addr2[off++] = 0; 3950843e1988Sjohnlev 3951843e1988Sjohnlev /* 3952843e1988Sjohnlev * Now use faster block_zero_no_xmm() for any range 3953843e1988Sjohnlev * that is properly aligned and sized. 3954843e1988Sjohnlev */ 3955843e1988Sjohnlev rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 3956843e1988Sjohnlev len -= rem; 3957843e1988Sjohnlev if (len != 0) { 3958843e1988Sjohnlev block_zero_no_xmm(pp_addr2 + off, len); 3959843e1988Sjohnlev off += len; 3960843e1988Sjohnlev } 3961843e1988Sjohnlev 3962843e1988Sjohnlev /* 3963843e1988Sjohnlev * zero remainder with byte stores. 3964843e1988Sjohnlev */ 3965843e1988Sjohnlev while (rem-- > 0) 3966843e1988Sjohnlev pp_addr2[off++] = 0; 3967843e1988Sjohnlev #else 39687c478bd9Sstevel@tonic-gate hwblkclr(pp_addr2 + off, len); 3969843e1988Sjohnlev #endif 3970ae115bc7Smrj } else { 39717c478bd9Sstevel@tonic-gate bzero(pp_addr2 + off, len); 3972ae115bc7Smrj } 39737c478bd9Sstevel@tonic-gate 3974d2b85481Srscott if (!kpm_enable || pfn_is_foreign(pfn)) { 3975843e1988Sjohnlev #ifdef __xpv 3976d2b85481Srscott /* 3977d2b85481Srscott * On the hypervisor this page might get used for a page 3978d2b85481Srscott * table before any intervening change to this mapping, 3979d2b85481Srscott * so blow it away. 3980d2b85481Srscott */ 3981d2b85481Srscott if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3982d2b85481Srscott UVMF_INVLPG) < 0) 3983d2b85481Srscott panic("HYPERVISOR_update_va_mapping() failed"); 3984843e1988Sjohnlev #endif 39857c478bd9Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 3986d2b85481Srscott } 3987d2b85481Srscott 39887c478bd9Sstevel@tonic-gate kpreempt_enable(); 39897c478bd9Sstevel@tonic-gate } 39907c478bd9Sstevel@tonic-gate 39917c478bd9Sstevel@tonic-gate /* 39927c478bd9Sstevel@tonic-gate * Platform-dependent page scrub call. 39937c478bd9Sstevel@tonic-gate */ 39947c478bd9Sstevel@tonic-gate void 39957c478bd9Sstevel@tonic-gate pagescrub(page_t *pp, uint_t off, uint_t len) 39967c478bd9Sstevel@tonic-gate { 39977c478bd9Sstevel@tonic-gate /* 39987c478bd9Sstevel@tonic-gate * For now, we rely on the fact that pagezero() will 39997c478bd9Sstevel@tonic-gate * always clear UEs. 40007c478bd9Sstevel@tonic-gate */ 40017c478bd9Sstevel@tonic-gate pagezero(pp, off, len); 40027c478bd9Sstevel@tonic-gate } 40037c478bd9Sstevel@tonic-gate 40047c478bd9Sstevel@tonic-gate /* 40057c478bd9Sstevel@tonic-gate * set up two private addresses for use on a given CPU for use in ppcopy() 40067c478bd9Sstevel@tonic-gate */ 40077c478bd9Sstevel@tonic-gate void 40087c478bd9Sstevel@tonic-gate setup_vaddr_for_ppcopy(struct cpu *cpup) 40097c478bd9Sstevel@tonic-gate { 40107c478bd9Sstevel@tonic-gate void *addr; 4011ae115bc7Smrj hat_mempte_t pte_pa; 40127c478bd9Sstevel@tonic-gate 40137c478bd9Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 4014ae115bc7Smrj pte_pa = hat_mempte_setup(addr); 40157c478bd9Sstevel@tonic-gate cpup->cpu_caddr1 = addr; 4016ae115bc7Smrj cpup->cpu_caddr1pte = pte_pa; 40177c478bd9Sstevel@tonic-gate 40187c478bd9Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 4019ae115bc7Smrj pte_pa = hat_mempte_setup(addr); 40207c478bd9Sstevel@tonic-gate cpup->cpu_caddr2 = addr; 4021ae115bc7Smrj cpup->cpu_caddr2pte = pte_pa; 40227c478bd9Sstevel@tonic-gate 40237c478bd9Sstevel@tonic-gate mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 40247c478bd9Sstevel@tonic-gate } 40257c478bd9Sstevel@tonic-gate 4026ae115bc7Smrj /* 4027ae115bc7Smrj * Undo setup_vaddr_for_ppcopy 4028ae115bc7Smrj */ 4029ae115bc7Smrj void 4030ae115bc7Smrj teardown_vaddr_for_ppcopy(struct cpu *cpup) 4031ae115bc7Smrj { 4032ae115bc7Smrj mutex_destroy(&cpup->cpu_ppaddr_mutex); 4033ae115bc7Smrj 4034ae115bc7Smrj hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 4035ae115bc7Smrj cpup->cpu_caddr2pte = 0; 4036ae115bc7Smrj vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 4037ae115bc7Smrj cpup->cpu_caddr2 = 0; 4038ae115bc7Smrj 4039ae115bc7Smrj hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 4040ae115bc7Smrj cpup->cpu_caddr1pte = 0; 4041ae115bc7Smrj vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 4042ae115bc7Smrj cpup->cpu_caddr1 = 0; 4043ae115bc7Smrj } 40447c478bd9Sstevel@tonic-gate 40457c478bd9Sstevel@tonic-gate /* 40467c478bd9Sstevel@tonic-gate * Function for flushing D-cache when performing module relocations 40477c478bd9Sstevel@tonic-gate * to an alternate mapping. Unnecessary on Intel / AMD platforms. 40487c478bd9Sstevel@tonic-gate */ 40497c478bd9Sstevel@tonic-gate void 40507c478bd9Sstevel@tonic-gate dcache_flushall() 40517c478bd9Sstevel@tonic-gate {} 4052102033aaSdp 4053ae115bc7Smrj /* 4054ae115bc7Smrj * Allocate a memory page. The argument 'seed' can be any pseudo-random 4055ae115bc7Smrj * number to vary where the pages come from. This is quite a hacked up 4056ae115bc7Smrj * method -- it works for now, but really needs to be fixed up a bit. 4057ae115bc7Smrj * 4058ae115bc7Smrj * We currently use page_create_va() on the kvp with fake offsets, 4059ae115bc7Smrj * segments and virt address. This is pretty bogus, but was copied from the 4060ae115bc7Smrj * old hat_i86.c code. A better approach would be to specify either mnode 4061ae115bc7Smrj * random or mnode local and takes a page from whatever color has the MOST 4062ae115bc7Smrj * available - this would have a minimal impact on page coloring. 4063ae115bc7Smrj */ 4064ae115bc7Smrj page_t * 4065a77271f8SVikram Hegde page_get_physical(uintptr_t seed) 4066ae115bc7Smrj { 4067ae115bc7Smrj page_t *pp; 4068a77271f8SVikram Hegde u_offset_t offset; 4069ae115bc7Smrj static struct seg tmpseg; 4070ae115bc7Smrj static uintptr_t ctr = 0; 4071ae115bc7Smrj 4072ae115bc7Smrj /* 4073ae115bc7Smrj * This code is gross, we really need a simpler page allocator. 4074ae115bc7Smrj * 4075a77271f8SVikram Hegde * We need to assign an offset for the page to call page_create_va() 4076ae115bc7Smrj * To avoid conflicts with other pages, we get creative with the offset. 407786c1f4dcSVikram Hegde * For 32 bits, we need an offset > 4Gig 407886c1f4dcSVikram Hegde * For 64 bits, need an offset somewhere in the VA hole. 4079ae115bc7Smrj */ 4080a77271f8SVikram Hegde offset = seed; 4081a77271f8SVikram Hegde if (offset > kernelbase) 4082a77271f8SVikram Hegde offset -= kernelbase; 4083a77271f8SVikram Hegde offset <<= MMU_PAGESHIFT; 4084a77271f8SVikram Hegde #if defined(__amd64) 4085a77271f8SVikram Hegde offset += mmu.hole_start; /* something in VA hole */ 4086a77271f8SVikram Hegde #else 4087a77271f8SVikram Hegde offset += 1ULL << 40; /* something > 4 Gig */ 4088a77271f8SVikram Hegde #endif 4089a77271f8SVikram Hegde 4090a77271f8SVikram Hegde if (page_resv(1, KM_NOSLEEP) == 0) 4091ae115bc7Smrj return (NULL); 4092ae115bc7Smrj 4093ae115bc7Smrj #ifdef DEBUG 4094ae115bc7Smrj pp = page_exists(&kvp, offset); 4095ae115bc7Smrj if (pp != NULL) 4096903a11ebSrh panic("page already exists %p", (void *)pp); 4097ae115bc7Smrj #endif 4098ae115bc7Smrj 4099843e1988Sjohnlev pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 4100ae115bc7Smrj &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 410186c1f4dcSVikram Hegde if (pp != NULL) { 410286c1f4dcSVikram Hegde page_io_unlock(pp); 4103408a1f8eSVikram Hegde page_downgrade(pp); 410486c1f4dcSVikram Hegde } 4105ae115bc7Smrj return (pp); 4106ae115bc7Smrj } 4107