/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. */ /* * UNIX machine dependent virtual memory support. */ #ifndef _VM_DEP_H #define _VM_DEP_H #ifdef __cplusplus extern "C" { #endif #include #include #include #define GETTICK() gettick() /* tick value that should be used for random values */ extern u_longlong_t randtick(void); /* * Per page size free lists. Allocated dynamically. */ #define MAX_MEM_TYPES 2 /* 0 = reloc, 1 = noreloc */ #define MTYPE_RELOC 0 #define MTYPE_NORELOC 1 #define PP_2_MTYPE(pp) (PP_ISNORELOC(pp) ? MTYPE_NORELOC : MTYPE_RELOC) #define MTYPE_INIT(mtype, vp, vaddr, flags, pgsz) \ mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC; /* mtype init for page_get_replacement_page */ #define MTYPE_PGR_INIT(mtype, flags, pp, pgcnt) \ mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC; #define MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi) \ pfnlo = mem_node_config[mnode].physbase; \ pfnhi = mem_node_config[mnode].physmax; /* * candidate counters in vm_pagelist.c are indexed by color and range */ #define MAX_MNODE_MRANGES MAX_MEM_TYPES #define MNODE_RANGE_CNT(mnode) MAX_MNODE_MRANGES #define MNODE_MAX_MRANGE(mnode) (MAX_MEM_TYPES - 1) #define MTYPE_2_MRANGE(mnode, mtype) (mtype) /* * Internal PG_ flags. */ #define PGI_RELOCONLY 0x10000 /* acts in the opposite sense to PG_NORELOC */ #define PGI_NOCAGE 0x20000 /* indicates Cage is disabled */ #define PGI_PGCPHIPRI 0x40000 /* page_get_contig_page priority allocation */ #define PGI_PGCPSZC0 0x80000 /* relocate base pagesize page */ /* * PGI mtype flags - should not overlap PGI flags */ #define PGI_MT_RANGE 0x1000000 /* mtype range */ #define PGI_MT_NEXT 0x2000000 /* get next mtype */ extern page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; extern page_t ***page_cachelists[MAX_MEM_TYPES]; #define PAGE_FREELISTS(mnode, szc, color, mtype) \ (*(page_freelists[szc][mtype][mnode] + (color))) #define PAGE_CACHELISTS(mnode, color, mtype) \ (*(page_cachelists[mtype][mnode] + (color))) /* * There are 'page_colors' colors/bins. Spread them out under a * couple of locks. There are mutexes for both the page freelist * and the page cachelist. We want enough locks to make contention * reasonable, but not too many -- otherwise page_freelist_lock() gets * so expensive that it becomes the bottleneck! */ #define NPC_MUTEX 16 extern kmutex_t *fpc_mutex[NPC_MUTEX]; extern kmutex_t *cpc_mutex[NPC_MUTEX]; /* * Iterator provides the info needed to convert RA to PA. * MEM_NODE_ITERATOR_INIT() should be called before * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash * translations requiring initializer call if color or ceq_mask changes, * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before * PFN_2_COLOR() that uses a valid iterator argument. * * plat_mem_node_iterator_init() starts from last mblock in continuation * case which may be invalid because memory DR. To detect this situation * mi_genid is checked against mpo_genid which is incremented after a * memory DR operation. See also plat_slice_add()/plat_slice_del(). */ #ifdef sun4v typedef struct mem_node_iterator { uint_t mi_mnode; /* mnode in which to iterate */ int mi_init; /* set to 1 when first init */ int mi_genid; /* set/checked against mpo_genid */ int mi_last_mblock; /* last mblock visited */ uint_t mi_hash_ceq_mask; /* cached copy of ceq_mask */ uint_t mi_hash_color; /* cached copy of color */ uint_t mi_mnode_mask; /* number of mask bits */ uint_t mi_mnode_pfn_shift; /* mnode position in pfn */ pfn_t mi_mblock_base; /* first valid pfn in current mblock */ pfn_t mi_mblock_end; /* last valid pfn in current mblock */ pfn_t mi_ra_to_pa; /* ra adjustment for current mblock */ pfn_t mi_mnode_pfn_mask; /* mask to obtain mnode id bits */ } mem_node_iterator_t; #define MEM_NODE_ITERATOR_DECL(it) \ mem_node_iterator_t it #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it) \ (pfn) = plat_mem_node_iterator_init((pfn), (mnode), (szc), (it), 1) extern pfn_t plat_mem_node_iterator_init(pfn_t, int, uchar_t, mem_node_iterator_t *, int); extern pfn_t plat_rapfn_to_papfn(pfn_t); extern int interleaved_mnodes; #else /* sun4v */ #define MEM_NODE_ITERATOR_DECL(it) \ void *it = NULL #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it) #endif /* sun4v */ /* * Return the mnode limits so that hpc_counters length and base * index can be determined. When interleaved_mnodes is set, we * create an array only for the first mnode that exists. All other * mnodes will share the array in this case. * If interleaved_mnodes is not set, simply return the limits for * the given mnode. */ #define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) \ if (!interleaved_mnodes) { \ (physbase) = mem_node_config[(mnode)].physbase; \ (physmax) = mem_node_config[(mnode)].physmax; \ (first) = (mnode); \ } else if ((first) < 0) { \ mem_node_max_range(&(physbase), &(physmax)); \ (first) = (mnode); \ } #define PAGE_CTRS_WRITE_LOCK(mnode) \ if (!interleaved_mnodes) { \ rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER); \ page_freelist_lock(mnode); \ } else { \ /* changing shared hpm_counters */ \ int _i; \ for (_i = 0; _i < max_mem_nodes; _i++) { \ rw_enter(&page_ctrs_rwlock[_i], RW_WRITER); \ page_freelist_lock(_i); \ } \ } #define PAGE_CTRS_WRITE_UNLOCK(mnode) \ if (!interleaved_mnodes) { \ page_freelist_unlock(mnode); \ rw_exit(&page_ctrs_rwlock[(mnode)]); \ } else { \ int _i; \ for (_i = 0; _i < max_mem_nodes; _i++) { \ page_freelist_unlock(_i); \ rw_exit(&page_ctrs_rwlock[_i]); \ } \ } /* * cpu specific color conversion functions */ extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t); #pragma weak page_get_nsz_color_mask_cpu extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t); #pragma weak page_get_nsz_color_cpu extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t); #pragma weak page_get_color_shift_cpu extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t); #pragma weak page_convert_color_cpu extern pfn_t page_next_pfn_for_color_cpu(pfn_t, uchar_t, uint_t, uint_t, uint_t, void *); #pragma weak page_next_pfn_for_color_cpu extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t, void *); #pragma weak page_pfn_2_color_cpu #define PAGE_GET_COLOR_SHIFT(szc, nszc) \ ((&page_get_color_shift_cpu != NULL) ? \ page_get_color_shift_cpu(szc, nszc) : \ (hw_page_array[(nszc)].hp_shift - \ hw_page_array[(szc)].hp_shift)) #define PAGE_CONVERT_COLOR(ncolor, szc, nszc) \ ((&page_convert_color_cpu != NULL) ? \ page_convert_color_cpu(ncolor, szc, nszc) : \ ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc)))) #define PFN_2_COLOR(pfn, szc, it) \ ((&page_pfn_2_color_cpu != NULL) ? \ page_pfn_2_color_cpu(pfn, szc, it) : \ ((pfn & (hw_page_array[0].hp_colors - 1)) >> \ (hw_page_array[szc].hp_shift - \ hw_page_array[0].hp_shift))) #define PNUM_SIZE(szc) \ (hw_page_array[(szc)].hp_pgcnt) #define PNUM_SHIFT(szc) \ (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) #define PAGE_GET_SHIFT(szc) \ (hw_page_array[(szc)].hp_shift) #define PAGE_GET_PAGECOLORS(szc) \ (hw_page_array[(szc)].hp_colors) /* * This macro calculates the next sequential pfn with the specified * color using color equivalency mask */ #define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it) \ { \ ASSERT(((color) & ~(ceq_mask)) == 0); \ if (&page_next_pfn_for_color_cpu == NULL) { \ uint_t pfn_shift = PAGE_BSZS_SHIFT(szc); \ pfn_t spfn = pfn >> pfn_shift; \ pfn_t stride = (ceq_mask) + 1; \ ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0); \ if (((spfn ^ (color)) & (ceq_mask)) == 0) { \ pfn += stride << pfn_shift; \ } else { \ pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color); \ pfn = (pfn > spfn ? pfn : pfn + stride) << \ pfn_shift; \ } \ } else { \ pfn = page_next_pfn_for_color_cpu(pfn, szc, color, \ ceq_mask, color_mask, it); \ } \ } /* get the color equivalency mask for the next szc */ #define PAGE_GET_NSZ_MASK(szc, mask) \ ((&page_get_nsz_color_mask_cpu == NULL) ? \ ((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \ page_get_nsz_color_mask_cpu(szc, mask)) /* get the color of the next szc */ #define PAGE_GET_NSZ_COLOR(szc, color) \ ((&page_get_nsz_color_cpu == NULL) ? \ ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \ page_get_nsz_color_cpu(szc, color)) /* Find the bin for the given page if it was of size szc */ #define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1))) #define PP_2_BIN(pp) (PP_2_BIN_SZC(pp, pp->p_szc)) #define PP_2_MEM_NODE(pp) (PFN_2_MEM_NODE(pp->p_pagenum)) #define PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ? \ &fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] : \ &cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode]) #define FPC_MUTEX(mnode, i) (&fpc_mutex[i][mnode]) #define CPC_MUTEX(mnode, i) (&cpc_mutex[i][mnode]) #define PFN_BASE(pfnum, szc) (pfnum & ~((1 << PAGE_BSZS_SHIFT(szc)) - 1)) /* * this structure is used for walking free page lists * controls when to split large pages into smaller pages, * and when to coalesce smaller pages into larger pages */ typedef struct page_list_walker { uint_t plw_colors; /* num of colors for szc */ uint_t plw_color_mask; /* colors-1 */ uint_t plw_bin_step; /* next bin: 1 or 2 */ uint_t plw_count; /* loop count */ uint_t plw_bin0; /* starting bin */ uint_t plw_bin_marker; /* bin after initial jump */ uint_t plw_bin_split_prev; /* last bin we tried to split */ uint_t plw_do_split; /* set if OK to split */ uint_t plw_split_next; /* next bin to split */ uint_t plw_ceq_dif; /* number of different color groups */ /* to check */ uint_t plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */ uint_t plw_bins[MMU_PAGE_SIZES + 1]; /* num of bins */ } page_list_walker_t; void page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, int use_ceq, page_list_walker_t *plw); typedef char hpmctr_t; #ifdef DEBUG #define CHK_LPG(pp, szc) chk_lpg(pp, szc) extern void chk_lpg(page_t *, uchar_t); #else #define CHK_LPG(pp, szc) #endif /* * page list count per mnode and type. */ typedef struct { pgcnt_t plc_mt_pgmax; /* max page cnt */ pgcnt_t plc_mt_clpgcnt; /* cache list cnt */ pgcnt_t plc_mt_flpgcnt; /* free list cnt - small pages */ pgcnt_t plc_mt_lgpgcnt; /* free list cnt - large pages */ #ifdef DEBUG struct { pgcnt_t plc_mts_pgcnt; /* per page size count */ int plc_mts_colors; pgcnt_t *plc_mtsc_pgcnt; /* per color bin count */ } plc_mts[MMU_PAGE_SIZES]; #endif } plcnt_t[MAX_MEM_NODES][MAX_MEM_TYPES]; #ifdef DEBUG #define PLCNT_SZ(ctrs_sz) { \ int szc; \ for (szc = 0; szc < mmu_page_sizes; szc++) { \ int colors = page_get_pagecolors(szc); \ ctrs_sz += (max_mem_nodes * MAX_MEM_TYPES * \ colors * sizeof (pgcnt_t)); \ } \ } #define PLCNT_INIT(base) { \ int mn, mt, szc, colors; \ for (szc = 0; szc < mmu_page_sizes; szc++) { \ colors = page_get_pagecolors(szc); \ for (mn = 0; mn < max_mem_nodes; mn++) { \ for (mt = 0; mt < MAX_MEM_TYPES; mt++) { \ plcnt[mn][mt].plc_mts[szc]. \ plc_mts_colors = colors; \ plcnt[mn][mt].plc_mts[szc]. \ plc_mtsc_pgcnt = (pgcnt_t *)base; \ base += (colors * sizeof (pgcnt_t)); \ } \ } \ } \ } #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) { \ int bin = PP_2_BIN(pp); \ if (flags & PG_CACHE_LIST) \ atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt); \ else if (szc) \ atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt); \ else \ atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt); \ atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].plc_mts_pgcnt, \ cnt); \ atomic_add_long(&plcnt[mn][mtype].plc_mts[szc]. \ plc_mtsc_pgcnt[bin], cnt); \ } #else #define PLCNT_SZ(ctrs_sz) #define PLCNT_INIT(base) /* PG_FREE_LIST may not be explicitly set in flags for large pages */ #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) { \ if (flags & PG_CACHE_LIST) \ atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt); \ else if (szc) \ atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt); \ else \ atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt); \ } #endif #define PLCNT_INCR(pp, mn, mtype, szc, flags) { \ long cnt = (1 << PAGE_BSZS_SHIFT(szc)); \ PLCNT_DO(pp, mn, mtype, szc, cnt, flags); \ } #define PLCNT_DECR(pp, mn, mtype, szc, flags) { \ long cnt = ((ULONG_MAX) << PAGE_BSZS_SHIFT(szc)); \ PLCNT_DO(pp, mn, mtype, szc, cnt, flags); \ } /* * macros to update page list max counts - done when pages transferred * from RELOC to NORELOC mtype (kcage_init or kcage_assimilate_page). */ #define PLCNT_XFER_NORELOC(pp) { \ long cnt = (1 << PAGE_BSZS_SHIFT((pp)->p_szc)); \ int mn = PP_2_MEM_NODE(pp); \ atomic_add_long(&plcnt[mn][MTYPE_NORELOC].plc_mt_pgmax, cnt); \ atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, -cnt); \ } /* * macro to modify the page list max counts when memory is added to * the page lists during startup (add_physmem) or during a DR operation * when memory is added (kphysm_add_memory_dynamic) or deleted * (kphysm_del_cleanup). */ #define PLCNT_MODIFY_MAX(pfn, cnt) { \ spgcnt_t _cnt = (spgcnt_t)(cnt); \ pgcnt_t _acnt = ABS(_cnt); \ int _mn; \ pgcnt_t _np; \ if (&plat_mem_node_intersect_range != NULL) { \ for (_mn = 0; _mn < max_mem_nodes; _mn++) { \ plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\ if (_np == 0) \ continue; \ atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \ (_cnt < 0) ? -_np : _np); \ } \ } else { \ pfn_t _pfn = (pfn); \ pfn_t _endpfn = _pfn + _acnt; \ while (_pfn < _endpfn) { \ _mn = PFN_2_MEM_NODE(_pfn); \ _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \ _pfn; \ _pfn += _np; \ atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \ (_cnt < 0) ? -_np : _np); \ } \ } \ } /* * macro to call page_ctrs_adjust() when memory is added * during a DR operation. */ #define PAGE_CTRS_ADJUST(pfn, cnt, rv) { \ spgcnt_t _cnt = (spgcnt_t)(cnt); \ int _mn; \ pgcnt_t _np; \ rv = 0; \ if (&plat_mem_node_intersect_range != NULL) { \ for (_mn = 0; _mn < max_mem_nodes; _mn++) { \ plat_mem_node_intersect_range((pfn), _cnt, _mn, &_np); \ if (_np == 0) \ continue; \ if ((rv = page_ctrs_adjust(_mn)) != 0) \ break; \ } \ } else { \ pfn_t _pfn = (pfn); \ pfn_t _endpfn = _pfn + _cnt; \ while (_pfn < _endpfn) { \ _mn = PFN_2_MEM_NODE(_pfn); \ _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \ _pfn; \ _pfn += _np; \ if ((rv = page_ctrs_adjust(_mn)) != 0) \ break; \ } \ } \ } extern plcnt_t plcnt; #define MNODE_PGCNT(mn) \ (plcnt[mn][MTYPE_RELOC].plc_mt_clpgcnt + \ plcnt[mn][MTYPE_NORELOC].plc_mt_clpgcnt + \ plcnt[mn][MTYPE_RELOC].plc_mt_flpgcnt + \ plcnt[mn][MTYPE_NORELOC].plc_mt_flpgcnt + \ plcnt[mn][MTYPE_RELOC].plc_mt_lgpgcnt + \ plcnt[mn][MTYPE_NORELOC].plc_mt_lgpgcnt) #define MNODETYPE_PGCNT(mn, mtype) \ (plcnt[mn][mtype].plc_mt_clpgcnt + \ plcnt[mn][mtype].plc_mt_flpgcnt + \ plcnt[mn][mtype].plc_mt_lgpgcnt) /* * macros to loop through the mtype range - MTYPE_START returns -1 in * mtype if no pages in mnode/mtype and possibly NEXT mtype. */ #define MTYPE_START(mnode, mtype, flags) { \ if (plcnt[mnode][mtype].plc_mt_pgmax == 0) { \ ASSERT(mtype == MTYPE_RELOC || \ MNODETYPE_PGCNT(mnode, mtype) == 0 || \ plcnt[mnode][mtype].plc_mt_pgmax != 0); \ MTYPE_NEXT(mnode, mtype, flags); \ } \ } /* * if allocation from the RELOC pool failed and there is sufficient cage * memory, attempt to allocate from the NORELOC pool. */ #define MTYPE_NEXT(mnode, mtype, flags) { \ if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && \ (kcage_freemem >= kcage_lotsfree)) { \ if (plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax == 0) { \ ASSERT(MNODETYPE_PGCNT(mnode, MTYPE_NORELOC) == 0 || \ plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax != 0); \ mtype = -1; \ } else { \ mtype = MTYPE_NORELOC; \ flags |= PG_NORELOC; \ } \ } else { \ mtype = -1; \ } \ } /* * get the ecache setsize for the current cpu. */ #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) extern struct cpu cpu0; #define CPU0 &cpu0 #define PAGE_BSZS_SHIFT(szc) TTE_BSZS_SHIFT(szc) /* * For sfmmu each larger page is 8 times the size of the previous * size page. */ #define FULL_REGION_CNT(rg_szc) (8) /* * The counter base must be per page_counter element to prevent * races when re-indexing, and the base page size element should * be aligned on a boundary of the given region size. * * We also round up the number of pages spanned by the counters * for a given region to PC_BASE_ALIGN in certain situations to simplify * the coding for some non-performance critical routines. */ #define PC_BASE_ALIGN ((pfn_t)1 << PAGE_BSZS_SHIFT(mmu_page_sizes-1)) #define PC_BASE_ALIGN_MASK (PC_BASE_ALIGN - 1) extern int ecache_alignsize; #define L2CACHE_ALIGN ecache_alignsize #define L2CACHE_ALIGN_MAX 512 extern int update_proc_pgcolorbase_after_fork; extern int consistent_coloring; extern uint_t vac_colors_mask; extern int vac_size; extern int vac_shift; /* * Kernel mem segment in 64-bit space */ extern caddr_t kmem64_base, kmem64_end, kmem64_aligned_end; extern int kmem64_alignsize, kmem64_szc; extern uint64_t kmem64_pabase; extern int max_bootlp_tteszc; /* * Maximum and default values for user heap, stack, private and shared * anonymous memory, and user text and initialized data. * * Initial values are defined in architecture specific mach_vm_dep.c file. * Used by map_pgsz*() routines. */ extern size_t max_uheap_lpsize; extern size_t default_uheap_lpsize; extern size_t max_ustack_lpsize; extern size_t default_ustack_lpsize; extern size_t max_privmap_lpsize; extern size_t max_uidata_lpsize; extern size_t max_utext_lpsize; extern size_t max_shm_lpsize; /* * For adjusting the default lpsize, for DTLB-limited page sizes. */ extern void adjust_data_maxlpsize(size_t ismpagesize); /* * Sanity control. Don't use large pages regardless of user * settings if there's less than priv or shm_lpg_min_physmem memory installed. * The units for this variable are 8K pages. */ extern pgcnt_t privm_lpg_min_physmem; extern pgcnt_t shm_lpg_min_physmem; /* * AS_2_BIN macro controls the page coloring policy. * 0 (default) uses various vaddr bits * 1 virtual=paddr * 2 bin hopping */ #define AS_2_BIN(as, seg, vp, addr, bin, szc) \ switch (consistent_coloring) { \ default: \ cmn_err(CE_WARN, \ "AS_2_BIN: bad consistent coloring value"); \ /* assume default algorithm -> continue */ \ /* FALLTHROUGH */ \ case 0: { \ uint32_t ndx, new; \ int slew = 0; \ pfn_t pfn; \ \ if (vp != NULL && IS_SWAPVP(vp) && \ seg->s_ops == &segvn_ops) \ slew = as_color_bin(as); \ \ pfn = ((uintptr_t)addr >> MMU_PAGESHIFT) + \ (((uintptr_t)addr >> page_coloring_shift) << \ (vac_shift - MMU_PAGESHIFT)); \ if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) { \ pfn += slew; \ bin = PFN_2_COLOR(pfn, szc, NULL); \ } else { \ bin = PFN_2_COLOR(pfn, szc, NULL); \ bin += slew >> (vac_shift - MMU_PAGESHIFT); \ bin &= hw_page_array[(szc)].hp_colors - 1; \ } \ break; \ } \ case 1: \ bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT), \ szc, NULL); \ break; \ case 2: { \ int cnt = as_color_bin(as); \ uint_t color_mask = page_get_pagecolors(0) - 1; \ \ /* make sure physical color aligns with vac color */ \ while ((cnt & vac_colors_mask) != \ addr_to_vcolor(addr)) { \ cnt++; \ } \ bin = cnt = cnt & color_mask; \ bin >>= PAGE_GET_COLOR_SHIFT(0, szc); \ /* update per as page coloring fields */ \ cnt = (cnt + 1) & color_mask; \ if (cnt == (as_color_start(as) & color_mask)) { \ cnt = as_color_start(as) = as_color_start(as) + \ PGCLR_LOOPFACTOR; \ } \ as_color_bin(as) = cnt & color_mask; \ break; \ } \ } \ ASSERT(bin < page_get_pagecolors(szc)); /* * cpu private vm data - accessed thru CPU->cpu_vm_data * vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock() * vc_pnext_memseg: tracks last memseg visited in page_nextn() * vc_kmptr: unaligned kmem pointer for this vm_cpu_data_t * vc_kmsize: orignal kmem size for this vm_cpu_data_t */ typedef struct { struct memseg *vc_pnum_memseg; struct memseg *vc_pnext_memseg; void *vc_kmptr; size_t vc_kmsize; } vm_cpu_data_t; /* allocation size to ensure vm_cpu_data_t resides in its own cache line */ #define VM_CPU_DATA_PADSIZE \ (P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX)) /* * Function to get an ecache color bin: F(as, cnt, vcolor). * the goal of this function is to: * - to spread a processes' physical pages across the entire ecache to * maximize its use. * - to minimize vac flushes caused when we reuse a physical page on a * different vac color than it was previously used. * - to prevent all processes to use the same exact colors and trash each * other. * * cnt is a bin ptr kept on a per as basis. As we page_create we increment * the ptr so we spread out the physical pages to cover the entire ecache. * The virtual color is made a subset of the physical color in order to * in minimize virtual cache flushing. * We add in the as to spread out different as. This happens when we * initialize the start count value. * sizeof(struct as) is 60 so we shift by 3 to get into the bit range * that will tend to change. For example, on spitfire based machines * (vcshft == 1) contigous as are spread bu ~6 bins. * vcshft provides for proper virtual color alignment. * In theory cnt should be updated using cas only but if we are off by one * or 2 it is no big deal. * We also keep a start value which is used to randomize on what bin we * start counting when it is time to start another loop. This avoids * contigous allocations of ecache size to point to the same bin. * Why 3? Seems work ok. Better than 7 or anything larger. */ #define PGCLR_LOOPFACTOR 3 /* * When a bin is empty, and we can't satisfy a color request correctly, * we scan. If we assume that the programs have reasonable spatial * behavior, then it will not be a good idea to use the adjacent color. * Using the adjacent color would result in virtually adjacent addresses * mapping into the same spot in the cache. So, if we stumble across * an empty bin, skip a bunch before looking. After the first skip, * then just look one bin at a time so we don't miss our cache on * every look. Be sure to check every bin. Page_create() will panic * if we miss a page. * * This also explains the `<=' in the for loops in both page_get_freelist() * and page_get_cachelist(). Since we checked the target bin, skipped * a bunch, then continued one a time, we wind up checking the target bin * twice to make sure we get all of them bins. */ #define BIN_STEP 20 #ifdef VM_STATS struct vmm_vmstats_str { ulong_t pgf_alloc[MMU_PAGE_SIZES]; /* page_get_freelist */ ulong_t pgf_allocok[MMU_PAGE_SIZES]; ulong_t pgf_allocokrem[MMU_PAGE_SIZES]; ulong_t pgf_allocfailed[MMU_PAGE_SIZES]; ulong_t pgf_allocdeferred; ulong_t pgf_allocretry[MMU_PAGE_SIZES]; ulong_t pgc_alloc; /* page_get_cachelist */ ulong_t pgc_allocok; ulong_t pgc_allocokrem; ulong_t pgc_allocokdeferred; ulong_t pgc_allocfailed; ulong_t pgcp_alloc[MMU_PAGE_SIZES]; /* page_get_contig_pages */ ulong_t pgcp_allocfailed[MMU_PAGE_SIZES]; ulong_t pgcp_allocempty[MMU_PAGE_SIZES]; ulong_t pgcp_allocok[MMU_PAGE_SIZES]; ulong_t ptcp[MMU_PAGE_SIZES]; /* page_trylock_contig_pages */ ulong_t ptcpfreethresh[MMU_PAGE_SIZES]; ulong_t ptcpfailexcl[MMU_PAGE_SIZES]; ulong_t ptcpfailszc[MMU_PAGE_SIZES]; ulong_t ptcpfailcage[MMU_PAGE_SIZES]; ulong_t ptcpok[MMU_PAGE_SIZES]; ulong_t pgmf_alloc[MMU_PAGE_SIZES]; /* page_get_mnode_freelist */ ulong_t pgmf_allocfailed[MMU_PAGE_SIZES]; ulong_t pgmf_allocempty[MMU_PAGE_SIZES]; ulong_t pgmf_allocok[MMU_PAGE_SIZES]; ulong_t pgmc_alloc; /* page_get_mnode_cachelist */ ulong_t pgmc_allocfailed; ulong_t pgmc_allocempty; ulong_t pgmc_allocok; ulong_t pladd_free[MMU_PAGE_SIZES]; /* page_list_add/sub */ ulong_t plsub_free[MMU_PAGE_SIZES]; ulong_t pladd_cache; ulong_t plsub_cache; ulong_t plsubpages_szcbig; ulong_t plsubpages_szc0; ulong_t pfs_req[MMU_PAGE_SIZES]; /* page_freelist_split */ ulong_t pfs_demote[MMU_PAGE_SIZES]; ulong_t pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; ulong_t ppr_reloc[MMU_PAGE_SIZES]; /* page_relocate */ ulong_t ppr_relocok[MMU_PAGE_SIZES]; ulong_t ppr_relocnoroot[MMU_PAGE_SIZES]; ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES]; ulong_t ppr_relocnolock[MMU_PAGE_SIZES]; ulong_t ppr_relocnomem[MMU_PAGE_SIZES]; ulong_t ppr_krelocfail[MMU_PAGE_SIZES]; ulong_t ppr_copyfail; /* page coalesce counter */ ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; /* candidates useful */ ulong_t page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; /* ctrs changed after locking */ ulong_t page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; /* page_freelist_coalesce failed */ ulong_t page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; ulong_t page_ctrs_coalesce_all; /* page coalesce all counter */ ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */ }; extern struct vmm_vmstats_str vmm_vmstats; #endif /* VM_STATS */ /* * Used to hold off page relocations into the cage until OBP has completed * its boot-time handoff of its resources to the kernel. */ extern int page_relocate_ready; /* * cpu/mmu-dependent vm variables may be reset at bootup. */ extern uint_t mmu_page_sizes; extern uint_t max_mmu_page_sizes; extern uint_t mmu_hashcnt; extern uint_t max_mmu_hashcnt; extern size_t mmu_ism_pagesize; extern int mmu_exported_pagesize_mask; extern uint_t mmu_exported_page_sizes; extern uint_t szc_2_userszc[]; extern uint_t userszc_2_szc[]; #define mmu_legacy_page_sizes mmu_exported_page_sizes #define USERSZC_2_SZC(userszc) (userszc_2_szc[userszc]) #define SZC_2_USERSZC(szc) (szc_2_userszc[szc]) /* * Platform specific page routines */ extern void mach_page_add(page_t **, page_t *); extern void mach_page_sub(page_t **, page_t *); extern uint_t page_get_pagecolors(uint_t); extern void ppcopy_kernel__relocatable(page_t *, page_t *); #define ppcopy_kernel(p1, p2) ppcopy_kernel__relocatable(p1, p2) /* * platform specific large pages for kernel heap support */ extern size_t get_segkmem_lpsize(size_t lpsize); extern size_t mmu_get_kernel_lpsize(size_t lpsize); extern void mmu_init_kernel_pgsz(struct hat *hat); extern void mmu_init_kcontext(); extern uint64_t kcontextreg; /* * Nucleus data page allocator routines */ extern void ndata_alloc_init(struct memlist *, uintptr_t, uintptr_t); extern void *ndata_alloc(struct memlist *, size_t, size_t); extern void *ndata_extra_base(struct memlist *, size_t, caddr_t); extern size_t ndata_maxsize(struct memlist *); extern size_t ndata_spare(struct memlist *, size_t, size_t); #ifdef __cplusplus } #endif #endif /* _VM_DEP_H */