/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2017 Joyent, Inc. */ #include #include #include #include #include #include /* for page_freelist_coalesce() */ #include #include #include #include #include #include #include #include #include #include #include /* for installed_top_size() */ #include /* for CV_HAS_WAITERS() */ #include /* for dump_resize() */ #include /* for use in stats collection */ #include #include #include #include #include #include #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ #include #include #include #include #include #include extern struct memlist *phys_avail; extern uint_t page_ctrs_adjust(int); void page_ctrs_cleanup(void); static void kphysm_setup_post_add(pgcnt_t); static int kphysm_setup_pre_del(pgcnt_t); static void kphysm_setup_post_del(pgcnt_t, int); static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); static int delspan_reserve(pfn_t, pgcnt_t); static void delspan_unreserve(pfn_t, pgcnt_t); kmutex_t memseg_lists_lock; struct memseg *memseg_va_avail; struct memseg *memseg_alloc(void); static struct memseg *memseg_delete_junk; static struct memseg *memseg_edit_junk; void memseg_remap_init(void); static void memseg_remap_to_dummy(struct memseg *); static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); static struct memseg *memseg_reuse(pgcnt_t); static struct kmem_cache *memseg_cache; /* * Interfaces to manage externally allocated * page_t memory (metadata) for a memseg. */ #pragma weak memseg_alloc_meta #pragma weak memseg_free_meta #pragma weak memseg_get_metapfn #pragma weak memseg_remap_meta extern int ppvm_enable; extern page_t *ppvm_base; extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *); extern void memseg_free_meta(void *, pgcnt_t); extern pfn_t memseg_get_metapfn(void *, pgcnt_t); extern void memseg_remap_meta(struct memseg *); static int memseg_is_dynamic(struct memseg *); static int memseg_includes_meta(struct memseg *); pfn_t memseg_get_start(struct memseg *); static void memseg_cpu_vm_flush(void); int meta_alloc_enable; #ifdef DEBUG static int memseg_debug; #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args) #else #define MEMSEG_DEBUG(...) #endif /* * Add a chunk of memory to the system. * base: starting PAGESIZE page of new memory. * npgs: length in PAGESIZE pages. * * Adding mem this way doesn't increase the size of the hash tables; * growing them would be too hard. This should be OK, but adding memory * dynamically most likely means more hash misses, since the tables will * be smaller than they otherwise would be. */ int kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) { page_t *pp; page_t *opp, *oepp, *segpp; struct memseg *seg; uint64_t avmem; pfn_t pfn; pfn_t pt_base = base; pgcnt_t tpgs = npgs; pgcnt_t metapgs = 0; int exhausted; pfn_t pnum; int mnode; caddr_t vaddr; int reuse; int mlret; int rv; int flags; int meta_alloc = 0; void *mapva; void *metabase = (void *)base; pgcnt_t nkpmpgs = 0; offset_t kpm_pages_off = 0; cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); /* * Add this span in the delete list to prevent interactions. */ if (!delspan_reserve(base, npgs)) { return (KPHYSM_ESPAN); } /* * Check to see if any of the memory span has been added * by trying an add to the installed memory list. This * forms the interlocking process for add. */ memlist_write_lock(); mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, (uint64_t)(tpgs) << PAGESHIFT, &phys_install); if (mlret == MEML_SPANOP_OK) installed_top_size(phys_install, &physmax, &physinstalled); memlist_write_unlock(); if (mlret != MEML_SPANOP_OK) { if (mlret == MEML_SPANOP_EALLOC) { delspan_unreserve(pt_base, tpgs); return (KPHYSM_ERESOURCE); } else if (mlret == MEML_SPANOP_ESPAN) { delspan_unreserve(pt_base, tpgs); return (KPHYSM_ESPAN); } else { delspan_unreserve(pt_base, tpgs); return (KPHYSM_ERESOURCE); } } if (meta_alloc_enable) { /* * Allocate the page_t's from existing memory; * if that fails, allocate from the incoming memory. */ rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs); if (rv == KPHYSM_OK) { ASSERT(metapgs); ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); meta_alloc = 1; goto mapalloc; } } /* * We store the page_t's for this new memory in the first * few pages of the chunk. Here, we go and get'em ... */ /* * The expression after the '-' gives the number of pages * that will fit in the new memory based on a requirement * of (PAGESIZE + sizeof (page_t)) bytes per page. */ metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / (PAGESIZE + sizeof (page_t))); npgs -= metapgs; base += metapgs; ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); exhausted = (metapgs == 0 || npgs == 0); if (kpm_enable && !exhausted) { pgcnt_t start, end, nkpmpgs_prelim; size_t ptsz; /* * A viable kpm large page mapping must not overlap two * dynamic memsegs. Therefore the total size is checked * to be at least kpm_pgsz and also whether start and end * points are at least kpm_pgsz aligned. */ if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || pmodkpmp(base + npgs)) { kphysm_addmem_error_undospan(pt_base, tpgs); /* * There is no specific error code for violating * kpm granularity constraints. */ return (KPHYSM_ENOTVIABLE); } start = kpmptop(ptokpmp(base)); end = kpmptop(ptokpmp(base + npgs)); nkpmpgs_prelim = ptokpmp(end - start); ptsz = npgs * sizeof (page_t); metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); exhausted = (tpgs <= metapgs); if (!exhausted) { npgs = tpgs - metapgs; base = pt_base + metapgs; /* final nkpmpgs */ start = kpmptop(ptokpmp(base)); nkpmpgs = ptokpmp(end - start); kpm_pages_off = ptsz + (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; } } /* * Is memory area supplied too small? */ if (exhausted) { kphysm_addmem_error_undospan(pt_base, tpgs); /* * There is no specific error code for 'too small'. */ return (KPHYSM_ERESOURCE); } mapalloc: /* * We may re-use a previously allocated VA space for the page_ts * eventually, but we need to initialize and lock the pages first. */ /* * Get an address in the kernel address map, map * the page_t pages and see if we can touch them. */ mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); if (mapva == NULL) { cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" " Can't allocate VA for page_ts"); if (meta_alloc) memseg_free_meta(metabase, metapgs); kphysm_addmem_error_undospan(pt_base, tpgs); return (KPHYSM_ERESOURCE); } pp = mapva; if (physmax < (pt_base + tpgs)) physmax = (pt_base + tpgs); /* * In the remapping code we map one page at a time so we must do * the same here to match mapping sizes. */ pfn = pt_base; vaddr = (caddr_t)pp; for (pnum = 0; pnum < metapgs; pnum++) { if (meta_alloc) pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum); hat_devload(kas.a_hat, vaddr, ptob(1), pfn, PROT_READ | PROT_WRITE, HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); pfn++; vaddr += ptob(1); } if (ddi_peek32((dev_info_t *)NULL, (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" " Can't access pp array at 0x%p [phys 0x%lx]", (void *)pp, pt_base); hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); vmem_free(heap_arena, mapva, ptob(metapgs)); if (meta_alloc) memseg_free_meta(metabase, metapgs); kphysm_addmem_error_undospan(pt_base, tpgs); return (KPHYSM_EFAULT); } /* * Add this memory slice to its memory node translation. * * Note that right now, each node may have only one slice; * this may change with COD or in larger SSM systems with * nested latency groups, so we must not assume that the * node does not yet exist. * * Note that there may be multiple memory nodes associated with * a single lgrp node on x86 systems. */ pnum = pt_base + tpgs - 1; mem_node_add_range(pt_base, pnum); /* * Allocate or resize page counters as necessary to accommodate * the increase in memory pages. */ mnode = PFN_2_MEM_NODE(pnum); PAGE_CTRS_ADJUST(base, npgs, rv); if (rv) { mem_node_del_range(pt_base, pnum); /* cleanup the page counters */ page_ctrs_cleanup(); hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); vmem_free(heap_arena, mapva, ptob(metapgs)); if (meta_alloc) memseg_free_meta(metabase, metapgs); kphysm_addmem_error_undospan(pt_base, tpgs); return (KPHYSM_ERESOURCE); } /* * Update the phys_avail memory list. * The phys_install list was done at the start. */ memlist_write_lock(); mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, (uint64_t)(npgs) << PAGESHIFT, &phys_avail); ASSERT(mlret == MEML_SPANOP_OK); memlist_write_unlock(); /* See if we can find a memseg to re-use. */ if (meta_alloc) { seg = memseg_reuse(0); reuse = 1; /* force unmapping of temp mapva */ flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC; /* * There is a 1:1 fixed relationship between a pfn * and a page_t VA. The pfn is used as an index into * the ppvm_base page_t table in order to calculate * the page_t base address for a given pfn range. */ segpp = ppvm_base + base; } else { seg = memseg_reuse(metapgs); reuse = (seg != NULL); flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL; segpp = pp; } /* * Initialize the memseg structure representing this memory * and add it to the existing list of memsegs. Do some basic * initialization and add the memory to the system. * In order to prevent lock deadlocks, the add_physmem() * code is repeated here, but split into several stages. * * If a memseg is reused, invalidate memseg pointers in * all cpu vm caches. We need to do this this since the check * pp >= seg->pages && pp < seg->epages * used in various places is not atomic and so the first compare * can happen before reuse and the second compare after reuse. * The invalidation ensures that a memseg is not deferenced while * it's page/pfn pointers are changing. */ if (seg == NULL) { seg = memseg_alloc(); ASSERT(seg != NULL); seg->msegflags = flags; MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p", (void *)seg, (void *)(seg->pages)); seg->pages = segpp; } else { ASSERT(seg->msegflags == flags); ASSERT(seg->pages_base == seg->pages_end); MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p", (void *)seg, (void *)(seg->pages)); if (meta_alloc) { memseg_cpu_vm_flush(); seg->pages = segpp; } } seg->epages = seg->pages + npgs; seg->pages_base = base; seg->pages_end = base + npgs; /* * Initialize metadata. The page_ts are set to locked state * ready to be freed. */ bzero((caddr_t)pp, ptob(metapgs)); pfn = seg->pages_base; /* Save the original pp base in case we reuse a memseg. */ opp = pp; oepp = opp + npgs; for (pp = opp; pp < oepp; pp++) { pp->p_pagenum = pfn; pfn++; page_iolock_init(pp); while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) continue; pp->p_offset = (u_offset_t)-1; } if (reuse) { /* Remap our page_ts to the re-used memseg VA space. */ pfn = pt_base; vaddr = (caddr_t)seg->pages; for (pnum = 0; pnum < metapgs; pnum++) { if (meta_alloc) pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum); hat_devload(kas.a_hat, vaddr, ptob(1), pfn, PROT_READ | PROT_WRITE, HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); pfn++; vaddr += ptob(1); } hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); vmem_free(heap_arena, mapva, ptob(metapgs)); } hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); memsegs_lock(1); /* * The new memseg is inserted at the beginning of the list. * Not only does this save searching for the tail, but in the * case of a re-used memseg, it solves the problem of what * happens if some process has still got a pointer to the * memseg and follows the next pointer to continue traversing * the memsegs list. */ hat_kpm_addmem_mseg_insert(seg); seg->next = memsegs; membar_producer(); hat_kpm_addmem_memsegs_update(seg); memsegs = seg; build_pfn_hash(); total_pages += npgs; /* * Recalculate the paging parameters now total_pages has changed. * This will also cause the clock hands to be reset before next use. */ setupclock(); memsegs_unlock(1); PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); /* * Free the pages outside the lock to avoid locking loops. */ for (pp = seg->pages; pp < seg->epages; pp++) { page_free(pp, 1); } /* * Now that we've updated the appropriate memory lists we * need to reset a number of globals, since we've increased memory. * Several have already been updated for us as noted above. The * globals we're interested in at this point are: * physmax - highest page frame number. * physinstalled - number of pages currently installed (done earlier) * maxmem - max free pages in the system * physmem - physical memory pages available * availrmem - real memory available */ mutex_enter(&freemem_lock); maxmem += npgs; physmem += npgs; availrmem += npgs; availrmem_initial += npgs; mutex_exit(&freemem_lock); dump_resize(); page_freelist_coalesce_all(mnode); kphysm_setup_post_add(npgs); cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " "(0x%" PRIx64 ")\n", physinstalled << (PAGESHIFT - 10), (uint64_t)physinstalled << PAGESHIFT); avmem = (uint64_t)freemem << PAGESHIFT; cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " "avail mem = %" PRId64 "\n", avmem); /* * Update lgroup generation number on single lgroup systems */ if (nlgrps == 1) lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); /* * Inform DDI of update */ ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT, (uint64_t)(tpgs) << PAGESHIFT); delspan_unreserve(pt_base, tpgs); return (KPHYSM_OK); /* Successfully added system memory */ } /* * There are various error conditions in kphysm_add_memory_dynamic() * which require a rollback of already changed global state. */ static void kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) { int mlret; /* Unreserve memory span. */ memlist_write_lock(); mlret = memlist_delete_span( (uint64_t)(pt_base) << PAGESHIFT, (uint64_t)(tpgs) << PAGESHIFT, &phys_install); ASSERT(mlret == MEML_SPANOP_OK); phys_install_has_changed(); installed_top_size(phys_install, &physmax, &physinstalled); memlist_write_unlock(); delspan_unreserve(pt_base, tpgs); } /* * Only return an available memseg of exactly the right size * if size is required. * When the meta data area has it's own virtual address space * we will need to manage this more carefully and do best fit * allocations, possibly splitting an available area. */ struct memseg * memseg_reuse(pgcnt_t metapgs) { int type; struct memseg **segpp, *seg; mutex_enter(&memseg_lists_lock); segpp = &memseg_va_avail; for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { caddr_t end; /* * Make sure we are reusing the right segment type. */ type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC; if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC)) != type) continue; if (kpm_enable) end = hat_kpm_mseg_reuse(seg); else end = (caddr_t)seg->epages; /* * Check for the right size if it is provided. */ if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) { *segpp = seg->lnext; seg->lnext = NULL; break; } } mutex_exit(&memseg_lists_lock); return (seg); } static uint_t handle_gen; struct memdelspan { struct memdelspan *mds_next; pfn_t mds_base; pgcnt_t mds_npgs; uint_t *mds_bitmap; uint_t *mds_bitmap_retired; }; #define NBPBMW (sizeof (uint_t) * NBBY) #define MDS_BITMAPBYTES(MDSP) \ ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) struct transit_list { struct transit_list *trl_next; struct memdelspan *trl_spans; int trl_collect; }; struct transit_list_head { kmutex_t trh_lock; struct transit_list *trh_head; }; static struct transit_list_head transit_list_head; struct mem_handle; static void transit_list_collect(struct mem_handle *, int); static void transit_list_insert(struct transit_list *); static void transit_list_remove(struct transit_list *); #ifdef DEBUG #define MEM_DEL_STATS #endif /* DEBUG */ #ifdef MEM_DEL_STATS static int mem_del_stat_print = 0; struct mem_del_stat { uint_t nloop; uint_t need_free; uint_t free_loop; uint_t free_low; uint_t free_failed; uint_t ncheck; uint_t nopaget; uint_t lockfail; uint_t nfree; uint_t nreloc; uint_t nrelocfail; uint_t already_done; uint_t first_notfree; uint_t npplocked; uint_t nlockreloc; uint_t nnorepl; uint_t nmodreloc; uint_t ndestroy; uint_t nputpage; uint_t nnoreclaim; uint_t ndelay; uint_t demotefail; uint64_t nticks_total; uint64_t nticks_pgrp; uint_t retired; uint_t toxic; uint_t failing; uint_t modtoxic; uint_t npplkdtoxic; uint_t gptlmodfail; uint_t gptllckfail; }; /* * The stat values are only incremented in the delete thread * so no locking or atomic required. */ #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) static void mem_del_stat_print_func(struct mem_handle *); #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) #else /* MEM_DEL_STATS */ #define MDSTAT_INCR(MHP, FLD) #define MDSTAT_TOTAL(MHP, ntck) #define MDSTAT_PGRP(MHP, ntck) #define MDSTAT_PRINT(MHP) #endif /* MEM_DEL_STATS */ typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; /* * mh_mutex must be taken to examine or change mh_exthandle and mh_state. * The mutex may not be required for other fields, dependent on mh_state. */ struct mem_handle { kmutex_t mh_mutex; struct mem_handle *mh_next; memhandle_t mh_exthandle; mhnd_state_t mh_state; struct transit_list mh_transit; pgcnt_t mh_phys_pages; pgcnt_t mh_vm_pages; pgcnt_t mh_hold_todo; void (*mh_delete_complete)(void *, int error); void *mh_delete_complete_arg; volatile uint_t mh_cancel; volatile uint_t mh_dr_aio_cleanup_cancel; volatile uint_t mh_aio_cleanup_done; kcondvar_t mh_cv; kthread_id_t mh_thread_id; page_t *mh_deleted; /* link through p_next */ #ifdef MEM_DEL_STATS struct mem_del_stat mh_delstat; #endif /* MEM_DEL_STATS */ }; static struct mem_handle *mem_handle_head; static kmutex_t mem_handle_list_mutex; static struct mem_handle * kphysm_allocate_mem_handle() { struct mem_handle *mhp; mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&mem_handle_list_mutex); mutex_enter(&mhp->mh_mutex); /* handle_gen is protected by list mutex. */ mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); mhp->mh_next = mem_handle_head; mem_handle_head = mhp; mutex_exit(&mem_handle_list_mutex); return (mhp); } static void kphysm_free_mem_handle(struct mem_handle *mhp) { struct mem_handle **mhpp; ASSERT(mutex_owned(&mhp->mh_mutex)); ASSERT(mhp->mh_state == MHND_FREE); /* * Exit the mutex to preserve locking order. This is OK * here as once in the FREE state, the handle cannot * be found by a lookup. */ mutex_exit(&mhp->mh_mutex); mutex_enter(&mem_handle_list_mutex); mhpp = &mem_handle_head; while (*mhpp != NULL && *mhpp != mhp) mhpp = &(*mhpp)->mh_next; ASSERT(*mhpp == mhp); /* * No need to lock the handle (mh_mutex) as only * mh_next changing and this is the only thread that * can be referncing mhp. */ *mhpp = mhp->mh_next; mutex_exit(&mem_handle_list_mutex); mutex_destroy(&mhp->mh_mutex); kmem_free(mhp, sizeof (struct mem_handle)); } /* * This function finds the internal mem_handle corresponding to an * external handle and returns it with the mh_mutex held. */ static struct mem_handle * kphysm_lookup_mem_handle(memhandle_t handle) { struct mem_handle *mhp; mutex_enter(&mem_handle_list_mutex); for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { if (mhp->mh_exthandle == handle) { mutex_enter(&mhp->mh_mutex); /* * The state of the handle could have been changed * by kphysm_del_release() while waiting for mh_mutex. */ if (mhp->mh_state == MHND_FREE) { mutex_exit(&mhp->mh_mutex); continue; } break; } } mutex_exit(&mem_handle_list_mutex); return (mhp); } int kphysm_del_gethandle(memhandle_t *xmhp) { struct mem_handle *mhp; mhp = kphysm_allocate_mem_handle(); /* * The handle is allocated using KM_SLEEP, so cannot fail. * If the implementation is changed, the correct error to return * here would be KPHYSM_ENOHANDLES. */ ASSERT(mhp->mh_state == MHND_FREE); mhp->mh_state = MHND_INIT; *xmhp = mhp->mh_exthandle; mutex_exit(&mhp->mh_mutex); return (KPHYSM_OK); } static int overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) { pfn_t e1, e2; e1 = b1 + l1; e2 = b2 + l2; return (!(b2 >= e1 || b1 >= e2)); } static int can_remove_pgs(pgcnt_t); static struct memdelspan * span_to_install(pfn_t base, pgcnt_t npgs) { struct memdelspan *mdsp; struct memdelspan *mdsp_new; uint64_t address, size, thislen; struct memlist *mlp; mdsp_new = NULL; address = (uint64_t)base << PAGESHIFT; size = (uint64_t)npgs << PAGESHIFT; while (size != 0) { memlist_read_lock(); for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) { if (address >= (mlp->ml_address + mlp->ml_size)) continue; if ((address + size) > mlp->ml_address) break; } if (mlp == NULL) { address += size; size = 0; thislen = 0; } else { if (address < mlp->ml_address) { size -= (mlp->ml_address - address); address = mlp->ml_address; } ASSERT(address >= mlp->ml_address); if ((address + size) > (mlp->ml_address + mlp->ml_size)) { thislen = mlp->ml_size - (address - mlp->ml_address); } else { thislen = size; } } memlist_read_unlock(); /* TODO: phys_install could change now */ if (thislen == 0) continue; mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); mdsp->mds_base = btop(address); mdsp->mds_npgs = btop(thislen); mdsp->mds_next = mdsp_new; mdsp_new = mdsp; address += thislen; size -= thislen; } return (mdsp_new); } static void free_delspans(struct memdelspan *mdsp) { struct memdelspan *amdsp; while ((amdsp = mdsp) != NULL) { mdsp = amdsp->mds_next; kmem_free(amdsp, sizeof (struct memdelspan)); } } /* * Concatenate lists. No list ordering is required. */ static void delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) { while (*mdspp != NULL) mdspp = &(*mdspp)->mds_next; *mdspp = mdsp; } /* * Given a new list of delspans, check there is no overlap with * all existing span activity (add or delete) and then concatenate * the new spans to the given list. * Return 1 for OK, 0 if overlapping. */ static int delspan_insert( struct transit_list *my_tlp, struct memdelspan *mdsp_new) { struct transit_list_head *trh; struct transit_list *tlp; int ret; trh = &transit_list_head; ASSERT(my_tlp != NULL); ASSERT(mdsp_new != NULL); ret = 1; mutex_enter(&trh->trh_lock); /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { struct memdelspan *mdsp; for (mdsp = tlp->trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { struct memdelspan *nmdsp; for (nmdsp = mdsp_new; nmdsp != NULL; nmdsp = nmdsp->mds_next) { if (overlapping(mdsp->mds_base, mdsp->mds_npgs, nmdsp->mds_base, nmdsp->mds_npgs)) { ret = 0; goto done; } } } } done: if (ret != 0) { if (my_tlp->trl_spans == NULL) transit_list_insert(my_tlp); delspan_concat(&my_tlp->trl_spans, mdsp_new); } mutex_exit(&trh->trh_lock); return (ret); } static void delspan_remove( struct transit_list *my_tlp, pfn_t base, pgcnt_t npgs) { struct transit_list_head *trh; struct memdelspan *mdsp; trh = &transit_list_head; ASSERT(my_tlp != NULL); mutex_enter(&trh->trh_lock); if ((mdsp = my_tlp->trl_spans) != NULL) { if (npgs == 0) { my_tlp->trl_spans = NULL; free_delspans(mdsp); transit_list_remove(my_tlp); } else { struct memdelspan **prv; prv = &my_tlp->trl_spans; while (mdsp != NULL) { pfn_t p_end; p_end = mdsp->mds_base + mdsp->mds_npgs; if (mdsp->mds_base >= base && p_end <= (base + npgs)) { *prv = mdsp->mds_next; mdsp->mds_next = NULL; free_delspans(mdsp); } else { prv = &mdsp->mds_next; } mdsp = *prv; } if (my_tlp->trl_spans == NULL) transit_list_remove(my_tlp); } } mutex_exit(&trh->trh_lock); } /* * Reserve interface for add to stop delete before add finished. * This list is only accessed through the delspan_insert/remove * functions and so is fully protected by the mutex in struct transit_list. */ static struct transit_list reserve_transit; static int delspan_reserve(pfn_t base, pgcnt_t npgs) { struct memdelspan *mdsp; int ret; mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); mdsp->mds_base = base; mdsp->mds_npgs = npgs; if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { free_delspans(mdsp); } return (ret); } static void delspan_unreserve(pfn_t base, pgcnt_t npgs) { delspan_remove(&reserve_transit, base, npgs); } /* * Return whether memseg was created by kphysm_add_memory_dynamic(). */ static int memseg_is_dynamic(struct memseg *seg) { return (seg->msegflags & MEMSEG_DYNAMIC); } int kphysm_del_span( memhandle_t handle, pfn_t base, pgcnt_t npgs) { struct mem_handle *mhp; struct memseg *seg; struct memdelspan *mdsp; struct memdelspan *mdsp_new; pgcnt_t phys_pages, vm_pages; pfn_t p_end; page_t *pp; int ret; mhp = kphysm_lookup_mem_handle(handle); if (mhp == NULL) { return (KPHYSM_EHANDLE); } if (mhp->mh_state != MHND_INIT) { mutex_exit(&mhp->mh_mutex); return (KPHYSM_ESEQUENCE); } /* * Intersect the span with the installed memory list (phys_install). */ mdsp_new = span_to_install(base, npgs); if (mdsp_new == NULL) { /* * No physical memory in this range. Is this an * error? If an attempt to start the delete is made * for OK returns from del_span such as this, start will * return an error. * Could return KPHYSM_ENOWORK. */ /* * It is assumed that there are no error returns * from span_to_install() due to kmem_alloc failure. */ mutex_exit(&mhp->mh_mutex); return (KPHYSM_OK); } /* * Does this span overlap an existing span? */ if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { /* * Differentiate between already on list for this handle * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). */ ret = KPHYSM_EBUSY; for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { if (overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { ret = KPHYSM_EDUP; break; } } mutex_exit(&mhp->mh_mutex); free_delspans(mdsp_new); return (ret); } /* * At this point the spans in mdsp_new have been inserted into the * list of spans for this handle and thereby to the global list of * spans being processed. Each of these spans must now be checked * for relocatability. As a side-effect segments in the memseg list * may be split. * * Note that mdsp_new can no longer be used as it is now part of * a larger list. Select elements of this larger list based * on base and npgs. */ restart: phys_pages = 0; vm_pages = 0; ret = KPHYSM_OK; for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { pgcnt_t pages_checked; if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { continue; } p_end = mdsp->mds_base + mdsp->mds_npgs; /* * The pages_checked count is a hack. All pages should be * checked for relocatability. Those not covered by memsegs * should be tested with arch_kphysm_del_span_ok(). */ pages_checked = 0; for (seg = memsegs; seg; seg = seg->next) { pfn_t mseg_start; if (seg->pages_base >= p_end || seg->pages_end <= mdsp->mds_base) { /* Span and memseg don't overlap. */ continue; } mseg_start = memseg_get_start(seg); /* Check that segment is suitable for delete. */ if (memseg_includes_meta(seg)) { /* * Check that this segment is completely * within the span. */ if (mseg_start < mdsp->mds_base || seg->pages_end > p_end) { ret = KPHYSM_EBUSY; break; } pages_checked += seg->pages_end - mseg_start; } else { /* * If this segment is larger than the span, * try to split it. After the split, it * is necessary to restart. */ if (seg->pages_base < mdsp->mds_base || seg->pages_end > p_end) { pfn_t abase; pgcnt_t anpgs; int s_ret; /* Split required. */ if (mdsp->mds_base < seg->pages_base) abase = seg->pages_base; else abase = mdsp->mds_base; if (p_end > seg->pages_end) anpgs = seg->pages_end - abase; else anpgs = p_end - abase; s_ret = kphysm_split_memseg(abase, anpgs); if (s_ret == 0) { /* Split failed. */ ret = KPHYSM_ERESOURCE; break; } goto restart; } pages_checked += seg->pages_end - seg->pages_base; } /* * The memseg is wholly within the delete span. * The individual pages can now be checked. */ /* Cage test. */ for (pp = seg->pages; pp < seg->epages; pp++) { if (PP_ISNORELOC(pp)) { ret = KPHYSM_ENONRELOC; break; } } if (ret != KPHYSM_OK) { break; } phys_pages += (seg->pages_end - mseg_start); vm_pages += MSEG_NPAGES(seg); } if (ret != KPHYSM_OK) break; if (pages_checked != mdsp->mds_npgs) { ret = KPHYSM_ENONRELOC; break; } } if (ret == KPHYSM_OK) { mhp->mh_phys_pages += phys_pages; mhp->mh_vm_pages += vm_pages; } else { /* * Keep holding the mh_mutex to prevent it going away. */ delspan_remove(&mhp->mh_transit, base, npgs); } mutex_exit(&mhp->mh_mutex); return (ret); } int kphysm_del_span_query( pfn_t base, pgcnt_t npgs, memquery_t *mqp) { struct memdelspan *mdsp; struct memdelspan *mdsp_new; int done_first_nonreloc; mqp->phys_pages = 0; mqp->managed = 0; mqp->nonrelocatable = 0; mqp->first_nonrelocatable = 0; mqp->last_nonrelocatable = 0; mdsp_new = span_to_install(base, npgs); /* * It is OK to proceed here if mdsp_new == NULL. */ done_first_nonreloc = 0; for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { pfn_t sbase; pgcnt_t snpgs; mqp->phys_pages += mdsp->mds_npgs; sbase = mdsp->mds_base; snpgs = mdsp->mds_npgs; while (snpgs != 0) { struct memseg *lseg, *seg; pfn_t p_end; page_t *pp; pfn_t mseg_start; p_end = sbase + snpgs; /* * Find the lowest addressed memseg that starts * after sbase and account for it. * This is to catch dynamic memsegs whose start * is hidden. */ seg = NULL; for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { if ((lseg->pages_base >= sbase) || (lseg->pages_base < p_end && lseg->pages_end > sbase)) { if (seg == NULL || seg->pages_base > lseg->pages_base) seg = lseg; } } if (seg != NULL) { mseg_start = memseg_get_start(seg); /* * Now have the full extent of the memseg so * do the range check. */ if (mseg_start >= p_end || seg->pages_end <= sbase) { /* Span does not overlap memseg. */ seg = NULL; } } /* * Account for gap either before the segment if * there is one or to the end of the span. */ if (seg == NULL || mseg_start > sbase) { pfn_t a_end; a_end = (seg == NULL) ? p_end : mseg_start; /* * Check with arch layer for relocatability. */ if (arch_kphysm_del_span_ok(sbase, (a_end - sbase))) { /* * No non-relocatble pages in this * area, avoid the fine-grained * test. */ snpgs -= (a_end - sbase); sbase = a_end; } while (sbase < a_end) { if (!arch_kphysm_del_span_ok(sbase, 1)) { mqp->nonrelocatable++; if (!done_first_nonreloc) { mqp-> first_nonrelocatable = sbase; done_first_nonreloc = 1; } mqp->last_nonrelocatable = sbase; } sbase++; snpgs--; } } if (seg != NULL) { ASSERT(mseg_start <= sbase); if (seg->pages_base != mseg_start && seg->pages_base > sbase) { pgcnt_t skip_pgs; /* * Skip the page_t area of a * dynamic memseg. */ skip_pgs = seg->pages_base - sbase; if (snpgs <= skip_pgs) { sbase += snpgs; snpgs = 0; continue; } snpgs -= skip_pgs; sbase += skip_pgs; } ASSERT(snpgs != 0); ASSERT(seg->pages_base <= sbase); /* * The individual pages can now be checked. */ for (pp = seg->pages + (sbase - seg->pages_base); snpgs != 0 && pp < seg->epages; pp++) { mqp->managed++; if (PP_ISNORELOC(pp)) { mqp->nonrelocatable++; if (!done_first_nonreloc) { mqp-> first_nonrelocatable = sbase; done_first_nonreloc = 1; } mqp->last_nonrelocatable = sbase; } sbase++; snpgs--; } } } } free_delspans(mdsp_new); return (KPHYSM_OK); } /* * This release function can be called at any stage as follows: * _gethandle only called * _span(s) only called * _start called but failed * delete thread exited */ int kphysm_del_release(memhandle_t handle) { struct mem_handle *mhp; mhp = kphysm_lookup_mem_handle(handle); if (mhp == NULL) { return (KPHYSM_EHANDLE); } switch (mhp->mh_state) { case MHND_STARTING: case MHND_RUNNING: mutex_exit(&mhp->mh_mutex); return (KPHYSM_ENOTFINISHED); case MHND_FREE: ASSERT(mhp->mh_state != MHND_FREE); mutex_exit(&mhp->mh_mutex); return (KPHYSM_EHANDLE); case MHND_INIT: break; case MHND_DONE: break; case MHND_RELEASE: mutex_exit(&mhp->mh_mutex); return (KPHYSM_ESEQUENCE); default: #ifdef DEBUG cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", (void *)mhp, mhp->mh_state); #endif /* DEBUG */ mutex_exit(&mhp->mh_mutex); return (KPHYSM_EHANDLE); } /* * Set state so that we can wait if necessary. * Also this means that we have read/write access to all * fields except mh_exthandle and mh_state. */ mhp->mh_state = MHND_RELEASE; /* * The mem_handle cannot be de-allocated by any other operation * now, so no need to hold mh_mutex. */ mutex_exit(&mhp->mh_mutex); delspan_remove(&mhp->mh_transit, 0, 0); mhp->mh_phys_pages = 0; mhp->mh_vm_pages = 0; mhp->mh_hold_todo = 0; mhp->mh_delete_complete = NULL; mhp->mh_delete_complete_arg = NULL; mhp->mh_cancel = 0; mutex_enter(&mhp->mh_mutex); ASSERT(mhp->mh_state == MHND_RELEASE); mhp->mh_state = MHND_FREE; kphysm_free_mem_handle(mhp); return (KPHYSM_OK); } /* * This cancel function can only be called with the thread running. */ int kphysm_del_cancel(memhandle_t handle) { struct mem_handle *mhp; mhp = kphysm_lookup_mem_handle(handle); if (mhp == NULL) { return (KPHYSM_EHANDLE); } if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { mutex_exit(&mhp->mh_mutex); return (KPHYSM_ENOTRUNNING); } /* * Set the cancel flag and wake the delete thread up. * The thread may be waiting on I/O, so the effect of the cancel * may be delayed. */ if (mhp->mh_cancel == 0) { mhp->mh_cancel = KPHYSM_ECANCELLED; cv_signal(&mhp->mh_cv); } mutex_exit(&mhp->mh_mutex); return (KPHYSM_OK); } int kphysm_del_status( memhandle_t handle, memdelstat_t *mdstp) { struct mem_handle *mhp; mhp = kphysm_lookup_mem_handle(handle); if (mhp == NULL) { return (KPHYSM_EHANDLE); } /* * Calling kphysm_del_status() is allowed before the delete * is started to allow for status display. */ if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { mutex_exit(&mhp->mh_mutex); return (KPHYSM_ENOTRUNNING); } mdstp->phys_pages = mhp->mh_phys_pages; mdstp->managed = mhp->mh_vm_pages; mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; mutex_exit(&mhp->mh_mutex); return (KPHYSM_OK); } static int mem_delete_additional_pages = 100; static int can_remove_pgs(pgcnt_t npgs) { /* * If all pageable pages were paged out, freemem would * equal availrmem. There is a minimum requirement for * availrmem. */ if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) < npgs) return (0); /* TODO: check swap space, etc. */ return (1); } static int get_availrmem(pgcnt_t npgs) { int ret; mutex_enter(&freemem_lock); ret = can_remove_pgs(npgs); if (ret != 0) availrmem -= npgs; mutex_exit(&freemem_lock); return (ret); } static void put_availrmem(pgcnt_t npgs) { mutex_enter(&freemem_lock); availrmem += npgs; mutex_exit(&freemem_lock); } #define FREEMEM_INCR 100 static pgcnt_t freemem_incr = FREEMEM_INCR; #define DEL_FREE_WAIT_FRAC 4 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) #define DEL_BUSY_WAIT_FRAC 20 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) static void kphysm_del_cleanup(struct mem_handle *); static void page_delete_collect(page_t *, struct mem_handle *); static pgcnt_t delthr_get_freemem(struct mem_handle *mhp) { pgcnt_t free_get; int ret; ASSERT(MUTEX_HELD(&mhp->mh_mutex)); MDSTAT_INCR(mhp, need_free); /* * Get up to freemem_incr pages. */ free_get = freemem_incr; if (free_get > mhp->mh_hold_todo) free_get = mhp->mh_hold_todo; /* * Take free_get pages away from freemem, * waiting if necessary. */ while (!mhp->mh_cancel) { mutex_exit(&mhp->mh_mutex); MDSTAT_INCR(mhp, free_loop); /* * Duplicate test from page_create_throttle() * but don't override with !PG_WAIT. */ if (freemem < (free_get + throttlefree)) { MDSTAT_INCR(mhp, free_low); ret = 0; } else { ret = page_create_wait(free_get, 0); if (ret == 0) { /* EMPTY */ MDSTAT_INCR(mhp, free_failed); } } if (ret != 0) { mutex_enter(&mhp->mh_mutex); return (free_get); } /* * Put pressure on pageout. */ page_needfree(free_get); WAKE_PAGEOUT_SCANNER(delthr); mutex_enter(&mhp->mh_mutex); (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK); mutex_exit(&mhp->mh_mutex); page_needfree(-(spgcnt_t)free_get); mutex_enter(&mhp->mh_mutex); } return (0); } #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 /* * This function is run as a helper thread for delete_memory_thread. * It is needed in order to force kaio cleanup, so that pages used in kaio * will be unlocked and subsequently relocated by delete_memory_thread. * The address of the delete_memory_threads's mem_handle is passed in to * this thread function, and is used to set the mh_aio_cleanup_done member * prior to calling thread_exit(). */ static void dr_aio_cleanup_thread(caddr_t amhp) { proc_t *procp; int (*aio_cleanup_dr_delete_memory)(proc_t *); int cleaned; int n = 0; struct mem_handle *mhp; volatile uint_t *pcancel; mhp = (struct mem_handle *)amhp; ASSERT(mhp != NULL); pcancel = &mhp->mh_dr_aio_cleanup_cancel; if (modload("sys", "kaio") == -1) { mhp->mh_aio_cleanup_done = 1; cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); thread_exit(); } aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) modgetsymvalue("aio_cleanup_dr_delete_memory", 0); if (aio_cleanup_dr_delete_memory == NULL) { mhp->mh_aio_cleanup_done = 1; cmn_err(CE_WARN, "aio_cleanup_dr_delete_memory not found in kaio"); thread_exit(); } do { cleaned = 0; mutex_enter(&pidlock); for (procp = practive; (*pcancel == 0) && (procp != NULL); procp = procp->p_next) { mutex_enter(&procp->p_lock); if (procp->p_aio != NULL) { /* cleanup proc's outstanding kaio */ cleaned += (*aio_cleanup_dr_delete_memory)(procp); } mutex_exit(&procp->p_lock); } mutex_exit(&pidlock); if ((*pcancel == 0) && (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { /* delay a bit before retrying all procs again */ delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); n = 0; } } while (*pcancel == 0); mhp->mh_aio_cleanup_done = 1; thread_exit(); } static void delete_memory_thread(caddr_t amhp) { struct mem_handle *mhp; struct memdelspan *mdsp; callb_cpr_t cprinfo; page_t *pp_targ; spgcnt_t freemem_left; void (*del_complete_funcp)(void *, int error); void *del_complete_arg; int comp_code; int ret; int first_scan; uint_t szc; #ifdef MEM_DEL_STATS uint64_t start_total, ntick_total; uint64_t start_pgrp, ntick_pgrp; #endif /* MEM_DEL_STATS */ mhp = (struct mem_handle *)amhp; #ifdef MEM_DEL_STATS start_total = ddi_get_lbolt(); #endif /* MEM_DEL_STATS */ CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, callb_generic_cpr, "memdel"); mutex_enter(&mhp->mh_mutex); ASSERT(mhp->mh_state == MHND_STARTING); mhp->mh_state = MHND_RUNNING; mhp->mh_thread_id = curthread; mhp->mh_hold_todo = mhp->mh_vm_pages; mutex_exit(&mhp->mh_mutex); /* Allocate the remap pages now, if necessary. */ memseg_remap_init(); /* * Subtract from availrmem now if possible as availrmem * may not be available by the end of the delete. */ if (!get_availrmem(mhp->mh_vm_pages)) { comp_code = KPHYSM_ENOTVIABLE; mutex_enter(&mhp->mh_mutex); goto early_exit; } ret = kphysm_setup_pre_del(mhp->mh_vm_pages); mutex_enter(&mhp->mh_mutex); if (ret != 0) { mhp->mh_cancel = KPHYSM_EREFUSED; goto refused; } transit_list_collect(mhp, 1); for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { ASSERT(mdsp->mds_bitmap == NULL); mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); } first_scan = 1; freemem_left = 0; /* * Start dr_aio_cleanup_thread, which periodically iterates * through the process list and invokes aio cleanup. This * is needed in order to avoid a deadly embrace between the * delete_memory_thread (waiting on writer lock for page, with the * exclusive-wanted bit set), kaio read request threads (waiting for a * reader lock on the same page that is wanted by the * delete_memory_thread), and threads waiting for kaio completion * (blocked on spt_amp->lock). */ mhp->mh_dr_aio_cleanup_cancel = 0; mhp->mh_aio_cleanup_done = 0; (void) thread_create(NULL, 0, dr_aio_cleanup_thread, (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { pgcnt_t collected; MDSTAT_INCR(mhp, nloop); collected = 0; for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { pfn_t pfn, p_end; p_end = mdsp->mds_base + mdsp->mds_npgs; for (pfn = mdsp->mds_base; (pfn < p_end) && (mhp->mh_cancel == 0); pfn++) { page_t *pp, *tpp, *tpp_targ; pgcnt_t bit; struct vnode *vp; u_offset_t offset; int mod, result; spgcnt_t pgcnt; bit = pfn - mdsp->mds_base; if ((mdsp->mds_bitmap[bit / NBPBMW] & (1 << (bit % NBPBMW))) != 0) { MDSTAT_INCR(mhp, already_done); continue; } if (freemem_left == 0) { freemem_left += delthr_get_freemem(mhp); if (freemem_left == 0) break; } /* * Release mh_mutex - some of this * stuff takes some time (eg PUTPAGE). */ mutex_exit(&mhp->mh_mutex); MDSTAT_INCR(mhp, ncheck); pp = page_numtopp_nolock(pfn); if (pp == NULL) { /* * Not covered by a page_t - will * be dealt with elsewhere. */ MDSTAT_INCR(mhp, nopaget); mutex_enter(&mhp->mh_mutex); mdsp->mds_bitmap[bit / NBPBMW] |= (1 << (bit % NBPBMW)); continue; } if (!page_try_reclaim_lock(pp, SE_EXCL, SE_EXCL_WANTED | SE_RETIRED)) { /* * Page in use elsewhere. Skip it. */ MDSTAT_INCR(mhp, lockfail); mutex_enter(&mhp->mh_mutex); continue; } /* * See if the cage expanded into the delete. * This can happen as we have to allow the * cage to expand. */ if (PP_ISNORELOC(pp)) { page_unlock(pp); mutex_enter(&mhp->mh_mutex); mhp->mh_cancel = KPHYSM_ENONRELOC; break; } if (PP_RETIRED(pp)) { /* * Page has been retired and is * not part of the cage so we * can now do the accounting for * it. */ MDSTAT_INCR(mhp, retired); mutex_enter(&mhp->mh_mutex); mdsp->mds_bitmap[bit / NBPBMW] |= (1 << (bit % NBPBMW)); mdsp->mds_bitmap_retired[bit / NBPBMW] |= (1 << (bit % NBPBMW)); mhp->mh_hold_todo--; continue; } ASSERT(freemem_left != 0); if (PP_ISFREE(pp)) { /* * Like page_reclaim() only 'freemem' * processing is already done. */ MDSTAT_INCR(mhp, nfree); free_page_collect: if (PP_ISAGED(pp)) { page_list_sub(pp, PG_FREE_LIST); } else { page_list_sub(pp, PG_CACHE_LIST); } PP_CLRFREE(pp); PP_CLRAGED(pp); collected++; mutex_enter(&mhp->mh_mutex); page_delete_collect(pp, mhp); mdsp->mds_bitmap[bit / NBPBMW] |= (1 << (bit % NBPBMW)); freemem_left--; continue; } ASSERT(pp->p_vnode != NULL); if (first_scan) { MDSTAT_INCR(mhp, first_notfree); page_unlock(pp); mutex_enter(&mhp->mh_mutex); continue; } /* * Keep stats on pages encountered that * are marked for retirement. */ if (PP_TOXIC(pp)) { MDSTAT_INCR(mhp, toxic); } else if (PP_PR_REQ(pp)) { MDSTAT_INCR(mhp, failing); } /* * In certain cases below, special exceptions * are made for pages that are toxic. This * is because the current meaning of toxic * is that an uncorrectable error has been * previously associated with the page. */ if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { if (!PP_TOXIC(pp)) { /* * Must relocate locked in * memory pages. */ #ifdef MEM_DEL_STATS start_pgrp = ddi_get_lbolt(); #endif /* MEM_DEL_STATS */ /* * Lock all constituent pages * of a large page to ensure * that p_szc won't change. */ if (!group_page_trylock(pp, SE_EXCL)) { MDSTAT_INCR(mhp, gptllckfail); page_unlock(pp); mutex_enter( &mhp->mh_mutex); continue; } MDSTAT_INCR(mhp, npplocked); pp_targ = page_get_replacement_page( pp, NULL, 0); if (pp_targ != NULL) { #ifdef MEM_DEL_STATS ntick_pgrp = (uint64_t) ddi_get_lbolt() - start_pgrp; #endif /* MEM_DEL_STATS */ MDSTAT_PGRP(mhp, ntick_pgrp); MDSTAT_INCR(mhp, nlockreloc); goto reloc; } group_page_unlock(pp); page_unlock(pp); #ifdef MEM_DEL_STATS ntick_pgrp = (uint64_t)ddi_get_lbolt() - start_pgrp; #endif /* MEM_DEL_STATS */ MDSTAT_PGRP(mhp, ntick_pgrp); MDSTAT_INCR(mhp, nnorepl); mutex_enter(&mhp->mh_mutex); continue; } else { /* * Cannot do anything about * this page because it is * toxic. */ MDSTAT_INCR(mhp, npplkdtoxic); page_unlock(pp); mutex_enter(&mhp->mh_mutex); continue; } } /* * Unload the mappings and check if mod bit * is set. */ ASSERT(!PP_ISKAS(pp)); (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); mod = hat_ismod(pp); #ifdef MEM_DEL_STATS start_pgrp = ddi_get_lbolt(); #endif /* MEM_DEL_STATS */ if (mod && !PP_TOXIC(pp)) { /* * Lock all constituent pages * of a large page to ensure * that p_szc won't change. */ if (!group_page_trylock(pp, SE_EXCL)) { MDSTAT_INCR(mhp, gptlmodfail); page_unlock(pp); mutex_enter(&mhp->mh_mutex); continue; } pp_targ = page_get_replacement_page(pp, NULL, 0); if (pp_targ != NULL) { MDSTAT_INCR(mhp, nmodreloc); #ifdef MEM_DEL_STATS ntick_pgrp = (uint64_t)ddi_get_lbolt() - start_pgrp; #endif /* MEM_DEL_STATS */ MDSTAT_PGRP(mhp, ntick_pgrp); goto reloc; } group_page_unlock(pp); } if (!page_try_demote_pages(pp)) { MDSTAT_INCR(mhp, demotefail); page_unlock(pp); #ifdef MEM_DEL_STATS ntick_pgrp = (uint64_t)ddi_get_lbolt() - start_pgrp; #endif /* MEM_DEL_STATS */ MDSTAT_PGRP(mhp, ntick_pgrp); mutex_enter(&mhp->mh_mutex); continue; } /* * Regular 'page-out'. */ if (!mod) { MDSTAT_INCR(mhp, ndestroy); page_destroy(pp, 1); /* * page_destroy was called with * dontfree. As long as p_lckcnt * and p_cowcnt are both zero, the * only additional action of * page_destroy with !dontfree is to * call page_free, so we can collect * the page here. */ collected++; #ifdef MEM_DEL_STATS ntick_pgrp = (uint64_t)ddi_get_lbolt() - start_pgrp; #endif /* MEM_DEL_STATS */ MDSTAT_PGRP(mhp, ntick_pgrp); mutex_enter(&mhp->mh_mutex); page_delete_collect(pp, mhp); mdsp->mds_bitmap[bit / NBPBMW] |= (1 << (bit % NBPBMW)); continue; } /* * The page is toxic and the mod bit is * set, we cannot do anything here to deal * with it. */ if (PP_TOXIC(pp)) { page_unlock(pp); #ifdef MEM_DEL_STATS ntick_pgrp = (uint64_t)ddi_get_lbolt() - start_pgrp; #endif /* MEM_DEL_STATS */ MDSTAT_PGRP(mhp, ntick_pgrp); MDSTAT_INCR(mhp, modtoxic); mutex_enter(&mhp->mh_mutex); continue; } MDSTAT_INCR(mhp, nputpage); vp = pp->p_vnode; offset = pp->p_offset; VN_HOLD(vp); page_unlock(pp); (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL|B_FORCE, kcred, NULL); VN_RELE(vp); #ifdef MEM_DEL_STATS ntick_pgrp = (uint64_t)ddi_get_lbolt() - start_pgrp; #endif /* MEM_DEL_STATS */ MDSTAT_PGRP(mhp, ntick_pgrp); /* * Try to get the page back immediately * so that it can be collected. */ pp = page_numtopp_nolock(pfn); if (pp == NULL) { MDSTAT_INCR(mhp, nnoreclaim); /* * This should not happen as this * thread is deleting the page. * If this code is generalized, this * becomes a reality. */ #ifdef DEBUG cmn_err(CE_WARN, "delete_memory_thread(0x%p) " "pfn 0x%lx has no page_t", (void *)mhp, pfn); #endif /* DEBUG */ mutex_enter(&mhp->mh_mutex); continue; } if (page_try_reclaim_lock(pp, SE_EXCL, SE_EXCL_WANTED | SE_RETIRED)) { if (PP_ISFREE(pp)) { goto free_page_collect; } page_unlock(pp); } MDSTAT_INCR(mhp, nnoreclaim); mutex_enter(&mhp->mh_mutex); continue; reloc: /* * Got some freemem and a target * page, so move the data to avoid * I/O and lock problems. */ ASSERT(!page_iolock_assert(pp)); MDSTAT_INCR(mhp, nreloc); /* * page_relocate() will return pgcnt: the * number of consecutive pages relocated. * If it is successful, pp will be a * linked list of the page structs that * were relocated. If page_relocate() is * unsuccessful, pp will be unmodified. */ #ifdef MEM_DEL_STATS start_pgrp = ddi_get_lbolt(); #endif /* MEM_DEL_STATS */ result = page_relocate(&pp, &pp_targ, 0, 0, &pgcnt, NULL); #ifdef MEM_DEL_STATS ntick_pgrp = (uint64_t)ddi_get_lbolt() - start_pgrp; #endif /* MEM_DEL_STATS */ MDSTAT_PGRP(mhp, ntick_pgrp); if (result != 0) { MDSTAT_INCR(mhp, nrelocfail); /* * We did not succeed. We need * to give the pp_targ pages back. * page_free(pp_targ, 1) without * the freemem accounting. */ group_page_unlock(pp); page_free_replacement_page(pp_targ); page_unlock(pp); mutex_enter(&mhp->mh_mutex); continue; } /* * We will then collect pgcnt pages. */ ASSERT(pgcnt > 0); mutex_enter(&mhp->mh_mutex); /* * We need to make sure freemem_left is * large enough. */ while ((freemem_left < pgcnt) && (!mhp->mh_cancel)) { freemem_left += delthr_get_freemem(mhp); } /* * Do not proceed if mh_cancel is set. */ if (mhp->mh_cancel) { while (pp_targ != NULL) { /* * Unlink and unlock each page. */ tpp_targ = pp_targ; page_sub(&pp_targ, tpp_targ); page_unlock(tpp_targ); } /* * We need to give the pp pages back. * page_free(pp, 1) without the * freemem accounting. */ page_free_replacement_page(pp); break; } /* Now remove pgcnt from freemem_left */ freemem_left -= pgcnt; ASSERT(freemem_left >= 0); szc = pp->p_szc; while (pp != NULL) { /* * pp and pp_targ were passed back as * a linked list of pages. * Unlink and unlock each page. */ tpp_targ = pp_targ; page_sub(&pp_targ, tpp_targ); page_unlock(tpp_targ); /* * The original page is now free * so remove it from the linked * list and collect it. */ tpp = pp; page_sub(&pp, tpp); pfn = page_pptonum(tpp); collected++; ASSERT(PAGE_EXCL(tpp)); ASSERT(tpp->p_vnode == NULL); ASSERT(!hat_page_is_mapped(tpp)); ASSERT(tpp->p_szc == szc); tpp->p_szc = 0; page_delete_collect(tpp, mhp); bit = pfn - mdsp->mds_base; mdsp->mds_bitmap[bit / NBPBMW] |= (1 << (bit % NBPBMW)); } ASSERT(pp_targ == NULL); } } first_scan = 0; if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && (collected == 0)) { /* * This code is needed as we cannot wait * for a page to be locked OR the delete to * be cancelled. Also, we must delay so * that other threads get a chance to run * on our cpu, otherwise page locks may be * held indefinitely by those threads. */ MDSTAT_INCR(mhp, ndelay); CALLB_CPR_SAFE_BEGIN(&cprinfo); (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK); CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); } } /* stop the dr aio cleanup thread */ mhp->mh_dr_aio_cleanup_cancel = 1; transit_list_collect(mhp, 0); if (freemem_left != 0) { /* Return any surplus. */ page_create_putback(freemem_left); freemem_left = 0; } #ifdef MEM_DEL_STATS ntick_total = (uint64_t)ddi_get_lbolt() - start_total; #endif /* MEM_DEL_STATS */ MDSTAT_TOTAL(mhp, ntick_total); MDSTAT_PRINT(mhp); /* * If the memory delete was cancelled, exclusive-wanted bits must * be cleared. If there are retired pages being deleted, they need * to be unretired. */ for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { pfn_t pfn, p_end; p_end = mdsp->mds_base + mdsp->mds_npgs; for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { page_t *pp; pgcnt_t bit; bit = pfn - mdsp->mds_base; if (mhp->mh_cancel) { pp = page_numtopp_nolock(pfn); if (pp != NULL) { if ((mdsp->mds_bitmap[bit / NBPBMW] & (1 << (bit % NBPBMW))) == 0) { page_lock_clr_exclwanted(pp); } } } else { pp = NULL; } if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & (1 << (bit % NBPBMW))) != 0) { /* do we already have pp? */ if (pp == NULL) { pp = page_numtopp_nolock(pfn); } ASSERT(pp != NULL); ASSERT(PP_RETIRED(pp)); if (mhp->mh_cancel != 0) { page_unlock(pp); /* * To satisfy ASSERT below in * cancel code. */ mhp->mh_hold_todo++; } else { (void) page_unretire_pp(pp, PR_UNR_CLEAN); } } } } /* * Free retired page bitmap and collected page bitmap */ for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { ASSERT(mdsp->mds_bitmap_retired != NULL); kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ ASSERT(mdsp->mds_bitmap != NULL); kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); mdsp->mds_bitmap = NULL; /* Paranoia. */ } /* wait for our dr aio cancel thread to exit */ while (!(mhp->mh_aio_cleanup_done)) { CALLB_CPR_SAFE_BEGIN(&cprinfo); delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); } refused: if (mhp->mh_cancel != 0) { page_t *pp; comp_code = mhp->mh_cancel; /* * Go through list of deleted pages (mh_deleted) freeing * them. */ while ((pp = mhp->mh_deleted) != NULL) { mhp->mh_deleted = pp->p_next; mhp->mh_hold_todo++; mutex_exit(&mhp->mh_mutex); /* Restore p_next. */ pp->p_next = pp->p_prev; if (PP_ISFREE(pp)) { cmn_err(CE_PANIC, "page %p is free", (void *)pp); } page_free(pp, 1); mutex_enter(&mhp->mh_mutex); } ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); mutex_exit(&mhp->mh_mutex); put_availrmem(mhp->mh_vm_pages); mutex_enter(&mhp->mh_mutex); goto t_exit; } /* * All the pages are no longer in use and are exclusively locked. */ mhp->mh_deleted = NULL; kphysm_del_cleanup(mhp); /* * mem_node_del_range needs to be after kphysm_del_cleanup so * that the mem_node_config[] will remain intact for the cleanup. */ for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { mem_node_del_range(mdsp->mds_base, mdsp->mds_base + mdsp->mds_npgs - 1); } /* cleanup the page counters */ page_ctrs_cleanup(); comp_code = KPHYSM_OK; t_exit: mutex_exit(&mhp->mh_mutex); kphysm_setup_post_del(mhp->mh_vm_pages, (comp_code == KPHYSM_OK) ? 0 : 1); mutex_enter(&mhp->mh_mutex); early_exit: /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ mhp->mh_state = MHND_DONE; del_complete_funcp = mhp->mh_delete_complete; del_complete_arg = mhp->mh_delete_complete_arg; CALLB_CPR_EXIT(&cprinfo); (*del_complete_funcp)(del_complete_arg, comp_code); thread_exit(); /*NOTREACHED*/ } /* * Start the delete of the memory from the system. */ int kphysm_del_start( memhandle_t handle, void (*complete)(void *, int), void *complete_arg) { struct mem_handle *mhp; mhp = kphysm_lookup_mem_handle(handle); if (mhp == NULL) { return (KPHYSM_EHANDLE); } switch (mhp->mh_state) { case MHND_FREE: ASSERT(mhp->mh_state != MHND_FREE); mutex_exit(&mhp->mh_mutex); return (KPHYSM_EHANDLE); case MHND_INIT: break; case MHND_STARTING: case MHND_RUNNING: mutex_exit(&mhp->mh_mutex); return (KPHYSM_ESEQUENCE); case MHND_DONE: mutex_exit(&mhp->mh_mutex); return (KPHYSM_ESEQUENCE); case MHND_RELEASE: mutex_exit(&mhp->mh_mutex); return (KPHYSM_ESEQUENCE); default: #ifdef DEBUG cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", (void *)mhp, mhp->mh_state); #endif /* DEBUG */ mutex_exit(&mhp->mh_mutex); return (KPHYSM_EHANDLE); } if (mhp->mh_transit.trl_spans == NULL) { mutex_exit(&mhp->mh_mutex); return (KPHYSM_ENOWORK); } ASSERT(complete != NULL); mhp->mh_delete_complete = complete; mhp->mh_delete_complete_arg = complete_arg; mhp->mh_state = MHND_STARTING; /* * Release the mutex in case thread_create sleeps. */ mutex_exit(&mhp->mh_mutex); /* * The "obvious" process for this thread is pageout (proc_pageout) * but this gives the thread too much power over freemem * which results in freemem starvation. */ (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, TS_RUN, maxclsyspri - 1); return (KPHYSM_OK); } static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ static caddr_t pp_dummy; static pgcnt_t pp_dummy_npages; static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ static void memseg_remap_init_pages(page_t *pages, page_t *epages) { page_t *pp; for (pp = pages; pp < epages; pp++) { pp->p_pagenum = PFN_INVALID; /* XXXX */ pp->p_offset = (u_offset_t)-1; page_iolock_init(pp); while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) continue; page_lock_delete(pp); } } void memseg_remap_init() { mutex_enter(&pp_dummy_lock); if (pp_dummy == NULL) { uint_t dpages; int i; /* * dpages starts off as the size of the structure and * ends up as the minimum number of pages that will * hold a whole number of page_t structures. */ dpages = sizeof (page_t); ASSERT(dpages != 0); ASSERT(dpages <= MMU_PAGESIZE); while ((dpages & 1) == 0) dpages >>= 1; pp_dummy_npages = dpages; /* * Allocate pp_dummy pages directly from static_arena, * since these are whole page allocations and are * referenced by physical address. This also has the * nice fringe benefit of hiding the memory from * ::findleaks since it doesn't deal well with allocated * kernel heap memory that doesn't have any mappings. */ pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); bzero(pp_dummy, ptob(pp_dummy_npages)); ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * pp_dummy_npages, KM_SLEEP); for (i = 0; i < pp_dummy_npages; i++) { pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, &pp_dummy[MMU_PAGESIZE * i]); ASSERT(pp_dummy_pfn[i] != PFN_INVALID); } /* * Initialize the page_t's to a known 'deleted' state * that matches the state of deleted pages. */ memseg_remap_init_pages((page_t *)pp_dummy, (page_t *)(pp_dummy + ptob(pp_dummy_npages))); /* Remove kmem mappings for the pages for safety. */ hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), HAT_UNLOAD_UNLOCK); /* Leave pp_dummy pointer set as flag that init is done. */ } mutex_exit(&pp_dummy_lock); } /* * Remap a page-aglined range of page_t's to dummy pages. */ void remap_to_dummy(caddr_t va, pgcnt_t metapgs) { int phase; ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE)); /* * We may start remapping at a non-zero page offset * within the dummy pages since the low/high ends * of the outgoing pp's could be shared by other * memsegs (see memseg_remap_meta). */ phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages; /*CONSTCOND*/ ASSERT(PAGESIZE % sizeof (page_t) || phase == 0); while (metapgs != 0) { pgcnt_t n; int i, j; n = pp_dummy_npages; if (n > metapgs) n = metapgs; for (i = 0; i < n; i++) { j = (i + phase) % pp_dummy_npages; hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j], PROT_READ, HAT_LOAD | HAT_LOAD_NOCONSIST | HAT_LOAD_REMAP); va += ptob(1); } metapgs -= n; } } static void memseg_remap_to_dummy(struct memseg *seg) { caddr_t pp; pgcnt_t metapgs; ASSERT(memseg_is_dynamic(seg)); ASSERT(pp_dummy != NULL); if (!memseg_includes_meta(seg)) { memseg_remap_meta(seg); return; } pp = (caddr_t)seg->pages; metapgs = seg->pages_base - memseg_get_start(seg); ASSERT(metapgs != 0); seg->pages_end = seg->pages_base; remap_to_dummy(pp, metapgs); } /* * Transition all the deleted pages to the deleted state so that * page_lock will not wait. The page_lock_delete call will * also wake up any waiters. */ static void memseg_lock_delete_all(struct memseg *seg) { page_t *pp; for (pp = seg->pages; pp < seg->epages; pp++) { pp->p_pagenum = PFN_INVALID; /* XXXX */ page_lock_delete(pp); } } static void kphysm_del_cleanup(struct mem_handle *mhp) { struct memdelspan *mdsp; struct memseg *seg; struct memseg **segpp; struct memseg *seglist; pfn_t p_end; uint64_t avmem; pgcnt_t avpgs; pgcnt_t npgs; avpgs = mhp->mh_vm_pages; memsegs_lock(1); /* * remove from main segment list. */ npgs = 0; seglist = NULL; for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { p_end = mdsp->mds_base + mdsp->mds_npgs; for (segpp = &memsegs; (seg = *segpp) != NULL; ) { if (seg->pages_base >= p_end || seg->pages_end <= mdsp->mds_base) { /* Span and memseg don't overlap. */ segpp = &((*segpp)->next); continue; } ASSERT(seg->pages_base >= mdsp->mds_base); ASSERT(seg->pages_end <= p_end); PLCNT_MODIFY_MAX(seg->pages_base, seg->pages_base - seg->pages_end); /* Hide the memseg from future scans. */ hat_kpm_delmem_mseg_update(seg, segpp); *segpp = seg->next; membar_producer(); /* TODO: Needed? */ npgs += MSEG_NPAGES(seg); /* * Leave the deleted segment's next pointer intact * in case a memsegs scanning loop is walking this * segment concurrently. */ seg->lnext = seglist; seglist = seg; } } build_pfn_hash(); ASSERT(npgs < total_pages); total_pages -= npgs; /* * Recalculate the paging parameters now total_pages has changed. * This will also cause the clock hands to be reset before next use. */ setupclock(); memsegs_unlock(1); mutex_exit(&mhp->mh_mutex); while ((seg = seglist) != NULL) { pfn_t mseg_start; pfn_t mseg_base, mseg_end; pgcnt_t mseg_npgs; int mlret; seglist = seg->lnext; /* * Put the page_t's into the deleted state to stop * cv_wait()s on the pages. When we remap, the dummy * page_t's will be in the same state. */ memseg_lock_delete_all(seg); /* * Collect up information based on pages_base and pages_end * early so that we can flag early that the memseg has been * deleted by setting pages_end == pages_base. */ mseg_base = seg->pages_base; mseg_end = seg->pages_end; mseg_npgs = MSEG_NPAGES(seg); mseg_start = memseg_get_start(seg); if (memseg_is_dynamic(seg)) { /* Remap the meta data to our special dummy area. */ memseg_remap_to_dummy(seg); mutex_enter(&memseg_lists_lock); seg->lnext = memseg_va_avail; memseg_va_avail = seg; mutex_exit(&memseg_lists_lock); } else { /* * For memory whose page_ts were allocated * at boot, we need to find a new use for * the page_t memory. * For the moment, just leak it. * (It is held in the memseg_delete_junk list.) */ seg->pages_end = seg->pages_base; mutex_enter(&memseg_lists_lock); seg->lnext = memseg_delete_junk; memseg_delete_junk = seg; mutex_exit(&memseg_lists_lock); } /* Must not use seg now as it could be re-used. */ memlist_write_lock(); mlret = memlist_delete_span( (uint64_t)(mseg_base) << PAGESHIFT, (uint64_t)(mseg_npgs) << PAGESHIFT, &phys_avail); ASSERT(mlret == MEML_SPANOP_OK); mlret = memlist_delete_span( (uint64_t)(mseg_start) << PAGESHIFT, (uint64_t)(mseg_end - mseg_start) << PAGESHIFT, &phys_install); ASSERT(mlret == MEML_SPANOP_OK); phys_install_has_changed(); memlist_write_unlock(); } memlist_read_lock(); installed_top_size(phys_install, &physmax, &physinstalled); memlist_read_unlock(); mutex_enter(&freemem_lock); maxmem -= avpgs; physmem -= avpgs; /* availrmem is adjusted during the delete. */ availrmem_initial -= avpgs; mutex_exit(&freemem_lock); dump_resize(); cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " "(0x%" PRIx64 ")\n", physinstalled << (PAGESHIFT - 10), (uint64_t)physinstalled << PAGESHIFT); avmem = (uint64_t)freemem << PAGESHIFT; cmn_err(CE_CONT, "?kphysm_delete: " "avail mem = %" PRId64 "\n", avmem); /* * Update lgroup generation number on single lgroup systems */ if (nlgrps == 1) lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); /* Successfully deleted system memory */ mutex_enter(&mhp->mh_mutex); } static uint_t mdel_nullvp_waiter; static void page_delete_collect( page_t *pp, struct mem_handle *mhp) { if (pp->p_vnode) { page_hashout(pp, (kmutex_t *)NULL); /* do not do PP_SETAGED(pp); */ } else { kmutex_t *sep; sep = page_se_mutex(pp); mutex_enter(sep); if (CV_HAS_WAITERS(&pp->p_cv)) { mdel_nullvp_waiter++; cv_broadcast(&pp->p_cv); } mutex_exit(sep); } ASSERT(pp->p_next == pp->p_prev); ASSERT(pp->p_next == NULL || pp->p_next == pp); pp->p_next = mhp->mh_deleted; mhp->mh_deleted = pp; ASSERT(mhp->mh_hold_todo != 0); mhp->mh_hold_todo--; } static void transit_list_collect(struct mem_handle *mhp, int v) { struct transit_list_head *trh; trh = &transit_list_head; mutex_enter(&trh->trh_lock); mhp->mh_transit.trl_collect = v; mutex_exit(&trh->trh_lock); } static void transit_list_insert(struct transit_list *tlp) { struct transit_list_head *trh; trh = &transit_list_head; ASSERT(MUTEX_HELD(&trh->trh_lock)); tlp->trl_next = trh->trh_head; trh->trh_head = tlp; } static void transit_list_remove(struct transit_list *tlp) { struct transit_list_head *trh; struct transit_list **tlpp; trh = &transit_list_head; tlpp = &trh->trh_head; ASSERT(MUTEX_HELD(&trh->trh_lock)); while (*tlpp != NULL && *tlpp != tlp) tlpp = &(*tlpp)->trl_next; ASSERT(*tlpp != NULL); if (*tlpp == tlp) *tlpp = tlp->trl_next; tlp->trl_next = NULL; } static struct transit_list * pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) { struct transit_list *tlp; for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { struct memdelspan *mdsp; for (mdsp = tlp->trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { if (pfnum >= mdsp->mds_base && pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { return (tlp); } } } return (NULL); } int pfn_is_being_deleted(pfn_t pfnum) { struct transit_list_head *trh; struct transit_list *tlp; int ret; trh = &transit_list_head; if (trh->trh_head == NULL) return (0); mutex_enter(&trh->trh_lock); tlp = pfnum_to_transit_list(trh, pfnum); ret = (tlp != NULL && tlp->trl_collect); mutex_exit(&trh->trh_lock); return (ret); } #ifdef MEM_DEL_STATS extern int hz; static void mem_del_stat_print_func(struct mem_handle *mhp) { uint64_t tmp; if (mem_del_stat_print) { printf("memory delete loop %x/%x, statistics%s\n", (uint_t)mhp->mh_transit.trl_spans->mds_base, (uint_t)mhp->mh_transit.trl_spans->mds_npgs, (mhp->mh_cancel ? " (cancelled)" : "")); printf("\t%8u nloop\n", mhp->mh_delstat.nloop); printf("\t%8u need_free\n", mhp->mh_delstat.need_free); printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); printf("\t%8u free_low\n", mhp->mh_delstat.free_low); printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); printf("\t%8u nfree\n", mhp->mh_delstat.nfree); printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); printf("\t%8u already_done\n", mhp->mh_delstat.already_done); printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); printf("\t%8u retired\n", mhp->mh_delstat.retired); printf("\t%8u toxic\n", mhp->mh_delstat.toxic); printf("\t%8u failing\n", mhp->mh_delstat.failing); printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ printf( "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ printf( "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); } } #endif /* MEM_DEL_STATS */ struct mem_callback { kphysm_setup_vector_t *vec; void *arg; }; #define NMEMCALLBACKS 100 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; static uint_t nmemcallbacks; static krwlock_t mem_callback_rwlock; int kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) { uint_t i, found; /* * This test will become more complicated when the version must * change. */ if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) return (EINVAL); if (vec->post_add == NULL || vec->pre_del == NULL || vec->post_del == NULL) return (EINVAL); rw_enter(&mem_callback_rwlock, RW_WRITER); for (i = 0, found = 0; i < nmemcallbacks; i++) { if (mem_callbacks[i].vec == NULL && found == 0) found = i + 1; if (mem_callbacks[i].vec == vec && mem_callbacks[i].arg == arg) { #ifdef DEBUG /* Catch this in DEBUG kernels. */ cmn_err(CE_WARN, "kphysm_setup_func_register" "(0x%p, 0x%p) duplicate registration from 0x%p", (void *)vec, arg, (void *)caller()); #endif /* DEBUG */ rw_exit(&mem_callback_rwlock); return (EEXIST); } } if (found != 0) { i = found - 1; } else { ASSERT(nmemcallbacks < NMEMCALLBACKS); if (nmemcallbacks == NMEMCALLBACKS) { rw_exit(&mem_callback_rwlock); return (ENOMEM); } i = nmemcallbacks++; } mem_callbacks[i].vec = vec; mem_callbacks[i].arg = arg; rw_exit(&mem_callback_rwlock); return (0); } void kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) { uint_t i; rw_enter(&mem_callback_rwlock, RW_WRITER); for (i = 0; i < nmemcallbacks; i++) { if (mem_callbacks[i].vec == vec && mem_callbacks[i].arg == arg) { mem_callbacks[i].vec = NULL; mem_callbacks[i].arg = NULL; if (i == (nmemcallbacks - 1)) nmemcallbacks--; break; } } rw_exit(&mem_callback_rwlock); } static void kphysm_setup_post_add(pgcnt_t delta_pages) { uint_t i; rw_enter(&mem_callback_rwlock, RW_READER); for (i = 0; i < nmemcallbacks; i++) { if (mem_callbacks[i].vec != NULL) { (*mem_callbacks[i].vec->post_add) (mem_callbacks[i].arg, delta_pages); } } rw_exit(&mem_callback_rwlock); } /* * Note the locking between pre_del and post_del: The reader lock is held * between the two calls to stop the set of functions from changing. */ static int kphysm_setup_pre_del(pgcnt_t delta_pages) { uint_t i; int ret; int aret; ret = 0; rw_enter(&mem_callback_rwlock, RW_READER); for (i = 0; i < nmemcallbacks; i++) { if (mem_callbacks[i].vec != NULL) { aret = (*mem_callbacks[i].vec->pre_del) (mem_callbacks[i].arg, delta_pages); ret |= aret; } } return (ret); } static void kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) { uint_t i; for (i = 0; i < nmemcallbacks; i++) { if (mem_callbacks[i].vec != NULL) { (*mem_callbacks[i].vec->post_del) (mem_callbacks[i].arg, delta_pages, cancelled); } } rw_exit(&mem_callback_rwlock); } static int kphysm_split_memseg( pfn_t base, pgcnt_t npgs) { struct memseg *seg; struct memseg **segpp; pgcnt_t size_low, size_high; struct memseg *seg_low, *seg_mid, *seg_high; /* * Lock the memsegs list against other updates now */ memsegs_lock(1); /* * Find boot time memseg that wholly covers this area. */ /* First find the memseg with page 'base' in it. */ for (segpp = &memsegs; (seg = *segpp) != NULL; segpp = &((*segpp)->next)) { if (base >= seg->pages_base && base < seg->pages_end) break; } if (seg == NULL) { memsegs_unlock(1); return (0); } if (memseg_includes_meta(seg)) { memsegs_unlock(1); return (0); } if ((base + npgs) > seg->pages_end) { memsegs_unlock(1); return (0); } /* * Work out the size of the two segments that will * surround the new segment, one for low address * and one for high. */ ASSERT(base >= seg->pages_base); size_low = base - seg->pages_base; ASSERT(seg->pages_end >= (base + npgs)); size_high = seg->pages_end - (base + npgs); /* * Sanity check. */ if ((size_low + size_high) == 0) { memsegs_unlock(1); return (0); } /* * Allocate the new structures. The old memseg will not be freed * as there may be a reference to it. */ seg_low = NULL; seg_high = NULL; if (size_low != 0) seg_low = memseg_alloc(); seg_mid = memseg_alloc(); if (size_high != 0) seg_high = memseg_alloc(); /* * All allocation done now. */ if (size_low != 0) { seg_low->pages = seg->pages; seg_low->epages = seg_low->pages + size_low; seg_low->pages_base = seg->pages_base; seg_low->pages_end = seg_low->pages_base + size_low; seg_low->next = seg_mid; seg_low->msegflags = seg->msegflags; } if (size_high != 0) { seg_high->pages = seg->epages - size_high; seg_high->epages = seg_high->pages + size_high; seg_high->pages_base = seg->pages_end - size_high; seg_high->pages_end = seg_high->pages_base + size_high; seg_high->next = seg->next; seg_high->msegflags = seg->msegflags; } seg_mid->pages = seg->pages + size_low; seg_mid->pages_base = seg->pages_base + size_low; seg_mid->epages = seg->epages - size_high; seg_mid->pages_end = seg->pages_end - size_high; seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; seg_mid->msegflags = seg->msegflags; /* * Update hat_kpm specific info of all involved memsegs and * allow hat_kpm specific global chain updates. */ hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); /* * At this point we have two equivalent memseg sub-chains, * seg and seg_low/seg_mid/seg_high, which both chain on to * the same place in the global chain. By re-writing the pointer * in the previous element we switch atomically from using the old * (seg) to the new. */ *segpp = (seg_low != NULL) ? seg_low : seg_mid; membar_enter(); build_pfn_hash(); memsegs_unlock(1); /* * We leave the old segment, 'seg', intact as there may be * references to it. Also, as the value of total_pages has not * changed and the memsegs list is effectively the same when * accessed via the old or the new pointer, we do not have to * cause pageout_scanner() to re-evaluate its hand pointers. * * We currently do not re-use or reclaim the page_t memory. * If we do, then this may have to change. */ mutex_enter(&memseg_lists_lock); seg->lnext = memseg_edit_junk; memseg_edit_junk = seg; mutex_exit(&memseg_lists_lock); return (1); } /* * The sfmmu hat layer (e.g.) accesses some parts of the memseg * structure using physical addresses. Therefore a kmem_cache is * used with KMC_NOHASH to avoid page crossings within a memseg * structure. KMC_NOHASH requires that no external (outside of * slab) information is allowed. This, in turn, implies that the * cache's slabsize must be exactly a single page, since per-slab * information (e.g. the freelist for the slab) is kept at the * end of the slab, where it is easy to locate. Should be changed * when a more obvious kmem_cache interface/flag will become * available. */ void mem_config_init() { memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); } struct memseg * memseg_alloc() { struct memseg *seg; seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); bzero(seg, sizeof (struct memseg)); return (seg); } /* * Return whether the page_t memory for this memseg * is included in the memseg itself. */ static int memseg_includes_meta(struct memseg *seg) { return (seg->msegflags & MEMSEG_META_INCL); } pfn_t memseg_get_start(struct memseg *seg) { pfn_t pt_start; if (memseg_includes_meta(seg)) { pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); /* Meta data is required to be at the beginning */ ASSERT(pt_start < seg->pages_base); } else pt_start = seg->pages_base; return (pt_start); } /* * Invalidate memseg pointers in cpu private vm data caches. */ static void memseg_cpu_vm_flush() { cpu_t *cp; vm_cpu_data_t *vc; mutex_enter(&cpu_lock); pause_cpus(NULL, NULL); cp = cpu_list; do { vc = cp->cpu_vm_data; vc->vc_pnum_memseg = NULL; vc->vc_pnext_memseg = NULL; } while ((cp = cp->cpu_next) != cpu_list); start_cpus(); mutex_exit(&cpu_lock); }