/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2018 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ /* * VM - address spaces. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ static struct kmem_cache *as_cache; static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); static void as_clearwatchprot(struct as *, caddr_t, size_t); /* * Verifying the segment lists is very time-consuming; it may not be * desirable always to define VERIFY_SEGLIST when DEBUG is set. */ #ifdef DEBUG #define VERIFY_SEGLIST int do_as_verify = 0; #endif /* * Allocate a new callback data structure entry and fill in the events of * interest, the address range of interest, and the callback argument. * Link the entry on the as->a_callbacks list. A callback entry for the * entire address space may be specified with vaddr = 0 and size = -1. * * CALLERS RESPONSIBILITY: If not calling from within the process context for * the specified as, the caller must guarantee persistence of the specified as * for the duration of this function (eg. pages being locked within the as * will guarantee persistence). */ int as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, caddr_t vaddr, size_t size, int sleepflag) { struct as_callback *current_head, *cb; caddr_t saddr; size_t rsize; /* callback function and an event are mandatory */ if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) return (EINVAL); /* Adding a callback after as_free has been called is not allowed */ if (as == &kas) return (ENOMEM); /* * vaddr = 0 and size = -1 is used to indicate that the callback range * is the entire address space so no rounding is done in that case. */ if (size != -1) { saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)saddr; /* check for wraparound */ if (saddr + rsize < saddr) return (ENOMEM); } else { if (vaddr != 0) return (EINVAL); saddr = vaddr; rsize = size; } /* Allocate and initialize a callback entry */ cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); if (cb == NULL) return (EAGAIN); cb->ascb_func = cb_func; cb->ascb_arg = arg; cb->ascb_events = events; cb->ascb_saddr = saddr; cb->ascb_len = rsize; /* Add the entry to the list */ mutex_enter(&as->a_contents); current_head = as->a_callbacks; as->a_callbacks = cb; cb->ascb_next = current_head; /* * The call to this function may lose in a race with * a pertinent event - eg. a thread does long term memory locking * but before the callback is added another thread executes as_unmap. * A broadcast here resolves that. */ if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { AS_CLRUNMAPWAIT(as); cv_broadcast(&as->a_cv); } mutex_exit(&as->a_contents); return (0); } /* * Search the callback list for an entry which pertains to arg. * * This is called from within the client upon completion of the callback. * RETURN VALUES: * AS_CALLBACK_DELETED (callback entry found and deleted) * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this * entry will be made in as_do_callbacks) * * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED * set, it indicates that as_do_callbacks is processing this entry. The * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made * to unblock as_do_callbacks, in case it is blocked. * * CALLERS RESPONSIBILITY: If not calling from within the process context for * the specified as, the caller must guarantee persistence of the specified as * for the duration of this function (eg. pages being locked within the as * will guarantee persistence). */ uint_t as_delete_callback(struct as *as, void *arg) { struct as_callback **prevcb = &as->a_callbacks; struct as_callback *cb; uint_t rc = AS_CALLBACK_NOTFOUND; mutex_enter(&as->a_contents); for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { if (cb->ascb_arg != arg) continue; /* * If the events indicate AS_CALLBACK_CALLED, just clear * AS_ALL_EVENT in the events field and wakeup the thread * that may be waiting in as_do_callbacks. as_do_callbacks * will take care of removing this entry from the list. In * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise * (AS_CALLBACK_CALLED not set), just remove it from the * list, return the memory and return AS_CALLBACK_DELETED. */ if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { /* leave AS_CALLBACK_CALLED */ cb->ascb_events &= ~AS_ALL_EVENT; rc = AS_CALLBACK_DELETE_DEFERRED; cv_broadcast(&as->a_cv); } else { *prevcb = cb->ascb_next; kmem_free(cb, sizeof (struct as_callback)); rc = AS_CALLBACK_DELETED; } break; } mutex_exit(&as->a_contents); return (rc); } /* * Searches the as callback list for a matching entry. * Returns a pointer to the first matching callback, or NULL if * nothing is found. * This function never sleeps so it is ok to call it with more * locks held but the (required) a_contents mutex. * * See also comment on as_do_callbacks below. */ static struct as_callback * as_find_callback(struct as *as, uint_t events, caddr_t event_addr, size_t event_len) { struct as_callback *cb; ASSERT(MUTEX_HELD(&as->a_contents)); for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { /* * If the callback has not already been called, then * check if events or address range pertains. An event_len * of zero means do an unconditional callback. */ if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || ((event_len != 0) && (((cb->ascb_events & events) == 0) || (event_addr + event_len < cb->ascb_saddr) || (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { continue; } break; } return (cb); } /* * Executes a given callback and removes it from the callback list for * this address space. * This function may sleep so the caller must drop all locks except * a_contents before calling this func. * * See also comments on as_do_callbacks below. */ static void as_execute_callback(struct as *as, struct as_callback *cb, uint_t events) { struct as_callback **prevcb; void *cb_arg; ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); cb->ascb_events |= AS_CALLBACK_CALLED; mutex_exit(&as->a_contents); (*cb->ascb_func)(as, cb->ascb_arg, events); mutex_enter(&as->a_contents); /* * the callback function is required to delete the callback * when the callback function determines it is OK for * this thread to continue. as_delete_callback will clear * the AS_ALL_EVENT in the events field when it is deleted. * If the callback function called as_delete_callback, * events will already be cleared and there will be no blocking. */ while ((cb->ascb_events & events) != 0) { cv_wait(&as->a_cv, &as->a_contents); } /* * This entry needs to be taken off the list. Normally, the * callback func itself does that, but unfortunately the list * may have changed while the callback was running because the * a_contents mutex was dropped and someone else other than the * callback func itself could have called as_delete_callback, * so we have to search to find this entry again. The entry * must have AS_CALLBACK_CALLED, and have the same 'arg'. */ cb_arg = cb->ascb_arg; prevcb = &as->a_callbacks; for (cb = as->a_callbacks; cb != NULL; prevcb = &cb->ascb_next, cb = *prevcb) { if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || (cb_arg != cb->ascb_arg)) { continue; } *prevcb = cb->ascb_next; kmem_free(cb, sizeof (struct as_callback)); break; } } /* * Check the callback list for a matching event and intersection of * address range. If there is a match invoke the callback. Skip an entry if: * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) * - not event of interest * - not address range of interest * * An event_len of zero indicates a request for an unconditional callback * (regardless of event), only the AS_CALLBACK_CALLED is checked. The * a_contents lock must be dropped before a callback, so only one callback * can be done before returning. Return -1 (true) if a callback was * executed and removed from the list, else return 0 (false). * * The logically separate parts, i.e. finding a matching callback and * executing a given callback have been separated into two functions * so that they can be called with different sets of locks held beyond * the always-required a_contents. as_find_callback does not sleep so * it is ok to call it if more locks than a_contents (i.e. the a_lock * rwlock) are held. as_execute_callback on the other hand may sleep * so all locks beyond a_contents must be dropped by the caller if one * does not want to end comatose. */ static int as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, size_t event_len) { struct as_callback *cb; if ((cb = as_find_callback(as, events, event_addr, event_len))) { as_execute_callback(as, cb, events); return (-1); } return (0); } /* * Search for the segment containing addr. If a segment containing addr * exists, that segment is returned. If no such segment exists, and * the list spans addresses greater than addr, then the first segment * whose base is greater than addr is returned; otherwise, NULL is * returned unless tail is true, in which case the last element of the * list is returned. * * a_seglast is used to cache the last found segment for repeated * searches to the same addr (which happens frequently). */ struct seg * as_findseg(struct as *as, caddr_t addr, int tail) { struct seg *seg = as->a_seglast; avl_index_t where; ASSERT(AS_LOCK_HELD(as)); if (seg != NULL && seg->s_base <= addr && addr < seg->s_base + seg->s_size) return (seg); seg = avl_find(&as->a_segtree, &addr, &where); if (seg != NULL) return (as->a_seglast = seg); seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); if (seg == NULL && tail) seg = avl_last(&as->a_segtree); return (as->a_seglast = seg); } #ifdef VERIFY_SEGLIST /* * verify that the linked list is coherent */ static void as_verify(struct as *as) { struct seg *seg, *seglast, *p, *n; uint_t nsegs = 0; if (do_as_verify == 0) return; seglast = as->a_seglast; for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { ASSERT(seg->s_as == as); p = AS_SEGPREV(as, seg); n = AS_SEGNEXT(as, seg); ASSERT(p == NULL || p->s_as == as); ASSERT(p == NULL || p->s_base < seg->s_base); ASSERT(n == NULL || n->s_base > seg->s_base); ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); if (seg == seglast) seglast = NULL; nsegs++; } ASSERT(seglast == NULL); ASSERT(avl_numnodes(&as->a_segtree) == nsegs); } #endif /* VERIFY_SEGLIST */ /* * Add a new segment to the address space. The avl_find() * may be expensive so we attempt to use last segment accessed * in as_gap() as an insertion point. */ int as_addseg(struct as *as, struct seg *newseg) { struct seg *seg; caddr_t addr; caddr_t eaddr; avl_index_t where; ASSERT(AS_WRITE_HELD(as)); as->a_updatedir = 1; /* inform /proc */ gethrestime(&as->a_updatetime); if (as->a_lastgaphl != NULL) { struct seg *hseg = NULL; struct seg *lseg = NULL; if (as->a_lastgaphl->s_base > newseg->s_base) { hseg = as->a_lastgaphl; lseg = AVL_PREV(&as->a_segtree, hseg); } else { lseg = as->a_lastgaphl; hseg = AVL_NEXT(&as->a_segtree, lseg); } if (hseg && lseg && lseg->s_base < newseg->s_base && hseg->s_base > newseg->s_base) { avl_insert_here(&as->a_segtree, newseg, lseg, AVL_AFTER); as->a_lastgaphl = NULL; as->a_seglast = newseg; return (0); } as->a_lastgaphl = NULL; } addr = newseg->s_base; eaddr = addr + newseg->s_size; seg = avl_find(&as->a_segtree, &addr, &where); if (seg == NULL) seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); if (seg == NULL) seg = avl_last(&as->a_segtree); if (seg != NULL) { caddr_t base = seg->s_base; /* * If top of seg is below the requested address, then * the insertion point is at the end of the linked list, * and seg points to the tail of the list. Otherwise, * the insertion point is immediately before seg. */ if (base + seg->s_size > addr) { if (addr >= base || eaddr > base) { return (-1); /* overlapping segment */ } } } as->a_seglast = newseg; avl_insert(&as->a_segtree, newseg, where); #ifdef VERIFY_SEGLIST as_verify(as); #endif return (0); } struct seg * as_removeseg(struct as *as, struct seg *seg) { avl_tree_t *t; ASSERT(AS_WRITE_HELD(as)); as->a_updatedir = 1; /* inform /proc */ gethrestime(&as->a_updatetime); if (seg == NULL) return (NULL); t = &as->a_segtree; if (as->a_seglast == seg) as->a_seglast = NULL; as->a_lastgaphl = NULL; /* * if this segment is at an address higher than * a_lastgap, set a_lastgap to the next segment (NULL if last segment) */ if (as->a_lastgap && (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) as->a_lastgap = AVL_NEXT(t, seg); /* * remove the segment from the seg tree */ avl_remove(t, seg); #ifdef VERIFY_SEGLIST as_verify(as); #endif return (seg); } /* * Find a segment containing addr. */ struct seg * as_segat(struct as *as, caddr_t addr) { struct seg *seg = as->a_seglast; ASSERT(AS_LOCK_HELD(as)); if (seg != NULL && seg->s_base <= addr && addr < seg->s_base + seg->s_size) return (seg); seg = avl_find(&as->a_segtree, &addr, NULL); return (seg); } /* * Serialize all searches for holes in an address space to * prevent two or more threads from allocating the same virtual * address range. The address space must not be "read/write" * locked by the caller since we may block. */ void as_rangelock(struct as *as) { mutex_enter(&as->a_contents); while (AS_ISCLAIMGAP(as)) cv_wait(&as->a_cv, &as->a_contents); AS_SETCLAIMGAP(as); mutex_exit(&as->a_contents); } /* * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. */ void as_rangeunlock(struct as *as) { mutex_enter(&as->a_contents); AS_CLRCLAIMGAP(as); cv_signal(&as->a_cv); mutex_exit(&as->a_contents); } /* * compar segments (or just an address) by segment address range */ static int as_segcompar(const void *x, const void *y) { struct seg *a = (struct seg *)x; struct seg *b = (struct seg *)y; if (a->s_base < b->s_base) return (-1); if (a->s_base >= b->s_base + b->s_size) return (1); return (0); } void as_avlinit(struct as *as) { avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), offsetof(struct seg, s_tree)); avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), offsetof(struct watched_page, wp_link)); } /*ARGSUSED*/ static int as_constructor(void *buf, void *cdrarg, int kmflags) { struct as *as = buf; mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); as_avlinit(as); return (0); } /*ARGSUSED1*/ static void as_destructor(void *buf, void *cdrarg) { struct as *as = buf; avl_destroy(&as->a_segtree); mutex_destroy(&as->a_contents); cv_destroy(&as->a_cv); rw_destroy(&as->a_lock); } void as_init(void) { as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, as_constructor, as_destructor, NULL, NULL, NULL, 0); } /* * Allocate and initialize an address space data structure. * We call hat_alloc to allow any machine dependent * information in the hat structure to be initialized. */ struct as * as_alloc(void) { struct as *as; as = kmem_cache_alloc(as_cache, KM_SLEEP); as->a_flags = 0; as->a_vbits = 0; as->a_hrm = NULL; as->a_seglast = NULL; as->a_size = 0; as->a_resvsize = 0; as->a_updatedir = 0; gethrestime(&as->a_updatetime); as->a_objectdir = NULL; as->a_sizedir = 0; as->a_userlimit = (caddr_t)USERLIMIT; as->a_lastgap = NULL; as->a_lastgaphl = NULL; as->a_callbacks = NULL; as->a_proc = NULL; AS_LOCK_ENTER(as, RW_WRITER); as->a_hat = hat_alloc(as); /* create hat for default system mmu */ AS_LOCK_EXIT(as); return (as); } /* * Free an address space data structure. * Need to free the hat first and then * all the segments on this as and finally * the space for the as struct itself. */ void as_free(struct as *as) { struct hat *hat = as->a_hat; struct seg *seg, *next; boolean_t free_started = B_FALSE; top: /* * Invoke ALL callbacks. as_do_callbacks will do one callback * per call, and not return (-1) until the callback has completed. * When as_do_callbacks returns zero, all callbacks have completed. */ mutex_enter(&as->a_contents); while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)) ; mutex_exit(&as->a_contents); AS_LOCK_ENTER(as, RW_WRITER); if (!free_started) { free_started = B_TRUE; hat_free_start(hat); } for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { int err; next = AS_SEGNEXT(as, seg); retry: err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); if (err == EAGAIN) { mutex_enter(&as->a_contents); if (as->a_callbacks) { AS_LOCK_EXIT(as); } else if (!AS_ISNOUNMAPWAIT(as)) { /* * Memory is currently locked. Wait for a * cv_signal that it has been unlocked, then * try the operation again. */ if (AS_ISUNMAPWAIT(as) == 0) cv_broadcast(&as->a_cv); AS_SETUNMAPWAIT(as); AS_LOCK_EXIT(as); while (AS_ISUNMAPWAIT(as)) cv_wait(&as->a_cv, &as->a_contents); } else { /* * We may have raced with * segvn_reclaim()/segspt_reclaim(). In this * case clean nounmapwait flag and retry since * softlockcnt in this segment may be already * 0. We don't drop as writer lock so our * number of retries without sleeping should * be very small. See segvn_reclaim() for * more comments. */ AS_CLRNOUNMAPWAIT(as); mutex_exit(&as->a_contents); goto retry; } mutex_exit(&as->a_contents); goto top; } else { /* * We do not expect any other error return at this * time. This is similar to an ASSERT in seg_unmap() */ ASSERT(err == 0); } } hat_free_end(hat); AS_LOCK_EXIT(as); /* /proc stuff */ ASSERT(avl_numnodes(&as->a_wpage) == 0); if (as->a_objectdir) { kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); as->a_objectdir = NULL; as->a_sizedir = 0; } /* * Free the struct as back to kmem. Assert it has no segments. */ ASSERT(avl_numnodes(&as->a_segtree) == 0); kmem_cache_free(as_cache, as); } int as_dup(struct as *as, struct proc *forkedproc) { struct as *newas; struct seg *seg, *newseg; size_t purgesize = 0; int error; AS_LOCK_ENTER(as, RW_WRITER); as_clearwatch(as); newas = as_alloc(); newas->a_userlimit = as->a_userlimit; newas->a_proc = forkedproc; AS_LOCK_ENTER(newas, RW_WRITER); (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { if (seg->s_flags & S_PURGE) { purgesize += seg->s_size; continue; } newseg = seg_alloc(newas, seg->s_base, seg->s_size); if (newseg == NULL) { AS_LOCK_EXIT(newas); as_setwatch(as); AS_LOCK_EXIT(as); as_free(newas); return (-1); } if ((error = SEGOP_DUP(seg, newseg)) != 0) { /* * We call seg_free() on the new seg * because the segment is not set up * completely; i.e. it has no ops. */ as_setwatch(as); AS_LOCK_EXIT(as); seg_free(newseg); AS_LOCK_EXIT(newas); as_free(newas); return (error); } if ((newseg->s_flags & S_HOLE) == 0) { newas->a_size += seg->s_size; } } newas->a_resvsize = as->a_resvsize - purgesize; error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); AS_LOCK_EXIT(newas); as_setwatch(as); AS_LOCK_EXIT(as); if (error != 0) { as_free(newas); return (error); } forkedproc->p_as = newas; return (0); } /* * Handle a ``fault'' at addr for size bytes. */ faultcode_t as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, enum fault_type type, enum seg_rw rw) { struct seg *seg; caddr_t raddr; /* rounded down addr */ size_t rsize; /* rounded up size */ size_t ssize; faultcode_t res = 0; caddr_t addrsav; struct seg *segsav; int as_lock_held; klwp_t *lwp = ttolwp(curthread); retry: /* * Indicate that the lwp is not to be stopped while waiting for a * pagefault. This is to avoid deadlock while debugging a process * via /proc over NFS (in particular). */ if (lwp != NULL) lwp->lwp_nostop++; /* * same length must be used when we softlock and softunlock. We * don't support softunlocking lengths less than the original length * when there is largepage support. See seg_dev.c for more * comments. */ switch (type) { case F_SOFTLOCK: CPU_STATS_ADD_K(vm, softlock, 1); break; case F_SOFTUNLOCK: break; case F_PROT: CPU_STATS_ADD_K(vm, prot_fault, 1); break; case F_INVAL: CPU_STATS_ENTER_K(); CPU_STATS_ADDQ(CPU, vm, as_fault, 1); if (as == &kas) CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); CPU_STATS_EXIT_K(); break; } raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; /* * XXX -- Don't grab the as lock for segkmap. We should grab it for * correctness, but then we could be stuck holding this lock for * a LONG time if the fault needs to be resolved on a slow * filesystem, and then no-one will be able to exec new commands, * as exec'ing requires the write lock on the as. */ if (as == &kas && segkmap && segkmap->s_base <= raddr && raddr + size < segkmap->s_base + segkmap->s_size) { seg = segkmap; as_lock_held = 0; } else { AS_LOCK_ENTER(as, RW_READER); seg = as_segat(as, raddr); if (seg == NULL) { AS_LOCK_EXIT(as); if (lwp != NULL) lwp->lwp_nostop--; return (FC_NOMAP); } as_lock_held = 1; } addrsav = raddr; segsav = seg; for (; rsize != 0; rsize -= ssize, raddr += ssize) { if (raddr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { res = FC_NOMAP; break; } } if (raddr + rsize > seg->s_base + seg->s_size) ssize = seg->s_base + seg->s_size - raddr; else ssize = rsize; res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); if (res != 0) break; } /* * If we were SOFTLOCKing and encountered a failure, * we must SOFTUNLOCK the range we already did. (Maybe we * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing * right here...) */ if (res != 0 && type == F_SOFTLOCK) { for (seg = segsav; addrsav < raddr; addrsav += ssize) { if (addrsav >= seg->s_base + seg->s_size) seg = AS_SEGNEXT(as, seg); ASSERT(seg != NULL); /* * Now call the fault routine again to perform the * unlock using S_OTHER instead of the rw variable * since we never got a chance to touch the pages. */ if (raddr > seg->s_base + seg->s_size) ssize = seg->s_base + seg->s_size - addrsav; else ssize = raddr - addrsav; (void) SEGOP_FAULT(hat, seg, addrsav, ssize, F_SOFTUNLOCK, S_OTHER); } } if (as_lock_held) AS_LOCK_EXIT(as); if (lwp != NULL) lwp->lwp_nostop--; /* * If the lower levels returned EDEADLK for a fault, * It means that we should retry the fault. Let's wait * a bit also to let the deadlock causing condition clear. * This is part of a gross hack to work around a design flaw * in the ufs/sds logging code and should go away when the * logging code is re-designed to fix the problem. See bug * 4125102 for details of the problem. */ if (FC_ERRNO(res) == EDEADLK) { delay(deadlk_wait); res = 0; goto retry; } return (res); } /* * Asynchronous ``fault'' at addr for size bytes. */ faultcode_t as_faulta(struct as *as, caddr_t addr, size_t size) { struct seg *seg; caddr_t raddr; /* rounded down addr */ size_t rsize; /* rounded up size */ faultcode_t res = 0; klwp_t *lwp = ttolwp(curthread); retry: /* * Indicate that the lwp is not to be stopped while waiting * for a pagefault. This is to avoid deadlock while debugging * a process via /proc over NFS (in particular). */ if (lwp != NULL) lwp->lwp_nostop++; raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; AS_LOCK_ENTER(as, RW_READER); seg = as_segat(as, raddr); if (seg == NULL) { AS_LOCK_EXIT(as); if (lwp != NULL) lwp->lwp_nostop--; return (FC_NOMAP); } for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { if (raddr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { res = FC_NOMAP; break; } } res = SEGOP_FAULTA(seg, raddr); if (res != 0) break; } AS_LOCK_EXIT(as); if (lwp != NULL) lwp->lwp_nostop--; /* * If the lower levels returned EDEADLK for a fault, * It means that we should retry the fault. Let's wait * a bit also to let the deadlock causing condition clear. * This is part of a gross hack to work around a design flaw * in the ufs/sds logging code and should go away when the * logging code is re-designed to fix the problem. See bug * 4125102 for details of the problem. */ if (FC_ERRNO(res) == EDEADLK) { delay(deadlk_wait); res = 0; goto retry; } return (res); } /* * Set the virtual mapping for the interval from [addr : addr + size) * in address space `as' to have the specified protection. * It is ok for the range to cross over several segments, * as long as they are contiguous. */ int as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) { struct seg *seg; struct as_callback *cb; size_t ssize; caddr_t raddr; /* rounded down addr */ size_t rsize; /* rounded up size */ int error = 0, writer = 0; caddr_t saveraddr; size_t saversize; setprot_top: raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; if (raddr + rsize < raddr) /* check for wraparound */ return (ENOMEM); saveraddr = raddr; saversize = rsize; /* * Normally we only lock the as as a reader. But * if due to setprot the segment driver needs to split * a segment it will return IE_RETRY. Therefore we re-acquire * the as lock as a writer so the segment driver can change * the seg list. Also the segment driver will return IE_RETRY * after it has changed the segment list so we therefore keep * locking as a writer. Since these opeartions should be rare * want to only lock as a writer when necessary. */ if (writer || avl_numnodes(&as->a_wpage) != 0) { AS_LOCK_ENTER(as, RW_WRITER); } else { AS_LOCK_ENTER(as, RW_READER); } as_clearwatchprot(as, raddr, rsize); seg = as_segat(as, raddr); if (seg == NULL) { as_setwatch(as); AS_LOCK_EXIT(as); return (ENOMEM); } for (; rsize != 0; rsize -= ssize, raddr += ssize) { if (raddr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { error = ENOMEM; break; } } if ((raddr + rsize) > (seg->s_base + seg->s_size)) ssize = seg->s_base + seg->s_size - raddr; else ssize = rsize; retry: error = SEGOP_SETPROT(seg, raddr, ssize, prot); if (error == IE_NOMEM) { error = EAGAIN; break; } if (error == IE_RETRY) { AS_LOCK_EXIT(as); writer = 1; goto setprot_top; } if (error == EAGAIN) { /* * Make sure we have a_lock as writer. */ if (writer == 0) { AS_LOCK_EXIT(as); writer = 1; goto setprot_top; } /* * Memory is currently locked. It must be unlocked * before this operation can succeed through a retry. * The possible reasons for locked memory and * corresponding strategies for unlocking are: * (1) Normal I/O * wait for a signal that the I/O operation * has completed and the memory is unlocked. * (2) Asynchronous I/O * The aio subsystem does not unlock pages when * the I/O is completed. Those pages are unlocked * when the application calls aiowait/aioerror. * So, to prevent blocking forever, cv_broadcast() * is done to wake up aio_cleanup_thread. * Subsequently, segvn_reclaim will be called, and * that will do AS_CLRUNMAPWAIT() and wake us up. * (3) Long term page locking: * Drivers intending to have pages locked for a * period considerably longer than for normal I/O * (essentially forever) may have registered for a * callback so they may unlock these pages on * request. This is needed to allow this operation * to succeed. Each entry on the callback list is * examined. If the event or address range pertains * the callback is invoked (unless it already is in * progress). The a_contents lock must be dropped * before the callback, so only one callback can * be done at a time. Go to the top and do more * until zero is returned. If zero is returned, * either there were no callbacks for this event * or they were already in progress. */ mutex_enter(&as->a_contents); if (as->a_callbacks && (cb = as_find_callback(as, AS_SETPROT_EVENT, seg->s_base, seg->s_size))) { AS_LOCK_EXIT(as); as_execute_callback(as, cb, AS_SETPROT_EVENT); } else if (!AS_ISNOUNMAPWAIT(as)) { if (AS_ISUNMAPWAIT(as) == 0) cv_broadcast(&as->a_cv); AS_SETUNMAPWAIT(as); AS_LOCK_EXIT(as); while (AS_ISUNMAPWAIT(as)) cv_wait(&as->a_cv, &as->a_contents); } else { /* * We may have raced with * segvn_reclaim()/segspt_reclaim(). In this * case clean nounmapwait flag and retry since * softlockcnt in this segment may be already * 0. We don't drop as writer lock so our * number of retries without sleeping should * be very small. See segvn_reclaim() for * more comments. */ AS_CLRNOUNMAPWAIT(as); mutex_exit(&as->a_contents); goto retry; } mutex_exit(&as->a_contents); goto setprot_top; } else if (error != 0) break; } if (error != 0) { as_setwatch(as); } else { as_setwatchprot(as, saveraddr, saversize, prot); } AS_LOCK_EXIT(as); return (error); } /* * Check to make sure that the interval [addr, addr + size) * in address space `as' has at least the specified protection. * It is ok for the range to cross over several segments, as long * as they are contiguous. */ int as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) { struct seg *seg; size_t ssize; caddr_t raddr; /* rounded down addr */ size_t rsize; /* rounded up size */ int error = 0; raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; if (raddr + rsize < raddr) /* check for wraparound */ return (ENOMEM); /* * This is ugly as sin... * Normally, we only acquire the address space readers lock. * However, if the address space has watchpoints present, * we must acquire the writer lock on the address space for * the benefit of as_clearwatchprot() and as_setwatchprot(). */ if (avl_numnodes(&as->a_wpage) != 0) AS_LOCK_ENTER(as, RW_WRITER); else AS_LOCK_ENTER(as, RW_READER); as_clearwatchprot(as, raddr, rsize); seg = as_segat(as, raddr); if (seg == NULL) { as_setwatch(as); AS_LOCK_EXIT(as); return (ENOMEM); } for (; rsize != 0; rsize -= ssize, raddr += ssize) { if (raddr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { error = ENOMEM; break; } } if ((raddr + rsize) > (seg->s_base + seg->s_size)) ssize = seg->s_base + seg->s_size - raddr; else ssize = rsize; error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); if (error != 0) break; } as_setwatch(as); AS_LOCK_EXIT(as); return (error); } int as_unmap(struct as *as, caddr_t addr, size_t size) { struct seg *seg, *seg_next; struct as_callback *cb; caddr_t raddr, eaddr; size_t ssize, rsize = 0; int err; top: raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & (uintptr_t)PAGEMASK); AS_LOCK_ENTER(as, RW_WRITER); as->a_updatedir = 1; /* inform /proc */ gethrestime(&as->a_updatetime); /* * Use as_findseg to find the first segment in the range, then * step through the segments in order, following s_next. */ as_clearwatchprot(as, raddr, eaddr - raddr); for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { const boolean_t is_hole = ((seg->s_flags & S_HOLE) != 0); if (eaddr <= seg->s_base) break; /* eaddr was in a gap; all done */ /* this is implied by the test above */ ASSERT(raddr < eaddr); if (raddr < seg->s_base) raddr = seg->s_base; /* raddr was in a gap */ if (eaddr > (seg->s_base + seg->s_size)) ssize = seg->s_base + seg->s_size - raddr; else ssize = eaddr - raddr; /* * Save next segment pointer since seg can be * destroyed during the segment unmap operation. */ seg_next = AS_SEGNEXT(as, seg); /* * We didn't count /dev/null mappings, so ignore them here. * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again, * we have to do this check here while we have seg.) */ rsize = 0; if (!SEG_IS_DEVNULL_MAPPING(seg) && !SEG_IS_PARTIAL_RESV(seg)) rsize = ssize; retry: err = SEGOP_UNMAP(seg, raddr, ssize); if (err == EAGAIN) { /* * Memory is currently locked. It must be unlocked * before this operation can succeed through a retry. * The possible reasons for locked memory and * corresponding strategies for unlocking are: * (1) Normal I/O * wait for a signal that the I/O operation * has completed and the memory is unlocked. * (2) Asynchronous I/O * The aio subsystem does not unlock pages when * the I/O is completed. Those pages are unlocked * when the application calls aiowait/aioerror. * So, to prevent blocking forever, cv_broadcast() * is done to wake up aio_cleanup_thread. * Subsequently, segvn_reclaim will be called, and * that will do AS_CLRUNMAPWAIT() and wake us up. * (3) Long term page locking: * Drivers intending to have pages locked for a * period considerably longer than for normal I/O * (essentially forever) may have registered for a * callback so they may unlock these pages on * request. This is needed to allow this operation * to succeed. Each entry on the callback list is * examined. If the event or address range pertains * the callback is invoked (unless it already is in * progress). The a_contents lock must be dropped * before the callback, so only one callback can * be done at a time. Go to the top and do more * until zero is returned. If zero is returned, * either there were no callbacks for this event * or they were already in progress. */ mutex_enter(&as->a_contents); if (as->a_callbacks && (cb = as_find_callback(as, AS_UNMAP_EVENT, seg->s_base, seg->s_size))) { AS_LOCK_EXIT(as); as_execute_callback(as, cb, AS_UNMAP_EVENT); } else if (!AS_ISNOUNMAPWAIT(as)) { if (AS_ISUNMAPWAIT(as) == 0) cv_broadcast(&as->a_cv); AS_SETUNMAPWAIT(as); AS_LOCK_EXIT(as); while (AS_ISUNMAPWAIT(as)) cv_wait(&as->a_cv, &as->a_contents); } else { /* * We may have raced with * segvn_reclaim()/segspt_reclaim(). In this * case clean nounmapwait flag and retry since * softlockcnt in this segment may be already * 0. We don't drop as writer lock so our * number of retries without sleeping should * be very small. See segvn_reclaim() for * more comments. */ AS_CLRNOUNMAPWAIT(as); mutex_exit(&as->a_contents); goto retry; } mutex_exit(&as->a_contents); goto top; } else if (err == IE_RETRY) { AS_LOCK_EXIT(as); goto top; } else if (err) { as_setwatch(as); AS_LOCK_EXIT(as); return (-1); } if (!is_hole) { as->a_size -= ssize; if (rsize) as->a_resvsize -= rsize; } raddr += ssize; } AS_LOCK_EXIT(as); return (0); } static int as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) { uint_t szc, nszc, save_szcvec; int error; caddr_t a, eaddr; size_t pgsz = 0; const boolean_t do_off = (vn_a->vp != NULL || vn_a->amp != NULL); ASSERT(AS_WRITE_HELD(as)); ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); ASSERT(IS_P2ALIGNED(size, PAGESIZE)); ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); if (!do_off) { vn_a->offset = 0; } if (szcvec <= 1) { struct seg *seg, *segref; seg = segref = seg_alloc(as, addr, size); if (seg == NULL) { return (ENOMEM); } vn_a->szc = 0; error = (*crfp)(&seg, vn_a); if (error != 0) { VERIFY3P(seg, ==, segref); seg_free(seg); } else { as->a_size += size; as->a_resvsize += size; } return (error); } eaddr = addr + size; save_szcvec = szcvec; szcvec >>= 1; szc = 0; nszc = 0; while (szcvec) { if ((szcvec & 0x1) == 0) { nszc++; szcvec >>= 1; continue; } nszc++; pgsz = page_get_pagesize(nszc); a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); if (a != addr) { struct seg *seg, *segref; size_t segsize; ASSERT(a < eaddr); segsize = a - addr; seg = segref = seg_alloc(as, addr, segsize); if (seg == NULL) { return (ENOMEM); } vn_a->szc = szc; error = (*crfp)(&seg, vn_a); if (error != 0) { VERIFY3P(seg, ==, segref); seg_free(seg); return (error); } as->a_size += segsize; as->a_resvsize += segsize; *segcreated = B_TRUE; if (do_off) { vn_a->offset += segsize; } addr = a; } szc = nszc; szcvec >>= 1; } ASSERT(addr < eaddr); szcvec = save_szcvec | 1; /* add 8K pages */ while (szcvec) { a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); ASSERT(a >= addr); if (a != addr) { struct seg *seg, *segref; size_t segsize; segsize = a - addr; seg = segref = seg_alloc(as, addr, segsize); if (seg == NULL) { return (ENOMEM); } vn_a->szc = szc; error = (*crfp)(&seg, vn_a); if (error != 0) { VERIFY3P(seg, ==, segref); seg_free(seg); return (error); } as->a_size += segsize; as->a_resvsize += segsize; *segcreated = B_TRUE; if (do_off) { vn_a->offset += segsize; } addr = a; } szcvec &= ~(1 << szc); if (szcvec) { szc = highbit(szcvec) - 1; pgsz = page_get_pagesize(szc); } } ASSERT(addr == eaddr); return (0); } static int as_map_vnsegs(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) { uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, type, 0); int error; struct vattr va; u_offset_t eoff; size_t save_size = 0; extern size_t textrepl_size_thresh; ASSERT(AS_WRITE_HELD(as)); ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); ASSERT(IS_P2ALIGNED(size, PAGESIZE)); ASSERT(vn_a->vp != NULL); ASSERT(vn_a->amp == NULL); again: if (szcvec <= 1) { struct seg *seg, *segref; seg = segref = seg_alloc(as, addr, size); if (seg == NULL) { return (ENOMEM); } vn_a->szc = 0; error = (*crfp)(&seg, vn_a); if (error != 0) { VERIFY3P(seg, ==, segref); seg_free(seg); } else { as->a_size += size; as->a_resvsize += size; } return (error); } va.va_mask = AT_SIZE; if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) { szcvec = 0; goto again; } eoff = vn_a->offset & PAGEMASK; if (eoff >= va.va_size) { szcvec = 0; goto again; } eoff += size; if (btopr(va.va_size) < btopr(eoff)) { save_size = size; size = va.va_size - (vn_a->offset & PAGEMASK); size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, type, 0); if (szcvec <= 1) { size = save_size; goto again; } } if (size > textrepl_size_thresh) { vn_a->flags |= _MAP_TEXTREPL; } error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, segcreated); if (error != 0) { return (error); } if (save_size) { addr += size; size = save_size - size; szcvec = 0; goto again; } return (0); } /* * as_map_ansegs: shared or private anonymous memory. Note that the flags * passed to map_pgszvec cannot be MAP_INITDATA, for anon. */ static int as_map_ansegs(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) { uint_t szcvec; uchar_t type = 0; ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); if (vn_a->type == MAP_SHARED) { type = MAPPGSZC_SHM; } else if (vn_a->type == MAP_PRIVATE) { if (vn_a->szc == AS_MAP_HEAP) { type = MAPPGSZC_HEAP; } else if (vn_a->szc == AS_MAP_STACK) { type = MAPPGSZC_STACK; } else { type = MAPPGSZC_PRIVM; } } szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), (vn_a->flags & MAP_TEXT), type, 0); ASSERT(AS_WRITE_HELD(as)); ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); ASSERT(IS_P2ALIGNED(size, PAGESIZE)); ASSERT(vn_a->vp == NULL); return (as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, segcreated)); } int as_map(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, void *argsp) { AS_LOCK_ENTER(as, RW_WRITER); return (as_map_locked(as, addr, size, crfp, argsp)); } int as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, void *argsp) { caddr_t raddr; /* rounded down addr */ size_t rsize; /* rounded up size */ int error; boolean_t is_hole = B_FALSE; /* * The use of a_proc is preferred to handle the case where curproc is * a door_call server and is allocating memory in the client's (a_proc) * address space. * When creating a shared memory segment a_proc will be NULL so we * fallback to curproc in that case. */ struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc; struct segvn_crargs crargs; raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; /* * check for wrap around */ if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { AS_LOCK_EXIT(as); return (ENOMEM); } as->a_updatedir = 1; /* inform /proc */ gethrestime(&as->a_updatetime); if (as != &kas) { /* * Ensure that the virtual size of the process will not exceed * the configured limit. Since seg_hole segments will later * set the S_HOLE flag indicating their status as a hole in the * AS, they are excluded from this check. */ if (as->a_size + rsize > (size_t)p->p_vmem_ctl && !AS_MAP_CHECK_SEGHOLE(crfp)) { AS_LOCK_EXIT(as); (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p, RCA_UNSAFE_ALL); return (ENOMEM); } } if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { boolean_t do_unmap = B_FALSE; crargs = *(struct segvn_crargs *)argsp; error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &do_unmap); if (error != 0) { AS_LOCK_EXIT(as); if (do_unmap) { (void) as_unmap(as, addr, size); } return (error); } } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { boolean_t do_unmap = B_FALSE; crargs = *(struct segvn_crargs *)argsp; error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &do_unmap); if (error != 0) { AS_LOCK_EXIT(as); if (do_unmap) { (void) as_unmap(as, addr, size); } return (error); } } else { struct seg *seg, *segref; seg = segref = seg_alloc(as, addr, size); if (seg == NULL) { AS_LOCK_EXIT(as); return (ENOMEM); } /* * It is possible that the segment creation routine will free * 'seg' as part of a more advanced operation, such as when * segvn concatenates adjacent segments together. When this * occurs, the seg*_create routine must communicate the * resulting segment out via the 'struct seg **' parameter. * * If segment creation fails, it must not free the passed-in * segment, nor alter the argument pointer. */ error = (*crfp)(&seg, argsp); if (error != 0) { VERIFY3P(seg, ==, segref); seg_free(seg); AS_LOCK_EXIT(as); return (error); } /* * Check if the resulting segment represents a hole in the * address space, rather than contributing to the AS size. */ is_hole = ((seg->s_flags & S_HOLE) != 0); /* Add size now so as_unmap will work if as_ctl fails. */ if (!is_hole) { as->a_size += rsize; as->a_resvsize += rsize; } } as_setwatch(as); /* * Establish memory locks for the segment if the address space is * locked, provided it's not an explicit hole in the AS. */ mutex_enter(&as->a_contents); if (AS_ISPGLCK(as) && !is_hole) { mutex_exit(&as->a_contents); AS_LOCK_EXIT(as); error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); if (error != 0) (void) as_unmap(as, addr, size); } else { mutex_exit(&as->a_contents); AS_LOCK_EXIT(as); } return (error); } /* * Delete all segments in the address space marked with S_PURGE. * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). * These segments are deleted as a first step before calls to as_gap(), so * that they don't affect mmap() or shmat(). */ void as_purge(struct as *as) { struct seg *seg; struct seg *next_seg; /* * the setting of NEEDSPURGE is protect by as_rangelock(), so * no need to grab a_contents mutex for this check */ if ((as->a_flags & AS_NEEDSPURGE) == 0) return; AS_LOCK_ENTER(as, RW_WRITER); next_seg = NULL; seg = AS_SEGFIRST(as); while (seg != NULL) { next_seg = AS_SEGNEXT(as, seg); if (seg->s_flags & S_PURGE) SEGOP_UNMAP(seg, seg->s_base, seg->s_size); seg = next_seg; } AS_LOCK_EXIT(as); mutex_enter(&as->a_contents); as->a_flags &= ~AS_NEEDSPURGE; mutex_exit(&as->a_contents); } /* * Find a hole within [*basep, *basep + *lenp), which contains a mappable * range of addresses at least "minlen" long, where the base of the range is * at "off" phase from an "align" boundary and there is space for a * "redzone"-sized redzone on eithe rside of the range. Thus, * if align was 4M and off was 16k, the user wants a hole which will start * 16k into a 4M page. * * If flags specifies AH_HI, the hole will have the highest possible address * in the range. We use the as->a_lastgap field to figure out where to * start looking for a gap. * * Otherwise, the gap will have the lowest possible address. * * If flags specifies AH_CONTAIN, the hole will contain the address addr. * * If an adequate hole is found, *basep and *lenp are set to reflect the part of * the hole that is within range, and 0 is returned. On failure, -1 is returned. * * NOTE: This routine is not correct when base+len overflows caddr_t. */ int as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off) { caddr_t lobound = *basep; caddr_t hibound = lobound + *lenp; struct seg *lseg, *hseg; caddr_t lo, hi; int forward; caddr_t save_base; size_t save_len; size_t save_minlen; size_t save_redzone; int fast_path = 1; save_base = *basep; save_len = *lenp; save_minlen = minlen; save_redzone = redzone; /* * For the first pass/fast_path, just add align and redzone into * minlen since if we get an allocation, we can guarantee that it * will fit the alignment and redzone requested. * This increases the chance that hibound will be adjusted to * a_lastgap->s_base which will likely allow us to find an * acceptable hole in the address space quicker. * If we can't find a hole with this fast_path, then we look for * smaller holes in which the alignment and offset may allow * the allocation to fit. */ minlen += align; minlen += 2 * redzone; redzone = 0; AS_LOCK_ENTER(as, RW_READER); if (AS_SEGFIRST(as) == NULL) { if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR, align, redzone, off)) { AS_LOCK_EXIT(as); return (0); } else { AS_LOCK_EXIT(as); *basep = save_base; *lenp = save_len; return (-1); } } retry: /* * Set up to iterate over all the inter-segment holes in the given * direction. lseg is NULL for the lowest-addressed hole and hseg is * NULL for the highest-addressed hole. If moving backwards, we reset * sseg to denote the highest-addressed segment. */ forward = (flags & AH_DIR) == AH_LO; if (forward) { hseg = as_findseg(as, lobound, 1); lseg = AS_SEGPREV(as, hseg); } else { /* * If allocating at least as much as the last allocation, * use a_lastgap's base as a better estimate of hibound. */ if (as->a_lastgap && minlen >= as->a_lastgap->s_size && hibound >= as->a_lastgap->s_base) hibound = as->a_lastgap->s_base; hseg = as_findseg(as, hibound, 1); if (hseg->s_base + hseg->s_size < hibound) { lseg = hseg; hseg = NULL; } else { lseg = AS_SEGPREV(as, hseg); } } for (;;) { /* * Set lo and hi to the hole's boundaries. (We should really * use MAXADDR in place of hibound in the expression below, * but can't express it easily; using hibound in its place is * harmless.) */ lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; hi = (hseg == NULL) ? hibound : hseg->s_base; /* * If the iteration has moved past the interval from lobound * to hibound it's pointless to continue. */ if ((forward && lo > hibound) || (!forward && hi < lobound)) break; else if (lo > hibound || hi < lobound) goto cont; /* * Candidate hole lies at least partially within the allowable * range. Restrict it to fall completely within that range, * i.e., to [max(lo, lobound), min(hi, hibound)]. */ if (lo < lobound) lo = lobound; if (hi > hibound) hi = hibound; /* * Verify that the candidate hole is big enough and meets * hardware constraints. If the hole is too small, no need * to do the further checks since they will fail. */ *basep = lo; *lenp = hi - lo; if (*lenp >= minlen && valid_va_range_aligned(basep, lenp, minlen, forward ? AH_LO : AH_HI, align, redzone, off) && ((flags & AH_CONTAIN) == 0 || (*basep <= addr && *basep + *lenp > addr))) { if (!forward) as->a_lastgap = hseg; if (hseg != NULL) as->a_lastgaphl = hseg; else as->a_lastgaphl = lseg; AS_LOCK_EXIT(as); return (0); } cont: /* * Move to the next hole. */ if (forward) { lseg = hseg; if (lseg == NULL) break; hseg = AS_SEGNEXT(as, hseg); } else { hseg = lseg; if (hseg == NULL) break; lseg = AS_SEGPREV(as, lseg); } } if (fast_path && (align != 0 || save_redzone != 0)) { fast_path = 0; minlen = save_minlen; redzone = save_redzone; goto retry; } *basep = save_base; *lenp = save_len; AS_LOCK_EXIT(as); return (-1); } /* * Find a hole of at least size minlen within [*basep, *basep + *lenp). * * If flags specifies AH_HI, the hole will have the highest possible address * in the range. We use the as->a_lastgap field to figure out where to * start looking for a gap. * * Otherwise, the gap will have the lowest possible address. * * If flags specifies AH_CONTAIN, the hole will contain the address addr. * * If an adequate hole is found, base and len are set to reflect the part of * the hole that is within range, and 0 is returned, otherwise, * -1 is returned. * * NOTE: This routine is not correct when base+len overflows caddr_t. */ int as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, caddr_t addr) { return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0)); } /* * Return the next range within [base, base + len) that is backed * with "real memory". Skip holes and non-seg_vn segments. * We're lazy and only return one segment at a time. */ int as_memory(struct as *as, caddr_t *basep, size_t *lenp) { extern struct seg_ops segspt_shmops; /* needs a header file */ struct seg *seg; caddr_t addr, eaddr; caddr_t segend; AS_LOCK_ENTER(as, RW_READER); addr = *basep; eaddr = addr + *lenp; seg = as_findseg(as, addr, 0); if (seg != NULL) addr = MAX(seg->s_base, addr); for (;;) { if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { AS_LOCK_EXIT(as); return (EINVAL); } if (seg->s_ops == &segvn_ops) { segend = seg->s_base + seg->s_size; break; } /* * We do ISM by looking into the private data * to determine the real size of the segment. */ if (seg->s_ops == &segspt_shmops) { segend = seg->s_base + spt_realsize(seg); if (addr < segend) break; } seg = AS_SEGNEXT(as, seg); if (seg != NULL) addr = seg->s_base; } *basep = addr; if (segend > eaddr) *lenp = eaddr - addr; else *lenp = segend - addr; AS_LOCK_EXIT(as); return (0); } /* * Swap the pages associated with the address space as out to * secondary storage, returning the number of bytes actually * swapped. * * The value returned is intended to correlate well with the process's * memory requirements. Its usefulness for this purpose depends on * how well the segment-level routines do at returning accurate * information. */ size_t as_swapout(struct as *as) { struct seg *seg; size_t swpcnt = 0; /* * Kernel-only processes have given up their address * spaces. Of course, we shouldn't be attempting to * swap out such processes in the first place... */ if (as == NULL) return (0); AS_LOCK_ENTER(as, RW_READER); /* * Free all mapping resources associated with the address * space. The segment-level swapout routines capitalize * on this unmapping by scavanging pages that have become * unmapped here. */ hat_swapout(as->a_hat); /* * Call the swapout routines of all segments in the address * space to do the actual work, accumulating the amount of * space reclaimed. */ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { struct seg_ops *ov = seg->s_ops; /* * We have to check to see if the seg has * an ops vector because the seg may have * been in the middle of being set up when * the process was picked for swapout. */ if ((ov != NULL) && (ov->swapout != NULL)) swpcnt += SEGOP_SWAPOUT(seg); } AS_LOCK_EXIT(as); return (swpcnt); } /* * Determine whether data from the mappings in interval [addr, addr + size) * are in the primary memory (core) cache. */ int as_incore(struct as *as, caddr_t addr, size_t size, char *vec, size_t *sizep) { struct seg *seg; size_t ssize; caddr_t raddr; /* rounded down addr */ size_t rsize; /* rounded up size */ size_t isize; /* iteration size */ int error = 0; /* result, assume success */ *sizep = 0; raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; if (raddr + rsize < raddr) /* check for wraparound */ return (ENOMEM); AS_LOCK_ENTER(as, RW_READER); seg = as_segat(as, raddr); if (seg == NULL) { AS_LOCK_EXIT(as); return (-1); } for (; rsize != 0; rsize -= ssize, raddr += ssize) { if (raddr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { error = -1; break; } } if ((raddr + rsize) > (seg->s_base + seg->s_size)) ssize = seg->s_base + seg->s_size - raddr; else ssize = rsize; *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); if (isize != ssize) { error = -1; break; } vec += btopr(ssize); } AS_LOCK_EXIT(as); return (error); } static void as_segunlock(struct seg *seg, caddr_t addr, int attr, ulong_t *bitmap, size_t position, size_t npages) { caddr_t range_start; size_t pos1 = position; size_t pos2; size_t size; size_t end_pos = npages + position; while (bt_range(bitmap, &pos1, &pos2, end_pos)) { size = ptob((pos2 - pos1)); range_start = (caddr_t)((uintptr_t)addr + ptob(pos1 - position)); (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, (ulong_t *)NULL, (size_t)NULL); pos1 = pos2; } } static void as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, caddr_t raddr, size_t rsize) { struct seg *seg = as_segat(as, raddr); size_t ssize; while (rsize != 0) { if (raddr >= seg->s_base + seg->s_size) seg = AS_SEGNEXT(as, seg); if ((raddr + rsize) > (seg->s_base + seg->s_size)) ssize = seg->s_base + seg->s_size - raddr; else ssize = rsize; as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); rsize -= ssize; raddr += ssize; } } /* * Cache control operations over the interval [addr, addr + size) in * address space "as". */ /*ARGSUSED*/ int as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, uintptr_t arg, ulong_t *lock_map, size_t pos) { struct seg *seg; /* working segment */ caddr_t raddr; /* rounded down addr */ caddr_t initraddr; /* saved initial rounded down addr */ size_t rsize; /* rounded up size */ size_t initrsize; /* saved initial rounded up size */ size_t ssize; /* size of seg */ int error = 0; /* result */ size_t mlock_size; /* size of bitmap */ ulong_t *mlock_map; /* pointer to bitmap used */ /* to represent the locked */ /* pages. */ mlock_size = 0; mlock_map = NULL; retry: if (error == IE_RETRY) AS_LOCK_ENTER(as, RW_WRITER); else AS_LOCK_ENTER(as, RW_READER); /* * If these are address space lock/unlock operations, loop over * all segments in the address space, as appropriate. */ if (func == MC_LOCKAS) { size_t npages, idx; size_t rlen = 0; /* rounded as length */ idx = pos; if (arg & MCL_FUTURE) { mutex_enter(&as->a_contents); AS_SETPGLCK(as); mutex_exit(&as->a_contents); } if ((arg & MCL_CURRENT) == 0) { AS_LOCK_EXIT(as); return (0); } seg = AS_SEGFIRST(as); if (seg == NULL) { AS_LOCK_EXIT(as); return (0); } do { raddr = (caddr_t)((uintptr_t)seg->s_base & (uintptr_t)PAGEMASK); rlen += (((uintptr_t)(seg->s_base + seg->s_size) + PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; } while ((seg = AS_SEGNEXT(as, seg)) != NULL); mlock_size = BT_BITOUL(btopr(rlen)); if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * sizeof (ulong_t), KM_NOSLEEP)) == NULL) { AS_LOCK_EXIT(as); return (EAGAIN); } for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { if ((seg->s_flags & S_HOLE) != 0) { continue; } error = SEGOP_LOCKOP(seg, seg->s_base, seg->s_size, attr, MC_LOCK, mlock_map, pos); if (error != 0) break; pos += seg_pages(seg); } if (error) { for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { raddr = (caddr_t)((uintptr_t)seg->s_base & (uintptr_t)PAGEMASK); npages = seg_pages(seg); as_segunlock(seg, raddr, attr, mlock_map, idx, npages); idx += npages; } } kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); AS_LOCK_EXIT(as); goto lockerr; } else if (func == MC_UNLOCKAS) { mutex_enter(&as->a_contents); AS_CLRPGLCK(as); mutex_exit(&as->a_contents); for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { if ((seg->s_flags & S_HOLE) != 0) { continue; } error = SEGOP_LOCKOP(seg, seg->s_base, seg->s_size, attr, MC_UNLOCK, NULL, 0); if (error != 0) break; } AS_LOCK_EXIT(as); goto lockerr; } /* * Normalize addresses and sizes. */ initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; if (raddr + rsize < raddr) { /* check for wraparound */ AS_LOCK_EXIT(as); return (ENOMEM); } /* * Get initial segment. */ if ((seg = as_segat(as, raddr)) == NULL) { AS_LOCK_EXIT(as); return (ENOMEM); } if (func == MC_LOCK) { mlock_size = BT_BITOUL(btopr(rsize)); if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * sizeof (ulong_t), KM_NOSLEEP)) == NULL) { AS_LOCK_EXIT(as); return (EAGAIN); } } /* * Loop over all segments. If a hole in the address range is * discovered, then fail. For each segment, perform the appropriate * control operation. */ while (rsize != 0) { /* * Make sure there's no hole, calculate the portion * of the next segment to be operated over. */ if (raddr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { if (func == MC_LOCK) { as_unlockerr(as, attr, mlock_map, initraddr, initrsize - rsize); kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); } AS_LOCK_EXIT(as); return (ENOMEM); } } if ((raddr + rsize) > (seg->s_base + seg->s_size)) ssize = seg->s_base + seg->s_size - raddr; else ssize = rsize; /* * Dispatch on specific function. */ switch (func) { /* * Synchronize cached data from mappings with backing * objects. */ case MC_SYNC: if (error = SEGOP_SYNC(seg, raddr, ssize, attr, (uint_t)arg)) { AS_LOCK_EXIT(as); return (error); } break; /* * Lock pages in memory. */ case MC_LOCK: if (error = SEGOP_LOCKOP(seg, raddr, ssize, attr, func, mlock_map, pos)) { as_unlockerr(as, attr, mlock_map, initraddr, initrsize - rsize + ssize); kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); AS_LOCK_EXIT(as); goto lockerr; } break; /* * Unlock mapped pages. */ case MC_UNLOCK: (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, (ulong_t *)NULL, (size_t)NULL); break; /* * Store VM advise for mapped pages in segment layer. */ case MC_ADVISE: error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); /* * Check for regular errors and special retry error */ if (error) { if (error == IE_RETRY) { /* * Need to acquire writers lock, so * have to drop readers lock and start * all over again */ AS_LOCK_EXIT(as); goto retry; } else if (error == IE_REATTACH) { /* * Find segment for current address * because current segment just got * split or concatenated */ seg = as_segat(as, raddr); if (seg == NULL) { AS_LOCK_EXIT(as); return (ENOMEM); } } else { /* * Regular error */ AS_LOCK_EXIT(as); return (error); } } break; case MC_INHERIT_ZERO: if (seg->s_ops->inherit == NULL) { error = ENOTSUP; } else { error = SEGOP_INHERIT(seg, raddr, ssize, SEGP_INH_ZERO); } if (error != 0) { AS_LOCK_EXIT(as); return (error); } break; /* * Can't happen. */ default: panic("as_ctl: bad operation %d", func); /*NOTREACHED*/ } rsize -= ssize; raddr += ssize; } if (func == MC_LOCK) kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); AS_LOCK_EXIT(as); return (0); lockerr: /* * If the lower levels returned EDEADLK for a segment lockop, * it means that we should retry the operation. Let's wait * a bit also to let the deadlock causing condition clear. * This is part of a gross hack to work around a design flaw * in the ufs/sds logging code and should go away when the * logging code is re-designed to fix the problem. See bug * 4125102 for details of the problem. */ if (error == EDEADLK) { delay(deadlk_wait); error = 0; goto retry; } return (error); } int fc_decode(faultcode_t fault_err) { int error = 0; switch (FC_CODE(fault_err)) { case FC_OBJERR: error = FC_ERRNO(fault_err); break; case FC_PROT: error = EACCES; break; default: error = EFAULT; break; } return (error); } /* * Pagelock pages from a range that spans more than 1 segment. Obtain shadow * lists from each segment and copy them to one contiguous shadow list (plist) * as expected by the caller. Save pointers to per segment shadow lists at * the tail of plist so that they can be used during as_pageunlock(). */ static int as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, caddr_t addr, size_t size, enum seg_rw rw) { caddr_t sv_addr = addr; size_t sv_size = size; struct seg *sv_seg = seg; ulong_t segcnt = 1; ulong_t cnt; size_t ssize; pgcnt_t npages = btop(size); page_t **plist; page_t **pl; int error; caddr_t eaddr; faultcode_t fault_err = 0; pgcnt_t pl_off; extern struct seg_ops segspt_shmops; ASSERT(AS_LOCK_HELD(as)); ASSERT(seg != NULL); ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); ASSERT(addr + size > seg->s_base + seg->s_size); ASSERT(IS_P2ALIGNED(size, PAGESIZE)); ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); /* * Count the number of segments covered by the range we are about to * lock. The segment count is used to size the shadow list we return * back to the caller. */ for (; size != 0; size -= ssize, addr += ssize) { if (addr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || addr != seg->s_base) { AS_LOCK_EXIT(as); return (EFAULT); } /* * Do a quick check if subsequent segments * will most likely support pagelock. */ if (seg->s_ops == &segvn_ops) { vnode_t *vp; if (SEGOP_GETVP(seg, addr, &vp) != 0 || vp != NULL) { AS_LOCK_EXIT(as); goto slow; } } else if (seg->s_ops != &segspt_shmops) { AS_LOCK_EXIT(as); goto slow; } segcnt++; } if (addr + size > seg->s_base + seg->s_size) { ssize = seg->s_base + seg->s_size - addr; } else { ssize = size; } } ASSERT(segcnt > 1); plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); addr = sv_addr; size = sv_size; seg = sv_seg; for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { if (addr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); ASSERT(seg != NULL && addr == seg->s_base); cnt++; ASSERT(cnt < segcnt); } if (addr + size > seg->s_base + seg->s_size) { ssize = seg->s_base + seg->s_size - addr; } else { ssize = size; } pl = &plist[npages + cnt]; error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, L_PAGELOCK, rw); if (error) { break; } ASSERT(plist[npages + cnt] != NULL); ASSERT(pl_off + btop(ssize) <= npages); bcopy(plist[npages + cnt], &plist[pl_off], btop(ssize) * sizeof (page_t *)); pl_off += btop(ssize); } if (size == 0) { AS_LOCK_EXIT(as); ASSERT(cnt == segcnt - 1); *ppp = plist; return (0); } /* * one of pagelock calls failed. The error type is in error variable. * Unlock what we've locked so far and retry with F_SOFTLOCK if error * type is either EFAULT or ENOTSUP. Otherwise just return the error * back to the caller. */ eaddr = addr; seg = sv_seg; for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { if (addr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); ASSERT(seg != NULL && addr == seg->s_base); cnt++; ASSERT(cnt < segcnt); } if (eaddr > seg->s_base + seg->s_size) { ssize = seg->s_base + seg->s_size - addr; } else { ssize = eaddr - addr; } pl = &plist[npages + cnt]; ASSERT(*pl != NULL); (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, L_PAGEUNLOCK, rw); } AS_LOCK_EXIT(as); kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); if (error != ENOTSUP && error != EFAULT) { return (error); } slow: /* * If we are here because pagelock failed due to the need to cow fault * in the pages we want to lock F_SOFTLOCK will do this job and in * next as_pagelock() call for this address range pagelock will * hopefully succeed. */ fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); if (fault_err != 0) { return (fc_decode(fault_err)); } *ppp = NULL; return (0); } /* * lock pages in a given address space. Return shadow list. If * the list is NULL, the MMU mapping is also locked. */ int as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, size_t size, enum seg_rw rw) { size_t rsize; caddr_t raddr; faultcode_t fault_err; struct seg *seg; int err; TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, "as_pagelock_start: addr %p size %ld", addr, size); raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; /* * if the request crosses two segments let * as_fault handle it. */ AS_LOCK_ENTER(as, RW_READER); seg = as_segat(as, raddr); if (seg == NULL) { AS_LOCK_EXIT(as); return (EFAULT); } ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); if (raddr + rsize > seg->s_base + seg->s_size) { return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); } if (raddr + rsize <= raddr) { AS_LOCK_EXIT(as); return (EFAULT); } TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); /* * try to lock pages and pass back shadow list */ err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); AS_LOCK_EXIT(as); if (err == 0 || (err != ENOTSUP && err != EFAULT)) { return (err); } /* * Use F_SOFTLOCK to lock the pages because pagelock failed either due * to no pagelock support for this segment or pages need to be cow * faulted in. If fault is needed F_SOFTLOCK will do this job for * this as_pagelock() call and in the next as_pagelock() call for the * same address range pagelock call will hopefull succeed. */ fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); if (fault_err != 0) { return (fc_decode(fault_err)); } *ppp = NULL; TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); return (0); } /* * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow * lists from the end of plist and call pageunlock interface for each segment. * Drop as lock and free plist. */ static void as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, struct page **plist, enum seg_rw rw) { ulong_t cnt; caddr_t eaddr = addr + size; pgcnt_t npages = btop(size); size_t ssize; page_t **pl; ASSERT(AS_LOCK_HELD(as)); ASSERT(seg != NULL); ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); ASSERT(addr + size > seg->s_base + seg->s_size); ASSERT(IS_P2ALIGNED(size, PAGESIZE)); ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); ASSERT(plist != NULL); for (cnt = 0; addr < eaddr; addr += ssize) { if (addr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); ASSERT(seg != NULL && addr == seg->s_base); cnt++; } if (eaddr > seg->s_base + seg->s_size) { ssize = seg->s_base + seg->s_size - addr; } else { ssize = eaddr - addr; } pl = &plist[npages + cnt]; ASSERT(*pl != NULL); (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, L_PAGEUNLOCK, rw); } ASSERT(cnt > 0); AS_LOCK_EXIT(as); cnt++; kmem_free(plist, (npages + cnt) * sizeof (page_t *)); } /* * unlock pages in a given address range */ void as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, enum seg_rw rw) { struct seg *seg; size_t rsize; caddr_t raddr; TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, "as_pageunlock_start: addr %p size %ld", addr, size); /* * if the shadow list is NULL, as_pagelock was * falling back to as_fault */ if (pp == NULL) { (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); return; } raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; AS_LOCK_ENTER(as, RW_READER); seg = as_segat(as, raddr); ASSERT(seg != NULL); TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); if (raddr + rsize <= seg->s_base + seg->s_size) { SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); } else { as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); return; } AS_LOCK_EXIT(as); TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); } int as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, boolean_t wait) { struct seg *seg; size_t ssize; caddr_t raddr; /* rounded down addr */ size_t rsize; /* rounded up size */ int error = 0; size_t pgsz = page_get_pagesize(szc); setpgsz_top: if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { return (EINVAL); } raddr = addr; rsize = size; if (raddr + rsize < raddr) /* check for wraparound */ return (ENOMEM); AS_LOCK_ENTER(as, RW_WRITER); as_clearwatchprot(as, raddr, rsize); seg = as_segat(as, raddr); if (seg == NULL) { as_setwatch(as); AS_LOCK_EXIT(as); return (ENOMEM); } for (; rsize != 0; rsize -= ssize, raddr += ssize) { if (raddr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { error = ENOMEM; break; } } if ((raddr + rsize) > (seg->s_base + seg->s_size)) { ssize = seg->s_base + seg->s_size - raddr; } else { ssize = rsize; } retry: error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); if (error == IE_NOMEM) { error = EAGAIN; break; } if (error == IE_RETRY) { AS_LOCK_EXIT(as); goto setpgsz_top; } if (error == ENOTSUP) { error = EINVAL; break; } if (wait && (error == EAGAIN)) { /* * Memory is currently locked. It must be unlocked * before this operation can succeed through a retry. * The possible reasons for locked memory and * corresponding strategies for unlocking are: * (1) Normal I/O * wait for a signal that the I/O operation * has completed and the memory is unlocked. * (2) Asynchronous I/O * The aio subsystem does not unlock pages when * the I/O is completed. Those pages are unlocked * when the application calls aiowait/aioerror. * So, to prevent blocking forever, cv_broadcast() * is done to wake up aio_cleanup_thread. * Subsequently, segvn_reclaim will be called, and * that will do AS_CLRUNMAPWAIT() and wake us up. * (3) Long term page locking: * This is not relevant for as_setpagesize() * because we cannot change the page size for * driver memory. The attempt to do so will * fail with a different error than EAGAIN so * there's no need to trigger as callbacks like * as_unmap, as_setprot or as_free would do. */ mutex_enter(&as->a_contents); if (!AS_ISNOUNMAPWAIT(as)) { if (AS_ISUNMAPWAIT(as) == 0) { cv_broadcast(&as->a_cv); } AS_SETUNMAPWAIT(as); AS_LOCK_EXIT(as); while (AS_ISUNMAPWAIT(as)) { cv_wait(&as->a_cv, &as->a_contents); } } else { /* * We may have raced with * segvn_reclaim()/segspt_reclaim(). In this * case clean nounmapwait flag and retry since * softlockcnt in this segment may be already * 0. We don't drop as writer lock so our * number of retries without sleeping should * be very small. See segvn_reclaim() for * more comments. */ AS_CLRNOUNMAPWAIT(as); mutex_exit(&as->a_contents); goto retry; } mutex_exit(&as->a_contents); goto setpgsz_top; } else if (error != 0) { break; } } as_setwatch(as); AS_LOCK_EXIT(as); return (error); } /* * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments * in its chunk where s_szc is less than the szc we want to set. */ static int as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, int *retry) { struct seg *seg; size_t ssize; int error; ASSERT(AS_WRITE_HELD(as)); seg = as_segat(as, raddr); if (seg == NULL) { panic("as_iset3_default_lpsize: no seg"); } for (; rsize != 0; rsize -= ssize, raddr += ssize) { if (raddr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { panic("as_iset3_default_lpsize: as changed"); } } if ((raddr + rsize) > (seg->s_base + seg->s_size)) { ssize = seg->s_base + seg->s_size - raddr; } else { ssize = rsize; } if (szc > seg->s_szc) { error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); /* Only retry on EINVAL segments that have no vnode. */ if (error == EINVAL) { vnode_t *vp = NULL; if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) && (SEGOP_GETVP(seg, raddr, &vp) != 0 || vp == NULL)) { *retry = 1; } else { *retry = 0; } } if (error) { return (error); } } } return (0); } /* * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the * pagesize on each segment in its range, but if any fails with EINVAL, * then it reduces the pagesizes to the next size in the bitmap and * retries as_iset3_default_lpsize(). The reason why the code retries * smaller allowed sizes on EINVAL is because (a) the anon offset may not * match the bigger sizes, and (b) it's hard to get this offset (to begin * with) to pass to map_pgszcvec(). */ static int as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, uint_t szcvec) { int error; int retry; ASSERT(AS_WRITE_HELD(as)); for (;;) { error = as_iset3_default_lpsize(as, addr, size, szc, &retry); if (error == EINVAL && retry) { szcvec &= ~(1 << szc); if (szcvec <= 1) { return (EINVAL); } szc = highbit(szcvec) - 1; } else { return (error); } } } /* * as_iset1_default_lpsize() breaks its chunk into areas where existing * segments have a smaller szc than we want to set. For each such area, * it calls as_iset2_default_lpsize() */ static int as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, uint_t szcvec) { struct seg *seg; size_t ssize; caddr_t setaddr = raddr; size_t setsize = 0; int set; int error; ASSERT(AS_WRITE_HELD(as)); seg = as_segat(as, raddr); if (seg == NULL) { panic("as_iset1_default_lpsize: no seg"); } if (seg->s_szc < szc) { set = 1; } else { set = 0; } for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { if (raddr >= seg->s_base + seg->s_size) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { panic("as_iset1_default_lpsize: as changed"); } if (seg->s_szc >= szc && set) { ASSERT(setsize != 0); error = as_iset2_default_lpsize(as, setaddr, setsize, szc, szcvec); if (error) { return (error); } set = 0; } else if (seg->s_szc < szc && !set) { setaddr = raddr; setsize = 0; set = 1; } } if ((raddr + rsize) > (seg->s_base + seg->s_size)) { ssize = seg->s_base + seg->s_size - raddr; } else { ssize = rsize; } } error = 0; if (set) { ASSERT(setsize != 0); error = as_iset2_default_lpsize(as, setaddr, setsize, szc, szcvec); } return (error); } /* * as_iset_default_lpsize() breaks its chunk according to the size code bitmap * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each * chunk to as_iset1_default_lpsize(). */ static int as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, int type) { int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, flags, rtype, 1); uint_t szc; uint_t nszc; int error; caddr_t a; caddr_t eaddr; size_t segsize; size_t pgsz; uint_t save_szcvec; ASSERT(AS_WRITE_HELD(as)); ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); ASSERT(IS_P2ALIGNED(size, PAGESIZE)); szcvec &= ~1; if (szcvec <= 1) { /* skip if base page size */ return (0); } /* Get the pagesize of the first larger page size. */ szc = lowbit(szcvec) - 1; pgsz = page_get_pagesize(szc); eaddr = addr + size; addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); save_szcvec = szcvec; szcvec >>= (szc + 1); nszc = szc; while (szcvec) { if ((szcvec & 0x1) == 0) { nszc++; szcvec >>= 1; continue; } nszc++; pgsz = page_get_pagesize(nszc); a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); if (a != addr) { ASSERT(szc > 0); ASSERT(a < eaddr); segsize = a - addr; error = as_iset1_default_lpsize(as, addr, segsize, szc, save_szcvec); if (error) { return (error); } addr = a; } szc = nszc; szcvec >>= 1; } ASSERT(addr < eaddr); szcvec = save_szcvec; while (szcvec) { a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); ASSERT(a >= addr); if (a != addr) { ASSERT(szc > 0); segsize = a - addr; error = as_iset1_default_lpsize(as, addr, segsize, szc, save_szcvec); if (error) { return (error); } addr = a; } szcvec &= ~(1 << szc); if (szcvec) { szc = highbit(szcvec) - 1; pgsz = page_get_pagesize(szc); } } ASSERT(addr == eaddr); return (0); } /* * Set the default large page size for the range. Called via memcntl with * page size set to 0. as_set_default_lpsize breaks the range down into * chunks with the same type/flags, ignores-non segvn segments, and passes * each chunk to as_iset_default_lpsize(). */ int as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) { struct seg *seg; caddr_t raddr; size_t rsize; size_t ssize; int rtype, rflags; int stype, sflags; int error; caddr_t setaddr; size_t setsize; int segvn; if (size == 0) return (0); AS_LOCK_ENTER(as, RW_WRITER); again: error = 0; raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - (size_t)raddr; if (raddr + rsize < raddr) { /* check for wraparound */ AS_LOCK_EXIT(as); return (ENOMEM); } as_clearwatchprot(as, raddr, rsize); seg = as_segat(as, raddr); if (seg == NULL) { as_setwatch(as); AS_LOCK_EXIT(as); return (ENOMEM); } if (seg->s_ops == &segvn_ops) { rtype = SEGOP_GETTYPE(seg, addr); rflags = rtype & (MAP_TEXT | MAP_INITDATA); rtype = rtype & (MAP_SHARED | MAP_PRIVATE); segvn = 1; } else { segvn = 0; } setaddr = raddr; setsize = 0; for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { if (raddr >= (seg->s_base + seg->s_size)) { seg = AS_SEGNEXT(as, seg); if (seg == NULL || raddr != seg->s_base) { error = ENOMEM; break; } if (seg->s_ops == &segvn_ops) { stype = SEGOP_GETTYPE(seg, raddr); sflags = stype & (MAP_TEXT | MAP_INITDATA); stype &= (MAP_SHARED | MAP_PRIVATE); if (segvn && (rflags != sflags || rtype != stype)) { /* * The next segment is also segvn but * has different flags and/or type. */ ASSERT(setsize != 0); error = as_iset_default_lpsize(as, setaddr, setsize, rflags, rtype); if (error) { break; } rflags = sflags; rtype = stype; setaddr = raddr; setsize = 0; } else if (!segvn) { rflags = sflags; rtype = stype; setaddr = raddr; setsize = 0; segvn = 1; } } else if (segvn) { /* The next segment is not segvn. */ ASSERT(setsize != 0); error = as_iset_default_lpsize(as, setaddr, setsize, rflags, rtype); if (error) { break; } segvn = 0; } } if ((raddr + rsize) > (seg->s_base + seg->s_size)) { ssize = seg->s_base + seg->s_size - raddr; } else { ssize = rsize; } } if (error == 0 && segvn) { /* The last chunk when rsize == 0. */ ASSERT(setsize != 0); error = as_iset_default_lpsize(as, setaddr, setsize, rflags, rtype); } if (error == IE_RETRY) { goto again; } else if (error == IE_NOMEM) { error = EAGAIN; } else if (error == ENOTSUP) { error = EINVAL; } else if (error == EAGAIN) { mutex_enter(&as->a_contents); if (!AS_ISNOUNMAPWAIT(as)) { if (AS_ISUNMAPWAIT(as) == 0) { cv_broadcast(&as->a_cv); } AS_SETUNMAPWAIT(as); AS_LOCK_EXIT(as); while (AS_ISUNMAPWAIT(as)) { cv_wait(&as->a_cv, &as->a_contents); } mutex_exit(&as->a_contents); AS_LOCK_ENTER(as, RW_WRITER); } else { /* * We may have raced with * segvn_reclaim()/segspt_reclaim(). In this case * clean nounmapwait flag and retry since softlockcnt * in this segment may be already 0. We don't drop as * writer lock so our number of retries without * sleeping should be very small. See segvn_reclaim() * for more comments. */ AS_CLRNOUNMAPWAIT(as); mutex_exit(&as->a_contents); } goto again; } as_setwatch(as); AS_LOCK_EXIT(as); return (error); } /* * Setup all of the uninitialized watched pages that we can. */ void as_setwatch(struct as *as) { struct watched_page *pwp; struct seg *seg; caddr_t vaddr; uint_t prot; int err, retrycnt; if (avl_numnodes(&as->a_wpage) == 0) return; ASSERT(AS_WRITE_HELD(as)); for (pwp = avl_first(&as->a_wpage); pwp != NULL; pwp = AVL_NEXT(&as->a_wpage, pwp)) { retrycnt = 0; retry: vaddr = pwp->wp_vaddr; if (pwp->wp_oprot != 0 || /* already set up */ (seg = as_segat(as, vaddr)) == NULL || SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) continue; pwp->wp_oprot = prot; if (pwp->wp_read) prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); if (pwp->wp_write) prot &= ~PROT_WRITE; if (pwp->wp_exec) prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); if (err == IE_RETRY) { pwp->wp_oprot = 0; ASSERT(retrycnt == 0); retrycnt++; goto retry; } } pwp->wp_prot = prot; } } /* * Clear all of the watched pages in the address space. */ void as_clearwatch(struct as *as) { struct watched_page *pwp; struct seg *seg; caddr_t vaddr; uint_t prot; int err, retrycnt; if (avl_numnodes(&as->a_wpage) == 0) return; ASSERT(AS_WRITE_HELD(as)); for (pwp = avl_first(&as->a_wpage); pwp != NULL; pwp = AVL_NEXT(&as->a_wpage, pwp)) { retrycnt = 0; retry: vaddr = pwp->wp_vaddr; if (pwp->wp_oprot == 0 || /* not set up */ (seg = as_segat(as, vaddr)) == NULL) continue; if ((prot = pwp->wp_oprot) != pwp->wp_prot) { err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); if (err == IE_RETRY) { ASSERT(retrycnt == 0); retrycnt++; goto retry; } } pwp->wp_oprot = 0; pwp->wp_prot = 0; } } /* * Force a new setup for all the watched pages in the range. */ static void as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) { struct watched_page *pwp; struct watched_page tpw; caddr_t eaddr = addr + size; caddr_t vaddr; struct seg *seg; int err, retrycnt; uint_t wprot; avl_index_t where; if (avl_numnodes(&as->a_wpage) == 0) return; ASSERT(AS_WRITE_HELD(as)); tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); while (pwp != NULL && pwp->wp_vaddr < eaddr) { retrycnt = 0; vaddr = pwp->wp_vaddr; wprot = prot; if (pwp->wp_read) wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); if (pwp->wp_write) wprot &= ~PROT_WRITE; if (pwp->wp_exec) wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { retry: seg = as_segat(as, vaddr); if (seg == NULL) { panic("as_setwatchprot: no seg"); /*NOTREACHED*/ } err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); if (err == IE_RETRY) { ASSERT(retrycnt == 0); retrycnt++; goto retry; } } pwp->wp_oprot = prot; pwp->wp_prot = wprot; pwp = AVL_NEXT(&as->a_wpage, pwp); } } /* * Clear all of the watched pages in the range. */ static void as_clearwatchprot(struct as *as, caddr_t addr, size_t size) { caddr_t eaddr = addr + size; struct watched_page *pwp; struct watched_page tpw; uint_t prot; struct seg *seg; int err, retrycnt; avl_index_t where; if (avl_numnodes(&as->a_wpage) == 0) return; tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); ASSERT(AS_WRITE_HELD(as)); while (pwp != NULL && pwp->wp_vaddr < eaddr) { if ((prot = pwp->wp_oprot) != 0) { retrycnt = 0; if (prot != pwp->wp_prot) { retry: seg = as_segat(as, pwp->wp_vaddr); if (seg == NULL) continue; err = SEGOP_SETPROT(seg, pwp->wp_vaddr, PAGESIZE, prot); if (err == IE_RETRY) { ASSERT(retrycnt == 0); retrycnt++; goto retry; } } pwp->wp_oprot = 0; pwp->wp_prot = 0; } pwp = AVL_NEXT(&as->a_wpage, pwp); } } void as_signal_proc(struct as *as, k_siginfo_t *siginfo) { struct proc *p; mutex_enter(&pidlock); for (p = practive; p; p = p->p_next) { if (p->p_as == as) { mutex_enter(&p->p_lock); if (p->p_as == as) sigaddq(p, NULL, siginfo, KM_NOSLEEP); mutex_exit(&p->p_lock); } } mutex_exit(&pidlock); } /* * return memory object ID */ int as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) { struct seg *seg; int sts; AS_LOCK_ENTER(as, RW_READER); seg = as_segat(as, addr); if (seg == NULL) { AS_LOCK_EXIT(as); return (EFAULT); } /* * catch old drivers which may not support getmemid */ if (seg->s_ops->getmemid == NULL) { AS_LOCK_EXIT(as); return (ENODEV); } sts = SEGOP_GETMEMID(seg, addr, memidp); AS_LOCK_EXIT(as); return (sts); }