/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2021 Joyent, Inc. * Copyright 2021 Oxide Computer Company */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Page handling structures. This is set up as a list of per-page * control structures (sc_page_ctl), with p->p_pagep pointing to * the first. The per-page structures point to the actual pages * and contain pointers to the user address for each mapped page. * * All data is protected by p->p_sc_lock. Since this lock is * held while waiting for memory, schedctl_shared_alloc() should * not be called while holding p_lock. */ typedef struct sc_page_ctl { struct sc_page_ctl *spc_next; sc_shared_t *spc_base; /* base of kernel page */ sc_shared_t *spc_end; /* end of usable space */ ulong_t *spc_map; /* bitmap of allocated space on page */ size_t spc_space; /* amount of space on page */ caddr_t spc_uaddr; /* user-level address of the page */ struct anon_map *spc_amp; /* anonymous memory structure */ } sc_page_ctl_t; static size_t sc_pagesize; /* size of usable space on page */ static size_t sc_bitmap_len; /* # of bits in allocation bitmap */ static size_t sc_bitmap_words; /* # of words in allocation bitmap */ /* Context ops */ static void schedctl_save(void *); static void schedctl_restore(void *); static void schedctl_fork(void *, void *); /* Functions for handling shared pages */ static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *); static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *); static int schedctl_map(struct anon_map *, caddr_t *, caddr_t); static int schedctl_getpage(struct anon_map **, caddr_t *); static void schedctl_freepage(struct anon_map *, caddr_t); static const struct ctxop_template schedctl_ctxop_tpl = { .ct_rev = CTXOP_TPL_REV, .ct_save = schedctl_save, .ct_restore = schedctl_restore, .ct_fork = schedctl_fork, }; /* * System call interface to scheduler activations. * This always operates on the current lwp. */ caddr_t schedctl(void) { kthread_t *t = curthread; sc_shared_t *ssp; uintptr_t uaddr; int error; if (t->t_schedctl == NULL) { /* * Allocate and initialize the shared structure. */ if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0) return ((caddr_t)(uintptr_t)set_errno(error)); bzero(ssp, sizeof (*ssp)); ctxop_install(t, &schedctl_ctxop_tpl, ssp); thread_lock(t); /* protect against ts_tick and ts_update */ t->t_schedctl = ssp; t->t_sc_uaddr = uaddr; ssp->sc_cid = t->t_cid; ssp->sc_cpri = t->t_cpri; ssp->sc_priority = DISP_PRIO(t); thread_unlock(t); } return ((caddr_t)t->t_sc_uaddr); } /* * Clean up scheduler activations state associated with an exiting * (or execing) lwp. t is always the current thread. */ void schedctl_lwp_cleanup(kthread_t *t) { sc_shared_t *ssp = t->t_schedctl; proc_t *p = ttoproc(t); sc_page_ctl_t *pagep; index_t index; ASSERT(MUTEX_NOT_HELD(&p->p_lock)); thread_lock(t); /* protect against ts_tick and ts_update */ t->t_schedctl = NULL; t->t_sc_uaddr = 0; thread_unlock(t); /* * Remove the context op to avoid the final call to * schedctl_save when switching away from this lwp. */ (void) ctxop_remove(t, &schedctl_ctxop_tpl, ssp); /* * Do not unmap the shared page until the process exits. * User-level library code relies on this for adaptive mutex locking. */ mutex_enter(&p->p_sc_lock); ssp->sc_state = SC_FREE; pagep = schedctl_page_lookup(ssp); index = (index_t)(ssp - pagep->spc_base); BT_CLEAR(pagep->spc_map, index); pagep->spc_space += sizeof (sc_shared_t); mutex_exit(&p->p_sc_lock); } /* * Cleanup the list of schedctl shared pages for the process. * Called from exec() and exit() system calls. */ void schedctl_proc_cleanup(void) { proc_t *p = curproc; sc_page_ctl_t *pagep; sc_page_ctl_t *next; ASSERT(p->p_lwpcnt == 1); /* we are single-threaded now */ ASSERT(curthread->t_schedctl == NULL); /* * Since we are single-threaded, we don't have to hold p->p_sc_lock. */ pagep = p->p_pagep; p->p_pagep = NULL; while (pagep != NULL) { ASSERT(pagep->spc_space == sc_pagesize); next = pagep->spc_next; /* * Unmap the user space and free the mapping structure. */ (void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE); schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base)); kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words); kmem_free(pagep, sizeof (sc_page_ctl_t)); pagep = next; } } /* * Called by resume just before switching away from the current thread. * Save new thread state. */ static void schedctl_save(void *arg) { sc_shared_t *ssp = arg; ssp->sc_state = curthread->t_state; } /* * Called by resume after switching to the current thread. * Save new thread state and CPU. */ static void schedctl_restore(void *arg) { sc_shared_t *ssp = arg; ssp->sc_state = SC_ONPROC; ssp->sc_cpu = CPU->cpu_id; } /* * On fork, remove inherited mappings from the child's address space. * The child's threads must call schedctl() to get new shared mappings. */ static void schedctl_fork(void *parent, void *child) { kthread_t *pt = parent, *ct = child; proc_t *pp = ttoproc(pt); proc_t *cp = ttoproc(ct); sc_page_ctl_t *pagep; ASSERT(ct->t_schedctl == NULL); /* * Do this only once, whether we are doing fork1() or forkall(). * Don't do it at all if the child process is a child of vfork() * because a child of vfork() borrows the parent's address space. */ if (pt != curthread || (cp->p_flag & SVFORK)) return; mutex_enter(&pp->p_sc_lock); for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next) (void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE); mutex_exit(&pp->p_sc_lock); } /* * Returns non-zero if the specified thread shouldn't be preempted at this time. * Called by ts_preempt(), ts_tick(), and ts_update(). */ int schedctl_get_nopreempt(kthread_t *t) { ASSERT(THREAD_LOCK_HELD(t)); return (t->t_schedctl->sc_preemptctl.sc_nopreempt); } /* * Sets the value of the nopreempt field for the specified thread. * Called by ts_preempt() to clear the field on preemption. */ void schedctl_set_nopreempt(kthread_t *t, short val) { ASSERT(THREAD_LOCK_HELD(t)); t->t_schedctl->sc_preemptctl.sc_nopreempt = val; } /* * Sets the value of the yield field for the specified thread. * Called by ts_preempt() and ts_tick() to set the field, and * ts_yield() to clear it. * The kernel never looks at this field so we don't need a * schedctl_get_yield() function. */ void schedctl_set_yield(kthread_t *t, short val) { ASSERT(THREAD_LOCK_HELD(t)); t->t_schedctl->sc_preemptctl.sc_yield = val; } /* * Sets the values of the cid and priority fields for the specified thread. * Called from thread_change_pri(), thread_change_epri(), THREAD_CHANGE_PRI(). * Called following calls to CL_FORKRET() and CL_ENTERCLASS(). */ void schedctl_set_cidpri(kthread_t *t) { sc_shared_t *tdp = t->t_schedctl; if (tdp != NULL) { tdp->sc_cid = t->t_cid; tdp->sc_cpri = t->t_cpri; tdp->sc_priority = DISP_PRIO(t); } } /* * Returns non-zero if the specified thread has requested that all * signals be blocked. Called by signal-related code that tests * the signal mask of a thread that may not be the current thread * and where the process's p_lock cannot be acquired. */ int schedctl_sigblock(kthread_t *t) { sc_shared_t *tdp = t->t_schedctl; if (tdp != NULL) return (tdp->sc_sigblock); return (0); } /* * If the sc_sigblock field is set for the specified thread, set its signal * mask to block all maskable signals, then clear the sc_sigblock field. This * accomplishes what user-level code requested to be done when it set * tdp->sc_shared->sc_sigblock non-zero. * * This is generally called by signal-related code in the current thread. In * order to call against a thread other than curthread, p_lock for the * containing process must be held. Even then, the caller is not protected * from races with the thread in question updating its own fields. It is the * responsibility of the caller to perform additional synchronization. * */ void schedctl_finish_sigblock(kthread_t *t) { sc_shared_t *tdp = t->t_schedctl; ASSERT(t == curthread || MUTEX_HELD(&ttoproc(t)->p_lock)); if (tdp != NULL && tdp->sc_sigblock) { t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0; t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1; t->t_hold.__sigbits[2] = FILLSET2 & ~CANTMASK2; tdp->sc_sigblock = 0; } } /* * Return non-zero if the current thread has declared that it has * a cancellation pending and that cancellation is not disabled. * If SIGCANCEL is blocked, we must be going over the wire in an * NFS transaction (sigintr() was called); return zero in this case. */ int schedctl_cancel_pending(void) { sc_shared_t *tdp = curthread->t_schedctl; if (tdp != NULL && (tdp->sc_flgs & SC_CANCEL_FLG) && !tdp->sc_sigblock && !sigismember(&curthread->t_hold, SIGCANCEL)) return (1); return (0); } /* * Inform libc that the kernel returned EINTR from some system call * due to there being a cancellation pending (SC_CANCEL_FLG set or * we received an SI_LWP SIGCANCEL while in a system call), rather * than because of some other signal. User-level code can try to * recover from receiving other signals, but it can't recover from * being cancelled. */ void schedctl_cancel_eintr(void) { sc_shared_t *tdp = curthread->t_schedctl; if (tdp != NULL) tdp->sc_flgs |= SC_EINTR_FLG; } /* * Return non-zero if the current thread has declared that * it is calling into the kernel to park, else return zero. */ int schedctl_is_park(void) { sc_shared_t *tdp = curthread->t_schedctl; if (tdp != NULL) return ((tdp->sc_flgs & SC_PARK_FLG) != 0); /* * If we're here and there is no shared memory (how could * that happen?) then just assume we really are here to park. */ return (1); } /* * Declare thread is parking. * * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid) * in order to declare that the thread is calling into the kernel to park. * * This interface exists ONLY to support older versions of libthread which * are not aware of the SC_PARK_FLG flag. * * Older versions of libthread which are not aware of the SC_PARK_FLG flag * need to be modified or emulated to call lwpsys_park(4, ...) instead of * lwpsys_park(0, ...). This will invoke schedctl_set_park() before * lwp_park() to declare that the thread is parking. */ void schedctl_set_park(void) { sc_shared_t *tdp = curthread->t_schedctl; if (tdp != NULL) tdp->sc_flgs |= SC_PARK_FLG; } /* * Clear the parking flag on return from parking in the kernel. */ void schedctl_unpark(void) { sc_shared_t *tdp = curthread->t_schedctl; if (tdp != NULL) tdp->sc_flgs &= ~SC_PARK_FLG; } /* * Page handling code. */ void schedctl_init(void) { /* * Amount of page that can hold sc_shared_t structures. If * sizeof (sc_shared_t) is a power of 2, this should just be * PAGESIZE. */ sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t)); /* * Allocation bitmap is one bit per struct on a page. */ sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t); sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL); } static int schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp) { proc_t *p = curproc; sc_page_ctl_t *pagep; sc_shared_t *ssp; caddr_t base; index_t index; int error; ASSERT(MUTEX_NOT_HELD(&p->p_lock)); mutex_enter(&p->p_sc_lock); /* * Try to find space for the new data in existing pages * within the process's list of shared pages. */ for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) if (pagep->spc_space != 0) break; if (pagep != NULL) base = pagep->spc_uaddr; else { struct anon_map *amp; caddr_t kaddr; /* * No room, need to allocate a new page. Also set up * a mapping to the kernel address space for the new * page and lock it in memory. */ if ((error = schedctl_getpage(&, &kaddr)) != 0) { mutex_exit(&p->p_sc_lock); return (error); } if ((error = schedctl_map(amp, &base, kaddr)) != 0) { schedctl_freepage(amp, kaddr); mutex_exit(&p->p_sc_lock); return (error); } /* * Allocate and initialize the page control structure. */ pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP); pagep->spc_amp = amp; pagep->spc_base = (sc_shared_t *)kaddr; pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize); pagep->spc_uaddr = base; pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words, KM_SLEEP); pagep->spc_space = sc_pagesize; pagep->spc_next = p->p_pagep; p->p_pagep = pagep; } /* * Got a page, now allocate space for the data. There should * be space unless something's wrong. */ ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t)); index = bt_availbit(pagep->spc_map, sc_bitmap_len); ASSERT(index != -1); /* * Get location with pointer arithmetic. spc_base is of type * sc_shared_t *. Mark as allocated. */ ssp = pagep->spc_base + index; BT_SET(pagep->spc_map, index); pagep->spc_space -= sizeof (sc_shared_t); mutex_exit(&p->p_sc_lock); /* * Return kernel and user addresses. */ *kaddrp = ssp; *uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET); return (0); } /* * Find the page control structure corresponding to a kernel address. */ static sc_page_ctl_t * schedctl_page_lookup(sc_shared_t *ssp) { proc_t *p = curproc; sc_page_ctl_t *pagep; ASSERT(MUTEX_HELD(&p->p_sc_lock)); for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) { if (ssp >= pagep->spc_base && ssp < pagep->spc_end) return (pagep); } return (NULL); /* This "can't happen". Should we panic? */ } /* * This function is called when a page needs to be mapped into a * process's address space. Allocate the user address space and * set up the mapping to the page. Assumes the page has already * been allocated and locked in memory via schedctl_getpage. */ static int schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr) { caddr_t addr = NULL; struct as *as = curproc->p_as; struct segvn_crargs vn_a; int error; as_rangelock(as); /* pass address of kernel mapping as offset to avoid VAC conflicts */ map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0); if (addr == NULL) { as_rangeunlock(as); return (ENOMEM); } /* * Use segvn to set up the mapping to the page. */ vn_a.vp = NULL; vn_a.offset = 0; vn_a.cred = NULL; vn_a.type = MAP_SHARED; vn_a.prot = vn_a.maxprot = PROT_ALL; vn_a.flags = 0; vn_a.amp = amp; vn_a.szc = 0; vn_a.lgrp_mem_policy_flags = 0; error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a); as_rangeunlock(as); if (error) return (error); *uaddrp = addr; return (0); } /* * Allocate a new page from anonymous memory. Also, create a kernel * mapping to the page and lock the page in memory. */ static int schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) { struct anon_map *amp; caddr_t kaddr; /* * Set up anonymous memory struct. No swap reservation is * needed since the page will be locked into memory. */ amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP); /* * Allocate the page. */ kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); if (kaddr == NULL) { amp->refcnt--; anonmap_free(amp); return (ENOMEM); } /* * The page is left SE_SHARED locked so that it won't be * paged out or relocated (KPD_LOCKED above). */ *newamp = amp; *newaddr = kaddr; return (0); } /* * Take the necessary steps to allow a page to be released. * This is called when the process is doing exit() or exec(). * There should be no accesses to the page after this. * The kernel mapping of the page is released and the page is unlocked. */ static void schedctl_freepage(struct anon_map *amp, caddr_t kaddr) { /* * Release the lock on the page and remove the kernel mapping. */ ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); segkp_release(segkp, kaddr); /* * Decrement the refcnt so the anon_map structure will be freed. */ if (--amp->refcnt == 0) { /* * The current process no longer has the page mapped, so * we have to free everything rather than letting as_free * do the work. */ anonmap_purge(amp); anon_free(amp->ahp, 0, PAGESIZE); ANON_LOCK_EXIT(&->a_rwlock); anonmap_free(amp); } else { ANON_LOCK_EXIT(&->a_rwlock); } }