/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include int callout_init_done; /* useful during boot */ /* * Callout tables. See timeout(9F) for details. */ static int callout_threads; /* callout normal threads */ static hrtime_t callout_debug_hrtime; /* debugger entry time */ static int callout_chunk; /* callout heap chunk size */ static int callout_min_reap; /* callout minimum reap count */ static int callout_tolerance; /* callout hires tolerance */ static callout_table_t *callout_boot_ct; /* Boot CPU's callout tables */ static clock_t callout_max_ticks; /* max interval */ static hrtime_t callout_longterm; /* longterm nanoseconds */ static ulong_t callout_counter_low; /* callout ID increment */ static ulong_t callout_table_bits; /* number of table bits in ID */ static ulong_t callout_table_mask; /* mask for the table bits */ static callout_cache_t *callout_caches; /* linked list of caches */ #pragma align 64(callout_table) static callout_table_t *callout_table; /* global callout table array */ /* * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal' * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout, * via taskq, to a thread that executes at PIL 0 - so we end up running * 'normal' callouts at PIL 0. */ static volatile int callout_realtime_level = CY_LOW_LEVEL; static volatile int callout_normal_level = CY_LOCK_LEVEL; static char *callout_kstat_names[] = { "callout_timeouts", "callout_timeouts_pending", "callout_untimeouts_unexpired", "callout_untimeouts_executing", "callout_untimeouts_expired", "callout_expirations", "callout_allocations", "callout_cleanups", }; static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int); #define CALLOUT_HASH_INSERT(hash, cp, cnext, cprev) \ { \ callout_hash_t *hashp = &(hash); \ \ cp->cprev = NULL; \ cp->cnext = hashp->ch_head; \ if (hashp->ch_head == NULL) \ hashp->ch_tail = cp; \ else \ cp->cnext->cprev = cp; \ hashp->ch_head = cp; \ } #define CALLOUT_HASH_APPEND(hash, cp, cnext, cprev) \ { \ callout_hash_t *hashp = &(hash); \ \ cp->cnext = NULL; \ cp->cprev = hashp->ch_tail; \ if (hashp->ch_tail == NULL) \ hashp->ch_head = cp; \ else \ cp->cprev->cnext = cp; \ hashp->ch_tail = cp; \ } #define CALLOUT_HASH_DELETE(hash, cp, cnext, cprev) \ { \ callout_hash_t *hashp = &(hash); \ \ if (cp->cnext == NULL) \ hashp->ch_tail = cp->cprev; \ else \ cp->cnext->cprev = cp->cprev; \ if (cp->cprev == NULL) \ hashp->ch_head = cp->cnext; \ else \ cp->cprev->cnext = cp->cnext; \ } /* * These definitions help us queue callouts and callout lists. Here is * the queueing rationale: * * - callouts are queued in a FIFO manner in the ID hash table. * TCP timers are typically cancelled in the same order that they * were issued. The FIFO queueing shortens the search for a callout * during untimeout(). * * - callouts are queued in a FIFO manner in their callout lists. * This ensures that the callouts are executed in the same order that * they were queued. This is fair. Plus, it helps to make each * callout expiration timely. It also favors cancellations. * * - callout lists are queued in the following manner in the callout * hash table buckets: * * - appended, if the callout list is a 1-nanosecond resolution * callout list. When a callout is created, we first look for * a callout list that has the same expiration so we can avoid * allocating a callout list and inserting the expiration into * the heap. However, we do not want to look at 1-nanosecond * resolution callout lists as we will seldom find a match in * them. Keeping these callout lists in the rear of the hash * buckets allows us to skip these during the lookup. * * - inserted at the beginning, if the callout list is not a * 1-nanosecond resolution callout list. This also has the * side-effect of keeping the long term timers away from the * front of the buckets. * * - callout lists are queued in a FIFO manner in the expired callouts * list. This ensures that callout lists are executed in the order * of expiration. */ #define CALLOUT_APPEND(ct, cp) \ CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \ cp, c_idnext, c_idprev); \ CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev) #define CALLOUT_DELETE(ct, cp) \ CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \ cp, c_idnext, c_idprev); \ CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev) #define CALLOUT_LIST_INSERT(hash, cl) \ CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev) #define CALLOUT_LIST_APPEND(hash, cl) \ CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev) #define CALLOUT_LIST_DELETE(hash, cl) \ CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev) #define CALLOUT_LIST_BEFORE(cl, nextcl) \ { \ (cl)->cl_prev = (nextcl)->cl_prev; \ (cl)->cl_next = (nextcl); \ (nextcl)->cl_prev = (cl); \ if (cl->cl_prev != NULL) \ cl->cl_prev->cl_next = cl; \ } /* * For normal callouts, there is a deadlock scenario if two callouts that * have an inter-dependency end up on the same callout list. To break the * deadlock, you need two taskq threads running in parallel. We compute * the number of taskq threads here using a bunch of conditions to make * it optimal for the common case. This is an ugly hack, but one that is * necessary (sigh). */ #define CALLOUT_THRESHOLD 100000000 #define CALLOUT_EXEC_COMPUTE(ct, nextexp, exec) \ { \ callout_list_t *cl; \ \ cl = ct->ct_expired.ch_head; \ if (cl == NULL) { \ /* \ * If the expired list is NULL, there is nothing to \ * process. \ */ \ exec = 0; \ } else if ((cl->cl_next == NULL) && \ (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) { \ /* \ * If there is only one callout list and it contains \ * only one callout, there is no need for two threads. \ */ \ exec = 1; \ } else if ((nextexp) > (gethrtime() + CALLOUT_THRESHOLD)) { \ /* \ * If the next expiration of the cyclic is way out into \ * the future, we need two threads. \ */ \ exec = 2; \ } else { \ /* \ * We have multiple callouts to process. But the cyclic \ * will fire in the near future. So, we only need one \ * thread for now. \ */ \ exec = 1; \ } \ } /* * Macro to swap two heap items. */ #define CALLOUT_SWAP(h1, h2) \ { \ callout_heap_t tmp; \ \ tmp = *h1; \ *h1 = *h2; \ *h2 = tmp; \ } /* * Macro to free a callout list. */ #define CALLOUT_LIST_FREE(ct, cl) \ { \ cl->cl_next = ct->ct_lfree; \ ct->ct_lfree = cl; \ cl->cl_flags |= CALLOUT_LIST_FLAG_FREE; \ } /* * Macro to free a callout. */ #define CALLOUT_FREE(ct, cl) \ { \ cp->c_idnext = ct->ct_free; \ ct->ct_free = cp; \ cp->c_xid |= CALLOUT_ID_FREE; \ } /* * Allocate a callout structure. We try quite hard because we * can't sleep, and if we can't do the allocation, we're toast. * Failing all, we try a KM_PANIC allocation. Note that we never * deallocate a callout. See untimeout() for the reasoning. */ static callout_t * callout_alloc(callout_table_t *ct) { size_t size; callout_t *cp; ASSERT(MUTEX_HELD(&ct->ct_mutex)); mutex_exit(&ct->ct_mutex); cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP); if (cp == NULL) { size = sizeof (callout_t); cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC); } cp->c_xid = 0; cp->c_executor = NULL; cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL); cp->c_waiting = 0; mutex_enter(&ct->ct_mutex); ct->ct_allocations++; return (cp); } /* * Allocate a callout list structure. We try quite hard because we * can't sleep, and if we can't do the allocation, we're toast. * Failing all, we try a KM_PANIC allocation. Note that we never * deallocate a callout list. */ static void callout_list_alloc(callout_table_t *ct) { size_t size; callout_list_t *cl; ASSERT(MUTEX_HELD(&ct->ct_mutex)); mutex_exit(&ct->ct_mutex); cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP); if (cl == NULL) { size = sizeof (callout_list_t); cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC); } bzero(cl, sizeof (callout_list_t)); mutex_enter(&ct->ct_mutex); CALLOUT_LIST_FREE(ct, cl); } /* * Find a callout list that corresponds to an expiration and matching flags. */ static callout_list_t * callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash) { callout_list_t *cl; int clflags; ASSERT(MUTEX_HELD(&ct->ct_mutex)); if (flags & CALLOUT_LIST_FLAG_NANO) { /* * This is a 1-nanosecond resolution callout. We will rarely * find a match for this. So, bail out. */ return (NULL); } clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME); for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) { /* * If we have reached a 1-nanosecond resolution callout list, * we don't have much hope of finding a match in this hash * bucket. So, just bail out. */ if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) return (NULL); if ((cl->cl_expiration == expiration) && ((cl->cl_flags & clflags) == (flags & clflags))) return (cl); } return (NULL); } /* * Add a new callout list into a callout table's queue in sorted order by * expiration. */ static int callout_queue_add(callout_table_t *ct, callout_list_t *cl) { callout_list_t *nextcl; hrtime_t expiration; expiration = cl->cl_expiration; nextcl = ct->ct_queue.ch_head; if ((nextcl == NULL) || (expiration < nextcl->cl_expiration)) { CALLOUT_LIST_INSERT(ct->ct_queue, cl); return (1); } while (nextcl != NULL) { if (expiration < nextcl->cl_expiration) { CALLOUT_LIST_BEFORE(cl, nextcl); return (0); } nextcl = nextcl->cl_next; } CALLOUT_LIST_APPEND(ct->ct_queue, cl); return (0); } /* * Insert a callout list into a callout table's queue and reprogram the queue * cyclic if needed. */ static void callout_queue_insert(callout_table_t *ct, callout_list_t *cl) { cl->cl_flags |= CALLOUT_LIST_FLAG_QUEUED; /* * Add the callout to the callout queue. If it ends up at the head, * the cyclic needs to be reprogrammed as we have an earlier * expiration. * * Also, during the CPR suspend phase, do not reprogram the cyclic. * We don't want any callout activity. When the CPR resume phase is * entered, the cyclic will be programmed for the earliest expiration * in the queue. */ if (callout_queue_add(ct, cl) && (ct->ct_suspend == 0)) (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration); } /* * Delete and handle all past expirations in a callout table's queue. */ static hrtime_t callout_queue_delete(callout_table_t *ct) { callout_list_t *cl; hrtime_t now; ASSERT(MUTEX_HELD(&ct->ct_mutex)); now = gethrtime(); while ((cl = ct->ct_queue.ch_head) != NULL) { if (cl->cl_expiration > now) break; cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED; CALLOUT_LIST_DELETE(ct->ct_queue, cl); CALLOUT_LIST_APPEND(ct->ct_expired, cl); } /* * If this callout queue is empty or callouts have been suspended, * just return. */ if ((cl == NULL) || (ct->ct_suspend > 0)) return (CY_INFINITY); (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration); return (cl->cl_expiration); } static hrtime_t callout_queue_process(callout_table_t *ct, hrtime_t delta, int timechange) { callout_list_t *firstcl, *cl; hrtime_t expiration, now; int clflags; callout_hash_t temp; ASSERT(MUTEX_HELD(&ct->ct_mutex)); firstcl = ct->ct_queue.ch_head; if (firstcl == NULL) return (CY_INFINITY); /* * We walk the callout queue. If we encounter a hrestime entry that * must be removed, we clean it out. Otherwise, we apply any * adjustments needed to it. Because of the latter, we need to * recreate the list as we go along. */ temp = ct->ct_queue; ct->ct_queue.ch_head = NULL; ct->ct_queue.ch_tail = NULL; clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE); now = gethrtime(); while ((cl = temp.ch_head) != NULL) { CALLOUT_LIST_DELETE(temp, cl); /* * Delete the callout and expire it, if one of the following * is true: * - the callout has expired * - the callout is an absolute hrestime one and * there has been a system time change */ if ((cl->cl_expiration <= now) || (timechange && ((cl->cl_flags & clflags) == clflags))) { cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED; CALLOUT_LIST_APPEND(ct->ct_expired, cl); continue; } /* * Apply adjustments, if any. Adjustments are applied after * the system returns from KMDB or OBP. They are only applied * to relative callout lists. */ if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) { expiration = cl->cl_expiration + delta; if (expiration <= 0) expiration = CY_INFINITY; cl->cl_expiration = expiration; } (void) callout_queue_add(ct, cl); } /* * We need to return the expiration to help program the cyclic. * If there are expired callouts, the cyclic needs to go off * immediately. If the queue has become empty, then we return infinity. * Else, we return the expiration of the earliest callout in the queue. */ if (ct->ct_expired.ch_head != NULL) return (gethrtime()); cl = ct->ct_queue.ch_head; if (cl == NULL) return (CY_INFINITY); return (cl->cl_expiration); } /* * Initialize a callout table's heap, if necessary. Preallocate some free * entries so we don't have to check for NULL elsewhere. */ static void callout_heap_init(callout_table_t *ct) { size_t size; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap == NULL); ct->ct_heap_num = 0; ct->ct_heap_max = callout_chunk; size = sizeof (callout_heap_t) * callout_chunk; ct->ct_heap = kmem_alloc(size, KM_SLEEP); } /* * Reallocate the heap. Return 0 if the heap is still full at the end of it. * Return 1 otherwise. Note that the heap only expands, it never contracts. */ static int callout_heap_expand(callout_table_t *ct) { size_t max, size, osize; callout_heap_t *heap; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap_num <= ct->ct_heap_max); while (ct->ct_heap_num == ct->ct_heap_max) { max = ct->ct_heap_max; mutex_exit(&ct->ct_mutex); osize = sizeof (callout_heap_t) * max; size = sizeof (callout_heap_t) * (max + callout_chunk); heap = kmem_alloc(size, KM_NOSLEEP); mutex_enter(&ct->ct_mutex); if (heap == NULL) { /* * We could not allocate memory. If we can free up * some entries, that would be great. */ if (ct->ct_nreap > 0) (void) callout_heap_process(ct, 0, 0); /* * If we still have no space in the heap, inform the * caller. */ if (ct->ct_heap_num == ct->ct_heap_max) return (0); return (1); } if (max < ct->ct_heap_max) { /* * Someone beat us to the allocation. Free what we * just allocated and proceed. */ kmem_free(heap, size); continue; } bcopy(ct->ct_heap, heap, osize); kmem_free(ct->ct_heap, osize); ct->ct_heap = heap; ct->ct_heap_max = size / sizeof (callout_heap_t); } return (1); } /* * Move an expiration from the bottom of the heap to its correct place * in the heap. If we reached the root doing this, return 1. Else, * return 0. */ static int callout_upheap(callout_table_t *ct) { int current, parent; callout_heap_t *heap, *hcurrent, *hparent; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap_num >= 1); if (ct->ct_heap_num == 1) { return (1); } heap = ct->ct_heap; current = ct->ct_heap_num - 1; for (;;) { parent = CALLOUT_HEAP_PARENT(current); hparent = &heap[parent]; hcurrent = &heap[current]; /* * We have an expiration later than our parent; we're done. */ if (hcurrent->ch_expiration >= hparent->ch_expiration) { return (0); } /* * We need to swap with our parent, and continue up the heap. */ CALLOUT_SWAP(hparent, hcurrent); /* * If we just reached the root, we're done. */ if (parent == 0) { return (1); } current = parent; } /*NOTREACHED*/ } /* * Insert a new heap item into a callout table's heap. */ static void callout_heap_insert(callout_table_t *ct, callout_list_t *cl) { ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap_num < ct->ct_heap_max); cl->cl_flags |= CALLOUT_LIST_FLAG_HEAPED; /* * First, copy the expiration and callout list pointer to the bottom * of the heap. */ ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration; ct->ct_heap[ct->ct_heap_num].ch_list = cl; ct->ct_heap_num++; /* * Now, perform an upheap operation. If we reached the root, then * the cyclic needs to be reprogrammed as we have an earlier * expiration. * * Also, during the CPR suspend phase, do not reprogram the cyclic. * We don't want any callout activity. When the CPR resume phase is * entered, the cyclic will be programmed for the earliest expiration * in the heap. */ if (callout_upheap(ct) && (ct->ct_suspend == 0)) (void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration); } /* * Move an expiration from the top of the heap to its correct place * in the heap. */ static void callout_downheap(callout_table_t *ct) { int current, left, right, nelems; callout_heap_t *heap, *hleft, *hright, *hcurrent; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap_num >= 1); heap = ct->ct_heap; current = 0; nelems = ct->ct_heap_num; for (;;) { /* * If we don't have a left child (i.e., we're a leaf), we're * done. */ if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems) return; hleft = &heap[left]; hcurrent = &heap[current]; right = CALLOUT_HEAP_RIGHT(current); /* * Even if we don't have a right child, we still need to compare * our expiration against that of our left child. */ if (right >= nelems) goto comp_left; hright = &heap[right]; /* * We have both a left and a right child. We need to compare * the expiration of the children to determine which * expires earlier. */ if (hright->ch_expiration < hleft->ch_expiration) { /* * Our right child is the earlier of our children. * We'll now compare our expiration to its expiration. * If ours is the earlier one, we're done. */ if (hcurrent->ch_expiration <= hright->ch_expiration) return; /* * Our right child expires earlier than we do; swap * with our right child, and descend right. */ CALLOUT_SWAP(hright, hcurrent); current = right; continue; } comp_left: /* * Our left child is the earlier of our children (or we have * no right child). We'll now compare our expiration * to its expiration. If ours is the earlier one, we're done. */ if (hcurrent->ch_expiration <= hleft->ch_expiration) return; /* * Our left child expires earlier than we do; swap with our * left child, and descend left. */ CALLOUT_SWAP(hleft, hcurrent); current = left; } } /* * Delete and handle all past expirations in a callout table's heap. */ static hrtime_t callout_heap_delete(callout_table_t *ct) { hrtime_t now, expiration, next; callout_list_t *cl; callout_heap_t *heap; int hash; ASSERT(MUTEX_HELD(&ct->ct_mutex)); if (CALLOUT_CLEANUP(ct)) { /* * There are too many heap elements pointing to empty callout * lists. Clean them out. */ (void) callout_heap_process(ct, 0, 0); } now = gethrtime(); heap = ct->ct_heap; while (ct->ct_heap_num > 0) { expiration = heap->ch_expiration; hash = CALLOUT_CLHASH(expiration); cl = heap->ch_list; ASSERT(expiration == cl->cl_expiration); if (cl->cl_callouts.ch_head == NULL) { /* * If the callout list is empty, reap it. * Decrement the reap count. */ CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); CALLOUT_LIST_FREE(ct, cl); ct->ct_nreap--; } else { /* * If the root of the heap expires in the future, * bail out. */ if (expiration > now) break; /* * Move the callout list for this expiration to the * list of expired callout lists. It will be processed * by the callout executor. */ cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED; CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); CALLOUT_LIST_APPEND(ct->ct_expired, cl); } /* * Now delete the root. This is done by swapping the root with * the last item in the heap and downheaping the item. */ ct->ct_heap_num--; if (ct->ct_heap_num > 0) { heap[0] = heap[ct->ct_heap_num]; callout_downheap(ct); } } /* * If this callout table is empty or callouts have been suspended, * just return. The cyclic has already been programmed to * infinity by the cyclic subsystem. */ if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0)) return (CY_INFINITY); /* * If the top expirations are within callout_tolerance of each other, * delay the cyclic expire so that they can be processed together. * This is to prevent high resolution timers from swamping the system * with cyclic activity. */ if (ct->ct_heap_num > 2) { next = expiration + callout_tolerance; if ((heap[1].ch_expiration < next) || (heap[2].ch_expiration < next)) expiration = next; } (void) cyclic_reprogram(ct->ct_cyclic, expiration); return (expiration); } /* * There are some situations when the entire heap is walked and processed. * This function is called to do the processing. These are the situations: * * 1. When the reap count reaches its threshold, the heap has to be cleared * of all empty callout lists. * * 2. When the system enters and exits KMDB/OBP, all entries in the heap * need to be adjusted by the interval spent in KMDB/OBP. * * 3. When system time is changed, the heap has to be scanned for * absolute hrestime timers. These need to be removed from the heap * and expired immediately. * * In cases 2 and 3, it is a good idea to do 1 as well since we are * scanning the heap anyway. * * If the root gets changed and/or callout lists are expired, return the * new expiration to the caller so it can reprogram the cyclic accordingly. */ static hrtime_t callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange) { callout_heap_t *heap; callout_list_t *cl; hrtime_t expiration, now; int i, hash, clflags; ulong_t num; ASSERT(MUTEX_HELD(&ct->ct_mutex)); if (ct->ct_heap_num == 0) return (CY_INFINITY); if (ct->ct_nreap > 0) ct->ct_cleanups++; heap = ct->ct_heap; /* * We walk the heap from the top to the bottom. If we encounter * a heap item that points to an empty callout list, we clean * it out. If we encounter a hrestime entry that must be removed, * again we clean it out. Otherwise, we apply any adjustments needed * to an element. * * During the walk, we also compact the heap from the bottom and * reconstruct the heap using upheap operations. This is very * efficient if the number of elements to be cleaned is greater than * or equal to half the heap. This is the common case. * * Even in the non-common case, the upheap operations should be short * as the entries below generally tend to be bigger than the entries * above. */ num = ct->ct_heap_num; ct->ct_heap_num = 0; clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE); now = gethrtime(); for (i = 0; i < num; i++) { cl = heap[i].ch_list; /* * If the callout list is empty, delete the heap element and * free the callout list. */ if (cl->cl_callouts.ch_head == NULL) { hash = CALLOUT_CLHASH(cl->cl_expiration); CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); CALLOUT_LIST_FREE(ct, cl); continue; } /* * Delete the heap element and expire the callout list, if * one of the following is true: * - the callout list has expired * - the callout list is an absolute hrestime one and * there has been a system time change */ if ((cl->cl_expiration <= now) || (timechange && ((cl->cl_flags & clflags) == clflags))) { hash = CALLOUT_CLHASH(cl->cl_expiration); cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED; CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); CALLOUT_LIST_APPEND(ct->ct_expired, cl); continue; } /* * Apply adjustments, if any. Adjustments are applied after * the system returns from KMDB or OBP. They are only applied * to relative callout lists. */ if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) { hash = CALLOUT_CLHASH(cl->cl_expiration); CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); expiration = cl->cl_expiration + delta; if (expiration <= 0) expiration = CY_INFINITY; heap[i].ch_expiration = expiration; cl->cl_expiration = expiration; hash = CALLOUT_CLHASH(cl->cl_expiration); if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) { CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl); } else { CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl); } } heap[ct->ct_heap_num] = heap[i]; ct->ct_heap_num++; (void) callout_upheap(ct); } ct->ct_nreap = 0; /* * We need to return the expiration to help program the cyclic. * If there are expired callouts, the cyclic needs to go off * immediately. If the heap has become empty, then we return infinity. * Else, return the expiration of the earliest callout in the heap. */ if (ct->ct_expired.ch_head != NULL) return (gethrtime()); if (ct->ct_heap_num == 0) return (CY_INFINITY); return (heap->ch_expiration); } /* * Common function used to create normal and realtime callouts. * * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So, * there is one restriction on a realtime callout handler - it should not * directly or indirectly acquire cpu_lock. CPU offline waits for pending * cyclic handlers to complete while holding cpu_lock. So, if a realtime * callout handler were to try to get cpu_lock, there would be a deadlock * during CPU offline. */ callout_id_t timeout_generic(int type, void (*func)(void *), void *arg, hrtime_t expiration, hrtime_t resolution, int flags) { callout_table_t *ct; callout_t *cp; callout_id_t id; callout_list_t *cl; hrtime_t now, interval; int hash, clflags; ASSERT(resolution > 0); ASSERT(func != NULL); /* * We get the current hrtime right upfront so that latencies in * this function do not affect the accuracy of the callout. */ now = gethrtime(); /* * We disable kernel preemption so that we remain on the same CPU * throughout. If we needed to reprogram the callout table's cyclic, * we can avoid X-calls if we are on the same CPU. * * Note that callout_alloc() releases and reacquires the callout * table mutex. While reacquiring the mutex, it is possible for us * to go to sleep and later migrate to another CPU. This should be * pretty rare, though. */ kpreempt_disable(); ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)]; mutex_enter(&ct->ct_mutex); if (ct->ct_cyclic == CYCLIC_NONE) { mutex_exit(&ct->ct_mutex); /* * The callout table has not yet been initialized fully. * So, put this one on the boot callout table which is * always initialized. */ ct = &callout_boot_ct[type]; mutex_enter(&ct->ct_mutex); } if (CALLOUT_CLEANUP(ct)) { /* * There are too many heap elements pointing to empty callout * lists. Clean them out. Since cleanup is only done once * in a while, no need to reprogram the cyclic if the root * of the heap gets cleaned out. */ (void) callout_heap_process(ct, 0, 0); } if ((cp = ct->ct_free) == NULL) cp = callout_alloc(ct); else ct->ct_free = cp->c_idnext; cp->c_func = func; cp->c_arg = arg; /* * Compute the expiration hrtime. */ if (flags & CALLOUT_FLAG_ABSOLUTE) { interval = expiration - now; } else { interval = expiration; expiration += now; } if (resolution > 1) { /* * Align expiration to the specified resolution. */ if (flags & CALLOUT_FLAG_ROUNDUP) expiration += resolution - 1; expiration = (expiration / resolution) * resolution; } if (expiration <= 0) { /* * expiration hrtime overflow has occurred. Just set the * expiration to infinity. */ expiration = CY_INFINITY; } /* * Assign an ID to this callout */ if (flags & CALLOUT_FLAG_32BIT) { if (interval > callout_longterm) { id = (ct->ct_long_id - callout_counter_low); id |= CALLOUT_COUNTER_HIGH; ct->ct_long_id = id; } else { id = (ct->ct_short_id - callout_counter_low); id |= CALLOUT_COUNTER_HIGH; ct->ct_short_id = id; } } else { id = (ct->ct_gen_id - callout_counter_low); if ((id & CALLOUT_COUNTER_HIGH) == 0) { id |= CALLOUT_COUNTER_HIGH; id += CALLOUT_GENERATION_LOW; } ct->ct_gen_id = id; } cp->c_xid = id; clflags = 0; if (flags & CALLOUT_FLAG_ABSOLUTE) clflags |= CALLOUT_LIST_FLAG_ABSOLUTE; if (flags & CALLOUT_FLAG_HRESTIME) clflags |= CALLOUT_LIST_FLAG_HRESTIME; if (resolution == 1) clflags |= CALLOUT_LIST_FLAG_NANO; hash = CALLOUT_CLHASH(expiration); again: /* * Try to see if a callout list already exists for this expiration. */ cl = callout_list_get(ct, expiration, clflags, hash); if (cl == NULL) { /* * Check the free list. If we don't find one, we have to * take the slow path and allocate from kmem. */ if ((cl = ct->ct_lfree) == NULL) { callout_list_alloc(ct); /* * In the above call, we drop the lock, allocate and * reacquire the lock. So, we could have been away * for a while. In the meantime, someone could have * inserted a callout list with the same expiration. * Plus, the heap could have become full. So, the best * course is to repeat the steps. This should be an * infrequent event. */ goto again; } ct->ct_lfree = cl->cl_next; cl->cl_expiration = expiration; cl->cl_flags = clflags; /* * Check if we have enough space in the heap to insert one * expiration. If not, expand the heap. */ if (ct->ct_heap_num == ct->ct_heap_max) { if (callout_heap_expand(ct) == 0) { /* * Could not expand the heap. Just queue it. */ callout_queue_insert(ct, cl); goto out; } /* * In the above call, we drop the lock, allocate and * reacquire the lock. So, we could have been away * for a while. In the meantime, someone could have * inserted a callout list with the same expiration. * But we will not go back and check for it as this * should be a really infrequent event. There is no * point. */ } if (clflags & CALLOUT_LIST_FLAG_NANO) { CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl); } else { CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl); } /* * This is a new expiration. So, insert it into the heap. * This will also reprogram the cyclic, if the expiration * propagated to the root of the heap. */ callout_heap_insert(ct, cl); } else { /* * If the callout list was empty, untimeout_generic() would * have incremented a reap count. Decrement the reap count * as we are going to insert a callout into this list. */ if (cl->cl_callouts.ch_head == NULL) ct->ct_nreap--; } out: cp->c_list = cl; CALLOUT_APPEND(ct, cp); ct->ct_timeouts++; ct->ct_timeouts_pending++; mutex_exit(&ct->ct_mutex); kpreempt_enable(); TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT, "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration, cp); return (id); } timeout_id_t timeout(void (*func)(void *), void *arg, clock_t delta) { ulong_t id; /* * Make sure the callout runs at least 1 tick in the future. */ if (delta <= 0) delta = 1; else if (delta > callout_max_ticks) delta = callout_max_ticks; id = (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY); return ((timeout_id_t)id); } /* * Convenience function that creates a normal callout with default parameters * and returns a full ID. */ callout_id_t timeout_default(void (*func)(void *), void *arg, clock_t delta) { callout_id_t id; /* * Make sure the callout runs at least 1 tick in the future. */ if (delta <= 0) delta = 1; else if (delta > callout_max_ticks) delta = callout_max_ticks; id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta), nsec_per_tick, 0); return (id); } timeout_id_t realtime_timeout(void (*func)(void *), void *arg, clock_t delta) { ulong_t id; /* * Make sure the callout runs at least 1 tick in the future. */ if (delta <= 0) delta = 1; else if (delta > callout_max_ticks) delta = callout_max_ticks; id = (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY); return ((timeout_id_t)id); } /* * Convenience function that creates a realtime callout with default parameters * and returns a full ID. */ callout_id_t realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta) { callout_id_t id; /* * Make sure the callout runs at least 1 tick in the future. */ if (delta <= 0) delta = 1; else if (delta > callout_max_ticks) delta = callout_max_ticks; id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta), nsec_per_tick, 0); return (id); } hrtime_t untimeout_generic(callout_id_t id, int nowait) { callout_table_t *ct; callout_t *cp; callout_id_t xid; callout_list_t *cl; int hash, flags; callout_id_t bogus; ct = &callout_table[CALLOUT_ID_TO_TABLE(id)]; hash = CALLOUT_IDHASH(id); mutex_enter(&ct->ct_mutex); /* * Search the ID hash table for the callout. */ for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) { xid = cp->c_xid; /* * Match the ID and generation number. */ if ((xid & CALLOUT_ID_MASK) != id) continue; if ((xid & CALLOUT_EXECUTING) == 0) { hrtime_t expiration; /* * Delete the callout. If the callout list becomes * NULL, we don't remove it from the table. This is * so it can be reused. If the empty callout list * corresponds to the top of the the callout heap, we * don't reprogram the table cyclic here. This is in * order to avoid lots of X-calls to the CPU associated * with the callout table. */ cl = cp->c_list; expiration = cl->cl_expiration; CALLOUT_DELETE(ct, cp); CALLOUT_FREE(ct, cp); ct->ct_untimeouts_unexpired++; ct->ct_timeouts_pending--; /* * If the callout list has become empty, there are 3 * possibilities. If it is present: * - in the heap, it needs to be cleaned along * with its heap entry. Increment a reap count. * - in the callout queue, free it. * - in the expired list, free it. */ if (cl->cl_callouts.ch_head == NULL) { flags = cl->cl_flags; if (flags & CALLOUT_LIST_FLAG_HEAPED) { ct->ct_nreap++; } else if (flags & CALLOUT_LIST_FLAG_QUEUED) { CALLOUT_LIST_DELETE(ct->ct_queue, cl); CALLOUT_LIST_FREE(ct, cl); } else { CALLOUT_LIST_DELETE(ct->ct_expired, cl); CALLOUT_LIST_FREE(ct, cl); } } mutex_exit(&ct->ct_mutex); expiration -= gethrtime(); TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT, "untimeout:ID %lx hrtime left %llx", id, expiration); return (expiration < 0 ? 0 : expiration); } ct->ct_untimeouts_executing++; /* * The callout we want to delete is currently executing. * The DDI states that we must wait until the callout * completes before returning, so we block on c_done until the * callout ID changes (to the old ID if it's on the freelist, * or to a new callout ID if it's in use). This implicitly * assumes that callout structures are persistent (they are). */ if (cp->c_executor == curthread) { /* * The timeout handler called untimeout() on itself. * Stupid, but legal. We can't wait for the timeout * to complete without deadlocking, so we just return. */ mutex_exit(&ct->ct_mutex); TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF, "untimeout_self:ID %x", id); return (-1); } if (nowait == 0) { /* * We need to wait. Indicate that we are waiting by * incrementing c_waiting. This prevents the executor * from doing a wakeup on c_done if there are no * waiters. */ while (cp->c_xid == xid) { cp->c_waiting = 1; cv_wait(&cp->c_done, &ct->ct_mutex); } } mutex_exit(&ct->ct_mutex); TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING, "untimeout_executing:ID %lx", id); return (-1); } ct->ct_untimeouts_expired++; mutex_exit(&ct->ct_mutex); TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID, "untimeout_bogus_id:ID %lx", id); /* * We didn't find the specified callout ID. This means either * (1) the callout already fired, or (2) the caller passed us * a bogus value. Perform a sanity check to detect case (2). */ bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH); if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0)) panic("untimeout: impossible timeout id %llx", (unsigned long long)id); return (-1); } clock_t untimeout(timeout_id_t id_arg) { hrtime_t hleft; clock_t tleft; callout_id_t id; id = (ulong_t)id_arg; hleft = untimeout_generic(id, 0); if (hleft < 0) tleft = -1; else if (hleft == 0) tleft = 0; else tleft = NSEC_TO_TICK(hleft); return (tleft); } /* * Convenience function to untimeout a timeout with a full ID with default * parameters. */ clock_t untimeout_default(callout_id_t id, int nowait) { hrtime_t hleft; clock_t tleft; hleft = untimeout_generic(id, nowait); if (hleft < 0) tleft = -1; else if (hleft == 0) tleft = 0; else tleft = NSEC_TO_TICK(hleft); return (tleft); } /* * Expire all the callouts queued in the specified callout list. */ static void callout_list_expire(callout_table_t *ct, callout_list_t *cl) { callout_t *cp, *cnext; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(cl != NULL); for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) { /* * Multiple executor threads could be running at the same * time. If this callout is already being executed, * go on to the next one. */ if (cp->c_xid & CALLOUT_EXECUTING) { cnext = cp->c_clnext; continue; } /* * Indicate to untimeout() that a callout is * being expired by the executor. */ cp->c_xid |= CALLOUT_EXECUTING; cp->c_executor = curthread; mutex_exit(&ct->ct_mutex); DTRACE_PROBE1(callout__start, callout_t *, cp); (*cp->c_func)(cp->c_arg); DTRACE_PROBE1(callout__end, callout_t *, cp); mutex_enter(&ct->ct_mutex); ct->ct_expirations++; ct->ct_timeouts_pending--; /* * Indicate completion for c_done. */ cp->c_xid &= ~CALLOUT_EXECUTING; cp->c_executor = NULL; cnext = cp->c_clnext; /* * Delete callout from ID hash table and the callout * list, return to freelist, and tell any untimeout() that * cares that we're done. */ CALLOUT_DELETE(ct, cp); CALLOUT_FREE(ct, cp); if (cp->c_waiting) { cp->c_waiting = 0; cv_broadcast(&cp->c_done); } } } /* * Execute all expired callout lists for a callout table. */ static void callout_expire(callout_table_t *ct) { callout_list_t *cl, *clnext; ASSERT(MUTEX_HELD(&ct->ct_mutex)); for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) { /* * Expire all the callouts in this callout list. */ callout_list_expire(ct, cl); clnext = cl->cl_next; if (cl->cl_callouts.ch_head == NULL) { /* * Free the callout list. */ CALLOUT_LIST_DELETE(ct->ct_expired, cl); CALLOUT_LIST_FREE(ct, cl); } } } /* * The cyclic handlers below process callouts in two steps: * * 1. Find all expired callout lists and queue them in a separate * list of expired callouts. * 2. Execute the expired callout lists. * * This is done for two reasons: * * 1. We want to quickly find the next earliest expiration to program * the cyclic to and reprogram it. We can do this right at the end * of step 1. * 2. The realtime cyclic handler expires callouts in place. However, * for normal callouts, callouts are expired by a taskq thread. * So, it is simpler and more robust to have the taskq thread just * do step 2. */ /* * Realtime callout cyclic handlers. */ void callout_realtime(callout_table_t *ct) { mutex_enter(&ct->ct_mutex); (void) callout_heap_delete(ct); callout_expire(ct); mutex_exit(&ct->ct_mutex); } void callout_queue_realtime(callout_table_t *ct) { mutex_enter(&ct->ct_mutex); (void) callout_queue_delete(ct); callout_expire(ct); mutex_exit(&ct->ct_mutex); } void callout_execute(callout_table_t *ct) { mutex_enter(&ct->ct_mutex); callout_expire(ct); mutex_exit(&ct->ct_mutex); } /* * Normal callout cyclic handlers. */ void callout_normal(callout_table_t *ct) { int i, exec; hrtime_t exp; mutex_enter(&ct->ct_mutex); exp = callout_heap_delete(ct); CALLOUT_EXEC_COMPUTE(ct, exp, exec); mutex_exit(&ct->ct_mutex); for (i = 0; i < exec; i++) { ASSERT(ct->ct_taskq != NULL); (void) taskq_dispatch(ct->ct_taskq, (task_func_t *)callout_execute, ct, TQ_NOSLEEP); } } void callout_queue_normal(callout_table_t *ct) { int i, exec; hrtime_t exp; mutex_enter(&ct->ct_mutex); exp = callout_queue_delete(ct); CALLOUT_EXEC_COMPUTE(ct, exp, exec); mutex_exit(&ct->ct_mutex); for (i = 0; i < exec; i++) { ASSERT(ct->ct_taskq != NULL); (void) taskq_dispatch(ct->ct_taskq, (task_func_t *)callout_execute, ct, TQ_NOSLEEP); } } /* * Suspend callout processing. */ static void callout_suspend(void) { int t, f; callout_table_t *ct; /* * Traverse every callout table in the system and suspend callout * processing. * * We need to suspend all the tables (including the inactive ones) * so that if a table is made active while the suspend is still on, * the table remains suspended. */ for (f = 0; f < max_ncpus; f++) { for (t = 0; t < CALLOUT_NTYPES; t++) { ct = &callout_table[CALLOUT_TABLE(t, f)]; mutex_enter(&ct->ct_mutex); ct->ct_suspend++; if (ct->ct_cyclic == CYCLIC_NONE) { mutex_exit(&ct->ct_mutex); continue; } if (ct->ct_suspend == 1) { (void) cyclic_reprogram(ct->ct_cyclic, CY_INFINITY); (void) cyclic_reprogram(ct->ct_qcyclic, CY_INFINITY); } mutex_exit(&ct->ct_mutex); } } } /* * Resume callout processing. */ static void callout_resume(hrtime_t delta, int timechange) { hrtime_t hexp, qexp; int t, f; callout_table_t *ct; /* * Traverse every callout table in the system and resume callout * processing. For active tables, perform any hrtime adjustments * necessary. */ for (f = 0; f < max_ncpus; f++) { for (t = 0; t < CALLOUT_NTYPES; t++) { ct = &callout_table[CALLOUT_TABLE(t, f)]; mutex_enter(&ct->ct_mutex); if (ct->ct_cyclic == CYCLIC_NONE) { ct->ct_suspend--; mutex_exit(&ct->ct_mutex); continue; } /* * If a delta is specified, adjust the expirations in * the heap by delta. Also, if the caller indicates * a timechange, process that. This step also cleans * out any empty callout lists that might happen to * be there. */ hexp = callout_heap_process(ct, delta, timechange); qexp = callout_queue_process(ct, delta, timechange); ct->ct_suspend--; if (ct->ct_suspend == 0) { (void) cyclic_reprogram(ct->ct_cyclic, hexp); (void) cyclic_reprogram(ct->ct_qcyclic, qexp); } mutex_exit(&ct->ct_mutex); } } } /* * Callback handler used by CPR to stop and resume callouts. * The cyclic subsystem saves and restores hrtime during CPR. * That is why callout_resume() is called with a 0 delta. * Although hrtime is the same, hrestime (system time) has * progressed during CPR. So, we have to indicate a time change * to expire the absolute hrestime timers. */ /*ARGSUSED*/ static boolean_t callout_cpr_callb(void *arg, int code) { if (code == CB_CODE_CPR_CHKPT) callout_suspend(); else callout_resume(0, 1); return (B_TRUE); } /* * Callback handler invoked when the debugger is entered or exited. */ /*ARGSUSED*/ static boolean_t callout_debug_callb(void *arg, int code) { hrtime_t delta; /* * When the system enters the debugger. make a note of the hrtime. * When it is resumed, compute how long the system was in the * debugger. This interval should not be counted for callouts. */ if (code == 0) { callout_suspend(); callout_debug_hrtime = gethrtime(); } else { delta = gethrtime() - callout_debug_hrtime; callout_resume(delta, 0); } return (B_TRUE); } /* * Move the absolute hrestime callouts to the expired list. Then program the * table's cyclic to expire immediately so that the callouts can be executed * immediately. */ static void callout_hrestime_one(callout_table_t *ct) { hrtime_t hexp, qexp; mutex_enter(&ct->ct_mutex); if (ct->ct_cyclic == CYCLIC_NONE) { mutex_exit(&ct->ct_mutex); return; } /* * Walk the heap and process all the absolute hrestime entries. */ hexp = callout_heap_process(ct, 0, 1); qexp = callout_queue_process(ct, 0, 1); if (ct->ct_suspend == 0) { (void) cyclic_reprogram(ct->ct_cyclic, hexp); (void) cyclic_reprogram(ct->ct_qcyclic, qexp); } mutex_exit(&ct->ct_mutex); } /* * This function is called whenever system time (hrestime) is changed * explicitly. All the HRESTIME callouts must be expired at once. */ /*ARGSUSED*/ void callout_hrestime(void) { int t, f; callout_table_t *ct; /* * Traverse every callout table in the system and process the hrestime * callouts therein. * * We look at all the tables because we don't know which ones were * onlined and offlined in the past. The offlined tables may still * have active cyclics processing timers somewhere. */ for (f = 0; f < max_ncpus; f++) { for (t = 0; t < CALLOUT_NTYPES; t++) { ct = &callout_table[CALLOUT_TABLE(t, f)]; callout_hrestime_one(ct); } } } /* * Create the hash tables for this callout table. */ static void callout_hash_init(callout_table_t *ct) { size_t size; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL)); size = sizeof (callout_hash_t) * CALLOUT_BUCKETS; ct->ct_idhash = kmem_zalloc(size, KM_SLEEP); ct->ct_clhash = kmem_zalloc(size, KM_SLEEP); } /* * Create per-callout table kstats. */ static void callout_kstat_init(callout_table_t *ct) { callout_stat_type_t stat; kstat_t *ct_kstats; int ndx; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_kstats == NULL); ndx = ct - callout_table; ct_kstats = kstat_create("unix", ndx, "callout", "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL); if (ct_kstats == NULL) { cmn_err(CE_WARN, "kstat_create for callout table %p failed", (void *)ct); } else { ct_kstats->ks_data = ct->ct_kstat_data; for (stat = 0; stat < CALLOUT_NUM_STATS; stat++) kstat_named_init(&ct->ct_kstat_data[stat], callout_kstat_names[stat], KSTAT_DATA_INT64); ct->ct_kstats = ct_kstats; kstat_install(ct_kstats); } } static void callout_cyclic_init(callout_table_t *ct) { cyc_handler_t hdlr; cyc_time_t when; processorid_t seqid; int t; cyclic_id_t cyclic, qcyclic; ASSERT(MUTEX_HELD(&ct->ct_mutex)); t = ct->ct_type; seqid = CALLOUT_TABLE_SEQID(ct); /* * Create the taskq thread if the table type is normal. * Realtime tables are handled at PIL1 by a softint * handler. */ if (t == CALLOUT_NORMAL) { ASSERT(ct->ct_taskq == NULL); /* * Each callout thread consumes exactly one * task structure while active. Therefore, * prepopulating with 2 * callout_threads tasks * ensures that there's at least one task per * thread that's either scheduled or on the * freelist. In turn, this guarantees that * taskq_dispatch() will always either succeed * (because there's a free task structure) or * be unnecessary (because "callout_excute(ct)" * has already scheduled). */ ct->ct_taskq = taskq_create_instance("callout_taskq", seqid, callout_threads, maxclsyspri, 2 * callout_threads, 2 * callout_threads, TASKQ_PREPOPULATE | TASKQ_CPR_SAFE); } /* * callouts can only be created in a table whose * cyclic has been initialized. */ ASSERT(ct->ct_heap_num == 0); /* * Drop the mutex before creating the callout cyclics. cyclic_add() * could potentially expand the cyclic heap. We don't want to be * holding the callout table mutex in that case. Note that this * function is called during CPU online. cpu_lock is held at this * point. So, only one thread can be executing the cyclic add logic * below at any time. */ mutex_exit(&ct->ct_mutex); /* * Create the callout table cyclics. * * The realtime cyclic handler executes at low PIL. The normal cyclic * handler executes at lock PIL. This is because there are cases * where code can block at PIL > 1 waiting for a normal callout handler * to unblock it directly or indirectly. If the normal cyclic were to * be executed at low PIL, it could get blocked out by the waiter * and cause a deadlock. */ ASSERT(ct->ct_cyclic == CYCLIC_NONE); if (t == CALLOUT_REALTIME) { hdlr.cyh_level = callout_realtime_level; hdlr.cyh_func = (cyc_func_t)callout_realtime; } else { hdlr.cyh_level = callout_normal_level; hdlr.cyh_func = (cyc_func_t)callout_normal; } hdlr.cyh_arg = ct; when.cyt_when = CY_INFINITY; when.cyt_interval = CY_INFINITY; cyclic = cyclic_add(&hdlr, &when); if (t == CALLOUT_REALTIME) hdlr.cyh_func = (cyc_func_t)callout_queue_realtime; else hdlr.cyh_func = (cyc_func_t)callout_queue_normal; qcyclic = cyclic_add(&hdlr, &when); mutex_enter(&ct->ct_mutex); ct->ct_cyclic = cyclic; ct->ct_qcyclic = qcyclic; } void callout_cpu_online(cpu_t *cp) { lgrp_handle_t hand; callout_cache_t *cache; char s[KMEM_CACHE_NAMELEN]; callout_table_t *ct; processorid_t seqid; int t; ASSERT(MUTEX_HELD(&cpu_lock)); /* * Locate the cache corresponding to the onlined CPU's lgroup. * Note that access to callout_caches is protected by cpu_lock. */ hand = lgrp_plat_cpu_to_hand(cp->cpu_id); for (cache = callout_caches; cache != NULL; cache = cache->cc_next) { if (cache->cc_hand == hand) break; } /* * If not found, create one. The caches are never destroyed. */ if (cache == NULL) { cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP); cache->cc_hand = hand; (void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx", (long)hand); cache->cc_cache = kmem_cache_create(s, sizeof (callout_t), CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); (void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx", (long)hand); cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t), CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); cache->cc_next = callout_caches; callout_caches = cache; } seqid = cp->cpu_seqid; for (t = 0; t < CALLOUT_NTYPES; t++) { ct = &callout_table[CALLOUT_TABLE(t, seqid)]; mutex_enter(&ct->ct_mutex); /* * Store convinience pointers to the kmem caches * in the callout table. These assignments should always be * done as callout tables can map to different physical * CPUs each time. */ ct->ct_cache = cache->cc_cache; ct->ct_lcache = cache->cc_lcache; /* * We use the heap pointer to check if stuff has been * initialized for this callout table. */ if (ct->ct_heap == NULL) { callout_heap_init(ct); callout_hash_init(ct); callout_kstat_init(ct); callout_cyclic_init(ct); } mutex_exit(&ct->ct_mutex); /* * Move the cyclics to this CPU by doing a bind. */ cyclic_bind(ct->ct_cyclic, cp, NULL); cyclic_bind(ct->ct_qcyclic, cp, NULL); } } void callout_cpu_offline(cpu_t *cp) { callout_table_t *ct; processorid_t seqid; int t; ASSERT(MUTEX_HELD(&cpu_lock)); seqid = cp->cpu_seqid; for (t = 0; t < CALLOUT_NTYPES; t++) { ct = &callout_table[CALLOUT_TABLE(t, seqid)]; /* * Unbind the cyclics. This will allow the cyclic subsystem * to juggle the cyclics during CPU offline. */ cyclic_bind(ct->ct_cyclic, NULL, NULL); cyclic_bind(ct->ct_qcyclic, NULL, NULL); } } /* * This is called to perform per-CPU initialization for slave CPUs at * boot time. */ void callout_mp_init(void) { cpu_t *cp; size_t min, max; if (callout_chunk == CALLOUT_CHUNK) { /* * No one has specified a chunk in /etc/system. We need to * compute it here based on the number of online CPUs and * available physical memory. */ min = CALLOUT_MIN_HEAP_SIZE; max = ptob(physmem / CALLOUT_MEM_FRACTION); if (min > max) min = max; callout_chunk = min / sizeof (callout_heap_t); callout_chunk /= ncpus_online; callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK); } mutex_enter(&cpu_lock); cp = cpu_active; do { callout_cpu_online(cp); } while ((cp = cp->cpu_next_onln) != cpu_active); mutex_exit(&cpu_lock); } /* * Initialize all callout tables. Called at boot time just before clkstart(). */ void callout_init(void) { int f, t; size_t size; int table_id; callout_table_t *ct; long bits, fanout; uintptr_t buf; /* * Initialize callout globals. */ bits = 0; for (fanout = 1; (fanout < max_ncpus); fanout <<= 1) bits++; callout_table_bits = CALLOUT_TYPE_BITS + bits; callout_table_mask = (1 << callout_table_bits) - 1; callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT; callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS); callout_max_ticks = CALLOUT_MAX_TICKS; if (callout_min_reap == 0) callout_min_reap = CALLOUT_MIN_REAP; if (callout_tolerance <= 0) callout_tolerance = CALLOUT_TOLERANCE; if (callout_threads <= 0) callout_threads = CALLOUT_THREADS; if (callout_chunk <= 0) callout_chunk = CALLOUT_CHUNK; else callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK); /* * Allocate all the callout tables based on max_ncpus. We have chosen * to do boot-time allocation instead of dynamic allocation because: * * - the size of the callout tables is not too large. * - there are race conditions involved in making this dynamic. * - the hash tables that go with the callout tables consume * most of the memory and they are only allocated in * callout_cpu_online(). * * Each CPU has two tables that are consecutive in the array. The first * one is for realtime callouts and the second one is for normal ones. * * We do this alignment dance to make sure that callout table * structures will always be on a cache line boundary. */ size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus; size += CALLOUT_ALIGN; buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP); callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN); size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS; /* * Now, initialize the tables for all the CPUs. */ for (f = 0; f < max_ncpus; f++) { for (t = 0; t < CALLOUT_NTYPES; t++) { table_id = CALLOUT_TABLE(t, f); ct = &callout_table[table_id]; ct->ct_type = t; mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL); /* * Precompute the base IDs for long and short-term * legacy IDs. This makes ID generation during * timeout() fast. */ ct->ct_short_id = CALLOUT_SHORT_ID(table_id); ct->ct_long_id = CALLOUT_LONG_ID(table_id); /* * Precompute the base ID for generation-based IDs. * Note that when the first ID gets allocated, the * ID will wrap. This will cause the generation * number to be incremented to 1. */ ct->ct_gen_id = CALLOUT_SHORT_ID(table_id); /* * Initialize the cyclics as NONE. This will get set * during CPU online. This is so that partially * populated systems will only have the required * number of cyclics, not more. */ ct->ct_cyclic = CYCLIC_NONE; ct->ct_qcyclic = CYCLIC_NONE; ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP); } } /* * Add the callback for CPR. This is called during checkpoint * resume to suspend and resume callouts. */ (void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT, "callout_cpr"); (void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER, "callout_debug"); /* * Call the per-CPU initialization function for the boot CPU. This * is done here because the function is not called automatically for * the boot CPU from the CPU online/offline hooks. Note that the * CPU lock is taken here because of convention. */ mutex_enter(&cpu_lock); callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)]; callout_cpu_online(CPU); mutex_exit(&cpu_lock); /* heads-up to boot-time clients that timeouts now available */ callout_init_done = 1; }