xref: /illumos-gate/usr/src/uts/common/os/clock_tick.c (revision 1b7f7204)
12850d85bSmv /*
22850d85bSmv  * CDDL HEADER START
32850d85bSmv  *
42850d85bSmv  * The contents of this file are subject to the terms of the
52850d85bSmv  * Common Development and Distribution License (the "License").
62850d85bSmv  * You may not use this file except in compliance with the License.
72850d85bSmv  *
82850d85bSmv  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
92850d85bSmv  * or http://www.opensolaris.org/os/licensing.
102850d85bSmv  * See the License for the specific language governing permissions
112850d85bSmv  * and limitations under the License.
122850d85bSmv  *
132850d85bSmv  * When distributing Covered Code, include this CDDL HEADER in each
142850d85bSmv  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
152850d85bSmv  * If applicable, add the following below this CDDL HEADER, with the
162850d85bSmv  * fields enclosed by brackets "[]" replaced with your own identifying
172850d85bSmv  * information: Portions Copyright [yyyy] [name of copyright owner]
182850d85bSmv  *
192850d85bSmv  * CDDL HEADER END
202850d85bSmv  */
212850d85bSmv 
222850d85bSmv /*
2307247649SMadhavan Venkataraman  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
242850d85bSmv  * Use is subject to license terms.
252850d85bSmv  */
262850d85bSmv 
272850d85bSmv #include <sys/thread.h>
282850d85bSmv #include <sys/proc.h>
292850d85bSmv #include <sys/task.h>
302850d85bSmv #include <sys/cmn_err.h>
312850d85bSmv #include <sys/class.h>
322850d85bSmv #include <sys/sdt.h>
332850d85bSmv #include <sys/atomic.h>
342850d85bSmv #include <sys/cpu.h>
352850d85bSmv #include <sys/clock_tick.h>
36d3d50737SRafael Vanoni #include <sys/clock_impl.h>
372850d85bSmv #include <sys/sysmacros.h>
382850d85bSmv #include <vm/rm.h>
392850d85bSmv 
402850d85bSmv /*
412850d85bSmv  * This file contains the implementation of clock tick accounting for threads.
422850d85bSmv  * Every tick, user threads running on various CPUs are located and charged
432850d85bSmv  * with a tick to account for their use of CPU time.
442850d85bSmv  *
452850d85bSmv  * Every tick, the clock() handler calls clock_tick_schedule() to perform tick
462850d85bSmv  * accounting for all the threads in the system. Tick accounting is done in
472850d85bSmv  * two phases:
482850d85bSmv  *
492850d85bSmv  * Tick scheduling	Done in clock_tick_schedule(). In this phase, cross
502850d85bSmv  *			calls are scheduled to multiple CPUs to perform
512850d85bSmv  *			multi-threaded tick accounting. The CPUs are chosen
522850d85bSmv  *			on a rotational basis so as to distribute the tick
532850d85bSmv  *			accounting load evenly across all CPUs.
542850d85bSmv  *
552850d85bSmv  * Tick execution	Done in clock_tick_execute(). In this phase, tick
562850d85bSmv  *			accounting is actually performed by softint handlers
572850d85bSmv  *			on multiple CPUs.
582850d85bSmv  *
592850d85bSmv  * This implementation gives us a multi-threaded tick processing facility that
602850d85bSmv  * is suitable for configurations with a large number of CPUs. On smaller
612850d85bSmv  * configurations it may be desirable to let the processing be single-threaded
622850d85bSmv  * and just allow clock() to do it as it has been done traditionally. To
632850d85bSmv  * facilitate this, a variable, clock_tick_threshold, is defined. Platforms
642850d85bSmv  * that desire multi-threading should set this variable to something
652850d85bSmv  * appropriate. A recommended value may be found in clock_tick.h. At boot time,
662850d85bSmv  * if the number of CPUs is greater than clock_tick_threshold, multi-threading
672850d85bSmv  * kicks in. Note that this is a decision made at boot time. If more CPUs
682850d85bSmv  * are dynamically added later on to exceed the threshold, no attempt is made
692850d85bSmv  * to switch to multi-threaded. Similarly, if CPUs are removed dynamically
702850d85bSmv  * no attempt is made to switch to single-threaded. This is to keep the
712850d85bSmv  * implementation simple. Also note that the threshold can be changed for a
722850d85bSmv  * specific customer configuration via /etc/system.
732850d85bSmv  *
742850d85bSmv  * The boot time decision is reflected in clock_tick_single_threaded.
752850d85bSmv  */
762850d85bSmv 
772850d85bSmv /*
782850d85bSmv  * clock_tick_threshold
792850d85bSmv  *	If the number of CPUs at boot time exceeds this threshold,
802850d85bSmv  *	multi-threaded tick accounting kicks in.
812850d85bSmv  *
822850d85bSmv  * clock_tick_ncpus
832850d85bSmv  *	The number of CPUs in a set. Each set is scheduled for tick execution
842850d85bSmv  *	on a separate processor.
852850d85bSmv  *
862850d85bSmv  * clock_tick_single_threaded
872850d85bSmv  *	Indicates whether or not tick accounting is single threaded.
882850d85bSmv  *
892850d85bSmv  * clock_tick_total_cpus
902850d85bSmv  *	Total number of online CPUs.
912850d85bSmv  *
922850d85bSmv  * clock_tick_cpus
932850d85bSmv  *	Array of online CPU pointers.
942850d85bSmv  *
952850d85bSmv  * clock_tick_cpu
962850d85bSmv  *	Per-CPU, cache-aligned data structures to facilitate multi-threading.
972850d85bSmv  *
982850d85bSmv  * clock_tick_active
992850d85bSmv  *	Counter that indicates the number of active tick processing softints
1002850d85bSmv  *	in the system.
1012850d85bSmv  *
1022850d85bSmv  * clock_tick_pending
1032850d85bSmv  *	Number of pending ticks that need to be accounted by the softint
1042850d85bSmv  *	handlers.
1052850d85bSmv  *
1062850d85bSmv  * clock_tick_lock
1072850d85bSmv  *	Mutex to synchronize between clock_tick_schedule() and
1082850d85bSmv  *	CPU online/offline.
1092850d85bSmv  *
1102850d85bSmv  * clock_cpu_id
1112850d85bSmv  *	CPU id of the clock() CPU. Used to detect when the clock CPU
1122850d85bSmv  *	is offlined.
1132850d85bSmv  *
1142850d85bSmv  * clock_tick_online_cpuset
1152850d85bSmv  *	CPU set of all online processors that can be X-called.
1162850d85bSmv  *
1172850d85bSmv  * clock_tick_proc_max
1182850d85bSmv  *	Each process is allowed to accumulate a few ticks before checking
1192850d85bSmv  *	for the task CPU time resource limit. We lower the number of calls
1202850d85bSmv  *	to rctl_test() to make tick accounting more scalable. The tradeoff
1212850d85bSmv  *	is that the limit may not get enforced in a timely manner. This is
1222850d85bSmv  *	typically not a problem.
1232850d85bSmv  *
1242850d85bSmv  * clock_tick_set
1252850d85bSmv  *	Per-set structures. Each structure contains the range of CPUs
1262850d85bSmv  *	to be processed for the set.
1272850d85bSmv  *
1282850d85bSmv  * clock_tick_nsets;
1292850d85bSmv  *	Number of sets.
1302850d85bSmv  *
1312850d85bSmv  * clock_tick_scan
1322850d85bSmv  *	Where to begin the scan for single-threaded mode. In multi-threaded,
1332850d85bSmv  *	the clock_tick_set itself contains a field for this.
1342850d85bSmv  */
1352850d85bSmv int			clock_tick_threshold;
1362850d85bSmv int			clock_tick_ncpus;
1372850d85bSmv int			clock_tick_single_threaded;
1382850d85bSmv int			clock_tick_total_cpus;
1392850d85bSmv cpu_t			*clock_tick_cpus[NCPU];
1402850d85bSmv clock_tick_cpu_t	*clock_tick_cpu[NCPU];
1412850d85bSmv ulong_t			clock_tick_active;
1422850d85bSmv int			clock_tick_pending;
1432850d85bSmv kmutex_t		clock_tick_lock;
1442850d85bSmv processorid_t		clock_cpu_id;
1452850d85bSmv cpuset_t		clock_tick_online_cpuset;
1462850d85bSmv clock_t			clock_tick_proc_max;
1472850d85bSmv clock_tick_set_t	*clock_tick_set;
1482850d85bSmv int			clock_tick_nsets;
1492850d85bSmv int			clock_tick_scan;
15007247649SMadhavan Venkataraman ulong_t			clock_tick_intr;
1512850d85bSmv 
1522850d85bSmv static uint_t	clock_tick_execute(caddr_t, caddr_t);
1532850d85bSmv static void	clock_tick_execute_common(int, int, int, clock_t, int);
1542850d85bSmv 
1552850d85bSmv #define	CLOCK_TICK_ALIGN	64	/* cache alignment */
1562850d85bSmv 
1572850d85bSmv /*
1582850d85bSmv  * Clock tick initialization is done in two phases:
1592850d85bSmv  *
1602850d85bSmv  * 1. Before clock_init() is called, clock_tick_init_pre() is called to set
1612850d85bSmv  *    up single-threading so the clock() can begin to do its job.
1622850d85bSmv  *
1632850d85bSmv  * 2. After the slave CPUs are initialized at boot time, we know the number
1642850d85bSmv  *    of CPUs. clock_tick_init_post() is called to set up multi-threading if
1652850d85bSmv  *    required.
1662850d85bSmv  */
1672850d85bSmv void
clock_tick_init_pre(void)1682850d85bSmv clock_tick_init_pre(void)
1692850d85bSmv {
1702850d85bSmv 	clock_tick_cpu_t	*ctp;
1712850d85bSmv 	int			i, n;
1722850d85bSmv 	clock_tick_set_t	*csp;
1732850d85bSmv 	uintptr_t		buf;
1742850d85bSmv 	size_t			size;
1752850d85bSmv 
1762850d85bSmv 	clock_tick_single_threaded = 1;
1772850d85bSmv 
1782850d85bSmv 	size = P2ROUNDUP(sizeof (clock_tick_cpu_t), CLOCK_TICK_ALIGN);
1792850d85bSmv 	buf = (uintptr_t)kmem_zalloc(size * NCPU + CLOCK_TICK_ALIGN, KM_SLEEP);
1802850d85bSmv 	buf = P2ROUNDUP(buf, CLOCK_TICK_ALIGN);
1812850d85bSmv 
1822850d85bSmv 	/*
1832850d85bSmv 	 * Perform initialization in case multi-threading is chosen later.
1842850d85bSmv 	 */
18507247649SMadhavan Venkataraman 	if (&create_softint != NULL) {
18607247649SMadhavan Venkataraman 		clock_tick_intr = create_softint(LOCK_LEVEL,
18707247649SMadhavan Venkataraman 		    clock_tick_execute, (caddr_t)NULL);
18807247649SMadhavan Venkataraman 	}
1892850d85bSmv 	for (i = 0; i < NCPU; i++, buf += size) {
1902850d85bSmv 		ctp = (clock_tick_cpu_t *)buf;
1912850d85bSmv 		clock_tick_cpu[i] = ctp;
1922850d85bSmv 		mutex_init(&ctp->ct_lock, NULL, MUTEX_DEFAULT, NULL);
1932850d85bSmv 		if (&create_softint != NULL) {
19407247649SMadhavan Venkataraman 			ctp->ct_intr = clock_tick_intr;
1952850d85bSmv 		}
1962850d85bSmv 		ctp->ct_pending = 0;
1972850d85bSmv 	}
1982850d85bSmv 
1992850d85bSmv 	mutex_init(&clock_tick_lock, NULL, MUTEX_DEFAULT, NULL);
2002850d85bSmv 
2012850d85bSmv 	/*
2022850d85bSmv 	 * Compute clock_tick_ncpus here. We need it to compute the
2032850d85bSmv 	 * maximum number of tick sets we need to support.
2042850d85bSmv 	 */
2052850d85bSmv 	ASSERT(clock_tick_ncpus >= 0);
2062850d85bSmv 	if (clock_tick_ncpus == 0)
2072850d85bSmv 		clock_tick_ncpus = CLOCK_TICK_NCPUS;
2082850d85bSmv 	if (clock_tick_ncpus > max_ncpus)
2092850d85bSmv 		clock_tick_ncpus = max_ncpus;
2102850d85bSmv 
2112850d85bSmv 	/*
2122850d85bSmv 	 * Allocate and initialize the tick sets.
2132850d85bSmv 	 */
2142850d85bSmv 	n = (max_ncpus + clock_tick_ncpus - 1)/clock_tick_ncpus;
2152850d85bSmv 	clock_tick_set = kmem_zalloc(sizeof (clock_tick_set_t) * n, KM_SLEEP);
2162850d85bSmv 	for (i = 0; i < n; i++) {
2172850d85bSmv 		csp = &clock_tick_set[i];
2182850d85bSmv 		csp->ct_start = i * clock_tick_ncpus;
2192850d85bSmv 		csp->ct_scan = csp->ct_start;
2202850d85bSmv 		csp->ct_end = csp->ct_start;
2212850d85bSmv 	}
2222850d85bSmv }
2232850d85bSmv 
2242850d85bSmv void
clock_tick_init_post(void)2252850d85bSmv clock_tick_init_post(void)
2262850d85bSmv {
2272850d85bSmv 	/*
2282850d85bSmv 	 * If a platform does not provide create_softint() and invoke_softint(),
2292850d85bSmv 	 * then we assume single threaded.
2302850d85bSmv 	 */
2312850d85bSmv 	if (&invoke_softint == NULL)
2322850d85bSmv 		clock_tick_threshold = 0;
2332850d85bSmv 
2342850d85bSmv 	ASSERT(clock_tick_threshold >= 0);
2352850d85bSmv 
2362850d85bSmv 	if (clock_tick_threshold == 0)
2372850d85bSmv 		clock_tick_threshold = max_ncpus;
2382850d85bSmv 
2392850d85bSmv 	/*
2402850d85bSmv 	 * If a platform does not specify a threshold or if the number of CPUs
2412850d85bSmv 	 * at boot time does not exceed the threshold, tick accounting remains
2422850d85bSmv 	 * single-threaded.
2432850d85bSmv 	 */
2442850d85bSmv 	if (ncpus <= clock_tick_threshold) {
2452850d85bSmv 		clock_tick_ncpus = max_ncpus;
2462850d85bSmv 		clock_tick_proc_max = 1;
2472850d85bSmv 		return;
2482850d85bSmv 	}
2492850d85bSmv 
2502850d85bSmv 	/*
2512850d85bSmv 	 * OK. Multi-thread tick processing. If a platform has not specified
2522850d85bSmv 	 * the CPU set size for multi-threading, then use the default value.
2532850d85bSmv 	 * This value has been arrived through measurements on large
2542850d85bSmv 	 * configuration systems.
2552850d85bSmv 	 */
2562850d85bSmv 	clock_tick_single_threaded = 0;
2572850d85bSmv 	if (clock_tick_proc_max == 0) {
2582850d85bSmv 		clock_tick_proc_max = CLOCK_TICK_PROC_MAX;
2592850d85bSmv 		if (hires_tick)
2602850d85bSmv 			clock_tick_proc_max *= 10;
2612850d85bSmv 	}
2622850d85bSmv }
2632850d85bSmv 
2642850d85bSmv static void
clock_tick_schedule_one(clock_tick_set_t * csp,int pending,processorid_t cid)2652850d85bSmv clock_tick_schedule_one(clock_tick_set_t *csp, int pending, processorid_t cid)
2662850d85bSmv {
2672850d85bSmv 	clock_tick_cpu_t	*ctp;
2682850d85bSmv 
2692850d85bSmv 	ASSERT(&invoke_softint != NULL);
27007247649SMadhavan Venkataraman 
27107247649SMadhavan Venkataraman 	atomic_inc_ulong(&clock_tick_active);
27207247649SMadhavan Venkataraman 
2732850d85bSmv 	/*
2742850d85bSmv 	 * Schedule tick accounting for a set of CPUs.
2752850d85bSmv 	 */
2762850d85bSmv 	ctp = clock_tick_cpu[cid];
2772850d85bSmv 	mutex_enter(&ctp->ct_lock);
278*1b7f7204SRafael Vanoni 	ctp->ct_lbolt = LBOLT_NO_ACCOUNT;
2792850d85bSmv 	ctp->ct_pending += pending;
2802850d85bSmv 	ctp->ct_start = csp->ct_start;
2812850d85bSmv 	ctp->ct_end = csp->ct_end;
2822850d85bSmv 	ctp->ct_scan = csp->ct_scan;
2832850d85bSmv 	mutex_exit(&ctp->ct_lock);
2842850d85bSmv 
2852850d85bSmv 	invoke_softint(cid, ctp->ct_intr);
2862850d85bSmv 	/*
2872850d85bSmv 	 * Return without waiting for the softint to finish.
2882850d85bSmv 	 */
2892850d85bSmv }
2902850d85bSmv 
2912850d85bSmv static void
clock_tick_process(cpu_t * cp,clock_t mylbolt,int pending)2922850d85bSmv clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending)
2932850d85bSmv {
2942850d85bSmv 	kthread_t	*t;
2952850d85bSmv 	kmutex_t	*plockp;
2962850d85bSmv 	int		notick, intr;
2972850d85bSmv 	klwp_id_t	lwp;
2982850d85bSmv 
2992850d85bSmv 	/*
3002850d85bSmv 	 * The locking here is rather tricky. thread_free_prevent()
3012850d85bSmv 	 * prevents the thread returned from being freed while we
3022850d85bSmv 	 * are looking at it. We can then check if the thread
3032850d85bSmv 	 * is exiting and get the appropriate p_lock if it
3042850d85bSmv 	 * is not.  We have to be careful, though, because
3052850d85bSmv 	 * the _process_ can still be freed while we've
3062850d85bSmv 	 * prevented thread free.  To avoid touching the
3072850d85bSmv 	 * proc structure we put a pointer to the p_lock in the
3082850d85bSmv 	 * thread structure.  The p_lock is persistent so we
3092850d85bSmv 	 * can acquire it even if the process is gone.  At that
3102850d85bSmv 	 * point we can check (again) if the thread is exiting
3112850d85bSmv 	 * and either drop the lock or do the tick processing.
3122850d85bSmv 	 */
3132850d85bSmv 	t = cp->cpu_thread;	/* Current running thread */
3142850d85bSmv 	if (CPU == cp) {
3152850d85bSmv 		/*
3162850d85bSmv 		 * 't' will be the tick processing thread on this
3172850d85bSmv 		 * CPU.  Use the pinned thread (if any) on this CPU
3182850d85bSmv 		 * as the target of the clock tick.
3192850d85bSmv 		 */
3202850d85bSmv 		if (t->t_intr != NULL)
3212850d85bSmv 			t = t->t_intr;
3222850d85bSmv 	}
3232850d85bSmv 
3242850d85bSmv 	/*
3252850d85bSmv 	 * We use thread_free_prevent to keep the currently running
3262850d85bSmv 	 * thread from being freed or recycled while we're
3272850d85bSmv 	 * looking at it.
3282850d85bSmv 	 */
3292850d85bSmv 	thread_free_prevent(t);
3302850d85bSmv 	/*
3312850d85bSmv 	 * We cannot hold the cpu_lock to prevent the
3322850d85bSmv 	 * cpu_active from changing in the clock interrupt.
3332850d85bSmv 	 * As long as we don't block (or don't get pre-empted)
3342850d85bSmv 	 * the cpu_list will not change (all threads are paused
3352850d85bSmv 	 * before list modification).
3362850d85bSmv 	 */
3372850d85bSmv 	if (CLOCK_TICK_CPU_OFFLINE(cp)) {
3382850d85bSmv 		thread_free_allow(t);
3392850d85bSmv 		return;
3402850d85bSmv 	}
3412850d85bSmv 
3422850d85bSmv 	/*
3432850d85bSmv 	 * Make sure the thread is still on the CPU.
3442850d85bSmv 	 */
3452850d85bSmv 	if ((t != cp->cpu_thread) &&
3462850d85bSmv 	    ((cp != CPU) || (t != cp->cpu_thread->t_intr))) {
3472850d85bSmv 		/*
3482850d85bSmv 		 * We could not locate the thread. Skip this CPU. Race
3492850d85bSmv 		 * conditions while performing these checks are benign.
3502850d85bSmv 		 * These checks are not perfect and they don't need
3512850d85bSmv 		 * to be.
3522850d85bSmv 		 */
3532850d85bSmv 		thread_free_allow(t);
3542850d85bSmv 		return;
3552850d85bSmv 	}
3562850d85bSmv 
3572850d85bSmv 	intr = t->t_flag & T_INTR_THREAD;
3582850d85bSmv 	lwp = ttolwp(t);
3592850d85bSmv 	if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) {
3602850d85bSmv 		/*
3612850d85bSmv 		 * Thread is exiting (or uninteresting) so don't
3622850d85bSmv 		 * do tick processing.
3632850d85bSmv 		 */
3642850d85bSmv 		thread_free_allow(t);
3652850d85bSmv 		return;
3662850d85bSmv 	}
3672850d85bSmv 
3682850d85bSmv 	/*
3692850d85bSmv 	 * OK, try to grab the process lock.  See
3702850d85bSmv 	 * comments above for why we're not using
3712850d85bSmv 	 * ttoproc(t)->p_lockp here.
3722850d85bSmv 	 */
3732850d85bSmv 	plockp = t->t_plockp;
3742850d85bSmv 	mutex_enter(plockp);
3752850d85bSmv 	/* See above comment. */
3762850d85bSmv 	if (CLOCK_TICK_CPU_OFFLINE(cp)) {
3772850d85bSmv 		mutex_exit(plockp);
3782850d85bSmv 		thread_free_allow(t);
3792850d85bSmv 		return;
3802850d85bSmv 	}
3812850d85bSmv 
3822850d85bSmv 	/*
3832850d85bSmv 	 * The thread may have exited between when we
3842850d85bSmv 	 * checked above, and when we got the p_lock.
3852850d85bSmv 	 */
3862850d85bSmv 	if (t->t_proc_flag & TP_LWPEXIT) {
3872850d85bSmv 		mutex_exit(plockp);
3882850d85bSmv 		thread_free_allow(t);
3892850d85bSmv 		return;
3902850d85bSmv 	}
3912850d85bSmv 
3922850d85bSmv 	/*
3932850d85bSmv 	 * Either we have the p_lock for the thread's process,
3942850d85bSmv 	 * or we don't care about the thread structure any more.
3952850d85bSmv 	 * Either way we can allow thread free.
3962850d85bSmv 	 */
3972850d85bSmv 	thread_free_allow(t);
3982850d85bSmv 
3992850d85bSmv 	/*
4002850d85bSmv 	 * If we haven't done tick processing for this
4012850d85bSmv 	 * lwp, then do it now. Since we don't hold the
4022850d85bSmv 	 * lwp down on a CPU it can migrate and show up
4032850d85bSmv 	 * more than once, hence the lbolt check. mylbolt
4042850d85bSmv 	 * is copied at the time of tick scheduling to prevent
4052850d85bSmv 	 * lbolt mismatches.
4062850d85bSmv 	 *
4072850d85bSmv 	 * Also, make sure that it's okay to perform the
4082850d85bSmv 	 * tick processing before calling clock_tick.
4092850d85bSmv 	 * Setting notick to a TRUE value (ie. not 0)
4102850d85bSmv 	 * results in tick processing not being performed for
4112850d85bSmv 	 * that thread.
4122850d85bSmv 	 */
4132850d85bSmv 	notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) ||
4142850d85bSmv 	    (cp->cpu_dispthread == cp->cpu_idle_thread));
4152850d85bSmv 
4162850d85bSmv 	if ((!notick) && (t->t_lbolt < mylbolt)) {
4172850d85bSmv 		t->t_lbolt = mylbolt;
4182850d85bSmv 		clock_tick(t, pending);
4192850d85bSmv 	}
4202850d85bSmv 
4212850d85bSmv 	mutex_exit(plockp);
4222850d85bSmv }
4232850d85bSmv 
4242850d85bSmv void
clock_tick_schedule(int one_sec)4252850d85bSmv clock_tick_schedule(int one_sec)
4262850d85bSmv {
4272850d85bSmv 	ulong_t			active;
4282850d85bSmv 	int			i, end;
4292850d85bSmv 	clock_tick_set_t	*csp;
4302850d85bSmv 	cpu_t			*cp;
4312850d85bSmv 
4322850d85bSmv 	if (clock_cpu_id != CPU->cpu_id)
4332850d85bSmv 		clock_cpu_id = CPU->cpu_id;
4342850d85bSmv 
4352850d85bSmv 	if (clock_tick_single_threaded) {
4362850d85bSmv 		/*
4372850d85bSmv 		 * Each tick cycle, start the scan from a different
4382850d85bSmv 		 * CPU for the sake of fairness.
4392850d85bSmv 		 */
4402850d85bSmv 		end = clock_tick_total_cpus;
4412850d85bSmv 		clock_tick_scan++;
4422850d85bSmv 		if (clock_tick_scan >= end)
4432850d85bSmv 			clock_tick_scan = 0;
4442850d85bSmv 
445d3d50737SRafael Vanoni 		clock_tick_execute_common(0, clock_tick_scan, end,
446*1b7f7204SRafael Vanoni 		    LBOLT_NO_ACCOUNT, 1);
4472850d85bSmv 
4482850d85bSmv 		return;
4492850d85bSmv 	}
4502850d85bSmv 
4512850d85bSmv 	/*
4522850d85bSmv 	 * If the previous invocation of handlers is not yet finished, then
4532850d85bSmv 	 * simply increment a pending count and return. Eventually when they
4542850d85bSmv 	 * finish, the pending count is passed down to the next set of
4552850d85bSmv 	 * handlers to process. This way, ticks that have already elapsed
4562850d85bSmv 	 * in the past are handled as quickly as possible to minimize the
4572850d85bSmv 	 * chances of threads getting away before their pending ticks are
4582850d85bSmv 	 * accounted. The other benefit is that if the pending count is
4592850d85bSmv 	 * more than one, it can be handled by a single invocation of
4602850d85bSmv 	 * clock_tick(). This is a good optimization for large configuration
4612850d85bSmv 	 * busy systems where tick accounting can get backed up for various
4622850d85bSmv 	 * reasons.
4632850d85bSmv 	 */
4642850d85bSmv 	clock_tick_pending++;
4652850d85bSmv 
4662850d85bSmv 	active = clock_tick_active;
4672850d85bSmv 	active = atomic_cas_ulong(&clock_tick_active, active, active);
4682850d85bSmv 	if (active)
4692850d85bSmv 		return;
4702850d85bSmv 
4712850d85bSmv 	/*
4722850d85bSmv 	 * We want to handle the clock CPU here. If we
4732850d85bSmv 	 * scheduled the accounting for the clock CPU to another
4742850d85bSmv 	 * processor, that processor will find only the clock() thread
4752850d85bSmv 	 * running and not account for any user thread below it. Also,
4762850d85bSmv 	 * we want to handle this before we block on anything and allow
4772850d85bSmv 	 * the pinned thread below the current thread to escape.
4782850d85bSmv 	 */
479*1b7f7204SRafael Vanoni 	clock_tick_process(CPU, LBOLT_NO_ACCOUNT, clock_tick_pending);
4802850d85bSmv 
4812850d85bSmv 	mutex_enter(&clock_tick_lock);
4822850d85bSmv 
4832850d85bSmv 	/*
4842850d85bSmv 	 * Schedule each set on a separate processor.
4852850d85bSmv 	 */
4862850d85bSmv 	cp = clock_cpu_list;
4872850d85bSmv 	for (i = 0; i < clock_tick_nsets; i++) {
4882850d85bSmv 		csp = &clock_tick_set[i];
4892850d85bSmv 
4902850d85bSmv 		/*
4912850d85bSmv 		 * Pick the next online CPU in list for scheduling tick
4922850d85bSmv 		 * accounting. The clock_tick_lock is held by the caller.
4932850d85bSmv 		 * So, CPU online/offline cannot muck with this while
4942850d85bSmv 		 * we are picking our CPU to X-call.
4952850d85bSmv 		 */
4962850d85bSmv 		if (cp == CPU)
4972850d85bSmv 			cp = cp->cpu_next_onln;
4982850d85bSmv 
4992850d85bSmv 		/*
5002850d85bSmv 		 * Each tick cycle, start the scan from a different
5012850d85bSmv 		 * CPU for the sake of fairness.
5022850d85bSmv 		 */
5032850d85bSmv 		csp->ct_scan++;
5042850d85bSmv 		if (csp->ct_scan >= csp->ct_end)
5052850d85bSmv 			csp->ct_scan = csp->ct_start;
5062850d85bSmv 
5072850d85bSmv 		clock_tick_schedule_one(csp, clock_tick_pending, cp->cpu_id);
5082850d85bSmv 
5092850d85bSmv 		cp = cp->cpu_next_onln;
5102850d85bSmv 	}
5112850d85bSmv 
5122850d85bSmv 	if (one_sec) {
5132850d85bSmv 		/*
5142850d85bSmv 		 * Move the CPU pointer around every second. This is so
5152850d85bSmv 		 * all the CPUs can be X-called in a round-robin fashion
5162850d85bSmv 		 * to evenly distribute the X-calls. We don't do this
5172850d85bSmv 		 * at a faster rate than this because we don't want
5182850d85bSmv 		 * to affect cache performance negatively.
5192850d85bSmv 		 */
5202850d85bSmv 		clock_cpu_list = clock_cpu_list->cpu_next_onln;
5212850d85bSmv 	}
5222850d85bSmv 
5232850d85bSmv 	mutex_exit(&clock_tick_lock);
5242850d85bSmv 
5252850d85bSmv 	clock_tick_pending = 0;
5262850d85bSmv }
5272850d85bSmv 
5282850d85bSmv static void
clock_tick_execute_common(int start,int scan,int end,clock_t mylbolt,int pending)5292850d85bSmv clock_tick_execute_common(int start, int scan, int end, clock_t mylbolt,
5302850d85bSmv 	int pending)
5312850d85bSmv {
5322850d85bSmv 	cpu_t		*cp;
5332850d85bSmv 	int		i;
5342850d85bSmv 
5352850d85bSmv 	ASSERT((start <= scan) && (scan <= end));
5362850d85bSmv 
5372850d85bSmv 	/*
5382850d85bSmv 	 * Handle the thread on current CPU first. This is to prevent a
5392850d85bSmv 	 * pinned thread from escaping if we ever block on something.
5402850d85bSmv 	 * Note that in the single-threaded mode, this handles the clock
5412850d85bSmv 	 * CPU.
5422850d85bSmv 	 */
5432850d85bSmv 	clock_tick_process(CPU, mylbolt, pending);
5442850d85bSmv 
5452850d85bSmv 	/*
5462850d85bSmv 	 * Perform tick accounting for the threads running on
5472850d85bSmv 	 * the scheduled CPUs.
5482850d85bSmv 	 */
5492850d85bSmv 	for (i = scan; i < end; i++) {
5502850d85bSmv 		cp = clock_tick_cpus[i];
5512850d85bSmv 		if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id))
5522850d85bSmv 			continue;
5532850d85bSmv 		clock_tick_process(cp, mylbolt, pending);
5542850d85bSmv 	}
5552850d85bSmv 
5562850d85bSmv 	for (i = start; i < scan; i++) {
5572850d85bSmv 		cp = clock_tick_cpus[i];
5582850d85bSmv 		if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id))
5592850d85bSmv 			continue;
5602850d85bSmv 		clock_tick_process(cp, mylbolt, pending);
5612850d85bSmv 	}
5622850d85bSmv }
5632850d85bSmv 
5642850d85bSmv /*ARGSUSED*/
5652850d85bSmv static uint_t
clock_tick_execute(caddr_t arg1,caddr_t arg2)5662850d85bSmv clock_tick_execute(caddr_t arg1, caddr_t arg2)
5672850d85bSmv {
5682850d85bSmv 	clock_tick_cpu_t	*ctp;
5692850d85bSmv 	int			start, scan, end, pending;
5702850d85bSmv 	clock_t			mylbolt;
5712850d85bSmv 
5722850d85bSmv 	/*
5732850d85bSmv 	 * We could have raced with cpu offline. We don't want to
5742850d85bSmv 	 * process anything on an offlined CPU. If we got blocked
5752850d85bSmv 	 * on anything, we may not get scheduled when we wakeup
5762850d85bSmv 	 * later on.
5772850d85bSmv 	 */
5782850d85bSmv 	if (!CLOCK_TICK_XCALL_SAFE(CPU))
57907247649SMadhavan Venkataraman 		goto out;
5802850d85bSmv 
58107247649SMadhavan Venkataraman 	ctp = clock_tick_cpu[CPU->cpu_id];
5822850d85bSmv 
5832850d85bSmv 	mutex_enter(&ctp->ct_lock);
5842850d85bSmv 	pending = ctp->ct_pending;
5852850d85bSmv 	if (pending == 0) {
5862850d85bSmv 		/*
5872850d85bSmv 		 * If a CPU is busy at LOCK_LEVEL, then an invocation
5882850d85bSmv 		 * of this softint may be queued for some time. In that case,
5892850d85bSmv 		 * clock_tick_active will not be incremented.
5902850d85bSmv 		 * clock_tick_schedule() will then assume that the previous
5912850d85bSmv 		 * invocation is done and post a new softint. The first one
5922850d85bSmv 		 * that gets in will reset the pending count so the
5932850d85bSmv 		 * second one is a noop.
5942850d85bSmv 		 */
5952850d85bSmv 		mutex_exit(&ctp->ct_lock);
5962850d85bSmv 		goto out;
5972850d85bSmv 	}
5982850d85bSmv 	ctp->ct_pending = 0;
5992850d85bSmv 	start = ctp->ct_start;
6002850d85bSmv 	end = ctp->ct_end;
6012850d85bSmv 	scan = ctp->ct_scan;
6022850d85bSmv 	mylbolt = ctp->ct_lbolt;
6032850d85bSmv 	mutex_exit(&ctp->ct_lock);
6042850d85bSmv 
6052850d85bSmv 	clock_tick_execute_common(start, scan, end, mylbolt, pending);
6062850d85bSmv 
6072850d85bSmv out:
6082850d85bSmv 	/*
6092850d85bSmv 	 * Signal completion to the clock handler.
6102850d85bSmv 	 */
6112850d85bSmv 	atomic_dec_ulong(&clock_tick_active);
6122850d85bSmv 
6132850d85bSmv 	return (1);
6142850d85bSmv }
6152850d85bSmv 
6162850d85bSmv /*ARGSUSED*/
6172850d85bSmv static int
clock_tick_cpu_setup(cpu_setup_t what,int cid,void * arg)6182850d85bSmv clock_tick_cpu_setup(cpu_setup_t what, int cid, void *arg)
6192850d85bSmv {
6202850d85bSmv 	cpu_t			*cp, *ncp;
6212850d85bSmv 	int			i, set;
6222850d85bSmv 	clock_tick_set_t	*csp;
6232850d85bSmv 
6242850d85bSmv 	/*
6252850d85bSmv 	 * This function performs some computations at CPU offline/online
6262850d85bSmv 	 * time. The computed values are used during tick scheduling and
6272850d85bSmv 	 * execution phases. This avoids having to compute things on
6282850d85bSmv 	 * an every tick basis. The other benefit is that we perform the
6292850d85bSmv 	 * computations only for onlined CPUs (not offlined ones). As a
6302850d85bSmv 	 * result, no tick processing is attempted for offlined CPUs.
6312850d85bSmv 	 *
6322850d85bSmv 	 * Also, cpu_offline() calls this function before checking for
6332850d85bSmv 	 * active interrupt threads. This allows us to avoid posting
6342850d85bSmv 	 * cross calls to CPUs that are being offlined.
6352850d85bSmv 	 */
6362850d85bSmv 
6372850d85bSmv 	cp = cpu[cid];
6382850d85bSmv 
6392850d85bSmv 	mutex_enter(&clock_tick_lock);
6402850d85bSmv 
6412850d85bSmv 	switch (what) {
6422850d85bSmv 	case CPU_ON:
6432850d85bSmv 		clock_tick_cpus[clock_tick_total_cpus] = cp;
6442850d85bSmv 		set = clock_tick_total_cpus / clock_tick_ncpus;
6452850d85bSmv 		csp = &clock_tick_set[set];
6462850d85bSmv 		csp->ct_end++;
6472850d85bSmv 		clock_tick_total_cpus++;
6482850d85bSmv 		clock_tick_nsets =
6492850d85bSmv 		    (clock_tick_total_cpus + clock_tick_ncpus - 1) /
6502850d85bSmv 		    clock_tick_ncpus;
6512850d85bSmv 		CPUSET_ADD(clock_tick_online_cpuset, cp->cpu_id);
6522850d85bSmv 		membar_sync();
6532850d85bSmv 		break;
6542850d85bSmv 
6552850d85bSmv 	case CPU_OFF:
6562850d85bSmv 		if (&sync_softint != NULL)
6572850d85bSmv 			sync_softint(clock_tick_online_cpuset);
6582850d85bSmv 		CPUSET_DEL(clock_tick_online_cpuset, cp->cpu_id);
6592850d85bSmv 		clock_tick_total_cpus--;
6602850d85bSmv 		clock_tick_cpus[clock_tick_total_cpus] = NULL;
6612850d85bSmv 		clock_tick_nsets =
6622850d85bSmv 		    (clock_tick_total_cpus + clock_tick_ncpus - 1) /
6632850d85bSmv 		    clock_tick_ncpus;
6642850d85bSmv 		set = clock_tick_total_cpus / clock_tick_ncpus;
6652850d85bSmv 		csp = &clock_tick_set[set];
6662850d85bSmv 		csp->ct_end--;
6672850d85bSmv 
6682850d85bSmv 		i = 0;
6692850d85bSmv 		ncp = cpu_active;
6702850d85bSmv 		do {
6712850d85bSmv 			if (cp == ncp)
6722850d85bSmv 				continue;
6732850d85bSmv 			clock_tick_cpus[i] = ncp;
6742850d85bSmv 			i++;
6752850d85bSmv 		} while ((ncp = ncp->cpu_next_onln) != cpu_active);
6762850d85bSmv 		ASSERT(i == clock_tick_total_cpus);
6772850d85bSmv 		membar_sync();
6782850d85bSmv 		break;
6792850d85bSmv 
6802850d85bSmv 	default:
6812850d85bSmv 		break;
6822850d85bSmv 	}
6832850d85bSmv 
6842850d85bSmv 	mutex_exit(&clock_tick_lock);
6852850d85bSmv 
6862850d85bSmv 	return (0);
6872850d85bSmv }
6882850d85bSmv 
6892850d85bSmv 
6902850d85bSmv void
clock_tick_mp_init(void)6912850d85bSmv clock_tick_mp_init(void)
6922850d85bSmv {
6932850d85bSmv 	cpu_t	*cp;
6942850d85bSmv 
6952850d85bSmv 	mutex_enter(&cpu_lock);
6962850d85bSmv 
6972850d85bSmv 	cp = cpu_active;
6982850d85bSmv 	do {
6992850d85bSmv 		(void) clock_tick_cpu_setup(CPU_ON, cp->cpu_id, NULL);
7002850d85bSmv 	} while ((cp = cp->cpu_next_onln) != cpu_active);
7012850d85bSmv 
7022850d85bSmv 	register_cpu_setup_func(clock_tick_cpu_setup, NULL);
7032850d85bSmv 
7042850d85bSmv 	mutex_exit(&cpu_lock);
7052850d85bSmv }
706