1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright 2019 Joyent, Inc.
28 */
29
30/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
31/*	  All Rights Reserved  	*/
32
33
34#include <sys/types.h>
35#include <sys/param.h>
36#include <sys/sysmacros.h>
37#include <sys/signal.h>
38#include <sys/user.h>
39#include <sys/systm.h>
40#include <sys/sysinfo.h>
41#include <sys/var.h>
42#include <sys/errno.h>
43#include <sys/cmn_err.h>
44#include <sys/debug.h>
45#include <sys/inline.h>
46#include <sys/disp.h>
47#include <sys/class.h>
48#include <sys/bitmap.h>
49#include <sys/kmem.h>
50#include <sys/cpuvar.h>
51#include <sys/vtrace.h>
52#include <sys/tnf.h>
53#include <sys/cpupart.h>
54#include <sys/lgrp.h>
55#include <sys/pg.h>
56#include <sys/cmt.h>
57#include <sys/bitset.h>
58#include <sys/schedctl.h>
59#include <sys/atomic.h>
60#include <sys/dtrace.h>
61#include <sys/sdt.h>
62#include <sys/archsystm.h>
63#include <sys/smt.h>
64
65#include <vm/as.h>
66
67#define	BOUND_CPU	0x1
68#define	BOUND_PARTITION	0x2
69#define	BOUND_INTR	0x4
70
71/* Dispatch queue allocation structure and functions */
72struct disp_queue_info {
73	disp_t	*dp;
74	dispq_t *olddispq;
75	dispq_t *newdispq;
76	ulong_t	*olddqactmap;
77	ulong_t	*newdqactmap;
78	int	oldnglobpris;
79};
80static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
81    disp_t *dp);
82static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
83static void	disp_dq_free(struct disp_queue_info *dptr);
84
85/* platform-specific routine to call when processor is idle */
86static void	generic_idle_cpu();
87void		(*idle_cpu)() = generic_idle_cpu;
88
89/* routines invoked when a CPU enters/exits the idle loop */
90static void	idle_enter();
91static void	idle_exit();
92
93/* platform-specific routine to call when thread is enqueued */
94static void	generic_enq_thread(cpu_t *, int);
95void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
96
97pri_t	kpreemptpri;		/* priority where kernel preemption applies */
98pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
99pri_t	intr_pri;		/* interrupt thread priority base level */
100
101#define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
102pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
103disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
104disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
105int	nswapped;		/* total number of swapped threads */
106void	disp_swapped_enq(kthread_t *tp);
107static void	disp_swapped_setrun(kthread_t *tp);
108static void	cpu_resched(cpu_t *cp, pri_t tpri);
109
110/*
111 * If this is set, only interrupt threads will cause kernel preemptions.
112 * This is done by changing the value of kpreemptpri.  kpreemptpri
113 * will either be the max sysclass pri + 1 or the min interrupt pri.
114 */
115int	only_intr_kpreempt;
116
117extern void set_idle_cpu(int cpun);
118extern void unset_idle_cpu(int cpun);
119static void setkpdq(kthread_t *tp, int borf);
120#define	SETKP_BACK	0
121#define	SETKP_FRONT	1
122/*
123 * Parameter that determines how recently a thread must have run
124 * on the CPU to be considered loosely-bound to that CPU to reduce
125 * cold cache effects.  The interval is in hertz.
126 */
127#define	RECHOOSE_INTERVAL 3
128int	rechoose_interval = RECHOOSE_INTERVAL;
129
130/*
131 * Parameter that determines how long (in nanoseconds) a thread must
132 * be sitting on a run queue before it can be stolen by another CPU
133 * to reduce migrations.  The interval is in nanoseconds.
134 *
135 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
136 * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
137 * here indicating it is uninitiallized.
138 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
139 *
140 */
141#define	NOSTEAL_UNINITIALIZED	(-1)
142hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
143extern void cmp_set_nosteal_interval(void);
144
145id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
146
147disp_lock_t	transition_lock;	/* lock on transitioning threads */
148disp_lock_t	stop_lock;		/* lock on stopped threads */
149
150static void	cpu_dispqalloc(int numpris);
151
152/*
153 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
154 * a thread because it was sitting on its run queue for a very short
155 * period of time.
156 */
157#define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
158
159static kthread_t	*disp_getwork(cpu_t *to);
160static kthread_t	*disp_getbest(disp_t *from);
161static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
162
163void	swtch_to(kthread_t *);
164
165/*
166 * dispatcher and scheduler initialization
167 */
168
169/*
170 * disp_setup - Common code to calculate and allocate dispatcher
171 *		variables and structures based on the maximum priority.
172 */
173static void
174disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
175{
176	pri_t	newnglobpris;
177
178	ASSERT(MUTEX_HELD(&cpu_lock));
179
180	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
181
182	if (newnglobpris > oldnglobpris) {
183		/*
184		 * Allocate new kp queues for each CPU partition.
185		 */
186		cpupart_kpqalloc(newnglobpris);
187
188		/*
189		 * Allocate new dispatch queues for each CPU.
190		 */
191		cpu_dispqalloc(newnglobpris);
192
193		/*
194		 * compute new interrupt thread base priority
195		 */
196		intr_pri = maxglobpri;
197		if (only_intr_kpreempt) {
198			kpreemptpri = intr_pri + 1;
199			if (kpqpri == KPQPRI)
200				kpqpri = kpreemptpri;
201		}
202		v.v_nglobpris = newnglobpris;
203	}
204}
205
206/*
207 * dispinit - Called to initialize all loaded classes and the
208 *	      dispatcher framework.
209 */
210void
211dispinit(void)
212{
213	id_t	cid;
214	pri_t	maxglobpri;
215	pri_t	cl_maxglobpri;
216
217	maxglobpri = -1;
218
219	/*
220	 * Initialize transition lock, which will always be set.
221	 */
222	DISP_LOCK_INIT(&transition_lock);
223	disp_lock_enter_high(&transition_lock);
224	DISP_LOCK_INIT(&stop_lock);
225
226	mutex_enter(&cpu_lock);
227	CPU->cpu_disp->disp_maxrunpri = -1;
228	CPU->cpu_disp->disp_max_unbound_pri = -1;
229
230	/*
231	 * Initialize the default CPU partition.
232	 */
233	cpupart_initialize_default();
234	/*
235	 * Call the class specific initialization functions for
236	 * all pre-installed schedulers.
237	 *
238	 * We pass the size of a class specific parameter
239	 * buffer to each of the initialization functions
240	 * to try to catch problems with backward compatibility
241	 * of class modules.
242	 *
243	 * For example a new class module running on an old system
244	 * which didn't provide sufficiently large parameter buffers
245	 * would be bad news. Class initialization modules can check for
246	 * this and take action if they detect a problem.
247	 */
248
249	for (cid = 0; cid < nclass; cid++) {
250		sclass_t	*sc;
251
252		sc = &sclass[cid];
253		if (SCHED_INSTALLED(sc)) {
254			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
255			    &sc->cl_funcs);
256			if (cl_maxglobpri > maxglobpri)
257				maxglobpri = cl_maxglobpri;
258		}
259	}
260	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
261	if (kpqpri == KPQPRI)
262		kpqpri = kpreemptpri;
263
264	ASSERT(maxglobpri >= 0);
265	disp_setup(maxglobpri, 0);
266
267	mutex_exit(&cpu_lock);
268
269	/*
270	 * Platform specific sticky scheduler setup.
271	 */
272	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
273		cmp_set_nosteal_interval();
274
275	/*
276	 * Get the default class ID; this may be later modified via
277	 * dispadmin(1M).  This will load the class (normally TS) and that will
278	 * call disp_add(), which is why we had to drop cpu_lock first.
279	 */
280	if (getcid(defaultclass, &defaultcid) != 0) {
281		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
282		    defaultclass);
283	}
284}
285
286/*
287 * disp_add - Called with class pointer to initialize the dispatcher
288 *	      for a newly loaded class.
289 */
290void
291disp_add(sclass_t *clp)
292{
293	pri_t	maxglobpri;
294	pri_t	cl_maxglobpri;
295
296	mutex_enter(&cpu_lock);
297	/*
298	 * Initialize the scheduler class.
299	 */
300	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
301	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
302	if (cl_maxglobpri > maxglobpri)
303		maxglobpri = cl_maxglobpri;
304
305	/*
306	 * Save old queue information.  Since we're initializing a
307	 * new scheduling class which has just been loaded, then
308	 * the size of the dispq may have changed.  We need to handle
309	 * that here.
310	 */
311	disp_setup(maxglobpri, v.v_nglobpris);
312
313	mutex_exit(&cpu_lock);
314}
315
316
317/*
318 * For each CPU, allocate new dispatch queues
319 * with the stated number of priorities.
320 */
321static void
322cpu_dispqalloc(int numpris)
323{
324	cpu_t	*cpup;
325	struct disp_queue_info	*disp_mem;
326	int i, num;
327
328	ASSERT(MUTEX_HELD(&cpu_lock));
329
330	disp_mem = kmem_zalloc(NCPU *
331	    sizeof (struct disp_queue_info), KM_SLEEP);
332
333	/*
334	 * This routine must allocate all of the memory before stopping
335	 * the cpus because it must not sleep in kmem_alloc while the
336	 * CPUs are stopped.  Locks they hold will not be freed until they
337	 * are restarted.
338	 */
339	i = 0;
340	cpup = cpu_list;
341	do {
342		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
343		i++;
344		cpup = cpup->cpu_next;
345	} while (cpup != cpu_list);
346	num = i;
347
348	pause_cpus(NULL, NULL);
349	for (i = 0; i < num; i++)
350		disp_dq_assign(&disp_mem[i], numpris);
351	start_cpus();
352
353	/*
354	 * I must free all of the memory after starting the cpus because
355	 * I can not risk sleeping in kmem_free while the cpus are stopped.
356	 */
357	for (i = 0; i < num; i++)
358		disp_dq_free(&disp_mem[i]);
359
360	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
361}
362
363static void
364disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
365{
366	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
367	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
368	    sizeof (long), KM_SLEEP);
369	dptr->dp = dp;
370}
371
372static void
373disp_dq_assign(struct disp_queue_info *dptr, int numpris)
374{
375	disp_t	*dp;
376
377	dp = dptr->dp;
378	dptr->olddispq = dp->disp_q;
379	dptr->olddqactmap = dp->disp_qactmap;
380	dptr->oldnglobpris = dp->disp_npri;
381
382	ASSERT(dptr->oldnglobpris < numpris);
383
384	if (dptr->olddispq != NULL) {
385		/*
386		 * Use kcopy because bcopy is platform-specific
387		 * and could block while we might have paused the cpus.
388		 */
389		(void) kcopy(dptr->olddispq, dptr->newdispq,
390		    dptr->oldnglobpris * sizeof (dispq_t));
391		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
392		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
393		    sizeof (long));
394	}
395	dp->disp_q = dptr->newdispq;
396	dp->disp_qactmap = dptr->newdqactmap;
397	dp->disp_q_limit = &dptr->newdispq[numpris];
398	dp->disp_npri = numpris;
399}
400
401static void
402disp_dq_free(struct disp_queue_info *dptr)
403{
404	if (dptr->olddispq != NULL)
405		kmem_free(dptr->olddispq,
406		    dptr->oldnglobpris * sizeof (dispq_t));
407	if (dptr->olddqactmap != NULL)
408		kmem_free(dptr->olddqactmap,
409		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
410}
411
412/*
413 * For a newly created CPU, initialize the dispatch queue.
414 * This is called before the CPU is known through cpu[] or on any lists.
415 */
416void
417disp_cpu_init(cpu_t *cp)
418{
419	disp_t	*dp;
420	dispq_t	*newdispq;
421	ulong_t	*newdqactmap;
422
423	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
424
425	if (cp == cpu0_disp.disp_cpu)
426		dp = &cpu0_disp;
427	else
428		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
429	bzero(dp, sizeof (disp_t));
430	cp->cpu_disp = dp;
431	dp->disp_cpu = cp;
432	dp->disp_maxrunpri = -1;
433	dp->disp_max_unbound_pri = -1;
434	DISP_LOCK_INIT(&cp->cpu_thread_lock);
435	/*
436	 * Allocate memory for the dispatcher queue headers
437	 * and the active queue bitmap.
438	 */
439	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
440	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
441	    sizeof (long), KM_SLEEP);
442	dp->disp_q = newdispq;
443	dp->disp_qactmap = newdqactmap;
444	dp->disp_q_limit = &newdispq[v.v_nglobpris];
445	dp->disp_npri = v.v_nglobpris;
446}
447
448void
449disp_cpu_fini(cpu_t *cp)
450{
451	ASSERT(MUTEX_HELD(&cpu_lock));
452
453	disp_kp_free(cp->cpu_disp);
454	if (cp->cpu_disp != &cpu0_disp)
455		kmem_free(cp->cpu_disp, sizeof (disp_t));
456}
457
458/*
459 * Allocate new, larger kpreempt dispatch queue to replace the old one.
460 */
461void
462disp_kp_alloc(disp_t *dq, pri_t npri)
463{
464	struct disp_queue_info	mem_info;
465
466	if (npri > dq->disp_npri) {
467		/*
468		 * Allocate memory for the new array.
469		 */
470		disp_dq_alloc(&mem_info, npri, dq);
471
472		/*
473		 * We need to copy the old structures to the new
474		 * and free the old.
475		 */
476		disp_dq_assign(&mem_info, npri);
477		disp_dq_free(&mem_info);
478	}
479}
480
481/*
482 * Free dispatch queue.
483 * Used for the kpreempt queues for a removed CPU partition and
484 * for the per-CPU queues of deleted CPUs.
485 */
486void
487disp_kp_free(disp_t *dq)
488{
489	struct disp_queue_info	mem_info;
490
491	mem_info.olddispq = dq->disp_q;
492	mem_info.olddqactmap = dq->disp_qactmap;
493	mem_info.oldnglobpris = dq->disp_npri;
494	disp_dq_free(&mem_info);
495}
496
497/*
498 * End dispatcher and scheduler initialization.
499 */
500
501/*
502 * See if there's anything to do other than remain idle.
503 * Return non-zero if there is.
504 *
505 * This function must be called with high spl, or with
506 * kernel preemption disabled to prevent the partition's
507 * active cpu list from changing while being traversed.
508 *
509 * This is essentially a simpler version of disp_getwork()
510 * to be called by CPUs preparing to "halt".
511 */
512int
513disp_anywork(void)
514{
515	cpu_t		*cp = CPU;
516	cpu_t		*ocp;
517	volatile int	*local_nrunnable = &cp->cpu_disp->disp_nrunnable;
518
519	if (!(cp->cpu_flags & CPU_OFFLINE)) {
520		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
521			return (1);
522
523		for (ocp = cp->cpu_next_part; ocp != cp;
524		    ocp = ocp->cpu_next_part) {
525			ASSERT(CPU_ACTIVE(ocp));
526
527			/*
528			 * Something has appeared on the local run queue.
529			 */
530			if (*local_nrunnable > 0)
531				return (1);
532			/*
533			 * If we encounter another idle CPU that will
534			 * soon be trolling around through disp_anywork()
535			 * terminate our walk here and let this other CPU
536			 * patrol the next part of the list.
537			 */
538			if (ocp->cpu_dispatch_pri == -1 &&
539			    (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
540				return (0);
541			/*
542			 * Work can be taken from another CPU if:
543			 *	- There is unbound work on the run queue
544			 *	- That work isn't a thread undergoing a
545			 *	- context switch on an otherwise empty queue.
546			 *	- The CPU isn't running the idle loop.
547			 */
548			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
549			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
550			    ocp->cpu_disp->disp_nrunnable == 1) &&
551			    ocp->cpu_dispatch_pri != -1)
552				return (1);
553		}
554	}
555	return (0);
556}
557
558/*
559 * Called when CPU enters the idle loop
560 */
561static void
562idle_enter()
563{
564	cpu_t		*cp = CPU;
565
566	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
567	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
568	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
569}
570
571/*
572 * Called when CPU exits the idle loop
573 */
574static void
575idle_exit()
576{
577	cpu_t		*cp = CPU;
578
579	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
580	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
581}
582
583/*
584 * Idle loop.
585 */
586void
587idle()
588{
589	struct cpu	*cp = CPU;		/* pointer to this CPU */
590	kthread_t	*t;			/* taken thread */
591
592	idle_enter();
593
594	/*
595	 * Uniprocessor version of idle loop.
596	 * Do this until notified that we're on an actual multiprocessor.
597	 */
598	while (ncpus == 1) {
599		if (cp->cpu_disp->disp_nrunnable == 0) {
600			(*idle_cpu)();
601			continue;
602		}
603		idle_exit();
604		swtch();
605
606		idle_enter(); /* returned from swtch */
607	}
608
609	/*
610	 * Multiprocessor idle loop.
611	 */
612	for (;;) {
613		/*
614		 * If CPU is completely quiesced by p_online(2), just wait
615		 * here with minimal bus traffic until put online.
616		 */
617		while (cp->cpu_flags & CPU_QUIESCED)
618			(*idle_cpu)();
619
620		if (cp->cpu_disp->disp_nrunnable != 0) {
621			idle_exit();
622			swtch();
623		} else {
624			if (cp->cpu_flags & CPU_OFFLINE)
625				continue;
626			if ((t = disp_getwork(cp)) == NULL) {
627				if (cp->cpu_chosen_level != -1) {
628					disp_t *dp = cp->cpu_disp;
629					disp_t *kpq;
630
631					disp_lock_enter(&dp->disp_lock);
632					/*
633					 * Set kpq under lock to prevent
634					 * migration between partitions.
635					 */
636					kpq = &cp->cpu_part->cp_kp_queue;
637					if (kpq->disp_maxrunpri == -1)
638						cp->cpu_chosen_level = -1;
639					disp_lock_exit(&dp->disp_lock);
640				}
641				(*idle_cpu)();
642				continue;
643			}
644			/*
645			 * If there was a thread but we couldn't steal
646			 * it, then keep trying.
647			 */
648			if (t == T_DONTSTEAL)
649				continue;
650			idle_exit();
651			swtch_to(t);
652		}
653		idle_enter(); /* returned from swtch/swtch_to */
654	}
655}
656
657
658/*
659 * Preempt the currently running thread in favor of the highest
660 * priority thread.  The class of the current thread controls
661 * where it goes on the dispatcher queues. If panicking, turn
662 * preemption off.
663 */
664void
665preempt()
666{
667	kthread_t 	*t = curthread;
668	klwp_t 		*lwp = ttolwp(curthread);
669
670	if (panicstr)
671		return;
672
673	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
674
675	thread_lock(t);
676
677	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
678		/*
679		 * this thread has already been chosen to be run on
680		 * another CPU. Clear kprunrun on this CPU since we're
681		 * already headed for swtch().
682		 */
683		CPU->cpu_kprunrun = 0;
684		thread_unlock_nopreempt(t);
685		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
686	} else {
687		if (lwp != NULL)
688			lwp->lwp_ru.nivcsw++;
689		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
690		THREAD_TRANSITION(t);
691		CL_PREEMPT(t);
692		DTRACE_SCHED(preempt);
693		thread_unlock_nopreempt(t);
694
695		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
696
697		swtch();		/* clears CPU->cpu_runrun via disp() */
698	}
699}
700
701extern kthread_t *thread_unpin();
702
703/*
704 * disp() - find the highest priority thread for this processor to run, and
705 * set it in TS_ONPROC state so that resume() can be called to run it.
706 */
707static kthread_t *
708disp()
709{
710	cpu_t		*cpup;
711	disp_t		*dp;
712	kthread_t	*tp;
713	dispq_t		*dq;
714	int		maxrunword;
715	pri_t		pri;
716	disp_t		*kpq;
717
718	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
719
720	cpup = CPU;
721	/*
722	 * Find the highest priority loaded, runnable thread.
723	 */
724	dp = cpup->cpu_disp;
725
726reschedule:
727	/*
728	 * If there is more important work on the global queue with a better
729	 * priority than the maximum on this CPU, take it now.
730	 */
731	kpq = &cpup->cpu_part->cp_kp_queue;
732	while ((pri = kpq->disp_maxrunpri) >= 0 &&
733	    pri >= dp->disp_maxrunpri &&
734	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
735	    (tp = disp_getbest(kpq)) != NULL) {
736		if (disp_ratify(tp, kpq) != NULL) {
737			TRACE_1(TR_FAC_DISP, TR_DISP_END,
738			    "disp_end:tid %p", tp);
739			return (tp);
740		}
741	}
742
743	disp_lock_enter(&dp->disp_lock);
744	pri = dp->disp_maxrunpri;
745
746	/*
747	 * If there is nothing to run, look at what's runnable on other queues.
748	 * Choose the idle thread if the CPU is quiesced.
749	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
750	 * interrupt threads, which will be the only threads on the CPU's own
751	 * queue, but cannot run threads from other queues.
752	 */
753	if (pri == -1) {
754		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
755			disp_lock_exit(&dp->disp_lock);
756			if ((tp = disp_getwork(cpup)) == NULL ||
757			    tp == T_DONTSTEAL) {
758				tp = cpup->cpu_idle_thread;
759				(void) splhigh();
760				THREAD_ONPROC(tp, cpup);
761				cpup->cpu_dispthread = tp;
762				cpup->cpu_dispatch_pri = -1;
763				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
764				cpup->cpu_chosen_level = -1;
765			}
766		} else {
767			disp_lock_exit_high(&dp->disp_lock);
768			tp = cpup->cpu_idle_thread;
769			THREAD_ONPROC(tp, cpup);
770			cpup->cpu_dispthread = tp;
771			cpup->cpu_dispatch_pri = -1;
772			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
773			cpup->cpu_chosen_level = -1;
774		}
775		TRACE_1(TR_FAC_DISP, TR_DISP_END,
776		    "disp_end:tid %p", tp);
777		return (tp);
778	}
779
780	dq = &dp->disp_q[pri];
781	tp = dq->dq_first;
782
783	ASSERT(tp != NULL);
784	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
785
786	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
787
788	/*
789	 * Found it so remove it from queue.
790	 */
791	dp->disp_nrunnable--;
792	dq->dq_sruncnt--;
793	if ((dq->dq_first = tp->t_link) == NULL) {
794		ulong_t	*dqactmap = dp->disp_qactmap;
795
796		ASSERT(dq->dq_sruncnt == 0);
797		dq->dq_last = NULL;
798
799		/*
800		 * The queue is empty, so the corresponding bit needs to be
801		 * turned off in dqactmap.   If nrunnable != 0 just took the
802		 * last runnable thread off the
803		 * highest queue, so recompute disp_maxrunpri.
804		 */
805		maxrunword = pri >> BT_ULSHIFT;
806		dqactmap[maxrunword] &= ~BT_BIW(pri);
807
808		if (dp->disp_nrunnable == 0) {
809			dp->disp_max_unbound_pri = -1;
810			dp->disp_maxrunpri = -1;
811		} else {
812			int ipri;
813
814			ipri = bt_gethighbit(dqactmap, maxrunword);
815			dp->disp_maxrunpri = ipri;
816			if (ipri < dp->disp_max_unbound_pri)
817				dp->disp_max_unbound_pri = ipri;
818		}
819	} else {
820		tp->t_link = NULL;
821	}
822
823	/*
824	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
825	 * out this thread before we have a chance to run it.
826	 * While running, it is protected against swapping by t_lock.
827	 */
828	tp->t_schedflag |= TS_DONT_SWAP;
829	cpup->cpu_dispthread = tp;		/* protected by spl only */
830	cpup->cpu_dispatch_pri = pri;
831	ASSERT(pri == DISP_PRIO(tp));
832	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
833	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
834
835	ASSERT(tp != NULL);
836	TRACE_1(TR_FAC_DISP, TR_DISP_END,
837	    "disp_end:tid %p", tp);
838
839	if (disp_ratify(tp, kpq) == NULL)
840		goto reschedule;
841
842	return (tp);
843}
844
845/*
846 * swtch()
847 *	Find best runnable thread and run it.
848 *	Called with the current thread already switched to a new state,
849 *	on a sleep queue, run queue, stopped, and not zombied.
850 *	May be called at any spl level less than or equal to LOCK_LEVEL.
851 *	Always drops spl to the base level (spl0()).
852 */
853void
854swtch()
855{
856	kthread_t	*t = curthread;
857	kthread_t	*next;
858	cpu_t		*cp;
859
860	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
861
862	if (t->t_flag & T_INTR_THREAD)
863		cpu_intr_swtch_enter(t);
864
865	if (t->t_intr != NULL) {
866		/*
867		 * We are an interrupt thread.  Setup and return
868		 * the interrupted thread to be resumed.
869		 */
870		(void) splhigh();	/* block other scheduler action */
871		cp = CPU;		/* now protected against migration */
872		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
873		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
874		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
875		next = thread_unpin();
876		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
877		resume_from_intr(next);
878	} else {
879#ifdef	DEBUG
880		if (t->t_state == TS_ONPROC &&
881		    t->t_disp_queue->disp_cpu == CPU &&
882		    t->t_preempt == 0) {
883			thread_lock(t);
884			ASSERT(t->t_state != TS_ONPROC ||
885			    t->t_disp_queue->disp_cpu != CPU ||
886			    t->t_preempt != 0);	/* cannot migrate */
887			thread_unlock_nopreempt(t);
888		}
889#endif	/* DEBUG */
890		cp = CPU;
891		next = disp();		/* returns with spl high */
892		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
893
894		/* OK to steal anything left on run queue */
895		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
896
897		if (next != t) {
898			hrtime_t now;
899
900			now = gethrtime_unscaled();
901			pg_ev_thread_swtch(cp, now, t, next);
902
903			/*
904			 * If t was previously in the TS_ONPROC state,
905			 * setfrontdq and setbackdq won't have set its t_waitrq.
906			 * Since we now finally know that we're switching away
907			 * from this thread, set its t_waitrq if it is on a run
908			 * queue.
909			 */
910			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
911				t->t_waitrq = now;
912			}
913
914			/*
915			 * restore mstate of thread that we are switching to
916			 */
917			restore_mstate(next);
918
919			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
920			cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
921			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
922
923			if (dtrace_vtime_active)
924				dtrace_vtime_switch(next);
925
926			resume(next);
927			/*
928			 * The TR_RESUME_END and TR_SWTCH_END trace points
929			 * appear at the end of resume(), because we may not
930			 * return here
931			 */
932		} else {
933			if (t->t_flag & T_INTR_THREAD)
934				cpu_intr_swtch_exit(t);
935			/*
936			 * Threads that enqueue themselves on a run queue defer
937			 * setting t_waitrq. It is then either set in swtch()
938			 * when the CPU is actually yielded, or not at all if it
939			 * is remaining on the CPU.
940			 * There is however a window between where the thread
941			 * placed itself on a run queue, and where it selects
942			 * itself in disp(), where a third party (eg. clock()
943			 * doing tick processing) may have re-enqueued this
944			 * thread, setting t_waitrq in the process. We detect
945			 * this race by noticing that despite switching to
946			 * ourself, our t_waitrq has been set, and should be
947			 * cleared.
948			 */
949			if (t->t_waitrq != 0)
950				t->t_waitrq = 0;
951
952			pg_ev_thread_remain(cp, t);
953
954			DTRACE_SCHED(remain__cpu);
955			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
956			(void) spl0();
957		}
958	}
959}
960
961/*
962 * swtch_from_zombie()
963 *	Special case of swtch(), which allows checks for TS_ZOMB to be
964 *	eliminated from normal resume.
965 *	Find best runnable thread and run it.
966 *	Called with the current thread zombied.
967 *	Zombies cannot migrate, so CPU references are safe.
968 */
969void
970swtch_from_zombie()
971{
972	kthread_t	*next;
973	cpu_t		*cpu = CPU;
974
975	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
976
977	ASSERT(curthread->t_state == TS_ZOMB);
978
979	next = disp();			/* returns with spl high */
980	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
981	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
982	ASSERT(next != curthread);
983	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
984
985	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
986
987	restore_mstate(next);
988
989	if (dtrace_vtime_active)
990		dtrace_vtime_switch(next);
991
992	resume_from_zombie(next);
993	/*
994	 * The TR_RESUME_END and TR_SWTCH_END trace points
995	 * appear at the end of resume(), because we certainly will not
996	 * return here
997	 */
998}
999
1000#if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1001
1002/*
1003 * search_disp_queues()
1004 *	Search the given dispatch queues for thread tp.
1005 *	Return 1 if tp is found, otherwise return 0.
1006 */
1007static int
1008search_disp_queues(disp_t *dp, kthread_t *tp)
1009{
1010	dispq_t		*dq;
1011	dispq_t		*eq;
1012
1013	disp_lock_enter_high(&dp->disp_lock);
1014
1015	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1016		kthread_t	*rp;
1017
1018		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1019
1020		for (rp = dq->dq_first; rp; rp = rp->t_link)
1021			if (tp == rp) {
1022				disp_lock_exit_high(&dp->disp_lock);
1023				return (1);
1024			}
1025	}
1026	disp_lock_exit_high(&dp->disp_lock);
1027
1028	return (0);
1029}
1030
1031/*
1032 * thread_on_queue()
1033 *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1034 *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1035 */
1036static int
1037thread_on_queue(kthread_t *tp)
1038{
1039	cpu_t		*cp;
1040	struct cpupart	*part;
1041
1042	ASSERT(getpil() >= DISP_LEVEL);
1043
1044	/*
1045	 * Search the per-CPU dispatch queues for tp.
1046	 */
1047	cp = CPU;
1048	do {
1049		if (search_disp_queues(cp->cpu_disp, tp))
1050			return (1);
1051	} while ((cp = cp->cpu_next_onln) != CPU);
1052
1053	/*
1054	 * Search the partition-wide kpreempt queues for tp.
1055	 */
1056	part = CPU->cpu_part;
1057	do {
1058		if (search_disp_queues(&part->cp_kp_queue, tp))
1059			return (1);
1060	} while ((part = part->cp_next) != CPU->cpu_part);
1061
1062	return (0);
1063}
1064
1065#else
1066
1067#define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1068
1069#endif  /* DEBUG */
1070
1071/*
1072 * like swtch(), but switch to a specified thread taken from another CPU.
1073 *	called with spl high..
1074 */
1075void
1076swtch_to(kthread_t *next)
1077{
1078	cpu_t			*cp = CPU;
1079	hrtime_t		now;
1080
1081	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1082
1083	/*
1084	 * Update context switch statistics.
1085	 */
1086	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1087
1088	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1089
1090	now = gethrtime_unscaled();
1091	pg_ev_thread_swtch(cp, now, curthread, next);
1092
1093	/* OK to steal anything left on run queue */
1094	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1095
1096	/* record last execution time */
1097	cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1098
1099	/*
1100	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1101	 * won't have set its t_waitrq.  Since we now finally know that we're
1102	 * switching away from this thread, set its t_waitrq if it is on a run
1103	 * queue.
1104	 */
1105	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1106		curthread->t_waitrq = now;
1107	}
1108
1109	/* restore next thread to previously running microstate */
1110	restore_mstate(next);
1111
1112	if (dtrace_vtime_active)
1113		dtrace_vtime_switch(next);
1114
1115	resume(next);
1116	/*
1117	 * The TR_RESUME_END and TR_SWTCH_END trace points
1118	 * appear at the end of resume(), because we may not
1119	 * return here
1120	 */
1121}
1122
1123static void
1124cpu_resched(cpu_t *cp, pri_t tpri)
1125{
1126	int	call_poke_cpu = 0;
1127	pri_t   cpupri = cp->cpu_dispatch_pri;
1128
1129	if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1130		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1131		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1132		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1133			cp->cpu_runrun = 1;
1134			aston(cp->cpu_dispthread);
1135			if (tpri < kpreemptpri && cp != CPU)
1136				call_poke_cpu = 1;
1137		}
1138		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1139			cp->cpu_kprunrun = 1;
1140			if (cp != CPU)
1141				call_poke_cpu = 1;
1142		}
1143	}
1144
1145	/*
1146	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1147	 */
1148	membar_enter();
1149
1150	if (call_poke_cpu)
1151		poke_cpu(cp->cpu_id);
1152}
1153
1154/*
1155 * setbackdq() keeps runqs balanced such that the difference in length
1156 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1157 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1158 * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1159 * try to keep runqs perfectly balanced regardless of the thread priority.
1160 */
1161#define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1162#define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1163#define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1164
1165/*
1166 * Macro that evaluates to true if it is likely that the thread has cache
1167 * warmth. This is based on the amount of time that has elapsed since the
1168 * thread last ran. If that amount of time is less than "rechoose_interval"
1169 * ticks, then we decide that the thread has enough cache warmth to warrant
1170 * some affinity for t->t_cpu.
1171 */
1172#define	THREAD_HAS_CACHE_WARMTH(thread)	\
1173	((thread == curthread) ||	\
1174	((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1175/*
1176 * Put the specified thread on the back of the dispatcher
1177 * queue corresponding to its current priority.
1178 *
1179 * Called with the thread in transition, onproc or stopped state
1180 * and locked (transition implies locked) and at high spl.
1181 * Returns with the thread in TS_RUN state and still locked.
1182 */
1183void
1184setbackdq(kthread_t *tp)
1185{
1186	dispq_t	*dq;
1187	disp_t		*dp;
1188	cpu_t		*cp;
1189	pri_t		tpri;
1190	int		bound;
1191	boolean_t	self;
1192
1193	ASSERT(THREAD_LOCK_HELD(tp));
1194	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1195	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1196
1197	/*
1198	 * If thread is "swapped" or on the swap queue don't
1199	 * queue it, but wake sched.
1200	 */
1201	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1202		disp_swapped_setrun(tp);
1203		return;
1204	}
1205
1206	self = (tp == curthread);
1207
1208	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1209		bound = 1;
1210	else
1211		bound = 0;
1212
1213	tpri = DISP_PRIO(tp);
1214	if (ncpus == 1)
1215		cp = tp->t_cpu;
1216	else if (!bound) {
1217		if (tpri >= kpqpri) {
1218			setkpdq(tp, SETKP_BACK);
1219			return;
1220		}
1221
1222		/*
1223		 * We'll generally let this thread continue to run where
1224		 * it last ran...but will consider migration if:
1225		 * - The thread probably doesn't have much cache warmth.
1226		 * - SMT exclusion would prefer us to run elsewhere
1227		 * - The CPU where it last ran is the target of an offline
1228		 *   request.
1229		 * - The thread last ran outside its home lgroup.
1230		 */
1231		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1232		    !smt_should_run(tp, tp->t_cpu) ||
1233		    (tp->t_cpu == cpu_inmotion) ||
1234		    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1235			cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1236		} else {
1237			cp = tp->t_cpu;
1238		}
1239
1240		if (tp->t_cpupart == cp->cpu_part) {
1241			int	qlen;
1242
1243			/*
1244			 * Perform any CMT load balancing
1245			 */
1246			cp = cmt_balance(tp, cp);
1247
1248			/*
1249			 * Balance across the run queues
1250			 */
1251			qlen = RUNQ_LEN(cp, tpri);
1252			if (tpri >= RUNQ_MATCH_PRI &&
1253			    !(tp->t_schedflag & TS_RUNQMATCH))
1254				qlen -= RUNQ_MAX_DIFF;
1255			if (qlen > 0) {
1256				cpu_t *newcp;
1257
1258				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1259					newcp = cp->cpu_next_part;
1260				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1261					newcp = cp->cpu_next_part;
1262				}
1263
1264				if (smt_should_run(tp, newcp) &&
1265				    RUNQ_LEN(newcp, tpri) < qlen) {
1266					DTRACE_PROBE3(runq__balance,
1267					    kthread_t *, tp,
1268					    cpu_t *, cp, cpu_t *, newcp);
1269					cp = newcp;
1270				}
1271			}
1272		} else {
1273			/*
1274			 * Migrate to a cpu in the new partition.
1275			 */
1276			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1277			    tp->t_pri);
1278		}
1279		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1280	} else {
1281		/*
1282		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1283		 * a short time until weak binding that existed when the
1284		 * strong binding was established has dropped) so we must
1285		 * favour weak binding over strong.
1286		 */
1287		cp = tp->t_weakbound_cpu ?
1288		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1289	}
1290	/*
1291	 * A thread that is ONPROC may be temporarily placed on the run queue
1292	 * but then chosen to run again by disp.  If the thread we're placing on
1293	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1294	 * replacement process is actually scheduled in swtch().  In this
1295	 * situation, curthread is the only thread that could be in the ONPROC
1296	 * state.
1297	 */
1298	if ((!self) && (tp->t_waitrq == 0)) {
1299		hrtime_t curtime;
1300
1301		curtime = gethrtime_unscaled();
1302		(void) cpu_update_pct(tp, curtime);
1303		tp->t_waitrq = curtime;
1304	} else {
1305		(void) cpu_update_pct(tp, gethrtime_unscaled());
1306	}
1307
1308	dp = cp->cpu_disp;
1309	disp_lock_enter_high(&dp->disp_lock);
1310
1311	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1312	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1313	    tpri, cp, tp);
1314
1315#ifndef NPROBE
1316	/* Kernel probe */
1317	if (tnf_tracing_active)
1318		tnf_thread_queue(tp, cp, tpri);
1319#endif /* NPROBE */
1320
1321	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1322
1323	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1324	tp->t_disp_queue = dp;
1325	tp->t_link = NULL;
1326
1327	dq = &dp->disp_q[tpri];
1328	dp->disp_nrunnable++;
1329	if (!bound)
1330		dp->disp_steal = 0;
1331	membar_enter();
1332
1333	if (dq->dq_sruncnt++ != 0) {
1334		ASSERT(dq->dq_first != NULL);
1335		dq->dq_last->t_link = tp;
1336		dq->dq_last = tp;
1337	} else {
1338		ASSERT(dq->dq_first == NULL);
1339		ASSERT(dq->dq_last == NULL);
1340		dq->dq_first = dq->dq_last = tp;
1341		BT_SET(dp->disp_qactmap, tpri);
1342		if (tpri > dp->disp_maxrunpri) {
1343			dp->disp_maxrunpri = tpri;
1344			membar_enter();
1345			cpu_resched(cp, tpri);
1346		}
1347	}
1348
1349	if (!bound && tpri > dp->disp_max_unbound_pri) {
1350		if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1351			/*
1352			 * If there are no other unbound threads on the
1353			 * run queue, don't allow other CPUs to steal
1354			 * this thread while we are in the middle of a
1355			 * context switch. We may just switch to it
1356			 * again right away. CPU_DISP_DONTSTEAL is cleared
1357			 * in swtch and swtch_to.
1358			 */
1359			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1360		}
1361		dp->disp_max_unbound_pri = tpri;
1362	}
1363	(*disp_enq_thread)(cp, bound);
1364}
1365
1366/*
1367 * Put the specified thread on the front of the dispatcher
1368 * queue corresponding to its current priority.
1369 *
1370 * Called with the thread in transition, onproc or stopped state
1371 * and locked (transition implies locked) and at high spl.
1372 * Returns with the thread in TS_RUN state and still locked.
1373 */
1374void
1375setfrontdq(kthread_t *tp)
1376{
1377	disp_t		*dp;
1378	dispq_t		*dq;
1379	cpu_t		*cp;
1380	pri_t		tpri;
1381	int		bound;
1382
1383	ASSERT(THREAD_LOCK_HELD(tp));
1384	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1385	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1386
1387	/*
1388	 * If thread is "swapped" or on the swap queue don't
1389	 * queue it, but wake sched.
1390	 */
1391	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1392		disp_swapped_setrun(tp);
1393		return;
1394	}
1395
1396	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1397		bound = 1;
1398	else
1399		bound = 0;
1400
1401	tpri = DISP_PRIO(tp);
1402	if (ncpus == 1)
1403		cp = tp->t_cpu;
1404	else if (!bound) {
1405		if (tpri >= kpqpri) {
1406			setkpdq(tp, SETKP_FRONT);
1407			return;
1408		}
1409		cp = tp->t_cpu;
1410		if (tp->t_cpupart == cp->cpu_part) {
1411			/*
1412			 * We'll generally let this thread continue to run
1413			 * where it last ran, but will consider migration if:
1414			 * - The thread last ran outside its home lgroup.
1415			 * - The CPU where it last ran is the target of an
1416			 *   offline request (a thread_nomigrate() on the in
1417			 *   motion CPU relies on this when forcing a preempt).
1418			 * - The thread isn't the highest priority thread where
1419			 *   it last ran, and it is considered not likely to
1420			 *   have significant cache warmth.
1421			 */
1422			if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1423			    cp == cpu_inmotion ||
1424			    (tpri < cp->cpu_disp->disp_maxrunpri &&
1425			    !THREAD_HAS_CACHE_WARMTH(tp))) {
1426				cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1427			}
1428		} else {
1429			/*
1430			 * Migrate to a cpu in the new partition.
1431			 */
1432			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1433			    tp, tp->t_pri);
1434		}
1435		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1436	} else {
1437		/*
1438		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1439		 * a short time until weak binding that existed when the
1440		 * strong binding was established has dropped) so we must
1441		 * favour weak binding over strong.
1442		 */
1443		cp = tp->t_weakbound_cpu ?
1444		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1445	}
1446
1447	/*
1448	 * A thread that is ONPROC may be temporarily placed on the run queue
1449	 * but then chosen to run again by disp.  If the thread we're placing on
1450	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1451	 * replacement process is actually scheduled in swtch().  In this
1452	 * situation, curthread is the only thread that could be in the ONPROC
1453	 * state.
1454	 */
1455	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1456		hrtime_t curtime;
1457
1458		curtime = gethrtime_unscaled();
1459		(void) cpu_update_pct(tp, curtime);
1460		tp->t_waitrq = curtime;
1461	} else {
1462		(void) cpu_update_pct(tp, gethrtime_unscaled());
1463	}
1464
1465	dp = cp->cpu_disp;
1466	disp_lock_enter_high(&dp->disp_lock);
1467
1468	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1469	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1470
1471#ifndef NPROBE
1472	/* Kernel probe */
1473	if (tnf_tracing_active)
1474		tnf_thread_queue(tp, cp, tpri);
1475#endif /* NPROBE */
1476
1477	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1478
1479	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1480	tp->t_disp_queue = dp;
1481
1482	dq = &dp->disp_q[tpri];
1483	dp->disp_nrunnable++;
1484	if (!bound)
1485		dp->disp_steal = 0;
1486	membar_enter();
1487
1488	if (dq->dq_sruncnt++ != 0) {
1489		ASSERT(dq->dq_last != NULL);
1490		tp->t_link = dq->dq_first;
1491		dq->dq_first = tp;
1492	} else {
1493		ASSERT(dq->dq_last == NULL);
1494		ASSERT(dq->dq_first == NULL);
1495		tp->t_link = NULL;
1496		dq->dq_first = dq->dq_last = tp;
1497		BT_SET(dp->disp_qactmap, tpri);
1498		if (tpri > dp->disp_maxrunpri) {
1499			dp->disp_maxrunpri = tpri;
1500			membar_enter();
1501			cpu_resched(cp, tpri);
1502		}
1503	}
1504
1505	if (!bound && tpri > dp->disp_max_unbound_pri) {
1506		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1507		    cp == CPU) {
1508			/*
1509			 * If there are no other unbound threads on the
1510			 * run queue, don't allow other CPUs to steal
1511			 * this thread while we are in the middle of a
1512			 * context switch. We may just switch to it
1513			 * again right away. CPU_DISP_DONTSTEAL is cleared
1514			 * in swtch and swtch_to.
1515			 */
1516			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1517		}
1518		dp->disp_max_unbound_pri = tpri;
1519	}
1520	(*disp_enq_thread)(cp, bound);
1521}
1522
1523/*
1524 * Put a high-priority unbound thread on the kp queue
1525 */
1526static void
1527setkpdq(kthread_t *tp, int borf)
1528{
1529	dispq_t	*dq;
1530	disp_t	*dp;
1531	cpu_t	*cp;
1532	pri_t	tpri;
1533
1534	tpri = DISP_PRIO(tp);
1535
1536	dp = &tp->t_cpupart->cp_kp_queue;
1537	disp_lock_enter_high(&dp->disp_lock);
1538
1539	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1540
1541	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1542	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1543	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1544	tp->t_disp_queue = dp;
1545	dp->disp_nrunnable++;
1546	dq = &dp->disp_q[tpri];
1547
1548	if (dq->dq_sruncnt++ != 0) {
1549		if (borf == SETKP_BACK) {
1550			ASSERT(dq->dq_first != NULL);
1551			tp->t_link = NULL;
1552			dq->dq_last->t_link = tp;
1553			dq->dq_last = tp;
1554		} else {
1555			ASSERT(dq->dq_last != NULL);
1556			tp->t_link = dq->dq_first;
1557			dq->dq_first = tp;
1558		}
1559	} else {
1560		if (borf == SETKP_BACK) {
1561			ASSERT(dq->dq_first == NULL);
1562			ASSERT(dq->dq_last == NULL);
1563			dq->dq_first = dq->dq_last = tp;
1564		} else {
1565			ASSERT(dq->dq_last == NULL);
1566			ASSERT(dq->dq_first == NULL);
1567			tp->t_link = NULL;
1568			dq->dq_first = dq->dq_last = tp;
1569		}
1570		BT_SET(dp->disp_qactmap, tpri);
1571		if (tpri > dp->disp_max_unbound_pri)
1572			dp->disp_max_unbound_pri = tpri;
1573		if (tpri > dp->disp_maxrunpri) {
1574			dp->disp_maxrunpri = tpri;
1575			membar_enter();
1576		}
1577	}
1578
1579	cp = tp->t_cpu;
1580	if (tp->t_cpupart != cp->cpu_part) {
1581		/* migrate to a cpu in the new partition */
1582		cp = tp->t_cpupart->cp_cpulist;
1583	}
1584	cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1585	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1586	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1587
1588#ifndef NPROBE
1589	/* Kernel probe */
1590	if (tnf_tracing_active)
1591		tnf_thread_queue(tp, cp, tpri);
1592#endif /* NPROBE */
1593
1594	if (cp->cpu_chosen_level < tpri)
1595		cp->cpu_chosen_level = tpri;
1596	cpu_resched(cp, tpri);
1597	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1598	(*disp_enq_thread)(cp, 0);
1599}
1600
1601/*
1602 * Remove a thread from the dispatcher queue if it is on it.
1603 * It is not an error if it is not found but we return whether
1604 * or not it was found in case the caller wants to check.
1605 */
1606int
1607dispdeq(kthread_t *tp)
1608{
1609	disp_t		*dp;
1610	dispq_t		*dq;
1611	kthread_t	*rp;
1612	kthread_t	*trp;
1613	kthread_t	**ptp;
1614	int		tpri;
1615
1616	ASSERT(THREAD_LOCK_HELD(tp));
1617
1618	if (tp->t_state != TS_RUN)
1619		return (0);
1620
1621	/*
1622	 * The thread is "swapped" or is on the swap queue and
1623	 * hence no longer on the run queue, so return true.
1624	 */
1625	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1626		return (1);
1627
1628	tpri = DISP_PRIO(tp);
1629	dp = tp->t_disp_queue;
1630	ASSERT(tpri < dp->disp_npri);
1631	dq = &dp->disp_q[tpri];
1632	ptp = &dq->dq_first;
1633	rp = *ptp;
1634	trp = NULL;
1635
1636	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1637
1638	/*
1639	 * Search for thread in queue.
1640	 * Double links would simplify this at the expense of disp/setrun.
1641	 */
1642	while (rp != tp && rp != NULL) {
1643		trp = rp;
1644		ptp = &trp->t_link;
1645		rp = trp->t_link;
1646	}
1647
1648	if (rp == NULL) {
1649		panic("dispdeq: thread not on queue");
1650	}
1651
1652	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1653
1654	/*
1655	 * Found it so remove it from queue.
1656	 */
1657	if ((*ptp = rp->t_link) == NULL)
1658		dq->dq_last = trp;
1659
1660	dp->disp_nrunnable--;
1661	if (--dq->dq_sruncnt == 0) {
1662		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1663		if (dp->disp_nrunnable == 0) {
1664			dp->disp_max_unbound_pri = -1;
1665			dp->disp_maxrunpri = -1;
1666		} else if (tpri == dp->disp_maxrunpri) {
1667			int ipri;
1668
1669			ipri = bt_gethighbit(dp->disp_qactmap,
1670			    dp->disp_maxrunpri >> BT_ULSHIFT);
1671			if (ipri < dp->disp_max_unbound_pri)
1672				dp->disp_max_unbound_pri = ipri;
1673			dp->disp_maxrunpri = ipri;
1674		}
1675	}
1676	tp->t_link = NULL;
1677	THREAD_TRANSITION(tp);		/* put in intermediate state */
1678	return (1);
1679}
1680
1681
1682/*
1683 * dq_sruninc and dq_srundec are public functions for
1684 * incrementing/decrementing the sruncnts when a thread on
1685 * a dispatcher queue is made schedulable/unschedulable by
1686 * resetting the TS_LOAD flag.
1687 *
1688 * The caller MUST have the thread lock and therefore the dispatcher
1689 * queue lock so that the operation which changes
1690 * the flag, the operation that checks the status of the thread to
1691 * determine if it's on a disp queue AND the call to this function
1692 * are one atomic operation with respect to interrupts.
1693 */
1694
1695/*
1696 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1697 */
1698void
1699dq_sruninc(kthread_t *t)
1700{
1701	ASSERT(t->t_state == TS_RUN);
1702	ASSERT(t->t_schedflag & TS_LOAD);
1703
1704	THREAD_TRANSITION(t);
1705	setfrontdq(t);
1706}
1707
1708/*
1709 * See comment on calling conventions above.
1710 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1711 */
1712void
1713dq_srundec(kthread_t *t)
1714{
1715	ASSERT(t->t_schedflag & TS_LOAD);
1716
1717	(void) dispdeq(t);
1718	disp_swapped_enq(t);
1719}
1720
1721/*
1722 * Change the dispatcher lock of thread to the "swapped_lock"
1723 * and return with thread lock still held.
1724 *
1725 * Called with thread_lock held, in transition state, and at high spl.
1726 */
1727void
1728disp_swapped_enq(kthread_t *tp)
1729{
1730	ASSERT(THREAD_LOCK_HELD(tp));
1731	ASSERT(tp->t_schedflag & TS_LOAD);
1732
1733	switch (tp->t_state) {
1734	case TS_RUN:
1735		disp_lock_enter_high(&swapped_lock);
1736		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1737		break;
1738	case TS_ONPROC:
1739		disp_lock_enter_high(&swapped_lock);
1740		THREAD_TRANSITION(tp);
1741		wake_sched_sec = 1;		/* tell clock to wake sched */
1742		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1743		break;
1744	default:
1745		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1746	}
1747}
1748
1749/*
1750 * This routine is called by setbackdq/setfrontdq if the thread is
1751 * not loaded or loaded and on the swap queue.
1752 *
1753 * Thread state TS_SLEEP implies that a swapped thread
1754 * has been woken up and needs to be swapped in by the swapper.
1755 *
1756 * Thread state TS_RUN, it implies that the priority of a swapped
1757 * thread is being increased by scheduling class (e.g. ts_update).
1758 */
1759static void
1760disp_swapped_setrun(kthread_t *tp)
1761{
1762	ASSERT(THREAD_LOCK_HELD(tp));
1763	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1764
1765	switch (tp->t_state) {
1766	case TS_SLEEP:
1767		disp_lock_enter_high(&swapped_lock);
1768		/*
1769		 * Wakeup sched immediately (i.e., next tick) if the
1770		 * thread priority is above maxclsyspri.
1771		 */
1772		if (DISP_PRIO(tp) > maxclsyspri)
1773			wake_sched = 1;
1774		else
1775			wake_sched_sec = 1;
1776		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1777		break;
1778	case TS_RUN:				/* called from ts_update */
1779		break;
1780	default:
1781		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1782	}
1783}
1784
1785/*
1786 *	Make a thread give up its processor.  Find the processor on
1787 *	which this thread is executing, and have that processor
1788 *	preempt.
1789 *
1790 *	We allow System Duty Cycle (SDC) threads to be preempted even if
1791 *	they are running at kernel priorities.  To implement this, we always
1792 *	set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1793 *	calls cpu_surrender() very often, we only preempt if there is anyone
1794 *	competing with us.
1795 */
1796void
1797cpu_surrender(kthread_t *tp)
1798{
1799	cpu_t	*cpup;
1800	int	max_pri;
1801	int	max_run_pri;
1802	klwp_t	*lwp;
1803
1804	ASSERT(THREAD_LOCK_HELD(tp));
1805
1806	if (tp->t_state != TS_ONPROC)
1807		return;
1808	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1809	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1810	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1811	if (max_pri < max_run_pri)
1812		max_pri = max_run_pri;
1813
1814	if (tp->t_cid == sysdccid) {
1815		uint_t t_pri = DISP_PRIO(tp);
1816		if (t_pri > max_pri)
1817			return;		/* we are not competing w/ anyone */
1818		cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1819	} else {
1820		cpup->cpu_runrun = 1;
1821		if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1822			cpup->cpu_kprunrun = 1;
1823		}
1824	}
1825
1826	/*
1827	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1828	 */
1829	membar_enter();
1830
1831	DTRACE_SCHED1(surrender, kthread_t *, tp);
1832
1833	/*
1834	 * Make the target thread take an excursion through trap()
1835	 * to do preempt() (unless we're already in trap or post_syscall,
1836	 * calling cpu_surrender via CL_TRAPRET).
1837	 */
1838	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1839	    lwp->lwp_state != LWP_USER) {
1840		aston(tp);
1841		if (cpup != CPU)
1842			poke_cpu(cpup->cpu_id);
1843	}
1844	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1845	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1846}
1847
1848/*
1849 * Commit to and ratify a scheduling decision
1850 */
1851/*ARGSUSED*/
1852static kthread_t *
1853disp_ratify(kthread_t *tp, disp_t *kpq)
1854{
1855	pri_t	tpri, maxpri;
1856	pri_t	maxkpri;
1857	cpu_t	*cpup;
1858
1859	ASSERT(tp != NULL);
1860	/*
1861	 * Commit to, then ratify scheduling decision
1862	 */
1863	cpup = CPU;
1864	if (cpup->cpu_runrun != 0)
1865		cpup->cpu_runrun = 0;
1866	if (cpup->cpu_kprunrun != 0)
1867		cpup->cpu_kprunrun = 0;
1868	if (cpup->cpu_chosen_level != -1)
1869		cpup->cpu_chosen_level = -1;
1870	membar_enter();
1871	tpri = DISP_PRIO(tp);
1872	maxpri = cpup->cpu_disp->disp_maxrunpri;
1873	maxkpri = kpq->disp_maxrunpri;
1874	if (maxpri < maxkpri)
1875		maxpri = maxkpri;
1876	if (tpri < maxpri) {
1877		/*
1878		 * should have done better
1879		 * put this one back and indicate to try again
1880		 */
1881		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1882		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1883		thread_lock_high(tp);
1884		THREAD_TRANSITION(tp);
1885		setfrontdq(tp);
1886		thread_unlock_nopreempt(tp);
1887
1888		tp = NULL;
1889	}
1890	return (tp);
1891}
1892
1893/*
1894 * See if there is any work on the dispatcher queue for other CPUs.
1895 * If there is, dequeue the best thread and return.
1896 */
1897static kthread_t *
1898disp_getwork(cpu_t *cp)
1899{
1900	cpu_t		*ocp;		/* other CPU */
1901	cpu_t		*ocp_start;
1902	cpu_t		*tcp;		/* target local CPU */
1903	kthread_t	*tp;
1904	kthread_t	*retval = NULL;
1905	pri_t		maxpri;
1906	disp_t		*kpq;		/* kp queue for this partition */
1907	lpl_t		*lpl, *lpl_leaf;
1908	int		leafidx, startidx;
1909	hrtime_t	stealtime;
1910	lgrp_id_t	local_id;
1911
1912	maxpri = -1;
1913	tcp = NULL;
1914
1915	kpq = &cp->cpu_part->cp_kp_queue;
1916	while (kpq->disp_maxrunpri >= 0) {
1917		/*
1918		 * Try to take a thread from the kp_queue.
1919		 */
1920		tp = (disp_getbest(kpq));
1921		if (tp)
1922			return (disp_ratify(tp, kpq));
1923	}
1924
1925	kpreempt_disable();		/* protect the cpu_active list */
1926
1927	/*
1928	 * Try to find something to do on another CPU's run queue.
1929	 * Loop through all other CPUs looking for the one with the highest
1930	 * priority unbound thread.
1931	 *
1932	 * On NUMA machines, the partition's CPUs are consulted in order of
1933	 * distance from the current CPU. This way, the first available
1934	 * work found is also the closest, and will suffer the least
1935	 * from being migrated.
1936	 */
1937	lpl = lpl_leaf = cp->cpu_lpl;
1938	local_id = lpl_leaf->lpl_lgrpid;
1939	leafidx = startidx = 0;
1940
1941	/*
1942	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1943	 * broader levels of locality
1944	 */
1945	do {
1946		/* This loop iterates over the lpl's leaves */
1947		do {
1948			if (lpl_leaf != cp->cpu_lpl)
1949				ocp = lpl_leaf->lpl_cpus;
1950			else
1951				ocp = cp->cpu_next_lpl;
1952
1953			/* This loop iterates over the CPUs in the leaf */
1954			ocp_start = ocp;
1955			do {
1956				pri_t pri;
1957
1958				ASSERT(CPU_ACTIVE(ocp));
1959
1960				/*
1961				 * End our stroll around this lpl if:
1962				 *
1963				 * - Something became runnable on the local
1964				 *   queue...which also ends our stroll around
1965				 *   the partition.
1966				 *
1967				 * - We happen across another idle CPU.
1968				 *   Since it is patrolling the next portion
1969				 *   of the lpl's list (assuming it's not
1970				 *   halted, or busy servicing an interrupt),
1971				 *   move to the next higher level of locality.
1972				 */
1973				if (cp->cpu_disp->disp_nrunnable != 0) {
1974					kpreempt_enable();
1975					return (NULL);
1976				}
1977				if (ocp->cpu_dispatch_pri == -1) {
1978					if (ocp->cpu_disp_flags &
1979					    CPU_DISP_HALTED ||
1980					    ocp->cpu_intr_actv != 0)
1981						continue;
1982					else
1983						goto next_level;
1984				}
1985
1986				/*
1987				 * If there's only one thread and the CPU
1988				 * is in the middle of a context switch,
1989				 * or it's currently running the idle thread,
1990				 * don't steal it.
1991				 */
1992				if ((ocp->cpu_disp_flags &
1993				    CPU_DISP_DONTSTEAL) &&
1994				    ocp->cpu_disp->disp_nrunnable == 1)
1995					continue;
1996
1997				pri = ocp->cpu_disp->disp_max_unbound_pri;
1998				if (pri > maxpri) {
1999					/*
2000					 * Don't steal threads that we attempted
2001					 * to steal recently until they're ready
2002					 * to be stolen again.
2003					 */
2004					stealtime = ocp->cpu_disp->disp_steal;
2005					if (stealtime == 0 ||
2006					    stealtime - gethrtime() <= 0) {
2007						maxpri = pri;
2008						tcp = ocp;
2009					} else {
2010						/*
2011						 * Don't update tcp, just set
2012						 * the retval to T_DONTSTEAL, so
2013						 * that if no acceptable CPUs
2014						 * are found the return value
2015						 * will be T_DONTSTEAL rather
2016						 * then NULL.
2017						 */
2018						retval = T_DONTSTEAL;
2019					}
2020				}
2021			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2022
2023			/*
2024			 * Iterate to the next leaf lpl in the resource set
2025			 * at this level of locality. If we hit the end of
2026			 * the set, wrap back around to the beginning.
2027			 *
2028			 * Note: This iteration is NULL terminated for a reason
2029			 * see lpl_topo_bootstrap() in lgrp.c for details.
2030			 */
2031			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2032				leafidx = 0;
2033				lpl_leaf = lpl->lpl_rset[leafidx];
2034			}
2035		} while (leafidx != startidx);
2036
2037next_level:
2038		/*
2039		 * Expand the search to include farther away CPUs (next
2040		 * locality level). The closer CPUs that have already been
2041		 * checked will be checked again. In doing so, idle CPUs
2042		 * will tend to be more aggresive about stealing from CPUs
2043		 * that are closer (since the closer CPUs will be considered
2044		 * more often).
2045		 * Begin at this level with the CPUs local leaf lpl.
2046		 */
2047		if ((lpl = lpl->lpl_parent) != NULL) {
2048			leafidx = startidx = lpl->lpl_id2rset[local_id];
2049			lpl_leaf = lpl->lpl_rset[leafidx];
2050		}
2051	} while (!tcp && lpl);
2052
2053	kpreempt_enable();
2054
2055	/*
2056	 * If another queue looks good, and there is still nothing on
2057	 * the local queue, try to transfer one or more threads
2058	 * from it to our queue.
2059	 */
2060	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2061		tp = disp_getbest(tcp->cpu_disp);
2062		if (tp == NULL || tp == T_DONTSTEAL)
2063			return (tp);
2064		return (disp_ratify(tp, kpq));
2065	}
2066	return (retval);
2067}
2068
2069
2070/*
2071 * disp_fix_unbound_pri()
2072 *	Determines the maximum priority of unbound threads on the queue.
2073 *	The priority is kept for the queue, but is only increased, never
2074 *	reduced unless some CPU is looking for something on that queue.
2075 *
2076 *	The priority argument is the known upper limit.
2077 *
2078 *	Perhaps this should be kept accurately, but that probably means
2079 *	separate bitmaps for bound and unbound threads.  Since only idled
2080 *	CPUs will have to do this recalculation, it seems better this way.
2081 */
2082static void
2083disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2084{
2085	kthread_t	*tp;
2086	dispq_t		*dq;
2087	ulong_t		*dqactmap = dp->disp_qactmap;
2088	ulong_t		mapword;
2089	int		wx;
2090
2091	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2092
2093	ASSERT(pri >= 0);			/* checked by caller */
2094
2095	/*
2096	 * Start the search at the next lowest priority below the supplied
2097	 * priority.  This depends on the bitmap implementation.
2098	 */
2099	do {
2100		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2101
2102		/*
2103		 * Form mask for all lower priorities in the word.
2104		 */
2105		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2106
2107		/*
2108		 * Get next lower active priority.
2109		 */
2110		if (mapword != 0) {
2111			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2112		} else if (wx > 0) {
2113			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2114			if (pri < 0)
2115				break;
2116		} else {
2117			pri = -1;
2118			break;
2119		}
2120
2121		/*
2122		 * Search the queue for unbound, runnable threads.
2123		 */
2124		dq = &dp->disp_q[pri];
2125		tp = dq->dq_first;
2126
2127		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2128			tp = tp->t_link;
2129		}
2130
2131		/*
2132		 * If a thread was found, set the priority and return.
2133		 */
2134	} while (tp == NULL);
2135
2136	/*
2137	 * pri holds the maximum unbound thread priority or -1.
2138	 */
2139	if (dp->disp_max_unbound_pri != pri)
2140		dp->disp_max_unbound_pri = pri;
2141}
2142
2143/*
2144 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2145 * 	check if the CPU to which is was previously bound should have
2146 * 	its disp_max_unbound_pri increased.
2147 */
2148void
2149disp_adjust_unbound_pri(kthread_t *tp)
2150{
2151	disp_t *dp;
2152	pri_t tpri;
2153
2154	ASSERT(THREAD_LOCK_HELD(tp));
2155
2156	/*
2157	 * Don't do anything if the thread is not bound, or
2158	 * currently not runnable or swapped out.
2159	 */
2160	if (tp->t_bound_cpu == NULL ||
2161	    tp->t_state != TS_RUN ||
2162	    tp->t_schedflag & TS_ON_SWAPQ)
2163		return;
2164
2165	tpri = DISP_PRIO(tp);
2166	dp = tp->t_bound_cpu->cpu_disp;
2167	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2168	if (tpri > dp->disp_max_unbound_pri)
2169		dp->disp_max_unbound_pri = tpri;
2170}
2171
2172/*
2173 * disp_getbest()
2174 *   De-queue the highest priority unbound runnable thread.
2175 *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2176 *   Returns NULL if nothing found.
2177 *   Returns T_DONTSTEAL if the thread was not stealable.
2178 *   so that the caller will try again later.
2179 *
2180 *   Passed a pointer to a dispatch queue not associated with this CPU, and
2181 *   its type.
2182 */
2183static kthread_t *
2184disp_getbest(disp_t *dp)
2185{
2186	kthread_t	*tp;
2187	dispq_t		*dq;
2188	pri_t		pri;
2189	cpu_t		*cp, *tcp;
2190	boolean_t	allbound;
2191
2192	disp_lock_enter(&dp->disp_lock);
2193
2194	/*
2195	 * If there is nothing to run, or the CPU is in the middle of a
2196	 * context switch of the only thread, return NULL.
2197	 */
2198	tcp = dp->disp_cpu;
2199	cp = CPU;
2200	pri = dp->disp_max_unbound_pri;
2201	if (pri == -1 ||
2202	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2203	    tcp->cpu_disp->disp_nrunnable == 1)) {
2204		disp_lock_exit_nopreempt(&dp->disp_lock);
2205		return (NULL);
2206	}
2207
2208	dq = &dp->disp_q[pri];
2209
2210
2211	/*
2212	 * Assume that all threads are bound on this queue, and change it
2213	 * later when we find out that it is not the case.
2214	 */
2215	allbound = B_TRUE;
2216	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2217		hrtime_t now, nosteal, rqtime;
2218
2219		/*
2220		 * Skip over bound threads which could be here even
2221		 * though disp_max_unbound_pri indicated this level.
2222		 */
2223		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2224			continue;
2225
2226		/*
2227		 * We've got some unbound threads on this queue, so turn
2228		 * the allbound flag off now.
2229		 */
2230		allbound = B_FALSE;
2231
2232		/*
2233		 * The thread is a candidate for stealing from its run queue. We
2234		 * don't want to steal threads that became runnable just a
2235		 * moment ago. This improves CPU affinity for threads that get
2236		 * preempted for short periods of time and go back on the run
2237		 * queue.
2238		 *
2239		 * We want to let it stay on its run queue if it was only placed
2240		 * there recently and it was running on the same CPU before that
2241		 * to preserve its cache investment. For the thread to remain on
2242		 * its run queue, ALL of the following conditions must be
2243		 * satisfied:
2244		 *
2245		 * - the disp queue should not be the kernel preemption queue
2246		 * - delayed idle stealing should not be disabled
2247		 * - nosteal_nsec should be non-zero
2248		 * - it should run with user priority
2249		 * - it should be on the run queue of the CPU where it was
2250		 *   running before being placed on the run queue
2251		 * - it should be the only thread on the run queue (to prevent
2252		 *   extra scheduling latency for other threads)
2253		 * - it should sit on the run queue for less than per-chip
2254		 *   nosteal interval or global nosteal interval
2255		 * - in case of CPUs with shared cache it should sit in a run
2256		 *   queue of a CPU from a different chip
2257		 *
2258		 * The checks are arranged so that the ones that are faster are
2259		 * placed earlier.
2260		 */
2261		if (tcp == NULL ||
2262		    pri >= minclsyspri ||
2263		    tp->t_cpu != tcp)
2264			break;
2265
2266		/*
2267		 * Steal immediately if, due to CMT processor architecture
2268		 * migraiton between cp and tcp would incur no performance
2269		 * penalty.
2270		 */
2271		if (pg_cmt_can_migrate(cp, tcp))
2272			break;
2273
2274		nosteal = nosteal_nsec;
2275		if (nosteal == 0)
2276			break;
2277
2278		/*
2279		 * Calculate time spent sitting on run queue
2280		 */
2281		now = gethrtime_unscaled();
2282		rqtime = now - tp->t_waitrq;
2283		scalehrtime(&rqtime);
2284
2285		/*
2286		 * Steal immediately if the time spent on this run queue is more
2287		 * than allowed nosteal delay.
2288		 *
2289		 * Negative rqtime check is needed here to avoid infinite
2290		 * stealing delays caused by unlikely but not impossible
2291		 * drifts between CPU times on different CPUs.
2292		 */
2293		if (rqtime > nosteal || rqtime < 0)
2294			break;
2295
2296		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2297		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2298		scalehrtime(&now);
2299		/*
2300		 * Calculate when this thread becomes stealable
2301		 */
2302		now += (nosteal - rqtime);
2303
2304		/*
2305		 * Calculate time when some thread becomes stealable
2306		 */
2307		if (now < dp->disp_steal)
2308			dp->disp_steal = now;
2309	}
2310
2311	/*
2312	 * If there were no unbound threads on this queue, find the queue
2313	 * where they are and then return later. The value of
2314	 * disp_max_unbound_pri is not always accurate because it isn't
2315	 * reduced until another idle CPU looks for work.
2316	 */
2317	if (allbound)
2318		disp_fix_unbound_pri(dp, pri);
2319
2320	/*
2321	 * If we reached the end of the queue and found no unbound threads
2322	 * then return NULL so that other CPUs will be considered.  If there
2323	 * are unbound threads but they cannot yet be stolen, then
2324	 * return T_DONTSTEAL and try again later.
2325	 */
2326	if (tp == NULL) {
2327		disp_lock_exit_nopreempt(&dp->disp_lock);
2328		return (allbound ? NULL : T_DONTSTEAL);
2329	}
2330
2331	/*
2332	 * Found a runnable, unbound thread, so remove it from queue.
2333	 * dispdeq() requires that we have the thread locked, and we do,
2334	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2335	 * put the thread in transition state, thereby dropping the dispq
2336	 * lock.
2337	 */
2338
2339#ifdef DEBUG
2340	{
2341		int	thread_was_on_queue;
2342
2343		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2344		ASSERT(thread_was_on_queue);
2345	}
2346
2347#else /* DEBUG */
2348	(void) dispdeq(tp);			/* drops disp_lock */
2349#endif /* DEBUG */
2350
2351	/*
2352	 * Reset the disp_queue steal time - we do not know what is the smallest
2353	 * value across the queue is.
2354	 */
2355	dp->disp_steal = 0;
2356
2357	tp->t_schedflag |= TS_DONT_SWAP;
2358
2359	/*
2360	 * Setup thread to run on the current CPU.
2361	 */
2362	tp->t_disp_queue = cp->cpu_disp;
2363
2364	cp->cpu_dispthread = tp;		/* protected by spl only */
2365	cp->cpu_dispatch_pri = pri;
2366
2367	/*
2368	 * There can be a memory synchronization race between disp_getbest()
2369	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2370	 * to preempt the current thread to run the enqueued thread while
2371	 * disp_getbest() and disp_ratify() are changing the current thread
2372	 * to the stolen thread. This may lead to a situation where
2373	 * cpu_resched() tries to preempt the wrong thread and the
2374	 * stolen thread continues to run on the CPU which has been tagged
2375	 * for preemption.
2376	 * Later the clock thread gets enqueued but doesn't get to run on the
2377	 * CPU causing the system to hang.
2378	 *
2379	 * To avoid this, grabbing and dropping the disp_lock (which does
2380	 * a memory barrier) is needed to synchronize the execution of
2381	 * cpu_resched() with disp_getbest() and disp_ratify() and
2382	 * synchronize the memory read and written by cpu_resched(),
2383	 * disp_getbest(), and disp_ratify() with each other.
2384	 *  (see CR#6482861 for more details).
2385	 */
2386	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2387	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2388
2389	ASSERT(pri == DISP_PRIO(tp));
2390
2391	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2392
2393	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2394
2395	/*
2396	 * Return with spl high so that swtch() won't need to raise it.
2397	 * The disp_lock was dropped by dispdeq().
2398	 */
2399
2400	return (tp);
2401}
2402
2403/*
2404 * disp_bound_common() - common routine for higher level functions
2405 *	that check for bound threads under certain conditions.
2406 *	If 'threadlistsafe' is set then there is no need to acquire
2407 *	pidlock to stop the thread list from changing (eg, if
2408 *	disp_bound_* is called with cpus paused).
2409 */
2410static int
2411disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2412{
2413	int		found = 0;
2414	kthread_t	*tp;
2415
2416	ASSERT(flag);
2417
2418	if (!threadlistsafe)
2419		mutex_enter(&pidlock);
2420	tp = curthread;		/* faster than allthreads */
2421	do {
2422		if (tp->t_state != TS_FREE) {
2423			/*
2424			 * If an interrupt thread is busy, but the
2425			 * caller doesn't care (i.e. BOUND_INTR is off),
2426			 * then just ignore it and continue through.
2427			 */
2428			if ((tp->t_flag & T_INTR_THREAD) &&
2429			    !(flag & BOUND_INTR))
2430				continue;
2431
2432			/*
2433			 * Skip the idle thread for the CPU
2434			 * we're about to set offline.
2435			 */
2436			if (tp == cp->cpu_idle_thread)
2437				continue;
2438
2439			/*
2440			 * Skip the pause thread for the CPU
2441			 * we're about to set offline.
2442			 */
2443			if (tp == cp->cpu_pause_thread)
2444				continue;
2445
2446			if ((flag & BOUND_CPU) &&
2447			    (tp->t_bound_cpu == cp ||
2448			    tp->t_bind_cpu == cp->cpu_id ||
2449			    tp->t_weakbound_cpu == cp)) {
2450				found = 1;
2451				break;
2452			}
2453
2454			if ((flag & BOUND_PARTITION) &&
2455			    (tp->t_cpupart == cp->cpu_part)) {
2456				found = 1;
2457				break;
2458			}
2459		}
2460	} while ((tp = tp->t_next) != curthread && found == 0);
2461	if (!threadlistsafe)
2462		mutex_exit(&pidlock);
2463	return (found);
2464}
2465
2466/*
2467 * disp_bound_threads - return nonzero if threads are bound to the processor.
2468 *	Called infrequently.  Keep this simple.
2469 *	Includes threads that are asleep or stopped but not onproc.
2470 */
2471int
2472disp_bound_threads(cpu_t *cp, int threadlistsafe)
2473{
2474	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2475}
2476
2477/*
2478 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2479 * to the given processor, including interrupt threads.
2480 */
2481int
2482disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2483{
2484	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2485}
2486
2487/*
2488 * disp_bound_partition - return nonzero if threads are bound to the same
2489 * partition as the processor.
2490 *	Called infrequently.  Keep this simple.
2491 *	Includes threads that are asleep or stopped but not onproc.
2492 */
2493int
2494disp_bound_partition(cpu_t *cp, int threadlistsafe)
2495{
2496	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2497}
2498
2499/*
2500 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2501 * threads to other CPUs.
2502 */
2503void
2504disp_cpu_inactive(cpu_t *cp)
2505{
2506	kthread_t	*tp;
2507	disp_t		*dp = cp->cpu_disp;
2508	dispq_t		*dq;
2509	pri_t		pri;
2510	int		wasonq;
2511
2512	disp_lock_enter(&dp->disp_lock);
2513	while ((pri = dp->disp_max_unbound_pri) != -1) {
2514		dq = &dp->disp_q[pri];
2515		tp = dq->dq_first;
2516
2517		/*
2518		 * Skip over bound threads.
2519		 */
2520		while (tp != NULL && tp->t_bound_cpu != NULL) {
2521			tp = tp->t_link;
2522		}
2523
2524		if (tp == NULL) {
2525			/* disp_max_unbound_pri must be inaccurate, so fix it */
2526			disp_fix_unbound_pri(dp, pri);
2527			continue;
2528		}
2529
2530		wasonq = dispdeq(tp);		/* drops disp_lock */
2531		ASSERT(wasonq);
2532		ASSERT(tp->t_weakbound_cpu == NULL);
2533
2534		setbackdq(tp);
2535		/*
2536		 * Called from cpu_offline:
2537		 *
2538		 * cp has already been removed from the list of active cpus
2539		 * and tp->t_cpu has been changed so there is no risk of
2540		 * tp ending up back on cp.
2541		 *
2542		 * Called from cpupart_move_cpu:
2543		 *
2544		 * The cpu has moved to a new cpupart.  Any threads that
2545		 * were on it's dispatch queues before the move remain
2546		 * in the old partition and can't run in the new partition.
2547		 */
2548		ASSERT(tp->t_cpu != cp);
2549		thread_unlock(tp);
2550
2551		disp_lock_enter(&dp->disp_lock);
2552	}
2553	disp_lock_exit(&dp->disp_lock);
2554}
2555
2556/*
2557 * Return a score rating this CPU for running this thread: lower is better.
2558 *
2559 * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2560 * curcpu (as that's our own priority).
2561 *
2562 * If a cpu is the target of an offline request, then try to avoid it.
2563 *
2564 * Otherwise we'll use double the effective dispatcher priority for the CPU.
2565 *
2566 * We do this so smt_adjust_cpu_score() can increment the score if needed,
2567 * without ending up over-riding a dispatcher priority.
2568 */
2569static pri_t
2570cpu_score(cpu_t *cp, kthread_t *tp)
2571{
2572	pri_t score;
2573
2574	if (tp == curthread && cp == curthread->t_cpu)
2575		score = 2 * CPU_IDLE_PRI;
2576	else if (cp == cpu_inmotion)
2577		score = SHRT_MAX;
2578	else
2579		score = 2 * cp->cpu_dispatch_pri;
2580
2581	if (2 * cp->cpu_disp->disp_maxrunpri > score)
2582		score = 2 * cp->cpu_disp->disp_maxrunpri;
2583	if (2 * cp->cpu_chosen_level > score)
2584		score = 2 * cp->cpu_chosen_level;
2585
2586	return (smt_adjust_cpu_score(tp, cp, score));
2587}
2588
2589/*
2590 * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2591 *
2592 * We are looking for a CPU with an effective dispatch priority lower than the
2593 * thread's, so that the thread will run immediately rather than be enqueued.
2594 * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2595 * If we don't find an available CPU there, we will expand our search to include
2596 * wider locality levels. (Note these groups are already divided by CPU
2597 * partition.)
2598 *
2599 * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2600 * the best home CPU we found.
2601 *
2602 * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2603 * other CPU.  The caller should pass in the most recently used CPU for the
2604 * thread; it's of course possible that this CPU isn't in the home lgroup.
2605 *
2606 * This function must be called at either high SPL, or with preemption disabled,
2607 * so that the "hint" CPU cannot be removed from the online CPU list while we
2608 * are traversing it.
2609 */
2610cpu_t *
2611disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2612{
2613	cpu_t	*bestcpu;
2614	cpu_t	*besthomecpu;
2615	cpu_t   *cp, *cpstart;
2616
2617	klgrpset_t	done;
2618
2619	lpl_t		*lpl_iter, *lpl_leaf;
2620
2621	ASSERT(hint != NULL);
2622	ASSERT(tp->t_lpl->lpl_ncpu > 0);
2623
2624	bestcpu = besthomecpu = NULL;
2625	klgrpset_clear(done);
2626
2627	lpl_iter = tp->t_lpl;
2628
2629	do {
2630		pri_t best = SHRT_MAX;
2631		klgrpset_t cur_set;
2632
2633		klgrpset_clear(cur_set);
2634
2635		for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2636			lpl_leaf = lpl_iter->lpl_rset[i];
2637			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2638				continue;
2639
2640			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2641
2642			if (hint->cpu_lpl == lpl_leaf)
2643				cp = cpstart = hint;
2644			else
2645				cp = cpstart = lpl_leaf->lpl_cpus;
2646
2647			do {
2648				pri_t score = cpu_score(cp, tp);
2649
2650				if (score < best) {
2651					best = score;
2652					bestcpu = cp;
2653
2654					/* An idle CPU: we're done. */
2655					if (score / 2 == CPU_IDLE_PRI)
2656						goto out;
2657				}
2658			} while ((cp = cp->cpu_next_lpl) != cpstart);
2659		}
2660
2661		if (bestcpu != NULL && tpri > (best / 2))
2662			goto out;
2663
2664		if (besthomecpu == NULL)
2665			besthomecpu = bestcpu;
2666
2667		/*
2668		 * Add the lgrps we just considered to the "done" set
2669		 */
2670		klgrpset_or(done, cur_set);
2671
2672	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2673
2674	/*
2675	 * The specified priority isn't high enough to run immediately
2676	 * anywhere, so just return the best CPU from the home lgroup.
2677	 */
2678	bestcpu = besthomecpu;
2679
2680out:
2681	ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2682	return (bestcpu);
2683}
2684
2685/*
2686 * This routine provides the generic idle cpu function for all processors.
2687 * If a processor has some specific code to execute when idle (say, to stop
2688 * the pipeline and save power) then that routine should be defined in the
2689 * processors specific code (module_xx.c) and the global variable idle_cpu
2690 * set to that function.
2691 */
2692static void
2693generic_idle_cpu(void)
2694{
2695}
2696
2697/*ARGSUSED*/
2698static void
2699generic_enq_thread(cpu_t *cpu, int bound)
2700{
2701}
2702
2703cpu_t *
2704disp_choose_best_cpu(void)
2705{
2706	kthread_t *t = curthread;
2707	cpu_t *curcpu = CPU;
2708
2709	ASSERT(t->t_preempt > 0);
2710	ASSERT(t->t_state == TS_ONPROC);
2711	ASSERT(t->t_schedflag & TS_VCPU);
2712
2713	if (smt_should_run(t, curcpu))
2714		return (curcpu);
2715
2716	return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2717}
2718