17c478bdstevel@tonic-gate/*
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
5ab76139esaxe * Common Development and Distribution License (the "License").
6ab76139esaxe * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
217c478bdstevel@tonic-gate/*
220e75152Eric Saxe * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bdstevel@tonic-gate * Use is subject to license terms.
247c478bdstevel@tonic-gate */
257c478bdstevel@tonic-gate
26455e370John Levon/*
27c3377eeJohn Levon * Copyright 2019 Joyent, Inc.
28455e370John Levon */
29455e370John Levon
307c478bdstevel@tonic-gate/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
317c478bdstevel@tonic-gate/*	  All Rights Reserved  	*/
327c478bdstevel@tonic-gate
337c478bdstevel@tonic-gate
347c478bdstevel@tonic-gate#include <sys/types.h>
357c478bdstevel@tonic-gate#include <sys/param.h>
367c478bdstevel@tonic-gate#include <sys/sysmacros.h>
377c478bdstevel@tonic-gate#include <sys/signal.h>
387c478bdstevel@tonic-gate#include <sys/user.h>
397c478bdstevel@tonic-gate#include <sys/systm.h>
407c478bdstevel@tonic-gate#include <sys/sysinfo.h>
417c478bdstevel@tonic-gate#include <sys/var.h>
427c478bdstevel@tonic-gate#include <sys/errno.h>
437c478bdstevel@tonic-gate#include <sys/cmn_err.h>
447c478bdstevel@tonic-gate#include <sys/debug.h>
457c478bdstevel@tonic-gate#include <sys/inline.h>
467c478bdstevel@tonic-gate#include <sys/disp.h>
477c478bdstevel@tonic-gate#include <sys/class.h>
487c478bdstevel@tonic-gate#include <sys/bitmap.h>
497c478bdstevel@tonic-gate#include <sys/kmem.h>
507c478bdstevel@tonic-gate#include <sys/cpuvar.h>
517c478bdstevel@tonic-gate#include <sys/vtrace.h>
527c478bdstevel@tonic-gate#include <sys/tnf.h>
537c478bdstevel@tonic-gate#include <sys/cpupart.h>
547c478bdstevel@tonic-gate#include <sys/lgrp.h>
55fb2f18fesaxe#include <sys/pg.h>
56fb2f18fesaxe#include <sys/cmt.h>
57fb2f18fesaxe#include <sys/bitset.h>
587c478bdstevel@tonic-gate#include <sys/schedctl.h>
597c478bdstevel@tonic-gate#include <sys/atomic.h>
607c478bdstevel@tonic-gate#include <sys/dtrace.h>
617c478bdstevel@tonic-gate#include <sys/sdt.h>
62057452cjj#include <sys/archsystm.h>
63c3377eeJohn Levon#include <sys/smt.h>
647c478bdstevel@tonic-gate
657c478bdstevel@tonic-gate#include <vm/as.h>
667c478bdstevel@tonic-gate
677c478bdstevel@tonic-gate#define	BOUND_CPU	0x1
687c478bdstevel@tonic-gate#define	BOUND_PARTITION	0x2
697c478bdstevel@tonic-gate#define	BOUND_INTR	0x4
707c478bdstevel@tonic-gate
717c478bdstevel@tonic-gate/* Dispatch queue allocation structure and functions */
727c478bdstevel@tonic-gatestruct disp_queue_info {
737c478bdstevel@tonic-gate	disp_t	*dp;
747c478bdstevel@tonic-gate	dispq_t *olddispq;
757c478bdstevel@tonic-gate	dispq_t *newdispq;
767c478bdstevel@tonic-gate	ulong_t	*olddqactmap;
777c478bdstevel@tonic-gate	ulong_t	*newdqactmap;
787c478bdstevel@tonic-gate	int	oldnglobpris;
797c478bdstevel@tonic-gate};
807c478bdstevel@tonic-gatestatic void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
817c478bdstevel@tonic-gate    disp_t *dp);
827c478bdstevel@tonic-gatestatic void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
837c478bdstevel@tonic-gatestatic void	disp_dq_free(struct disp_queue_info *dptr);
847c478bdstevel@tonic-gate
857c478bdstevel@tonic-gate/* platform-specific routine to call when processor is idle */
867c478bdstevel@tonic-gatestatic void	generic_idle_cpu();
877c478bdstevel@tonic-gatevoid		(*idle_cpu)() = generic_idle_cpu;
887c478bdstevel@tonic-gate
897c478bdstevel@tonic-gate/* routines invoked when a CPU enters/exits the idle loop */
907c478bdstevel@tonic-gatestatic void	idle_enter();
917c478bdstevel@tonic-gatestatic void	idle_exit();
927c478bdstevel@tonic-gate
937c478bdstevel@tonic-gate/* platform-specific routine to call when thread is enqueued */
947c478bdstevel@tonic-gatestatic void	generic_enq_thread(cpu_t *, int);
957c478bdstevel@tonic-gatevoid		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
967c478bdstevel@tonic-gate
97685679fakolbpri_t	kpreemptpri;		/* priority where kernel preemption applies */
98685679fakolbpri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
99685679fakolbpri_t	intr_pri;		/* interrupt thread priority base level */
1007c478bdstevel@tonic-gate
101685679fakolb#define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
102685679fakolbpri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
103685679fakolbdisp_t	cpu0_disp;		/* boot CPU's dispatch queue */
1047c478bdstevel@tonic-gatedisp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
105685679fakolbint	nswapped;		/* total number of swapped threads */
1067c478bdstevel@tonic-gatevoid	disp_swapped_enq(kthread_t *tp);
1077c478bdstevel@tonic-gatestatic void	disp_swapped_setrun(kthread_t *tp);
1087c478bdstevel@tonic-gatestatic void	cpu_resched(cpu_t *cp, pri_t tpri);
1097c478bdstevel@tonic-gate
1107c478bdstevel@tonic-gate/*
1117c478bdstevel@tonic-gate * If this is set, only interrupt threads will cause kernel preemptions.
1127c478bdstevel@tonic-gate * This is done by changing the value of kpreemptpri.  kpreemptpri
1137c478bdstevel@tonic-gate * will either be the max sysclass pri + 1 or the min interrupt pri.
1147c478bdstevel@tonic-gate */
1157c478bdstevel@tonic-gateint	only_intr_kpreempt;
1167c478bdstevel@tonic-gate
1177c478bdstevel@tonic-gateextern void set_idle_cpu(int cpun);
1187c478bdstevel@tonic-gateextern void unset_idle_cpu(int cpun);
1197c478bdstevel@tonic-gatestatic void setkpdq(kthread_t *tp, int borf);
1207c478bdstevel@tonic-gate#define	SETKP_BACK	0
1217c478bdstevel@tonic-gate#define	SETKP_FRONT	1
1227c478bdstevel@tonic-gate/*
1237c478bdstevel@tonic-gate * Parameter that determines how recently a thread must have run
1247c478bdstevel@tonic-gate * on the CPU to be considered loosely-bound to that CPU to reduce
1257c478bdstevel@tonic-gate * cold cache effects.  The interval is in hertz.
1267c478bdstevel@tonic-gate */
127fb2f18fesaxe#define	RECHOOSE_INTERVAL 3
1287c478bdstevel@tonic-gateint	rechoose_interval = RECHOOSE_INTERVAL;
1297c478bdstevel@tonic-gate
130685679fakolb/*
131685679fakolb * Parameter that determines how long (in nanoseconds) a thread must
132685679fakolb * be sitting on a run queue before it can be stolen by another CPU
133685679fakolb * to reduce migrations.  The interval is in nanoseconds.
134685679fakolb *
1358158859bholler * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
1368158859bholler * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
1378158859bholler * here indicating it is uninitiallized.
1388158859bholler * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
1398158859bholler *
140685679fakolb */
1418158859bholler#define	NOSTEAL_UNINITIALIZED	(-1)
1428158859bhollerhrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
1438158859bhollerextern void cmp_set_nosteal_interval(void);
144685679fakolb
1457c478bdstevel@tonic-gateid_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
1467c478bdstevel@tonic-gate
1477c478bdstevel@tonic-gatedisp_lock_t	transition_lock;	/* lock on transitioning threads */
1487c478bdstevel@tonic-gatedisp_lock_t	stop_lock;		/* lock on stopped threads */
1497c478bdstevel@tonic-gate
150685679fakolbstatic void	cpu_dispqalloc(int numpris);
151685679fakolb
152685679fakolb/*
153685679fakolb * This gets returned by disp_getwork/disp_getbest if we couldn't steal
154685679fakolb * a thread because it was sitting on its run queue for a very short
155685679fakolb * period of time.
156685679fakolb */
157685679fakolb#define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
1587c478bdstevel@tonic-gate
1597c478bdstevel@tonic-gatestatic kthread_t	*disp_getwork(cpu_t *to);
1607c478bdstevel@tonic-gatestatic kthread_t	*disp_getbest(disp_t *from);
1617c478bdstevel@tonic-gatestatic kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
1627c478bdstevel@tonic-gate
1637c478bdstevel@tonic-gatevoid	swtch_to(kthread_t *);
1647c478bdstevel@tonic-gate
1657c478bdstevel@tonic-gate/*
1667c478bdstevel@tonic-gate * dispatcher and scheduler initialization
1677c478bdstevel@tonic-gate */
1687c478bdstevel@tonic-gate
1697c478bdstevel@tonic-gate/*
1707c478bdstevel@tonic-gate * disp_setup - Common code to calculate and allocate dispatcher
1717c478bdstevel@tonic-gate *		variables and structures based on the maximum priority.
1727c478bdstevel@tonic-gate */
1737c478bdstevel@tonic-gatestatic void
1747c478bdstevel@tonic-gatedisp_setup(pri_t maxglobpri, pri_t oldnglobpris)
1757c478bdstevel@tonic-gate{
1767c478bdstevel@tonic-gate	pri_t	newnglobpris;
1777c478bdstevel@tonic-gate
1787c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&cpu_lock));
1797c478bdstevel@tonic-gate
1807c478bdstevel@tonic-gate	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
1817c478bdstevel@tonic-gate
1827c478bdstevel@tonic-gate	if (newnglobpris > oldnglobpris) {
1837c478bdstevel@tonic-gate		/*
1847c478bdstevel@tonic-gate		 * Allocate new kp queues for each CPU partition.
1857c478bdstevel@tonic-gate		 */
1867c478bdstevel@tonic-gate		cpupart_kpqalloc(newnglobpris);
1877c478bdstevel@tonic-gate
1887c478bdstevel@tonic-gate		/*
1897c478bdstevel@tonic-gate		 * Allocate new dispatch queues for each CPU.
1907c478bdstevel@tonic-gate		 */
1917c478bdstevel@tonic-gate		cpu_dispqalloc(newnglobpris);
1927c478bdstevel@tonic-gate
1937c478bdstevel@tonic-gate		/*
1947c478bdstevel@tonic-gate		 * compute new interrupt thread base priority
1957c478bdstevel@tonic-gate		 */
1967c478bdstevel@tonic-gate		intr_pri = maxglobpri;
1977c478bdstevel@tonic-gate		if (only_intr_kpreempt) {
1987c478bdstevel@tonic-gate			kpreemptpri = intr_pri + 1;
1997c478bdstevel@tonic-gate			if (kpqpri == KPQPRI)
2007c478bdstevel@tonic-gate				kpqpri = kpreemptpri;
2017c478bdstevel@tonic-gate		}
2027c478bdstevel@tonic-gate		v.v_nglobpris = newnglobpris;
2037c478bdstevel@tonic-gate	}
2047c478bdstevel@tonic-gate}
2057c478bdstevel@tonic-gate
2067c478bdstevel@tonic-gate/*
2077c478bdstevel@tonic-gate * dispinit - Called to initialize all loaded classes and the
2087c478bdstevel@tonic-gate *	      dispatcher framework.
2097c478bdstevel@tonic-gate */
2107c478bdstevel@tonic-gatevoid
2117c478bdstevel@tonic-gatedispinit(void)
2127c478bdstevel@tonic-gate{
2137c478bdstevel@tonic-gate	id_t	cid;
2147c478bdstevel@tonic-gate	pri_t	maxglobpri;
2157c478bdstevel@tonic-gate	pri_t	cl_maxglobpri;
2167c478bdstevel@tonic-gate
2177c478bdstevel@tonic-gate	maxglobpri = -1;
2187c478bdstevel@tonic-gate
2197c478bdstevel@tonic-gate	/*
2207c478bdstevel@tonic-gate	 * Initialize transition lock, which will always be set.
2217c478bdstevel@tonic-gate	 */
2227c478bdstevel@tonic-gate	DISP_LOCK_INIT(&transition_lock);
2237c478bdstevel@tonic-gate	disp_lock_enter_high(&transition_lock);
2247c478bdstevel@tonic-gate	DISP_LOCK_INIT(&stop_lock);
2257c478bdstevel@tonic-gate
2267c478bdstevel@tonic-gate	mutex_enter(&cpu_lock);
2277c478bdstevel@tonic-gate	CPU->cpu_disp->disp_maxrunpri = -1;
2287c478bdstevel@tonic-gate	CPU->cpu_disp->disp_max_unbound_pri = -1;
229fb2f18fesaxe
2307c478bdstevel@tonic-gate	/*
2317c478bdstevel@tonic-gate	 * Initialize the default CPU partition.
2327c478bdstevel@tonic-gate	 */
2337c478bdstevel@tonic-gate	cpupart_initialize_default();
2347c478bdstevel@tonic-gate	/*
2357c478bdstevel@tonic-gate	 * Call the class specific initialization functions for
2367c478bdstevel@tonic-gate	 * all pre-installed schedulers.
2377c478bdstevel@tonic-gate	 *
2387c478bdstevel@tonic-gate	 * We pass the size of a class specific parameter
2397c478bdstevel@tonic-gate	 * buffer to each of the initialization functions
2407c478bdstevel@tonic-gate	 * to try to catch problems with backward compatibility
2417c478bdstevel@tonic-gate	 * of class modules.
2427c478bdstevel@tonic-gate	 *
2437c478bdstevel@tonic-gate	 * For example a new class module running on an old system
2447c478bdstevel@tonic-gate	 * which didn't provide sufficiently large parameter buffers
2457c478bdstevel@tonic-gate	 * would be bad news. Class initialization modules can check for
2467c478bdstevel@tonic-gate	 * this and take action if they detect a problem.
2477c478bdstevel@tonic-gate	 */
2487c478bdstevel@tonic-gate
2497c478bdstevel@tonic-gate	for (cid = 0; cid < nclass; cid++) {
2507c478bdstevel@tonic-gate		sclass_t	*sc;
2517c478bdstevel@tonic-gate
2527c478bdstevel@tonic-gate		sc = &sclass[cid];
2537c478bdstevel@tonic-gate		if (SCHED_INSTALLED(sc)) {
2547c478bdstevel@tonic-gate			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
2557c478bdstevel@tonic-gate			    &sc->cl_funcs);
2567c478bdstevel@tonic-gate			if (cl_maxglobpri > maxglobpri)
2577c478bdstevel@tonic-gate				maxglobpri = cl_maxglobpri;
2587c478bdstevel@tonic-gate		}
2597c478bdstevel@tonic-gate	}
2607c478bdstevel@tonic-gate	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
2617c478bdstevel@tonic-gate	if (kpqpri == KPQPRI)
2627c478bdstevel@tonic-gate		kpqpri = kpreemptpri;
2637c478bdstevel@tonic-gate
2647c478bdstevel@tonic-gate	ASSERT(maxglobpri >= 0);
2657c478bdstevel@tonic-gate	disp_setup(maxglobpri, 0);
2667c478bdstevel@tonic-gate
2677c478bdstevel@tonic-gate	mutex_exit(&cpu_lock);
2687c478bdstevel@tonic-gate
2697c478bdstevel@tonic-gate	/*
2708158859bholler	 * Platform specific sticky scheduler setup.
2718158859bholler	 */
2728158859bholler	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
2738158859bholler		cmp_set_nosteal_interval();
2748158859bholler
2758158859bholler	/*
2767c478bdstevel@tonic-gate	 * Get the default class ID; this may be later modified via
2777c478bdstevel@tonic-gate	 * dispadmin(1M).  This will load the class (normally TS) and that will
2787c478bdstevel@tonic-gate	 * call disp_add(), which is why we had to drop cpu_lock first.
2797c478bdstevel@tonic-gate	 */
2807c478bdstevel@tonic-gate	if (getcid(defaultclass, &defaultcid) != 0) {
2817c478bdstevel@tonic-gate		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
2827c478bdstevel@tonic-gate		    defaultclass);
2837c478bdstevel@tonic-gate	}
2847c478bdstevel@tonic-gate}
2857c478bdstevel@tonic-gate
2867c478bdstevel@tonic-gate/*
2877c478bdstevel@tonic-gate * disp_add - Called with class pointer to initialize the dispatcher
2887c478bdstevel@tonic-gate *	      for a newly loaded class.
2897c478bdstevel@tonic-gate */
2907c478bdstevel@tonic-gatevoid
2917c478bdstevel@tonic-gatedisp_add(sclass_t *clp)
2927c478bdstevel@tonic-gate{
2937c478bdstevel@tonic-gate	pri_t	maxglobpri;
2947c478bdstevel@tonic-gate	pri_t	cl_maxglobpri;
2957c478bdstevel@tonic-gate
2967c478bdstevel@tonic-gate	mutex_enter(&cpu_lock);
2977c478bdstevel@tonic-gate	/*
2987c478bdstevel@tonic-gate	 * Initialize the scheduler class.
2997c478bdstevel@tonic-gate	 */
3007c478bdstevel@tonic-gate	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
3017c478bdstevel@tonic-gate	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
3027c478bdstevel@tonic-gate	if (cl_maxglobpri > maxglobpri)
3037c478bdstevel@tonic-gate		maxglobpri = cl_maxglobpri;
3047c478bdstevel@tonic-gate
3057c478bdstevel@tonic-gate	/*
3067c478bdstevel@tonic-gate	 * Save old queue information.  Since we're initializing a
3077c478bdstevel@tonic-gate	 * new scheduling class which has just been loaded, then
3087c478bdstevel@tonic-gate	 * the size of the dispq may have changed.  We need to handle
3097c478bdstevel@tonic-gate	 * that here.
3107c478bdstevel@tonic-gate	 */
3117c478bdstevel@tonic-gate	disp_setup(maxglobpri, v.v_nglobpris);
3127c478bdstevel@tonic-gate
3137c478bdstevel@tonic-gate	mutex_exit(&cpu_lock);
3147c478bdstevel@tonic-gate}
3157c478bdstevel@tonic-gate
3167c478bdstevel@tonic-gate
3177c478bdstevel@tonic-gate/*
3187c478bdstevel@tonic-gate * For each CPU, allocate new dispatch queues
3197c478bdstevel@tonic-gate * with the stated number of priorities.
3207c478bdstevel@tonic-gate */
3217c478bdstevel@tonic-gatestatic void
3227c478bdstevel@tonic-gatecpu_dispqalloc(int numpris)
3237c478bdstevel@tonic-gate{
3247c478bdstevel@tonic-gate	cpu_t	*cpup;
3257c478bdstevel@tonic-gate	struct disp_queue_info	*disp_mem;
3267c478bdstevel@tonic-gate	int i, num;
3277c478bdstevel@tonic-gate
3287c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&cpu_lock));
3297c478bdstevel@tonic-gate
3307c478bdstevel@tonic-gate	disp_mem = kmem_zalloc(NCPU *
3317c478bdstevel@tonic-gate	    sizeof (struct disp_queue_info), KM_SLEEP);
3327c478bdstevel@tonic-gate
3337c478bdstevel@tonic-gate	/*
3347c478bdstevel@tonic-gate	 * This routine must allocate all of the memory before stopping
3357c478bdstevel@tonic-gate	 * the cpus because it must not sleep in kmem_alloc while the
3367c478bdstevel@tonic-gate	 * CPUs are stopped.  Locks they hold will not be freed until they
3377c478bdstevel@tonic-gate	 * are restarted.
3387c478bdstevel@tonic-gate	 */
3397c478bdstevel@tonic-gate	i = 0;
3407c478bdstevel@tonic-gate	cpup = cpu_list;
3417c478bdstevel@tonic-gate	do {
3427c478bdstevel@tonic-gate		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
3437c478bdstevel@tonic-gate		i++;
3447c478bdstevel@tonic-gate		cpup = cpup->cpu_next;
3457c478bdstevel@tonic-gate	} while (cpup != cpu_list);
3467c478bdstevel@tonic-gate	num = i;
3477c478bdstevel@tonic-gate
3480ed5c46Josef 'Jeff' Sipek	pause_cpus(NULL, NULL);
3497c478bdstevel@tonic-gate	for (i = 0; i < num; i++)
3507c478bdstevel@tonic-gate		disp_dq_assign(&disp_mem[i], numpris);
3517c478bdstevel@tonic-gate	start_cpus();
3527c478bdstevel@tonic-gate
3537c478bdstevel@tonic-gate	/*
3547c478bdstevel@tonic-gate	 * I must free all of the memory after starting the cpus because
3557c478bdstevel@tonic-gate	 * I can not risk sleeping in kmem_free while the cpus are stopped.
3567c478bdstevel@tonic-gate	 */
3577c478bdstevel@tonic-gate	for (i = 0; i < num; i++)
3587c478bdstevel@tonic-gate		disp_dq_free(&disp_mem[i]);
3597c478bdstevel@tonic-gate
3607c478bdstevel@tonic-gate	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
3617c478bdstevel@tonic-gate}
3627c478bdstevel@tonic-gate
3637c478bdstevel@tonic-gatestatic void
3647c478bdstevel@tonic-gatedisp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
3657c478bdstevel@tonic-gate{
3667c478bdstevel@tonic-gate	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
3677c478bdstevel@tonic-gate	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
3687c478bdstevel@tonic-gate	    sizeof (long), KM_SLEEP);
3697c478bdstevel@tonic-gate	dptr->dp = dp;
3707c478bdstevel@tonic-gate}
3717c478bdstevel@tonic-gate
3727c478bdstevel@tonic-gatestatic void
3737c478bdstevel@tonic-gatedisp_dq_assign(struct disp_queue_info *dptr, int numpris)
3747c478bdstevel@tonic-gate{
3757c478bdstevel@tonic-gate	disp_t	*dp;
3767c478bdstevel@tonic-gate
3777c478bdstevel@tonic-gate	dp = dptr->dp;
3787c478bdstevel@tonic-gate	dptr->olddispq = dp->disp_q;
3797c478bdstevel@tonic-gate	dptr->olddqactmap = dp->disp_qactmap;
3807c478bdstevel@tonic-gate	dptr->oldnglobpris = dp->disp_npri;
3817c478bdstevel@tonic-gate
3827c478bdstevel@tonic-gate	ASSERT(dptr->oldnglobpris < numpris);
3837c478bdstevel@tonic-gate
3847c478bdstevel@tonic-gate	if (dptr->olddispq != NULL) {
3857c478bdstevel@tonic-gate		/*
3867c478bdstevel@tonic-gate		 * Use kcopy because bcopy is platform-specific
3877c478bdstevel@tonic-gate		 * and could block while we might have paused the cpus.
3887c478bdstevel@tonic-gate		 */
3897c478bdstevel@tonic-gate		(void) kcopy(dptr->olddispq, dptr->newdispq,
3907c478bdstevel@tonic-gate		    dptr->oldnglobpris * sizeof (dispq_t));
3917c478bdstevel@tonic-gate		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
3927c478bdstevel@tonic-gate		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
3937c478bdstevel@tonic-gate		    sizeof (long));
3947c478bdstevel@tonic-gate	}
3957c478bdstevel@tonic-gate	dp->disp_q = dptr->newdispq;
3967c478bdstevel@tonic-gate	dp->disp_qactmap = dptr->newdqactmap;
3977c478bdstevel@tonic-gate	dp->disp_q_limit = &dptr->newdispq[numpris];
3987c478bdstevel@tonic-gate	dp->disp_npri = numpris;
3997c478bdstevel@tonic-gate}
4007c478bdstevel@tonic-gate
4017c478bdstevel@tonic-gatestatic void
4027c478bdstevel@tonic-gatedisp_dq_free(struct disp_queue_info *dptr)
4037c478bdstevel@tonic-gate{
4047c478bdstevel@tonic-gate	if (dptr->olddispq != NULL)
4057c478bdstevel@tonic-gate		kmem_free(dptr->olddispq,
4067c478bdstevel@tonic-gate		    dptr->oldnglobpris * sizeof (dispq_t));
4077c478bdstevel@tonic-gate	if (dptr->olddqactmap != NULL)
4087c478bdstevel@tonic-gate		kmem_free(dptr->olddqactmap,
4097c478bdstevel@tonic-gate		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
4107c478bdstevel@tonic-gate}
4117c478bdstevel@tonic-gate
4127c478bdstevel@tonic-gate/*
4137c478bdstevel@tonic-gate * For a newly created CPU, initialize the dispatch queue.
4147c478bdstevel@tonic-gate * This is called before the CPU is known through cpu[] or on any lists.
4157c478bdstevel@tonic-gate */
4167c478bdstevel@tonic-gatevoid
4177c478bdstevel@tonic-gatedisp_cpu_init(cpu_t *cp)
4187c478bdstevel@tonic-gate{
4197c478bdstevel@tonic-gate	disp_t	*dp;
4207c478bdstevel@tonic-gate	dispq_t	*newdispq;
4217c478bdstevel@tonic-gate	ulong_t	*newdqactmap;
4227c478bdstevel@tonic-gate
4237c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
4247c478bdstevel@tonic-gate
4257c478bdstevel@tonic-gate	if (cp == cpu0_disp.disp_cpu)
4267c478bdstevel@tonic-gate		dp = &cpu0_disp;
4277c478bdstevel@tonic-gate	else
4287c478bdstevel@tonic-gate		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
4297c478bdstevel@tonic-gate	bzero(dp, sizeof (disp_t));
4307c478bdstevel@tonic-gate	cp->cpu_disp = dp;
4317c478bdstevel@tonic-gate	dp->disp_cpu = cp;
4327c478bdstevel@tonic-gate	dp->disp_maxrunpri = -1;
4337c478bdstevel@tonic-gate	dp->disp_max_unbound_pri = -1;
4347c478bdstevel@tonic-gate	DISP_LOCK_INIT(&cp->cpu_thread_lock);
4357c478bdstevel@tonic-gate	/*
4367c478bdstevel@tonic-gate	 * Allocate memory for the dispatcher queue headers
4377c478bdstevel@tonic-gate	 * and the active queue bitmap.
4387c478bdstevel@tonic-gate	 */
4397c478bdstevel@tonic-gate	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
4407c478bdstevel@tonic-gate	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
4417c478bdstevel@tonic-gate	    sizeof (long), KM_SLEEP);
4427c478bdstevel@tonic-gate	dp->disp_q = newdispq;
4437c478bdstevel@tonic-gate	dp->disp_qactmap = newdqactmap;
4447c478bdstevel@tonic-gate	dp->disp_q_limit = &newdispq[v.v_nglobpris];
4457c478bdstevel@tonic-gate	dp->disp_npri = v.v_nglobpris;
4467c478bdstevel@tonic-gate}
4477c478bdstevel@tonic-gate
4487c478bdstevel@tonic-gatevoid
4497c478bdstevel@tonic-gatedisp_cpu_fini(cpu_t *cp)
4507c478bdstevel@tonic-gate{
4517c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&cpu_lock));
4527c478bdstevel@tonic-gate
4537c478bdstevel@tonic-gate	disp_kp_free(cp->cpu_disp);
4547c478bdstevel@tonic-gate	if (cp->cpu_disp != &cpu0_disp)
4557c478bdstevel@tonic-gate		kmem_free(cp->cpu_disp, sizeof (disp_t));
4567c478bdstevel@tonic-gate}
4577c478bdstevel@tonic-gate
4587c478bdstevel@tonic-gate/*
4597c478bdstevel@tonic-gate * Allocate new, larger kpreempt dispatch queue to replace the old one.
4607c478bdstevel@tonic-gate */
4617c478bdstevel@tonic-gatevoid
4627c478bdstevel@tonic-gatedisp_kp_alloc(disp_t *dq, pri_t npri)
4637c478bdstevel@tonic-gate{
4647c478bdstevel@tonic-gate	struct disp_queue_info	mem_info;
4657c478bdstevel@tonic-gate
4667c478bdstevel@tonic-gate	if (npri > dq->disp_npri) {
4677c478bdstevel@tonic-gate		/*
4687c478bdstevel@tonic-gate		 * Allocate memory for the new array.
4697c478bdstevel@tonic-gate		 */
4707c478bdstevel@tonic-gate		disp_dq_alloc(&mem_info, npri, dq);
4717c478bdstevel@tonic-gate
4727c478bdstevel@tonic-gate		/*
4737c478bdstevel@tonic-gate		 * We need to copy the old structures to the new
4747c478bdstevel@tonic-gate		 * and free the old.
4757c478bdstevel@tonic-gate		 */
4767c478bdstevel@tonic-gate		disp_dq_assign(&mem_info, npri);
4777c478bdstevel@tonic-gate		disp_dq_free(&mem_info);
4787c478bdstevel@tonic-gate	}
4797c478bdstevel@tonic-gate}
4807c478bdstevel@tonic-gate
4817c478bdstevel@tonic-gate/*
4827c478bdstevel@tonic-gate * Free dispatch queue.
4837c478bdstevel@tonic-gate * Used for the kpreempt queues for a removed CPU partition and
4847c478bdstevel@tonic-gate * for the per-CPU queues of deleted CPUs.
4857c478bdstevel@tonic-gate */
4867c478bdstevel@tonic-gatevoid
4877c478bdstevel@tonic-gatedisp_kp_free(disp_t *dq)
4887c478bdstevel@tonic-gate{
4897c478bdstevel@tonic-gate	struct disp_queue_info	mem_info;
4907c478bdstevel@tonic-gate
4917c478bdstevel@tonic-gate	mem_info.olddispq = dq->disp_q;
4927c478bdstevel@tonic-gate	mem_info.olddqactmap = dq->disp_qactmap;
4937c478bdstevel@tonic-gate	mem_info.oldnglobpris = dq->disp_npri;
4947c478bdstevel@tonic-gate	disp_dq_free(&mem_info);
4957c478bdstevel@tonic-gate}
4967c478bdstevel@tonic-gate
4977c478bdstevel@tonic-gate/*
4987c478bdstevel@tonic-gate * End dispatcher and scheduler initialization.
4997c478bdstevel@tonic-gate */
5007c478bdstevel@tonic-gate
5017c478bdstevel@tonic-gate/*
5027c478bdstevel@tonic-gate * See if there's anything to do other than remain idle.
5037c478bdstevel@tonic-gate * Return non-zero if there is.
5047c478bdstevel@tonic-gate *
5057c478bdstevel@tonic-gate * This function must be called with high spl, or with
5067c478bdstevel@tonic-gate * kernel preemption disabled to prevent the partition's
5077c478bdstevel@tonic-gate * active cpu list from changing while being traversed.
5087c478bdstevel@tonic-gate *
5096890d02Eric Saxe * This is essentially a simpler version of disp_getwork()
5106890d02Eric Saxe * to be called by CPUs preparing to "halt".
5117c478bdstevel@tonic-gate */
5127c478bdstevel@tonic-gateint
5137c478bdstevel@tonic-gatedisp_anywork(void)
5147c478bdstevel@tonic-gate{
5156890d02Eric Saxe	cpu_t		*cp = CPU;
5166890d02Eric Saxe	cpu_t		*ocp;
5176890d02Eric Saxe	volatile int	*local_nrunnable = &cp->cpu_disp->disp_nrunnable;
5187c478bdstevel@tonic-gate
5197c478bdstevel@tonic-gate	if (!(cp->cpu_flags & CPU_OFFLINE)) {
5207c478bdstevel@tonic-gate		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
5217c478bdstevel@tonic-gate			return (1);
5227c478bdstevel@tonic-gate
5237c478bdstevel@tonic-gate		for (ocp = cp->cpu_next_part; ocp != cp;
5247c478bdstevel@tonic-gate		    ocp = ocp->cpu_next_part) {
5257c478bdstevel@tonic-gate			ASSERT(CPU_ACTIVE(ocp));
5267c478bdstevel@tonic-gate
5276890d02Eric Saxe			/*
5286890d02Eric Saxe			 * Something has appeared on the local run queue.
5296890d02Eric Saxe			 */
5306890d02Eric Saxe			if (*local_nrunnable > 0)
5316890d02Eric Saxe				return (1);
5326890d02Eric Saxe			/*
5336890d02Eric Saxe			 * If we encounter another idle CPU that will
5346890d02Eric Saxe			 * soon be trolling around through disp_anywork()
5356890d02Eric Saxe			 * terminate our walk here and let this other CPU
5366890d02Eric Saxe			 * patrol the next part of the list.
5376890d02Eric Saxe			 */
5386890d02Eric Saxe			if (ocp->cpu_dispatch_pri == -1 &&
5396890d02Eric Saxe			    (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
5406890d02Eric Saxe				return (0);
5416890d02Eric Saxe			/*
5426890d02Eric Saxe			 * Work can be taken from another CPU if:
5436890d02Eric Saxe			 *	- There is unbound work on the run queue
5446890d02Eric Saxe			 *	- That work isn't a thread undergoing a
5456890d02Eric Saxe			 *	- context switch on an otherwise empty queue.
5466890d02Eric Saxe			 *	- The CPU isn't running the idle loop.
5476890d02Eric Saxe			 */
5487c478bdstevel@tonic-gate			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
5497c478bdstevel@tonic-gate			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
5507c478bdstevel@tonic-gate			    ocp->cpu_disp->disp_nrunnable == 1) &&
5517c478bdstevel@tonic-gate			    ocp->cpu_dispatch_pri != -1)
5527c478bdstevel@tonic-gate				return (1);
5537c478bdstevel@tonic-gate		}
5547c478bdstevel@tonic-gate	}
5557c478bdstevel@tonic-gate	return (0);
5567c478bdstevel@tonic-gate}
5577c478bdstevel@tonic-gate
5587c478bdstevel@tonic-gate/*
5597c478bdstevel@tonic-gate * Called when CPU enters the idle loop
5607c478bdstevel@tonic-gate */
5617c478bdstevel@tonic-gatestatic void
5627c478bdstevel@tonic-gateidle_enter()
5637c478bdstevel@tonic-gate{
5647c478bdstevel@tonic-gate	cpu_t		*cp = CPU;
5657c478bdstevel@tonic-gate
566eda8946esolom	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
5677c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
5687c478bdstevel@tonic-gate	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
5697c478bdstevel@tonic-gate}
5707c478bdstevel@tonic-gate
5717c478bdstevel@tonic-gate/*
5727c478bdstevel@tonic-gate * Called when CPU exits the idle loop
5737c478bdstevel@tonic-gate */
5747c478bdstevel@tonic-gatestatic void
5757c478bdstevel@tonic-gateidle_exit()
5767c478bdstevel@tonic-gate{
5777c478bdstevel@tonic-gate	cpu_t		*cp = CPU;
5787c478bdstevel@tonic-gate
579eda8946esolom	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
5807c478bdstevel@tonic-gate	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
5817c478bdstevel@tonic-gate}
5827c478bdstevel@tonic-gate
5837c478bdstevel@tonic-gate/*
5847c478bdstevel@tonic-gate * Idle loop.
5857c478bdstevel@tonic-gate */
5867c478bdstevel@tonic-gatevoid
5877c478bdstevel@tonic-gateidle()
5887c478bdstevel@tonic-gate{
5897c478bdstevel@tonic-gate	struct cpu	*cp = CPU;		/* pointer to this CPU */
5907c478bdstevel@tonic-gate	kthread_t	*t;			/* taken thread */
5917c478bdstevel@tonic-gate
5927c478bdstevel@tonic-gate	idle_enter();
5937c478bdstevel@tonic-gate
5947c478bdstevel@tonic-gate	/*
5957c478bdstevel@tonic-gate	 * Uniprocessor version of idle loop.
5967c478bdstevel@tonic-gate	 * Do this until notified that we're on an actual multiprocessor.
5977c478bdstevel@tonic-gate	 */
5987c478bdstevel@tonic-gate	while (ncpus == 1) {
5997c478bdstevel@tonic-gate		if (cp->cpu_disp->disp_nrunnable == 0) {
6007c478bdstevel@tonic-gate			(*idle_cpu)();
6017c478bdstevel@tonic-gate			continue;
6027c478bdstevel@tonic-gate		}
6037c478bdstevel@tonic-gate		idle_exit();
6047c478bdstevel@tonic-gate		swtch();
6057c478bdstevel@tonic-gate
6067c478bdstevel@tonic-gate		idle_enter(); /* returned from swtch */
6077c478bdstevel@tonic-gate	}
6087c478bdstevel@tonic-gate
6097c478bdstevel@tonic-gate	/*
6107c478bdstevel@tonic-gate	 * Multiprocessor idle loop.
6117c478bdstevel@tonic-gate	 */
6127c478bdstevel@tonic-gate	for (;;) {
6137c478bdstevel@tonic-gate		/*
6147c478bdstevel@tonic-gate		 * If CPU is completely quiesced by p_online(2), just wait
6157c478bdstevel@tonic-gate		 * here with minimal bus traffic until put online.
6167c478bdstevel@tonic-gate		 */
6177c478bdstevel@tonic-gate		while (cp->cpu_flags & CPU_QUIESCED)
6187c478bdstevel@tonic-gate			(*idle_cpu)();
6197c478bdstevel@tonic-gate
6207c478bdstevel@tonic-gate		if (cp->cpu_disp->disp_nrunnable != 0) {
6217c478bdstevel@tonic-gate			idle_exit();
6227c478bdstevel@tonic-gate			swtch();
6237c478bdstevel@tonic-gate		} else {
6247c478bdstevel@tonic-gate			if (cp->cpu_flags & CPU_OFFLINE)
6257c478bdstevel@tonic-gate				continue;
6267c478bdstevel@tonic-gate			if ((t = disp_getwork(cp)) == NULL) {
6277c478bdstevel@tonic-gate				if (cp->cpu_chosen_level != -1) {
6287c478bdstevel@tonic-gate					disp_t *dp = cp->cpu_disp;
6297c478bdstevel@tonic-gate					disp_t *kpq;
6307c478bdstevel@tonic-gate
6317c478bdstevel@tonic-gate					disp_lock_enter(&dp->disp_lock);
6327c478bdstevel@tonic-gate					/*
6337c478bdstevel@tonic-gate					 * Set kpq under lock to prevent
6347c478bdstevel@tonic-gate					 * migration between partitions.
6357c478bdstevel@tonic-gate					 */
6367c478bdstevel@tonic-gate					kpq = &cp->cpu_part->cp_kp_queue;
6377c478bdstevel@tonic-gate					if (kpq->disp_maxrunpri == -1)
6387c478bdstevel@tonic-gate						cp->cpu_chosen_level = -1;
6397c478bdstevel@tonic-gate					disp_lock_exit(&dp->disp_lock);
6407c478bdstevel@tonic-gate				}
6417c478bdstevel@tonic-gate				(*idle_cpu)();
6427c478bdstevel@tonic-gate				continue;
6437c478bdstevel@tonic-gate			}
644685679fakolb			/*
645685679fakolb			 * If there was a thread but we couldn't steal
646685679fakolb			 * it, then keep trying.
647685679fakolb			 */
648685679fakolb			if (t == T_DONTSTEAL)
649685679fakolb				continue;
6507c478bdstevel@tonic-gate			idle_exit();
6517c478bdstevel@tonic-gate			swtch_to(t);
6527c478bdstevel@tonic-gate		}
6537c478bdstevel@tonic-gate		idle_enter(); /* returned from swtch/swtch_to */
6547c478bdstevel@tonic-gate	}
6557c478bdstevel@tonic-gate}
6567c478bdstevel@tonic-gate
6577c478bdstevel@tonic-gate
6587c478bdstevel@tonic-gate/*
6597c478bdstevel@tonic-gate * Preempt the currently running thread in favor of the highest
6607c478bdstevel@tonic-gate * priority thread.  The class of the current thread controls
6617c478bdstevel@tonic-gate * where it goes on the dispatcher queues. If panicking, turn
6627c478bdstevel@tonic-gate * preemption off.
6637c478bdstevel@tonic-gate */
6647c478bdstevel@tonic-gatevoid
6657c478bdstevel@tonic-gatepreempt()
6667c478bdstevel@tonic-gate{
6677c478bdstevel@tonic-gate	kthread_t 	*t = curthread;
6687c478bdstevel@tonic-gate	klwp_t 		*lwp = ttolwp(curthread);
6697c478bdstevel@tonic-gate
6707c478bdstevel@tonic-gate	if (panicstr)
6717c478bdstevel@tonic-gate		return;
6727c478bdstevel@tonic-gate
6737c478bdstevel@tonic-gate	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
6747c478bdstevel@tonic-gate
6757c478bdstevel@tonic-gate	thread_lock(t);
6767c478bdstevel@tonic-gate
6777c478bdstevel@tonic-gate	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
6787c478bdstevel@tonic-gate		/*
6797c478bdstevel@tonic-gate		 * this thread has already been chosen to be run on
6807c478bdstevel@tonic-gate		 * another CPU. Clear kprunrun on this CPU since we're
6817c478bdstevel@tonic-gate		 * already headed for swtch().
6827c478bdstevel@tonic-gate		 */
6837c478bdstevel@tonic-gate		CPU->cpu_kprunrun = 0;
6847c478bdstevel@tonic-gate		thread_unlock_nopreempt(t);
6857c478bdstevel@tonic-gate		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
6867c478bdstevel@tonic-gate	} else {
6877c478bdstevel@tonic-gate		if (lwp != NULL)
6887c478bdstevel@tonic-gate			lwp->lwp_ru.nivcsw++;
6897c478bdstevel@tonic-gate		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
6907c478bdstevel@tonic-gate		THREAD_TRANSITION(t);
6917c478bdstevel@tonic-gate		CL_PREEMPT(t);
6927c478bdstevel@tonic-gate		DTRACE_SCHED(preempt);
6937c478bdstevel@tonic-gate		thread_unlock_nopreempt(t);
6947c478bdstevel@tonic-gate
6957c478bdstevel@tonic-gate		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
6967c478bdstevel@tonic-gate
6977c478bdstevel@tonic-gate		swtch();		/* clears CPU->cpu_runrun via disp() */
6987c478bdstevel@tonic-gate	}
6997c478bdstevel@tonic-gate}
7007c478bdstevel@tonic-gate
7017c478bdstevel@tonic-gateextern kthread_t *thread_unpin();
7027c478bdstevel@tonic-gate
7037c478bdstevel@tonic-gate/*
7047c478bdstevel@tonic-gate * disp() - find the highest priority thread for this processor to run, and
7057c478bdstevel@tonic-gate * set it in TS_ONPROC state so that resume() can be called to run it.
7067c478bdstevel@tonic-gate */
7077c478bdstevel@tonic-gatestatic kthread_t *
7087c478bdstevel@tonic-gatedisp()
7097c478bdstevel@tonic-gate{
7107c478bdstevel@tonic-gate	cpu_t		*cpup;
7117c478bdstevel@tonic-gate	disp_t		*dp;
7127c478bdstevel@tonic-gate	kthread_t	*tp;
7137c478bdstevel@tonic-gate	dispq_t		*dq;
7147c478bdstevel@tonic-gate	int		maxrunword;
7157c478bdstevel@tonic-gate	pri_t		pri;
7167c478bdstevel@tonic-gate	disp_t		*kpq;
7177c478bdstevel@tonic-gate
7187c478bdstevel@tonic-gate	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
7197c478bdstevel@tonic-gate
7207c478bdstevel@tonic-gate	cpup = CPU;
7217c478bdstevel@tonic-gate	/*
7227c478bdstevel@tonic-gate	 * Find the highest priority loaded, runnable thread.
7237c478bdstevel@tonic-gate	 */
7247c478bdstevel@tonic-gate	dp = cpup->cpu_disp;
7257c478bdstevel@tonic-gate
7267c478bdstevel@tonic-gatereschedule:
7277c478bdstevel@tonic-gate	/*
7287c478bdstevel@tonic-gate	 * If there is more important work on the global queue with a better
7297c478bdstevel@tonic-gate	 * priority than the maximum on this CPU, take it now.
7307c478bdstevel@tonic-gate	 */
7317c478bdstevel@tonic-gate	kpq = &cpup->cpu_part->cp_kp_queue;
7327c478bdstevel@tonic-gate	while ((pri = kpq->disp_maxrunpri) >= 0 &&
7337c478bdstevel@tonic-gate	    pri >= dp->disp_maxrunpri &&
7347c478bdstevel@tonic-gate	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
7357c478bdstevel@tonic-gate	    (tp = disp_getbest(kpq)) != NULL) {
7367c478bdstevel@tonic-gate		if (disp_ratify(tp, kpq) != NULL) {
7377c478bdstevel@tonic-gate			TRACE_1(TR_FAC_DISP, TR_DISP_END,
7387c478bdstevel@tonic-gate			    "disp_end:tid %p", tp);
7397c478bdstevel@tonic-gate			return (tp);
7407c478bdstevel@tonic-gate		}
7417c478bdstevel@tonic-gate	}
7427c478bdstevel@tonic-gate
7437c478bdstevel@tonic-gate	disp_lock_enter(&dp->disp_lock);
7447c478bdstevel@tonic-gate	pri = dp->disp_maxrunpri;
7457c478bdstevel@tonic-gate
7467c478bdstevel@tonic-gate	/*
7477c478bdstevel@tonic-gate	 * If there is nothing to run, look at what's runnable on other queues.
7487c478bdstevel@tonic-gate	 * Choose the idle thread if the CPU is quiesced.
7497c478bdstevel@tonic-gate	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
7507c478bdstevel@tonic-gate	 * interrupt threads, which will be the only threads on the CPU's own
7517c478bdstevel@tonic-gate	 * queue, but cannot run threads from other queues.
7527c478bdstevel@tonic-gate	 */
7537c478bdstevel@tonic-gate	if (pri == -1) {
7547c478bdstevel@tonic-gate		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
7557c478bdstevel@tonic-gate			disp_lock_exit(&dp->disp_lock);
756685679fakolb			if ((tp = disp_getwork(cpup)) == NULL ||
757685679fakolb			    tp == T_DONTSTEAL) {
7587c478bdstevel@tonic-gate				tp = cpup->cpu_idle_thread;
7597c478bdstevel@tonic-gate				(void) splhigh();
7607c478bdstevel@tonic-gate				THREAD_ONPROC(tp, cpup);
7617c478bdstevel@tonic-gate				cpup->cpu_dispthread = tp;
7627c478bdstevel@tonic-gate				cpup->cpu_dispatch_pri = -1;
7637c478bdstevel@tonic-gate				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
7647c478bdstevel@tonic-gate				cpup->cpu_chosen_level = -1;
7657c478bdstevel@tonic-gate			}
7667c478bdstevel@tonic-gate		} else {
7677c478bdstevel@tonic-gate			disp_lock_exit_high(&dp->disp_lock);
7687c478bdstevel@tonic-gate			tp = cpup->cpu_idle_thread;
7697c478bdstevel@tonic-gate			THREAD_ONPROC(tp, cpup);
7707c478bdstevel@tonic-gate			cpup->cpu_dispthread = tp;
7717c478bdstevel@tonic-gate			cpup->cpu_dispatch_pri = -1;
7727c478bdstevel@tonic-gate			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
7737c478bdstevel@tonic-gate			cpup->cpu_chosen_level = -1;
7747c478bdstevel@tonic-gate		}
7757c478bdstevel@tonic-gate		TRACE_1(TR_FAC_DISP, TR_DISP_END,
776d129bdeesaxe		    "disp_end:tid %p", tp);
7777c478bdstevel@tonic-gate		return (tp);
7787c478bdstevel@tonic-gate	}
7797c478bdstevel@tonic-gate
7807c478bdstevel@tonic-gate	dq = &dp->disp_q[pri];
7817c478bdstevel@tonic-gate	tp = dq->dq_first;
7827c478bdstevel@tonic-gate
7837c478bdstevel@tonic-gate	ASSERT(tp != NULL);
7847c478bdstevel@tonic-gate	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
7857c478bdstevel@tonic-gate
7867c478bdstevel@tonic-gate	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
7877c478bdstevel@tonic-gate
7887c478bdstevel@tonic-gate	/*
7897c478bdstevel@tonic-gate	 * Found it so remove it from queue.
7907c478bdstevel@tonic-gate	 */
7917c478bdstevel@tonic-gate	dp->disp_nrunnable--;
7927c478bdstevel@tonic-gate	dq->dq_sruncnt--;
7937c478bdstevel@tonic-gate	if ((dq->dq_first = tp->t_link) == NULL) {
7947c478bdstevel@tonic-gate		ulong_t	*dqactmap = dp->disp_qactmap;
7957c478bdstevel@tonic-gate
7967c478bdstevel@tonic-gate		ASSERT(dq->dq_sruncnt == 0);
7977c478bdstevel@tonic-gate		dq->dq_last = NULL;
7987c478bdstevel@tonic-gate
7997c478bdstevel@tonic-gate		/*
8007c478bdstevel@tonic-gate		 * The queue is empty, so the corresponding bit needs to be
8017c478bdstevel@tonic-gate		 * turned off in dqactmap.   If nrunnable != 0 just took the
8027c478bdstevel@tonic-gate		 * last runnable thread off the
8037c478bdstevel@tonic-gate		 * highest queue, so recompute disp_maxrunpri.
8047c478bdstevel@tonic-gate		 */
8057c478bdstevel@tonic-gate		maxrunword = pri >> BT_ULSHIFT;
8067c478bdstevel@tonic-gate		dqactmap[maxrunword] &= ~BT_BIW(pri);
8077c478bdstevel@tonic-gate
8087c478bdstevel@tonic-gate		if (dp->disp_nrunnable == 0) {
8097c478bdstevel@tonic-gate			dp->disp_max_unbound_pri = -1;
8107c478bdstevel@tonic-gate			dp->disp_maxrunpri = -1;
8117c478bdstevel@tonic-gate		} else {
8127c478bdstevel@tonic-gate			int ipri;
8137c478bdstevel@tonic-gate
8147c478bdstevel@tonic-gate			ipri = bt_gethighbit(dqactmap, maxrunword);
8157c478bdstevel@tonic-gate			dp->disp_maxrunpri = ipri;
8167c478bdstevel@tonic-gate			if (ipri < dp->disp_max_unbound_pri)
8177c478bdstevel@tonic-gate				dp->disp_max_unbound_pri = ipri;
8187c478bdstevel@tonic-gate		}
8197c478bdstevel@tonic-gate	} else {
8207c478bdstevel@tonic-gate		tp->t_link = NULL;
8217c478bdstevel@tonic-gate	}
8227c478bdstevel@tonic-gate
8237c478bdstevel@tonic-gate	/*
8247c478bdstevel@tonic-gate	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
8257c478bdstevel@tonic-gate	 * out this thread before we have a chance to run it.
8267c478bdstevel@tonic-gate	 * While running, it is protected against swapping by t_lock.
8277c478bdstevel@tonic-gate	 */
8287c478bdstevel@tonic-gate	tp->t_schedflag |= TS_DONT_SWAP;
8297c478bdstevel@tonic-gate	cpup->cpu_dispthread = tp;		/* protected by spl only */
8307c478bdstevel@tonic-gate	cpup->cpu_dispatch_pri = pri;
8317c478bdstevel@tonic-gate	ASSERT(pri == DISP_PRIO(tp));
8327c478bdstevel@tonic-gate	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
8337c478bdstevel@tonic-gate	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
8347c478bdstevel@tonic-gate
8357c478bdstevel@tonic-gate	ASSERT(tp != NULL);
8367c478bdstevel@tonic-gate	TRACE_1(TR_FAC_DISP, TR_DISP_END,
837d129bdeesaxe	    "disp_end:tid %p", tp);
8387c478bdstevel@tonic-gate
8397c478bdstevel@tonic-gate	if (disp_ratify(tp, kpq) == NULL)
8407c478bdstevel@tonic-gate		goto reschedule;
8417c478bdstevel@tonic-gate
8427c478bdstevel@tonic-gate	return (tp);
8437c478bdstevel@tonic-gate}
8447c478bdstevel@tonic-gate
8457c478bdstevel@tonic-gate/*
8467c478bdstevel@tonic-gate * swtch()
8477c478bdstevel@tonic-gate *	Find best runnable thread and run it.
8487c478bdstevel@tonic-gate *	Called with the current thread already switched to a new state,
8497c478bdstevel@tonic-gate *	on a sleep queue, run queue, stopped, and not zombied.
8507c478bdstevel@tonic-gate *	May be called at any spl level less than or equal to LOCK_LEVEL.
8517c478bdstevel@tonic-gate *	Always drops spl to the base level (spl0()).
8527c478bdstevel@tonic-gate */
8537c478bdstevel@tonic-gatevoid
8547c478bdstevel@tonic-gateswtch()
8557c478bdstevel@tonic-gate{
8567c478bdstevel@tonic-gate	kthread_t	*t = curthread;
8577c478bdstevel@tonic-gate	kthread_t	*next;
8587c478bdstevel@tonic-gate	cpu_t		*cp;
8597c478bdstevel@tonic-gate
8607c478bdstevel@tonic-gate	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
8617c478bdstevel@tonic-gate
8627c478bdstevel@tonic-gate	if (t->t_flag & T_INTR_THREAD)
8637c478bdstevel@tonic-gate		cpu_intr_swtch_enter(t);
8647c478bdstevel@tonic-gate
8657c478bdstevel@tonic-gate	if (t->t_intr != NULL) {
8667c478bdstevel@tonic-gate		/*
8677c478bdstevel@tonic-gate		 * We are an interrupt thread.  Setup and return
8687c478bdstevel@tonic-gate		 * the interrupted thread to be resumed.
8697c478bdstevel@tonic-gate		 */
8707c478bdstevel@tonic-gate		(void) splhigh();	/* block other scheduler action */
8717c478bdstevel@tonic-gate		cp = CPU;		/* now protected against migration */
8727c478bdstevel@tonic-gate		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
8737c478bdstevel@tonic-gate		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
8747c478bdstevel@tonic-gate		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
8757c478bdstevel@tonic-gate		next = thread_unpin();
8767c478bdstevel@tonic-gate		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
8777c478bdstevel@tonic-gate		resume_from_intr(next);
8787c478bdstevel@tonic-gate	} else {
8797c478bdstevel@tonic-gate#ifdef	DEBUG
8807c478bdstevel@tonic-gate		if (t->t_state == TS_ONPROC &&
8817c478bdstevel@tonic-gate		    t->t_disp_queue->disp_cpu == CPU &&
8827c478bdstevel@tonic-gate		    t->t_preempt == 0) {
8837c478bdstevel@tonic-gate			thread_lock(t);
8847c478bdstevel@tonic-gate			ASSERT(t->t_state != TS_ONPROC ||
8857c478bdstevel@tonic-gate			    t->t_disp_queue->disp_cpu != CPU ||
8867c478bdstevel@tonic-gate			    t->t_preempt != 0);	/* cannot migrate */
8877c478bdstevel@tonic-gate			thread_unlock_nopreempt(t);
8887c478bdstevel@tonic-gate		}
8897c478bdstevel@tonic-gate#endif	/* DEBUG */
8907c478bdstevel@tonic-gate		cp = CPU;
8917c478bdstevel@tonic-gate		next = disp();		/* returns with spl high */
8927c478bdstevel@tonic-gate		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
8937c478bdstevel@tonic-gate
8947c478bdstevel@tonic-gate		/* OK to steal anything left on run queue */
8957c478bdstevel@tonic-gate		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
8967c478bdstevel@tonic-gate
8977c478bdstevel@tonic-gate		if (next != t) {
8980e75152Eric Saxe			hrtime_t now;
8990e75152Eric Saxe
9000e75152Eric Saxe			now = gethrtime_unscaled();
9010e75152Eric Saxe			pg_ev_thread_swtch(cp, now, t, next);
9027c478bdstevel@tonic-gate
903f2bd462johansen			/*
904f2bd462johansen			 * If t was previously in the TS_ONPROC state,
905f2bd462johansen			 * setfrontdq and setbackdq won't have set its t_waitrq.
906f2bd462johansen			 * Since we now finally know that we're switching away
907f2bd462johansen			 * from this thread, set its t_waitrq if it is on a run
908f2bd462johansen			 * queue.
909f2bd462johansen			 */
910f2bd462johansen			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
9110e75152Eric Saxe				t->t_waitrq = now;
912f2bd462johansen			}
913f2bd462johansen
914f2bd462johansen			/*
915f2bd462johansen			 * restore mstate of thread that we are switching to
916f2bd462johansen			 */
917f2bd462johansen			restore_mstate(next);
918f2bd462johansen
9197c478bdstevel@tonic-gate			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
920d3d5073Rafael Vanoni			cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
9217c478bdstevel@tonic-gate			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
9227c478bdstevel@tonic-gate
9237c478bdstevel@tonic-gate			if (dtrace_vtime_active)
9247c478bdstevel@tonic-gate				dtrace_vtime_switch(next);
9257c478bdstevel@tonic-gate
9267c478bdstevel@tonic-gate			resume(next);
9277c478bdstevel@tonic-gate			/*
9287c478bdstevel@tonic-gate			 * The TR_RESUME_END and TR_SWTCH_END trace points
9297c478bdstevel@tonic-gate			 * appear at the end of resume(), because we may not
9307c478bdstevel@tonic-gate			 * return here
9317c478bdstevel@tonic-gate			 */
9327c478bdstevel@tonic-gate		} else {
9337c478bdstevel@tonic-gate			if (t->t_flag & T_INTR_THREAD)
9347c478bdstevel@tonic-gate				cpu_intr_swtch_exit(t);
9351dbbbf7Sudheer A			/*
9361dbbbf7Sudheer A			 * Threads that enqueue themselves on a run queue defer
9371dbbbf7Sudheer A			 * setting t_waitrq. It is then either set in swtch()
9381dbbbf7Sudheer A			 * when the CPU is actually yielded, or not at all if it
9391dbbbf7Sudheer A			 * is remaining on the CPU.
9401dbbbf7Sudheer A			 * There is however a window between where the thread
9411dbbbf7Sudheer A			 * placed itself on a run queue, and where it selects
9421dbbbf7Sudheer A			 * itself in disp(), where a third party (eg. clock()
9431dbbbf7Sudheer A			 * doing tick processing) may have re-enqueued this
9441dbbbf7Sudheer A			 * thread, setting t_waitrq in the process. We detect
9451dbbbf7Sudheer A			 * this race by noticing that despite switching to
9461dbbbf7Sudheer A			 * ourself, our t_waitrq has been set, and should be
9471dbbbf7Sudheer A			 * cleared.
9481dbbbf7Sudheer A			 */
9491dbbbf7Sudheer A			if (t->t_waitrq != 0)
9501dbbbf7Sudheer A				t->t_waitrq = 0;
9517c478bdstevel@tonic-gate
9520e75152Eric Saxe			pg_ev_thread_remain(cp, t);
9530e75152Eric Saxe
9547c478bdstevel@tonic-gate			DTRACE_SCHED(remain__cpu);
9557c478bdstevel@tonic-gate			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
9567c478bdstevel@tonic-gate			(void) spl0();
9577c478bdstevel@tonic-gate		}
9587c478bdstevel@tonic-gate	}
9597c478bdstevel@tonic-gate}
9607c478bdstevel@tonic-gate
9617c478bdstevel@tonic-gate/*
9627c478bdstevel@tonic-gate * swtch_from_zombie()
9637c478bdstevel@tonic-gate *	Special case of swtch(), which allows checks for TS_ZOMB to be
9647c478bdstevel@tonic-gate *	eliminated from normal resume.
9657c478bdstevel@tonic-gate *	Find best runnable thread and run it.
9667c478bdstevel@tonic-gate *	Called with the current thread zombied.
9677c478bdstevel@tonic-gate *	Zombies cannot migrate, so CPU references are safe.
9687c478bdstevel@tonic-gate */
9697c478bdstevel@tonic-gatevoid
9707c478bdstevel@tonic-gateswtch_from_zombie()
9717c478bdstevel@tonic-gate{
9727c478bdstevel@tonic-gate	kthread_t	*next;
9737c478bdstevel@tonic-gate	cpu_t		*cpu = CPU;
9747c478bdstevel@tonic-gate
9757c478bdstevel@tonic-gate	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
9767c478bdstevel@tonic-gate
9777c478bdstevel@tonic-gate	ASSERT(curthread->t_state == TS_ZOMB);
9787c478bdstevel@tonic-gate
9797c478bdstevel@tonic-gate	next = disp();			/* returns with spl high */
9807c478bdstevel@tonic-gate	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
9817c478bdstevel@tonic-gate	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
9827c478bdstevel@tonic-gate	ASSERT(next != curthread);
9837c478bdstevel@tonic-gate	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
9847c478bdstevel@tonic-gate
9850e75152Eric Saxe	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
9867c478bdstevel@tonic-gate
987f2bd462johansen	restore_mstate(next);
988f2bd462johansen
9897c478bdstevel@tonic-gate	if (dtrace_vtime_active)
9907c478bdstevel@tonic-gate		dtrace_vtime_switch(next);
9917c478bdstevel@tonic-gate
9927c478bdstevel@tonic-gate	resume_from_zombie(next);
9937c478bdstevel@tonic-gate	/*
9947c478bdstevel@tonic-gate	 * The TR_RESUME_END and TR_SWTCH_END trace points
9957c478bdstevel@tonic-gate	 * appear at the end of resume(), because we certainly will not
9967c478bdstevel@tonic-gate	 * return here
9977c478bdstevel@tonic-gate	 */
9987c478bdstevel@tonic-gate}
9997c478bdstevel@tonic-gate
10007c478bdstevel@tonic-gate#if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1001057452cjj
1002057452cjj/*
1003057452cjj * search_disp_queues()
1004057452cjj *	Search the given dispatch queues for thread tp.
1005057452cjj *	Return 1 if tp is found, otherwise return 0.
1006057452cjj */
10077c478bdstevel@tonic-gatestatic int
1008057452cjjsearch_disp_queues(disp_t *dp, kthread_t *tp)
10097c478bdstevel@tonic-gate{
1010057452cjj	dispq_t		*dq;
1011057452cjj	dispq_t		*eq;
10127c478bdstevel@tonic-gate
1013057452cjj	disp_lock_enter_high(&dp->disp_lock);
1014057452cjj
1015057452cjj	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1016057452cjj		kthread_t	*rp;
1017057452cjj
1018057452cjj		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1019057452cjj
1020057452cjj		for (rp = dq->dq_first; rp; rp = rp->t_link)
1021057452cjj			if (tp == rp) {
1022057452cjj				disp_lock_exit_high(&dp->disp_lock);
1023057452cjj				return (1);
1024057452cjj			}
10257c478bdstevel@tonic-gate	}
1026057452cjj	disp_lock_exit_high(&dp->disp_lock);
1027057452cjj
1028057452cjj	return (0);
1029057452cjj}
1030057452cjj
1031057452cjj/*
1032057452cjj * thread_on_queue()
1033057452cjj *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1034057452cjj *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1035057452cjj */
1036057452cjjstatic int
1037057452cjjthread_on_queue(kthread_t *tp)
1038057452cjj{
1039057452cjj	cpu_t		*cp;
1040057452cjj	struct cpupart	*part;
1041057452cjj
1042057452cjj	ASSERT(getpil() >= DISP_LEVEL);
1043057452cjj
1044057452cjj	/*
1045057452cjj	 * Search the per-CPU dispatch queues for tp.
1046057452cjj	 */
1047057452cjj	cp = CPU;
1048057452cjj	do {
1049057452cjj		if (search_disp_queues(cp->cpu_disp, tp))
1050057452cjj			return (1);
1051057452cjj	} while ((cp = cp->cpu_next_onln) != CPU);
1052057452cjj
1053057452cjj	/*
1054057452cjj	 * Search the partition-wide kpreempt queues for tp.
1055057452cjj	 */
1056057452cjj	part = CPU->cpu_part;
1057057452cjj	do {
1058057452cjj		if (search_disp_queues(&part->cp_kp_queue, tp))
1059057452cjj			return (1);
1060057452cjj	} while ((part = part->cp_next) != CPU->cpu_part);
1061057452cjj
10627c478bdstevel@tonic-gate	return (0);
1063057452cjj}
1064057452cjj
10657c478bdstevel@tonic-gate#else
10667c478bdstevel@tonic-gate
10677c478bdstevel@tonic-gate#define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
10687c478bdstevel@tonic-gate
10697c478bdstevel@tonic-gate#endif  /* DEBUG */
10707c478bdstevel@tonic-gate
10717c478bdstevel@tonic-gate/*
10727c478bdstevel@tonic-gate * like swtch(), but switch to a specified thread taken from another CPU.
10737c478bdstevel@tonic-gate *	called with spl high..
10747c478bdstevel@tonic-gate */
10757c478bdstevel@tonic-gatevoid
10767c478bdstevel@tonic-gateswtch_to(kthread_t *next)
10777c478bdstevel@tonic-gate{
10787c478bdstevel@tonic-gate	cpu_t			*cp = CPU;
10790e75152Eric Saxe	hrtime_t		now;
10807c478bdstevel@tonic-gate
10817c478bdstevel@tonic-gate	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
10827c478bdstevel@tonic-gate
10837c478bdstevel@tonic-gate	/*
10847c478bdstevel@tonic-gate	 * Update context switch statistics.
10857c478bdstevel@tonic-gate	 */
10867c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
10877c478bdstevel@tonic-gate
10887c478bdstevel@tonic-gate	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
10897c478bdstevel@tonic-gate
10900e75152Eric Saxe	now = gethrtime_unscaled();
10910e75152Eric Saxe	pg_ev_thread_swtch(cp, now, curthread, next);
10927c478bdstevel@tonic-gate
10937c478bdstevel@tonic-gate	/* OK to steal anything left on run queue */
10947c478bdstevel@tonic-gate	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
10957c478bdstevel@tonic-gate
10967c478bdstevel@tonic-gate	/* record last execution time */
1097d3d5073Rafael Vanoni	cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
10987c478bdstevel@tonic-gate
1099f2bd462johansen	/*
1100f2bd462johansen	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1101f2bd462johansen	 * won't have set its t_waitrq.  Since we now finally know that we're
1102f2bd462johansen	 * switching away from this thread, set its t_waitrq if it is on a run
1103f2bd462johansen	 * queue.
1104f2bd462johansen	 */
1105f2bd462johansen	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
11060e75152Eric Saxe		curthread->t_waitrq = now;
1107f2bd462johansen	}
1108f2bd462johansen
1109f2bd462johansen	/* restore next thread to previously running microstate */
1110f2bd462johansen	restore_mstate(next);
1111f2bd462johansen
11127c478bdstevel@tonic-gate	if (dtrace_vtime_active)
11137c478bdstevel@tonic-gate		dtrace_vtime_switch(next);
11147c478bdstevel@tonic-gate
11157c478bdstevel@tonic-gate	resume(next);
11167c478bdstevel@tonic-gate	/*
11177c478bdstevel@tonic-gate	 * The TR_RESUME_END and TR_SWTCH_END trace points
11187c478bdstevel@tonic-gate	 * appear at the end of resume(), because we may not
11197c478bdstevel@tonic-gate	 * return here
11207c478bdstevel@tonic-gate	 */
11217c478bdstevel@tonic-gate}
11227c478bdstevel@tonic-gate
11237c478bdstevel@tonic-gatestatic void
11247c478bdstevel@tonic-gatecpu_resched(cpu_t *cp, pri_t tpri)
11257c478bdstevel@tonic-gate{
11267c478bdstevel@tonic-gate	int	call_poke_cpu = 0;
11277c478bdstevel@tonic-gate	pri_t   cpupri = cp->cpu_dispatch_pri;
11287c478bdstevel@tonic-gate
1129455e370John Levon	if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
11307c478bdstevel@tonic-gate		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
11317c478bdstevel@tonic-gate		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
11327c478bdstevel@tonic-gate		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
11337c478bdstevel@tonic-gate			cp->cpu_runrun = 1;
11347c478bdstevel@tonic-gate			aston(cp->cpu_dispthread);
11357c478bdstevel@tonic-gate			if (tpri < kpreemptpri && cp != CPU)
11367c478bdstevel@tonic-gate				call_poke_cpu = 1;
11377c478bdstevel@tonic-gate		}
11387c478bdstevel@tonic-gate		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
11397c478bdstevel@tonic-gate			cp->cpu_kprunrun = 1;
11407c478bdstevel@tonic-gate			if (cp != CPU)
11417c478bdstevel@tonic-gate				call_poke_cpu = 1;
11427c478bdstevel@tonic-gate		}
11437c478bdstevel@tonic-gate	}
11447c478bdstevel@tonic-gate
11457c478bdstevel@tonic-gate	/*
11467c478bdstevel@tonic-gate	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
11477c478bdstevel@tonic-gate	 */
11487c478bdstevel@tonic-gate	membar_enter();
11497c478bdstevel@tonic-gate
11507c478bdstevel@tonic-gate	if (call_poke_cpu)
11517c478bdstevel@tonic-gate		poke_cpu(cp->cpu_id);
11527c478bdstevel@tonic-gate}
11537c478bdstevel@tonic-gate
11547c478bdstevel@tonic-gate/*
11557c478bdstevel@tonic-gate * setbackdq() keeps runqs balanced such that the difference in length
11567c478bdstevel@tonic-gate * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
11577c478bdstevel@tonic-gate * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
11587c478bdstevel@tonic-gate * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
11597c478bdstevel@tonic-gate * try to keep runqs perfectly balanced regardless of the thread priority.
11607c478bdstevel@tonic-gate */
11617c478bdstevel@tonic-gate#define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
11627c478bdstevel@tonic-gate#define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
11637c478bdstevel@tonic-gate#define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
11647c478bdstevel@tonic-gate
11657c478bdstevel@tonic-gate/*
11666890d02Eric Saxe * Macro that evaluates to true if it is likely that the thread has cache
11676890d02Eric Saxe * warmth. This is based on the amount of time that has elapsed since the
11686890d02Eric Saxe * thread last ran. If that amount of time is less than "rechoose_interval"
11696890d02Eric Saxe * ticks, then we decide that the thread has enough cache warmth to warrant
11706890d02Eric Saxe * some affinity for t->t_cpu.
11716890d02Eric Saxe */
11726890d02Eric Saxe#define	THREAD_HAS_CACHE_WARMTH(thread)	\
11736890d02Eric Saxe	((thread == curthread) ||	\
1174d3d5073Rafael Vanoni	((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
11756890d02Eric Saxe/*
11767c478bdstevel@tonic-gate * Put the specified thread on the back of the dispatcher
11777c478bdstevel@tonic-gate * queue corresponding to its current priority.
11787c478bdstevel@tonic-gate *
11797c478bdstevel@tonic-gate * Called with the thread in transition, onproc or stopped state
11807c478bdstevel@tonic-gate * and locked (transition implies locked) and at high spl.
11817c478bdstevel@tonic-gate * Returns with the thread in TS_RUN state and still locked.
11827c478bdstevel@tonic-gate */
11837c478bdstevel@tonic-gatevoid
11847c478bdstevel@tonic-gatesetbackdq(kthread_t *tp)
11857c478bdstevel@tonic-gate{
11867c478bdstevel@tonic-gate	dispq_t	*dq;
11877c478bdstevel@tonic-gate	disp_t		*dp;
11887c478bdstevel@tonic-gate	cpu_t		*cp;
11897c478bdstevel@tonic-gate	pri_t		tpri;
11907c478bdstevel@tonic-gate	int		bound;
11916890d02Eric Saxe	boolean_t	self;
11927c478bdstevel@tonic-gate
11937c478bdstevel@tonic-gate	ASSERT(THREAD_LOCK_HELD(tp));
11947c478bdstevel@tonic-gate	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
11957c478bdstevel@tonic-gate	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
11967c478bdstevel@tonic-gate
11977c478bdstevel@tonic-gate	/*
11987c478bdstevel@tonic-gate	 * If thread is "swapped" or on the swap queue don't
11997c478bdstevel@tonic-gate	 * queue it, but wake sched.
12007c478bdstevel@tonic-gate	 */
12017c478bdstevel@tonic-gate	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
12027c478bdstevel@tonic-gate		disp_swapped_setrun(tp);
12037c478bdstevel@tonic-gate		return;
12047c478bdstevel@tonic-gate	}
12057c478bdstevel@tonic-gate
12066890d02Eric Saxe	self = (tp == curthread);
12076890d02Eric Saxe
1208abd4158gd	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1209abd4158gd		bound = 1;
1210abd4158gd	else
1211abd4158gd		bound = 0;
1212abd4158gd
12137c478bdstevel@tonic-gate	tpri = DISP_PRIO(tp);
12147c478bdstevel@tonic-gate	if (ncpus == 1)
12157c478bdstevel@tonic-gate		cp = tp->t_cpu;
1216abd4158gd	else if (!bound) {
12177c478bdstevel@tonic-gate		if (tpri >= kpqpri) {
12187c478bdstevel@tonic-gate			setkpdq(tp, SETKP_BACK);
12197c478bdstevel@tonic-gate			return;
12207c478bdstevel@tonic-gate		}
12216890d02Eric Saxe
12227c478bdstevel@tonic-gate		/*
12236890d02Eric Saxe		 * We'll generally let this thread continue to run where
12246890d02Eric Saxe		 * it last ran...but will consider migration if:
1225455e370John Levon		 * - The thread probably doesn't have much cache warmth.
1226c3377eeJohn Levon		 * - SMT exclusion would prefer us to run elsewhere
12276890d02Eric Saxe		 * - The CPU where it last ran is the target of an offline
12286890d02Eric Saxe		 *   request.
1229455e370John Levon		 * - The thread last ran outside its home lgroup.
12307c478bdstevel@tonic-gate		 */
12316890d02Eric Saxe		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1232c3377eeJohn Levon		    !smt_should_run(tp, tp->t_cpu) ||
1233455e370John Levon		    (tp->t_cpu == cpu_inmotion) ||
1234455e370John Levon		    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1235455e370John Levon			cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
12366890d02Eric Saxe		} else {
12376890d02Eric Saxe			cp = tp->t_cpu;
12386890d02Eric Saxe		}
12397c478bdstevel@tonic-gate
12407c478bdstevel@tonic-gate		if (tp->t_cpupart == cp->cpu_part) {
12417c478bdstevel@tonic-gate			int	qlen;
12427c478bdstevel@tonic-gate
12437c478bdstevel@tonic-gate			/*
1244fb2f18fesaxe			 * Perform any CMT load balancing
12457c478bdstevel@tonic-gate			 */
1246fb2f18fesaxe			cp = cmt_balance(tp, cp);
12477c478bdstevel@tonic-gate
12487c478bdstevel@tonic-gate			/*
12497c478bdstevel@tonic-gate			 * Balance across the run queues
12507c478bdstevel@tonic-gate			 */
12517c478bdstevel@tonic-gate			qlen = RUNQ_LEN(cp, tpri);
12527c478bdstevel@tonic-gate			if (tpri >= RUNQ_MATCH_PRI &&
12537c478bdstevel@tonic-gate			    !(tp->t_schedflag & TS_RUNQMATCH))
12547c478bdstevel@tonic-gate				qlen -= RUNQ_MAX_DIFF;
12557c478bdstevel@tonic-gate			if (qlen > 0) {
1256685679fakolb				cpu_t *newcp;
12577c478bdstevel@tonic-gate
1258685679fakolb				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1259685679fakolb					newcp = cp->cpu_next_part;
1260685679fakolb				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1261685679fakolb					newcp = cp->cpu_next_part;
1262685679fakolb				}
1263685679fakolb
1264c3377eeJohn Levon				if (smt_should_run(tp, newcp) &&
1265455e370John Levon				    RUNQ_LEN(newcp, tpri) < qlen) {
1266685679fakolb					DTRACE_PROBE3(runq__balance,
1267685679fakolb					    kthread_t *, tp,
1268685679fakolb					    cpu_t *, cp, cpu_t *, newcp);
1269685679fakolb					cp = newcp;
12707c478bdstevel@tonic-gate				}
12717c478bdstevel@tonic-gate			}
12727c478bdstevel@tonic-gate		} else {
12737c478bdstevel@tonic-gate			/*
12747c478bdstevel@tonic-gate			 * Migrate to a cpu in the new partition.
12757c478bdstevel@tonic-gate			 */
1276455e370John Levon			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1277455e370John Levon			    tp->t_pri);
12787c478bdstevel@tonic-gate		}
12797c478bdstevel@tonic-gate		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
12807c478bdstevel@tonic-gate	} else {
12817c478bdstevel@tonic-gate		/*
12827c478bdstevel@tonic-gate		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
12837c478bdstevel@tonic-gate		 * a short time until weak binding that existed when the
12847c478bdstevel@tonic-gate		 * strong binding was established has dropped) so we must
12857c478bdstevel@tonic-gate		 * favour weak binding over strong.
12867c478bdstevel@tonic-gate		 */
12877c478bdstevel@tonic-gate		cp = tp->t_weakbound_cpu ?
12887c478bdstevel@tonic-gate		    tp->t_weakbound_cpu : tp->t_bound_cpu;
12897c478bdstevel@tonic-gate	}
1290f2bd462johansen	/*
1291f2bd462johansen	 * A thread that is ONPROC may be temporarily placed on the run queue
1292f2bd462johansen	 * but then chosen to run again by disp.  If the thread we're placing on
1293f2bd462johansen	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1294f2bd462johansen	 * replacement process is actually scheduled in swtch().  In this
1295f2bd462johansen	 * situation, curthread is the only thread that could be in the ONPROC
1296f2bd462johansen	 * state.
1297f2bd462johansen	 */
12986890d02Eric Saxe	if ((!self) && (tp->t_waitrq == 0)) {
1299f2bd462johansen		hrtime_t curtime;
1300f2bd462johansen
1301f2bd462johansen		curtime = gethrtime_unscaled();
1302f2bd462johansen		(void) cpu_update_pct(tp, curtime);
1303f2bd462johansen		tp->t_waitrq = curtime;
1304f2bd462johansen	} else {
1305f2bd462johansen		(void) cpu_update_pct(tp, gethrtime_unscaled());
1306f2bd462johansen	}
1307f2bd462johansen
13087c478bdstevel@tonic-gate	dp = cp->cpu_disp;
13097c478bdstevel@tonic-gate	disp_lock_enter_high(&dp->disp_lock);
13107c478bdstevel@tonic-gate
13117c478bdstevel@tonic-gate	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
13127c478bdstevel@tonic-gate	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1313d129bdeesaxe	    tpri, cp, tp);
13147c478bdstevel@tonic-gate
13157c478bdstevel@tonic-gate#ifndef NPROBE
13167c478bdstevel@tonic-gate	/* Kernel probe */
13177c478bdstevel@tonic-gate	if (tnf_tracing_active)
13187c478bdstevel@tonic-gate		tnf_thread_queue(tp, cp, tpri);
13197c478bdstevel@tonic-gate#endif /* NPROBE */
13207c478bdstevel@tonic-gate
13217c478bdstevel@tonic-gate	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
13227c478bdstevel@tonic-gate
13237c478bdstevel@tonic-gate	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
13247c478bdstevel@tonic-gate	tp->t_disp_queue = dp;
13257c478bdstevel@tonic-gate	tp->t_link = NULL;
13267c478bdstevel@tonic-gate
13277c478bdstevel@tonic-gate	dq = &dp->disp_q[tpri];
13287c478bdstevel@tonic-gate	dp->disp_nrunnable++;
1329685679fakolb	if (!bound)
1330685679fakolb		dp->disp_steal = 0;
13317c478bdstevel@tonic-gate	membar_enter();
13327c478bdstevel@tonic-gate
13337c478bdstevel@tonic-gate	if (dq->dq_sruncnt++ != 0) {
13347c478bdstevel@tonic-gate		ASSERT(dq->dq_first != NULL);
13357c478bdstevel@tonic-gate		dq->dq_last->t_link = tp;
13367c478bdstevel@tonic-gate		dq->dq_last = tp;
13377c478bdstevel@tonic-gate	} else {
13387c478bdstevel@tonic-gate		ASSERT(dq->dq_first == NULL);
13397c478bdstevel@tonic-gate		ASSERT(dq->dq_last == NULL);
13407c478bdstevel@tonic-gate		dq->dq_first = dq->dq_last = tp;
13417c478bdstevel@tonic-gate		BT_SET(dp->disp_qactmap, tpri);
13427c478bdstevel@tonic-gate		if (tpri > dp->disp_maxrunpri) {
13437c478bdstevel@tonic-gate			dp->disp_maxrunpri = tpri;
13447c478bdstevel@tonic-gate			membar_enter();
13457c478bdstevel@tonic-gate			cpu_resched(cp, tpri);
13467c478bdstevel@tonic-gate		}
13477c478bdstevel@tonic-gate	}
13487c478bdstevel@tonic-gate
13497c478bdstevel@tonic-gate	if (!bound && tpri > dp->disp_max_unbound_pri) {
13506890d02Eric Saxe		if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
13517c478bdstevel@tonic-gate			/*
13527c478bdstevel@tonic-gate			 * If there are no other unbound threads on the
13537c478bdstevel@tonic-gate			 * run queue, don't allow other CPUs to steal
13547c478bdstevel@tonic-gate			 * this thread while we are in the middle of a
13557c478bdstevel@tonic-gate			 * context switch. We may just switch to it
13567c478bdstevel@tonic-gate			 * again right away. CPU_DISP_DONTSTEAL is cleared
13577c478bdstevel@tonic-gate			 * in swtch and swtch_to.
13587c478bdstevel@tonic-gate			 */
13597c478bdstevel@tonic-gate			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
13607c478bdstevel@tonic-gate		}
13617c478bdstevel@tonic-gate		dp->disp_max_unbound_pri = tpri;
13627c478bdstevel@tonic-gate	}
13637c478bdstevel@tonic-gate	(*disp_enq_thread)(cp, bound);
13647c478bdstevel@tonic-gate}
13657c478bdstevel@tonic-gate
13667c478bdstevel@tonic-gate/*
13677c478bdstevel@tonic-gate * Put the specified thread on the front of the dispatcher
13687c478bdstevel@tonic-gate * queue corresponding to its current priority.
13697c478bdstevel@tonic-gate *
13707c478bdstevel@tonic-gate * Called with the thread in transition, onproc or stopped state
13717c478bdstevel@tonic-gate * and locked (transition implies locked) and at high spl.
13727c478bdstevel@tonic-gate * Returns with the thread in TS_RUN state and still locked.
13737c478bdstevel@tonic-gate */
13747c478bdstevel@tonic-gatevoid
13757c478bdstevel@tonic-gatesetfrontdq(kthread_t *tp)
13767c478bdstevel@tonic-gate{
13777c478bdstevel@tonic-gate	disp_t		*dp;
13787c478bdstevel@tonic-gate	dispq_t		*dq;
13797c478bdstevel@tonic-gate	cpu_t		*cp;
13807c478bdstevel@tonic-gate	pri_t		tpri;
13817c478bdstevel@tonic-gate	int		bound;
13827c478bdstevel@tonic-gate
13837c478bdstevel@tonic-gate	ASSERT(THREAD_LOCK_HELD(tp));
13847c478bdstevel@tonic-gate	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
13857c478bdstevel@tonic-gate	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
13867c478bdstevel@tonic-gate
13877c478bdstevel@tonic-gate	/*
13887c478bdstevel@tonic-gate	 * If thread is "swapped" or on the swap queue don't
13897c478bdstevel@tonic-gate	 * queue it, but wake sched.
13907c478bdstevel@tonic-gate	 */
13917c478bdstevel@tonic-gate	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
13927c478bdstevel@tonic-gate		disp_swapped_setrun(tp);
13937c478bdstevel@tonic-gate		return;
13947c478bdstevel@tonic-gate	}
13957c478bdstevel@tonic-gate
1396abd4158gd	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1397abd4158gd		bound = 1;
1398abd4158gd	else
1399abd4158gd		bound = 0;
1400abd4158gd
14017c478bdstevel@tonic-gate	tpri = DISP_PRIO(tp);
14027c478bdstevel@tonic-gate	if (ncpus == 1)
14037c478bdstevel@tonic-gate		cp = tp->t_cpu;
1404abd4158gd	else if (!bound) {
14057c478bdstevel@tonic-gate		if (tpri >= kpqpri) {
14067c478bdstevel@tonic-gate			setkpdq(tp, SETKP_FRONT);
14077c478bdstevel@tonic-gate			return;
14087c478bdstevel@tonic-gate		}
14097c478bdstevel@tonic-gate		cp = tp->t_cpu;
14107c478bdstevel@tonic-gate		if (tp->t_cpupart == cp->cpu_part) {
14117c478bdstevel@tonic-gate			/*
14126890d02Eric Saxe			 * We'll generally let this thread continue to run
14136890d02Eric Saxe			 * where it last ran, but will consider migration if:
1414455e370John Levon			 * - The thread last ran outside its home lgroup.
14156890d02Eric Saxe			 * - The CPU where it last ran is the target of an
14166890d02Eric Saxe			 *   offline request (a thread_nomigrate() on the in
14176890d02Eric Saxe			 *   motion CPU relies on this when forcing a preempt).
14186890d02Eric Saxe			 * - The thread isn't the highest priority thread where
14196890d02Eric Saxe			 *   it last ran, and it is considered not likely to
14206890d02Eric Saxe			 *   have significant cache warmth.
14217c478bdstevel@tonic-gate			 */
1422455e370John Levon			if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1423455e370John Levon			    cp == cpu_inmotion ||
1424455e370John Levon			    (tpri < cp->cpu_disp->disp_maxrunpri &&
1425455e370John Levon			    !THREAD_HAS_CACHE_WARMTH(tp))) {
1426455e370John Levon				cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
14276890d02Eric Saxe			}
14287c478bdstevel@tonic-gate		} else {
14297c478bdstevel@tonic-gate			/*
14307c478bdstevel@tonic-gate			 * Migrate to a cpu in the new partition.
14317c478bdstevel@tonic-gate			 */
14327c478bdstevel@tonic-gate			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1433455e370John Levon			    tp, tp->t_pri);
14347c478bdstevel@tonic-gate		}
14357c478bdstevel@tonic-gate		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
14367c478bdstevel@tonic-gate	} else {
14377c478bdstevel@tonic-gate		/*
14387c478bdstevel@tonic-gate		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
14397c478bdstevel@tonic-gate		 * a short time until weak binding that existed when the
14407c478bdstevel@tonic-gate		 * strong binding was established has dropped) so we must
14417c478bdstevel@tonic-gate		 * favour weak binding over strong.
14427c478bdstevel@tonic-gate		 */
14437c478bdstevel@tonic-gate		cp = tp->t_weakbound_cpu ?
14447c478bdstevel@tonic-gate		    tp->t_weakbound_cpu : tp->t_bound_cpu;
14457c478bdstevel@tonic-gate	}
1446f2bd462johansen
1447f2bd462johansen	/*
1448f2bd462johansen	 * A thread that is ONPROC may be temporarily placed on the run queue
1449f2bd462johansen	 * but then chosen to run again by disp.  If the thread we're placing on
1450f2bd462johansen	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1451f2bd462johansen	 * replacement process is actually scheduled in swtch().  In this
1452f2bd462johansen	 * situation, curthread is the only thread that could be in the ONPROC
1453f2bd462johansen	 * state.
1454f2bd462johansen	 */
1455f2bd462johansen	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1456f2bd462johansen		hrtime_t curtime;
1457f2bd462johansen
1458f2bd462johansen		curtime = gethrtime_unscaled();
1459f2bd462johansen		(void) cpu_update_pct(tp, curtime);
1460f2bd462johansen		tp->t_waitrq = curtime;
1461f2bd462johansen	} else {
1462f2bd462johansen		(void) cpu_update_pct(tp, gethrtime_unscaled());
1463f2bd462johansen	}
1464f2bd462johansen
14657c478bdstevel@tonic-gate	dp = cp->cpu_disp;
14667c478bdstevel@tonic-gate	disp_lock_enter_high(&dp->disp_lock);
14677c478bdstevel@tonic-gate
14687c478bdstevel@tonic-gate	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
14697c478bdstevel@tonic-gate	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
14707c478bdstevel@tonic-gate
14717c478bdstevel@tonic-gate#ifndef NPROBE
14727c478bdstevel@tonic-gate	/* Kernel probe */
14737c478bdstevel@tonic-gate	if (tnf_tracing_active)
14747c478bdstevel@tonic-gate		tnf_thread_queue(tp, cp, tpri);
14757c478bdstevel@tonic-gate#endif /* NPROBE */
14767c478bdstevel@tonic-gate
14777c478bdstevel@tonic-gate	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
14787c478bdstevel@tonic-gate
14797c478bdstevel@tonic-gate	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
14807c478bdstevel@tonic-gate	tp->t_disp_queue = dp;
14817c478bdstevel@tonic-gate
14827c478bdstevel@tonic-gate	dq = &dp->disp_q[tpri];
14837c478bdstevel@tonic-gate	dp->disp_nrunnable++;
1484685679fakolb	if (!bound)
1485685679fakolb		dp->disp_steal = 0;
14867c478bdstevel@tonic-gate	membar_enter();
14877c478bdstevel@tonic-gate
14887c478bdstevel@tonic-gate	if (dq->dq_sruncnt++ != 0) {
14897c478bdstevel@tonic-gate		ASSERT(dq->dq_last != NULL);
14907c478bdstevel@tonic-gate		tp->t_link = dq->dq_first;
14917c478bdstevel@tonic-gate		dq->dq_first = tp;
14927c478bdstevel@tonic-gate	} else {
14937c478bdstevel@tonic-gate		ASSERT(dq->dq_last == NULL);
14947c478bdstevel@tonic-gate		ASSERT(dq->dq_first == NULL);
14957c478bdstevel@tonic-gate		tp->t_link = NULL;
14967c478bdstevel@tonic-gate		dq->dq_first = dq->dq_last = tp;
14977c478bdstevel@tonic-gate		BT_SET(dp->disp_qactmap, tpri);
14987c478bdstevel@tonic-gate		if (tpri > dp->disp_maxrunpri) {
14997c478bdstevel@tonic-gate			dp->disp_maxrunpri = tpri;
15007c478bdstevel@tonic-gate			membar_enter();
15017c478bdstevel@tonic-gate			cpu_resched(cp, tpri);
15027c478bdstevel@tonic-gate		}
15037c478bdstevel@tonic-gate	}
15047c478bdstevel@tonic-gate
15057c478bdstevel@tonic-gate	if (!bound && tpri > dp->disp_max_unbound_pri) {
15067c478bdstevel@tonic-gate		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
15077c478bdstevel@tonic-gate		    cp == CPU) {
15087c478bdstevel@tonic-gate			/*
15097c478bdstevel@tonic-gate			 * If there are no other unbound threads on the
15107c478bdstevel@tonic-gate			 * run queue, don't allow other CPUs to steal
15117c478bdstevel@tonic-gate			 * this thread while we are in the middle of a
15127c478bdstevel@tonic-gate			 * context switch. We may just switch to it
15137c478bdstevel@tonic-gate			 * again right away. CPU_DISP_DONTSTEAL is cleared
15147c478bdstevel@tonic-gate			 * in swtch and swtch_to.
15157c478bdstevel@tonic-gate			 */
15167c478bdstevel@tonic-gate			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
15177c478bdstevel@tonic-gate		}
15187c478bdstevel@tonic-gate		dp->disp_max_unbound_pri = tpri;
15197c478bdstevel@tonic-gate	}
15207c478bdstevel@tonic-gate	(*disp_enq_thread)(cp, bound);
15217c478bdstevel@tonic-gate}
15227c478bdstevel@tonic-gate
15237c478bdstevel@tonic-gate/*
15247c478bdstevel@tonic-gate * Put a high-priority unbound thread on the kp queue
15257c478bdstevel@tonic-gate */
15267c478bdstevel@tonic-gatestatic void
15277c478bdstevel@tonic-gatesetkpdq(kthread_t *tp, int borf)
15287c478bdstevel@tonic-gate{
15297c478bdstevel@tonic-gate	dispq_t	*dq;
15307c478bdstevel@tonic-gate	disp_t	*dp;
15317c478bdstevel@tonic-gate	cpu_t	*cp;
15327c478bdstevel@tonic-gate	pri_t	tpri;
15337c478bdstevel@tonic-gate
15347c478bdstevel@tonic-gate	tpri = DISP_PRIO(tp);
15357c478bdstevel@tonic-gate
15367c478bdstevel@tonic-gate	dp = &tp->t_cpupart->cp_kp_queue;
15377c478bdstevel@tonic-gate	disp_lock_enter_high(&dp->disp_lock);
15387c478bdstevel@tonic-gate
15397c478bdstevel@tonic-gate	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
15407c478bdstevel@tonic-gate
15417c478bdstevel@tonic-gate	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
15427c478bdstevel@tonic-gate	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
15437c478bdstevel@tonic-gate	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
15447c478bdstevel@tonic-gate	tp->t_disp_queue = dp;
15457c478bdstevel@tonic-gate	dp->disp_nrunnable++;
15467c478bdstevel@tonic-gate	dq = &dp->disp_q[tpri];
15477c478bdstevel@tonic-gate
15487c478bdstevel@tonic-gate	if (dq->dq_sruncnt++ != 0) {
15497c478bdstevel@tonic-gate		if (borf == SETKP_BACK) {
15507c478bdstevel@tonic-gate			ASSERT(dq->dq_first != NULL);
15517c478bdstevel@tonic-gate			tp->t_link = NULL;
15527c478bdstevel@tonic-gate			dq->dq_last->t_link = tp;
15537c478bdstevel@tonic-gate			dq->dq_last = tp;
15547c478bdstevel@tonic-gate		} else {
15557c478bdstevel@tonic-gate			ASSERT(dq->dq_last != NULL);
15567c478bdstevel@tonic-gate			tp->t_link = dq->dq_first;
15577c478bdstevel@tonic-gate			dq->dq_first = tp;
15587c478bdstevel@tonic-gate		}
15597c478bdstevel@tonic-gate	} else {
15607c478bdstevel@tonic-gate		if (borf == SETKP_BACK) {
15617c478bdstevel@tonic-gate			ASSERT(dq->dq_first == NULL);
15627c478bdstevel@tonic-gate			ASSERT(dq->dq_last == NULL);
15637c478bdstevel@tonic-gate			dq->dq_first = dq->dq_last = tp;
15647c478bdstevel@tonic-gate		} else {
15657c478bdstevel@tonic-gate			ASSERT(dq->dq_last == NULL);
15667c478bdstevel@tonic-gate			ASSERT(dq->dq_first == NULL);
15677c478bdstevel@tonic-gate			tp->t_link = NULL;
15687c478bdstevel@tonic-gate			dq->dq_first = dq->dq_last = tp;
15697c478bdstevel@tonic-gate		}
15707c478bdstevel@tonic-gate		BT_SET(dp->disp_qactmap, tpri);
15717c478bdstevel@tonic-gate		if (tpri > dp->disp_max_unbound_pri)
15727c478bdstevel@tonic-gate			dp->disp_max_unbound_pri = tpri;
15737c478bdstevel@tonic-gate		if (tpri > dp->disp_maxrunpri) {
15747c478bdstevel@tonic-gate			dp->disp_maxrunpri = tpri;
15757c478bdstevel@tonic-gate			membar_enter();
15767c478bdstevel@tonic-gate		}
15777c478bdstevel@tonic-gate	}
15787c478bdstevel@tonic-gate
15797c478bdstevel@tonic-gate	cp = tp->t_cpu;
15807c478bdstevel@tonic-gate	if (tp->t_cpupart != cp->cpu_part) {
15817c478bdstevel@tonic-gate		/* migrate to a cpu in the new partition */
15827c478bdstevel@tonic-gate		cp = tp->t_cpupart->cp_cpulist;
15837c478bdstevel@tonic-gate	}
1584455e370John Levon	cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
15857c478bdstevel@tonic-gate	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
15867c478bdstevel@tonic-gate	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
15877c478bdstevel@tonic-gate
15887c478bdstevel@tonic-gate#ifndef NPROBE
15897c478bdstevel@tonic-gate	/* Kernel probe */
15907c478bdstevel@tonic-gate	if (tnf_tracing_active)
15917c478bdstevel@tonic-gate		tnf_thread_queue(tp, cp, tpri);
15927c478bdstevel@tonic-gate#endif /* NPROBE */
15937c478bdstevel@tonic-gate
15947c478bdstevel@tonic-gate	if (cp->cpu_chosen_level < tpri)
15957c478bdstevel@tonic-gate		cp->cpu_chosen_level = tpri;
15967c478bdstevel@tonic-gate	cpu_resched(cp, tpri);
15977c478bdstevel@tonic-gate	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
15987c478bdstevel@tonic-gate	(*disp_enq_thread)(cp, 0);
15997c478bdstevel@tonic-gate}
16007c478bdstevel@tonic-gate
16017c478bdstevel@tonic-gate/*
16027c478bdstevel@tonic-gate * Remove a thread from the dispatcher queue if it is on it.
16037c478bdstevel@tonic-gate * It is not an error if it is not found but we return whether
16047c478bdstevel@tonic-gate * or not it was found in case the caller wants to check.
16057c478bdstevel@tonic-gate */
16067c478bdstevel@tonic-gateint
16077c478bdstevel@tonic-gatedispdeq(kthread_t *tp)
16087c478bdstevel@tonic-gate{
16097c478bdstevel@tonic-gate	disp_t		*dp;
16107c478bdstevel@tonic-gate	dispq_t		*dq;
16117c478bdstevel@tonic-gate	kthread_t	*rp;
16127c478bdstevel@tonic-gate	kthread_t	*trp;
16137c478bdstevel@tonic-gate	kthread_t	**ptp;
16147c478bdstevel@tonic-gate	int		tpri;
16157c478bdstevel@tonic-gate
16167c478bdstevel@tonic-gate	ASSERT(THREAD_LOCK_HELD(tp));
16177c478bdstevel@tonic-gate
16187c478bdstevel@tonic-gate	if (tp->t_state != TS_RUN)
16197c478bdstevel@tonic-gate		return (0);
16207c478bdstevel@tonic-gate
16217c478bdstevel@tonic-gate	/*
16227c478bdstevel@tonic-gate	 * The thread is "swapped" or is on the swap queue and
16237c478bdstevel@tonic-gate	 * hence no longer on the run queue, so return true.
16247c478bdstevel@tonic-gate	 */
16257c478bdstevel@tonic-gate	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
16267c478bdstevel@tonic-gate		return (1);
16277c478bdstevel@tonic-gate
16287c478bdstevel@tonic-gate	tpri = DISP_PRIO(tp);
16297c478bdstevel@tonic-gate	dp = tp->t_disp_queue;
16307c478bdstevel@tonic-gate	ASSERT(tpri < dp->disp_npri);
16317c478bdstevel@tonic-gate	dq = &dp->disp_q[tpri];
16327c478bdstevel@tonic-gate	ptp = &dq->dq_first;
16337c478bdstevel@tonic-gate	rp = *ptp;
16347c478bdstevel@tonic-gate	trp = NULL;
16357c478bdstevel@tonic-gate
16367c478bdstevel@tonic-gate	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
16377c478bdstevel@tonic-gate
16387c478bdstevel@tonic-gate	/*
16397c478bdstevel@tonic-gate	 * Search for thread in queue.
16407c478bdstevel@tonic-gate	 * Double links would simplify this at the expense of disp/setrun.
16417c478bdstevel@tonic-gate	 */
16427c478bdstevel@tonic-gate	while (rp != tp && rp != NULL) {
16437c478bdstevel@tonic-gate		trp = rp;
16447c478bdstevel@tonic-gate		ptp = &trp->t_link;
16457c478bdstevel@tonic-gate		rp = trp->t_link;
16467c478bdstevel@tonic-gate	}
16477c478bdstevel@tonic-gate
16487c478bdstevel@tonic-gate	if (rp == NULL) {
16497c478bdstevel@tonic-gate		panic("dispdeq: thread not on queue");
16507c478bdstevel@tonic-gate	}
16517c478bdstevel@tonic-gate
16527c478bdstevel@tonic-gate	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
16537c478bdstevel@tonic-gate
16547c478bdstevel@tonic-gate	/*
16557c478bdstevel@tonic-gate	 * Found it so remove it from queue.
16567c478bdstevel@tonic-gate	 */
16577c478bdstevel@tonic-gate	if ((*ptp = rp->t_link) == NULL)
16587c478bdstevel@tonic-gate		dq->dq_last = trp;
16597c478bdstevel@tonic-gate
16607c478bdstevel@tonic-gate	dp->disp_nrunnable--;
16617c478bdstevel@tonic-gate	if (--dq->dq_sruncnt == 0) {
16627c478bdstevel@tonic-gate		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
16637c478bdstevel@tonic-gate		if (dp->disp_nrunnable == 0) {
16647c478bdstevel@tonic-gate			dp->disp_max_unbound_pri = -1;
16657c478bdstevel@tonic-gate			dp->disp_maxrunpri = -1;
16667c478bdstevel@tonic-gate		} else if (tpri == dp->disp_maxrunpri) {
16677c478bdstevel@tonic-gate			int ipri;
16687c478bdstevel@tonic-gate
16697c478bdstevel@tonic-gate			ipri = bt_gethighbit(dp->disp_qactmap,
16707c478bdstevel@tonic-gate			    dp->disp_maxrunpri >> BT_ULSHIFT);
16717c478bdstevel@tonic-gate			if (ipri < dp->disp_max_unbound_pri)
16727c478bdstevel@tonic-gate				dp->disp_max_unbound_pri = ipri;
16737c478bdstevel@tonic-gate			dp->disp_maxrunpri = ipri;
16747c478bdstevel@tonic-gate		}
16757c478bdstevel@tonic-gate	}
16767c478bdstevel@tonic-gate	tp->t_link = NULL;
16777c478bdstevel@tonic-gate	THREAD_TRANSITION(tp);		/* put in intermediate state */
16787c478bdstevel@tonic-gate	return (1);
16797c478bdstevel@tonic-gate}
16807c478bdstevel@tonic-gate
16817c478bdstevel@tonic-gate
16827c478bdstevel@tonic-gate/*
16837c478bdstevel@tonic-gate * dq_sruninc and dq_srundec are public functions for
16847c478bdstevel@tonic-gate * incrementing/decrementing the sruncnts when a thread on
16857c478bdstevel@tonic-gate * a dispatcher queue is made schedulable/unschedulable by
16867c478bdstevel@tonic-gate * resetting the TS_LOAD flag.
16877c478bdstevel@tonic-gate *
16887c478bdstevel@tonic-gate * The caller MUST have the thread lock and therefore the dispatcher
16897c478bdstevel@tonic-gate * queue lock so that the operation which changes
16907c478bdstevel@tonic-gate * the flag, the operation that checks the status of the thread to
16917c478bdstevel@tonic-gate * determine if it's on a disp queue AND the call to this function
16927c478bdstevel@tonic-gate * are one atomic operation with respect to interrupts.
16937c478bdstevel@tonic-gate */
16947c478bdstevel@tonic-gate
16957c478bdstevel@tonic-gate/*
16967c478bdstevel@tonic-gate * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
16977c478bdstevel@tonic-gate */
16987c478bdstevel@tonic-gatevoid
16997c478bdstevel@tonic-gatedq_sruninc(kthread_t *t)
17007c478bdstevel@tonic-gate{
17017c478bdstevel@tonic-gate	ASSERT(t->t_state == TS_RUN);
17027c478bdstevel@tonic-gate	ASSERT(t->t_schedflag & TS_LOAD);
17037c478bdstevel@tonic-gate
17047c478bdstevel@tonic-gate	THREAD_TRANSITION(t);
17057c478bdstevel@tonic-gate	setfrontdq(t);
17067c478bdstevel@tonic-gate}
17077c478bdstevel@tonic-gate
17087c478bdstevel@tonic-gate/*
17097c478bdstevel@tonic-gate * See comment on calling conventions above.
17107c478bdstevel@tonic-gate * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
17117c478bdstevel@tonic-gate */
17127c478bdstevel@tonic-gatevoid
17137c478bdstevel@tonic-gatedq_srundec(kthread_t *t)
17147c478bdstevel@tonic-gate{
17157c478bdstevel@tonic-gate	ASSERT(t->t_schedflag & TS_LOAD);
17167c478bdstevel@tonic-gate
17177c478bdstevel@tonic-gate	(void) dispdeq(t);
17187c478bdstevel@tonic-gate	disp_swapped_enq(t);
17197c478bdstevel@tonic-gate}
17207c478bdstevel@tonic-gate
17217c478bdstevel@tonic-gate/*
17227c478bdstevel@tonic-gate * Change the dispatcher lock of thread to the "swapped_lock"
17237c478bdstevel@tonic-gate * and return with thread lock still held.
17247c478bdstevel@tonic-gate *
17257c478bdstevel@tonic-gate * Called with thread_lock held, in transition state, and at high spl.
17267c478bdstevel@tonic-gate */
17277c478bdstevel@tonic-gatevoid
17287c478bdstevel@tonic-gatedisp_swapped_enq(kthread_t *tp)
17297c478bdstevel@tonic-gate{
17307c478bdstevel@tonic-gate	ASSERT(THREAD_LOCK_HELD(tp));
17317c478bdstevel@tonic-gate	ASSERT(tp->t_schedflag & TS_LOAD);
17327c478bdstevel@tonic-gate
17337c478bdstevel@tonic-gate	switch (tp->t_state) {
17347c478bdstevel@tonic-gate	case TS_RUN:
17357c478bdstevel@tonic-gate		disp_lock_enter_high(&swapped_lock);
17367c478bdstevel@tonic-gate		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
17377c478bdstevel@tonic-gate		break;
17387c478bdstevel@tonic-gate	case TS_ONPROC:
17397c478bdstevel@tonic-gate		disp_lock_enter_high(&swapped_lock);
17407c478bdstevel@tonic-gate		THREAD_TRANSITION(tp);
17417c478bdstevel@tonic-gate		wake_sched_sec = 1;		/* tell clock to wake sched */
17427c478bdstevel@tonic-gate		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
17437c478bdstevel@tonic-gate		break;
17447c478bdstevel@tonic-gate	default:
17457c478bdstevel@tonic-gate		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
17467c478bdstevel@tonic-gate	}
17477c478bdstevel@tonic-gate}
17487c478bdstevel@tonic-gate
17497c478bdstevel@tonic-gate/*
17507c478bdstevel@tonic-gate * This routine is called by setbackdq/setfrontdq if the thread is
17517c478bdstevel@tonic-gate * not loaded or loaded and on the swap queue.
17527c478bdstevel@tonic-gate *
17537c478bdstevel@tonic-gate * Thread state TS_SLEEP implies that a swapped thread
17547c478bdstevel@tonic-gate * has been woken up and needs to be swapped in by the swapper.
17557c478bdstevel@tonic-gate *
17567c478bdstevel@tonic-gate * Thread state TS_RUN, it implies that the priority of a swapped
17577c478bdstevel@tonic-gate * thread is being increased by scheduling class (e.g. ts_update).
17587c478bdstevel@tonic-gate */
17597c478bdstevel@tonic-gatestatic void
17607c478bdstevel@tonic-gatedisp_swapped_setrun(kthread_t *tp)
17617c478bdstevel@tonic-gate{
17627c478bdstevel@tonic-gate	ASSERT(THREAD_LOCK_HELD(tp));
17637c478bdstevel@tonic-gate	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
17647c478bdstevel@tonic-gate
17657c478bdstevel@tonic-gate	switch (tp->t_state) {
17667c478bdstevel@tonic-gate	case TS_SLEEP:
17677c478bdstevel@tonic-gate		disp_lock_enter_high(&swapped_lock);
17687c478bdstevel@tonic-gate		/*
17697c478bdstevel@tonic-gate		 * Wakeup sched immediately (i.e., next tick) if the
17707c478bdstevel@tonic-gate		 * thread priority is above maxclsyspri.
17717c478bdstevel@tonic-gate		 */
17727c478bdstevel@tonic-gate		if (DISP_PRIO(tp) > maxclsyspri)
17737c478bdstevel@tonic-gate			wake_sched = 1;
17747c478bdstevel@tonic-gate		else
17757c478bdstevel@tonic-gate			wake_sched_sec = 1;
17767c478bdstevel@tonic-gate		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
17777c478bdstevel@tonic-gate		break;
17787c478bdstevel@tonic-gate	case TS_RUN:				/* called from ts_update */
17797c478bdstevel@tonic-gate		break;
17807c478bdstevel@tonic-gate	default:
17818793b36Nick Todd		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
17827c478bdstevel@tonic-gate	}
17837c478bdstevel@tonic-gate}
17847c478bdstevel@tonic-gate
17857c478bdstevel@tonic-gate/*
17867c478bdstevel@tonic-gate *	Make a thread give up its processor.  Find the processor on
17877c478bdstevel@tonic-gate *	which this thread is executing, and have that processor
17887c478bdstevel@tonic-gate *	preempt.
178935a5a35Jonathan Adams *
179035a5a35Jonathan Adams *	We allow System Duty Cycle (SDC) threads to be preempted even if
179135a5a35Jonathan Adams *	they are running at kernel priorities.  To implement this, we always
179235a5a35Jonathan Adams *	set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
179335a5a35Jonathan Adams *	calls cpu_surrender() very often, we only preempt if there is anyone
179435a5a35Jonathan Adams *	competing with us.
17957c478bdstevel@tonic-gate */
17967c478bdstevel@tonic-gatevoid
17977c478bdstevel@tonic-gatecpu_surrender(kthread_t *tp)
17987c478bdstevel@tonic-gate{
17997c478bdstevel@tonic-gate	cpu_t	*cpup;
18007c478bdstevel@tonic-gate	int	max_pri;
18017c478bdstevel@tonic-gate	int	max_run_pri;
18027c478bdstevel@tonic-gate	klwp_t	*lwp;
18037c478bdstevel@tonic-gate
18047c478bdstevel@tonic-gate	ASSERT(THREAD_LOCK_HELD(tp));
18057c478bdstevel@tonic-gate
18067c478bdstevel@tonic-gate	if (tp->t_state != TS_ONPROC)
18077c478bdstevel@tonic-gate		return;
18087c478bdstevel@tonic-gate	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
18097c478bdstevel@tonic-gate	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
18107c478bdstevel@tonic-gate	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
18117c478bdstevel@tonic-gate	if (max_pri < max_run_pri)
18127c478bdstevel@tonic-gate		max_pri = max_run_pri;
18137c478bdstevel@tonic-gate
181435a5a35Jonathan Adams	if (tp->t_cid == sysdccid) {
181535a5a35Jonathan Adams		uint_t t_pri = DISP_PRIO(tp);
181635a5a35Jonathan Adams		if (t_pri > max_pri)
181735a5a35Jonathan Adams			return;		/* we are not competing w/ anyone */
181835a5a35Jonathan Adams		cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
181935a5a35Jonathan Adams	} else {
182035a5a35Jonathan Adams		cpup->cpu_runrun = 1;
182135a5a35Jonathan Adams		if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
182235a5a35Jonathan Adams			cpup->cpu_kprunrun = 1;
182335a5a35Jonathan Adams		}
18247c478bdstevel@tonic-gate	}
18257c478bdstevel@tonic-gate
18267c478bdstevel@tonic-gate	/*
18277c478bdstevel@tonic-gate	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
18287c478bdstevel@tonic-gate	 */
18297c478bdstevel@tonic-gate	membar_enter();
18307c478bdstevel@tonic-gate
18317c478bdstevel@tonic-gate	DTRACE_SCHED1(surrender, kthread_t *, tp);
18327c478bdstevel@tonic-gate
18337c478bdstevel@tonic-gate	/*
18347c478bdstevel@tonic-gate	 * Make the target thread take an excursion through trap()
18357c478bdstevel@tonic-gate	 * to do preempt() (unless we're already in trap or post_syscall,
18367c478bdstevel@tonic-gate	 * calling cpu_surrender via CL_TRAPRET).
18377c478bdstevel@tonic-gate	 */
18387c478bdstevel@tonic-gate	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
18397c478bdstevel@tonic-gate	    lwp->lwp_state != LWP_USER) {
18407c478bdstevel@tonic-gate		aston(tp);
18417c478bdstevel@tonic-gate		if (cpup != CPU)
18427c478bdstevel@tonic-gate			poke_cpu(cpup->cpu_id);
18437c478bdstevel@tonic-gate	}
18447c478bdstevel@tonic-gate	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
18457c478bdstevel@tonic-gate	    "cpu_surrender:tid %p cpu %p", tp, cpup);
18467c478bdstevel@tonic-gate}
18477c478bdstevel@tonic-gate
18487c478bdstevel@tonic-gate/*
18497c478bdstevel@tonic-gate * Commit to and ratify a scheduling decision
18507c478bdstevel@tonic-gate */
18517c478bdstevel@tonic-gate/*ARGSUSED*/
18527c478bdstevel@tonic-gatestatic kthread_t *
18537c478bdstevel@tonic-gatedisp_ratify(kthread_t *tp, disp_t *kpq)
18547c478bdstevel@tonic-gate{
18557c478bdstevel@tonic-gate	pri_t	tpri, maxpri;
18567c478bdstevel@tonic-gate	pri_t	maxkpri;
18577c478bdstevel@tonic-gate	cpu_t	*cpup;
18587c478bdstevel@tonic-gate
18597c478bdstevel@tonic-gate	ASSERT(tp != NULL);
18607c478bdstevel@tonic-gate	/*
18617c478bdstevel@tonic-gate	 * Commit to, then ratify scheduling decision
18627c478bdstevel@tonic-gate	 */
18637c478bdstevel@tonic-gate	cpup = CPU;
18647c478bdstevel@tonic-gate	if (cpup->cpu_runrun != 0)
18657c478bdstevel@tonic-gate		cpup->cpu_runrun = 0;
18667c478bdstevel@tonic-gate	if (cpup->cpu_kprunrun != 0)
18677c478bdstevel@tonic-gate		cpup->cpu_kprunrun = 0;
18687c478bdstevel@tonic-gate	if (cpup->cpu_chosen_level != -1)
18697c478bdstevel@tonic-gate		cpup->cpu_chosen_level = -1;
18707c478bdstevel@tonic-gate	membar_enter();
18717c478bdstevel@tonic-gate	tpri = DISP_PRIO(tp);
18727c478bdstevel@tonic-gate	maxpri = cpup->cpu_disp->disp_maxrunpri;
18737c478bdstevel@tonic-gate	maxkpri = kpq->disp_maxrunpri;
18747c478bdstevel@tonic-gate	if (maxpri < maxkpri)
18757c478bdstevel@tonic-gate		maxpri = maxkpri;
18767c478bdstevel@tonic-gate	if (tpri < maxpri) {
18777c478bdstevel@tonic-gate		/*
18787c478bdstevel@tonic-gate		 * should have done better
18797c478bdstevel@tonic-gate		 * put this one back and indicate to try again
18807c478bdstevel@tonic-gate		 */
18817c478bdstevel@tonic-gate		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
18827c478bdstevel@tonic-gate		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
18837c478bdstevel@tonic-gate		thread_lock_high(tp);
18847c478bdstevel@tonic-gate		THREAD_TRANSITION(tp);
18857c478bdstevel@tonic-gate		setfrontdq(tp);
18867c478bdstevel@tonic-gate		thread_unlock_nopreempt(tp);
18877c478bdstevel@tonic-gate
18887c478bdstevel@tonic-gate		tp = NULL;
18897c478bdstevel@tonic-gate	}
18907c478bdstevel@tonic-gate	return (tp);
18917c478bdstevel@tonic-gate}
18927c478bdstevel@tonic-gate
18937c478bdstevel@tonic-gate/*
18947c478bdstevel@tonic-gate * See if there is any work on the dispatcher queue for other CPUs.
18957c478bdstevel@tonic-gate * If there is, dequeue the best thread and return.
18967c478bdstevel@tonic-gate */
18977c478bdstevel@tonic-gatestatic kthread_t *
18987c478bdstevel@tonic-gatedisp_getwork(cpu_t *cp)
18997c478bdstevel@tonic-gate{
19007c478bdstevel@tonic-gate	cpu_t		*ocp;		/* other CPU */
19017c478bdstevel@tonic-gate	cpu_t		*ocp_start;
19027c478bdstevel@tonic-gate	cpu_t		*tcp;		/* target local CPU */
19037c478bdstevel@tonic-gate	kthread_t	*tp;
1904685679fakolb	kthread_t	*retval = NULL;
19057c478bdstevel@tonic-gate	pri_t		maxpri;
19067c478bdstevel@tonic-gate	disp_t		*kpq;		/* kp queue for this partition */
19077c478bdstevel@tonic-gate	lpl_t		*lpl, *lpl_leaf;
19086890d02Eric Saxe	int		leafidx, startidx;
1909685679fakolb	hrtime_t	stealtime;
19106890d02Eric Saxe	lgrp_id_t	local_id;
19117c478bdstevel@tonic-gate
19127c478bdstevel@tonic-gate	maxpri = -1;
19137c478bdstevel@tonic-gate	tcp = NULL;
19147c478bdstevel@tonic-gate
19157c478bdstevel@tonic-gate	kpq = &cp->cpu_part->cp_kp_queue;
19167c478bdstevel@tonic-gate	while (kpq->disp_maxrunpri >= 0) {
19177c478bdstevel@tonic-gate		/*
19187c478bdstevel@tonic-gate		 * Try to take a thread from the kp_queue.
19197c478bdstevel@tonic-gate		 */
19207c478bdstevel@tonic-gate		tp = (disp_getbest(kpq));
19217c478bdstevel@tonic-gate		if (tp)
19227c478bdstevel@tonic-gate			return (disp_ratify(tp, kpq));
19237c478bdstevel@tonic-gate	}
19247c478bdstevel@tonic-gate
1925ab76139esaxe	kpreempt_disable();		/* protect the cpu_active list */
19267c478bdstevel@tonic-gate
19277c478bdstevel@tonic-gate	/*
19287c478bdstevel@tonic-gate	 * Try to find something to do on another CPU's run queue.
19297c478bdstevel@tonic-gate	 * Loop through all other CPUs looking for the one with the highest
19307c478bdstevel@tonic-gate	 * priority unbound thread.
19317c478bdstevel@tonic-gate	 *
19327c478bdstevel@tonic-gate	 * On NUMA machines, the partition's CPUs are consulted in order of
19337c478bdstevel@tonic-gate	 * distance from the current CPU. This way, the first available
19347c478bdstevel@tonic-gate	 * work found is also the closest, and will suffer the least
19357c478bdstevel@tonic-gate	 * from being migrated.
19367c478bdstevel@tonic-gate	 */
19377c478bdstevel@tonic-gate	lpl = lpl_leaf = cp->cpu_lpl;
19386890d02Eric Saxe	local_id = lpl_leaf->lpl_lgrpid;
19396890d02Eric Saxe	leafidx = startidx = 0;
19407c478bdstevel@tonic-gate
19417c478bdstevel@tonic-gate	/*
19427c478bdstevel@tonic-gate	 * This loop traverses the lpl hierarchy. Higher level lpls represent
19437c478bdstevel@tonic-gate	 * broader levels of locality
19447c478bdstevel@tonic-gate	 */
19457c478bdstevel@tonic-gate	do {
19467c478bdstevel@tonic-gate		/* This loop iterates over the lpl's leaves */
19477c478bdstevel@tonic-gate		do {
19487c478bdstevel@tonic-gate			if (lpl_leaf != cp->cpu_lpl)
19497c478bdstevel@tonic-gate				ocp = lpl_leaf->lpl_cpus;
19507c478bdstevel@tonic-gate			else
19517c478bdstevel@tonic-gate				ocp = cp->cpu_next_lpl;
19527c478bdstevel@tonic-gate
19537c478bdstevel@tonic-gate			/* This loop iterates over the CPUs in the leaf */
19547c478bdstevel@tonic-gate			ocp_start = ocp;
19557c478bdstevel@tonic-gate			do {
19567c478bdstevel@tonic-gate				pri_t pri;
19577c478bdstevel@tonic-gate
19587c478bdstevel@tonic-gate				ASSERT(CPU_ACTIVE(ocp));
19597c478bdstevel@tonic-gate
19607c478bdstevel@tonic-gate				/*
196139bac37esaxe				 * End our stroll around this lpl if:
19627c478bdstevel@tonic-gate				 *
19637c478bdstevel@tonic-gate				 * - Something became runnable on the local
196439bac37esaxe				 *   queue...which also ends our stroll around
196539bac37esaxe				 *   the partition.
19667c478bdstevel@tonic-gate				 *
196739bac37esaxe				 * - We happen across another idle CPU.
196839bac37esaxe				 *   Since it is patrolling the next portion
196939bac37esaxe				 *   of the lpl's list (assuming it's not
19706890d02Eric Saxe				 *   halted, or busy servicing an interrupt),
19716890d02Eric Saxe				 *   move to the next higher level of locality.
19727c478bdstevel@tonic-gate				 */
197339bac37esaxe				if (cp->cpu_disp->disp_nrunnable != 0) {
197439bac37esaxe					kpreempt_enable();
197539bac37esaxe					return (NULL);
197639bac37esaxe				}
19777c478bdstevel@tonic-gate				if (ocp->cpu_dispatch_pri == -1) {
19787c478bdstevel@tonic-gate					if (ocp->cpu_disp_flags &
19796890d02Eric Saxe					    CPU_DISP_HALTED ||
19806890d02Eric Saxe					    ocp->cpu_intr_actv != 0)
19817c478bdstevel@tonic-gate						continue;
198239bac37esaxe					else
19836890d02Eric Saxe						goto next_level;
19847c478bdstevel@tonic-gate				}
19857c478bdstevel@tonic-gate
19867c478bdstevel@tonic-gate				/*
19877c478bdstevel@tonic-gate				 * If there's only one thread and the CPU
19887c478bdstevel@tonic-gate				 * is in the middle of a context switch,
19897c478bdstevel@tonic-gate				 * or it's currently running the idle thread,
19907c478bdstevel@tonic-gate				 * don't steal it.
19917c478bdstevel@tonic-gate				 */
19927c478bdstevel@tonic-gate				if ((ocp->cpu_disp_flags &
1993d129bdeesaxe				    CPU_DISP_DONTSTEAL) &&
19947c478bdstevel@tonic-gate				    ocp->cpu_disp->disp_nrunnable == 1)
19957c478bdstevel@tonic-gate					continue;
19967c478bdstevel@tonic-gate
19977c478bdstevel@tonic-gate				pri = ocp->cpu_disp->disp_max_unbound_pri;
19987c478bdstevel@tonic-gate				if (pri > maxpri) {
1999685679fakolb					/*
2000685679fakolb					 * Don't steal threads that we attempted
2001fb2f18fesaxe					 * to steal recently until they're ready
2002fb2f18fesaxe					 * to be stolen again.
2003685679fakolb					 */
2004685679fakolb					stealtime = ocp->cpu_disp->disp_steal;
2005685679fakolb					if (stealtime == 0 ||
2006685679fakolb					    stealtime - gethrtime() <= 0) {
2007685679fakolb						maxpri = pri;
2008685679fakolb						tcp = ocp;
2009685679fakolb					} else {
2010685679fakolb						/*
2011685679fakolb						 * Don't update tcp, just set
2012685679fakolb						 * the retval to T_DONTSTEAL, so
2013685679fakolb						 * that if no acceptable CPUs
2014685679fakolb						 * are found the return value
2015685679fakolb						 * will be T_DONTSTEAL rather
2016685679fakolb						 * then NULL.
2017685679fakolb						 */
2018685679fakolb						retval = T_DONTSTEAL;
2019685679fakolb					}
20207c478bdstevel@tonic-gate				}
20217c478bdstevel@tonic-gate			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
20227c478bdstevel@tonic-gate
20236890d02Eric Saxe			/*
20246890d02Eric Saxe			 * Iterate to the next leaf lpl in the resource set
20256890d02Eric Saxe			 * at this level of locality. If we hit the end of
20266890d02Eric Saxe			 * the set, wrap back around to the beginning.
20276890d02Eric Saxe			 *
20286890d02Eric Saxe			 * Note: This iteration is NULL terminated for a reason
20296890d02Eric Saxe			 * see lpl_topo_bootstrap() in lgrp.c for details.
20306890d02Eric Saxe			 */
20317c478bdstevel@tonic-gate			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
20327c478bdstevel@tonic-gate				leafidx = 0;
20337c478bdstevel@tonic-gate				lpl_leaf = lpl->lpl_rset[leafidx];
20347c478bdstevel@tonic-gate			}
20356890d02Eric Saxe		} while (leafidx != startidx);
20367c478bdstevel@tonic-gate
20376890d02Eric Saxenext_level:
20386890d02Eric Saxe		/*
20396890d02Eric Saxe		 * Expand the search to include farther away CPUs (next
20406890d02Eric Saxe		 * locality level). The closer CPUs that have already been
20416890d02Eric Saxe		 * checked will be checked again. In doing so, idle CPUs
20426890d02Eric Saxe		 * will tend to be more aggresive about stealing from CPUs
20436890d02Eric Saxe		 * that are closer (since the closer CPUs will be considered
20446890d02Eric Saxe		 * more often).
20456890d02Eric Saxe		 * Begin at this level with the CPUs local leaf lpl.
20466890d02Eric Saxe		 */
20476890d02Eric Saxe		if ((lpl = lpl->lpl_parent) != NULL) {
20486890d02Eric Saxe			leafidx = startidx = lpl->lpl_id2rset[local_id];
20496890d02Eric Saxe			lpl_leaf = lpl->lpl_rset[leafidx];
20506890d02Eric Saxe		}
20517c478bdstevel@tonic-gate	} while (!tcp && lpl);
20527c478bdstevel@tonic-gate
2053ab76139esaxe	kpreempt_enable();
20547c478bdstevel@tonic-gate
20557c478bdstevel@tonic-gate	/*
20567c478bdstevel@tonic-gate	 * If another queue looks good, and there is still nothing on
20577c478bdstevel@tonic-gate	 * the local queue, try to transfer one or more threads
20587c478bdstevel@tonic-gate	 * from it to our queue.
20597c478bdstevel@tonic-gate	 */
20607c478bdstevel@tonic-gate	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2061685679fakolb		tp = disp_getbest(tcp->cpu_disp);
2062685679fakolb		if (tp == NULL || tp == T_DONTSTEAL)
2063685679fakolb			return (tp);
2064685679fakolb		return (disp_ratify(tp, kpq));
20657c478bdstevel@tonic-gate	}
2066685679fakolb	return (retval);
20677c478bdstevel@tonic-gate}
20687c478bdstevel@tonic-gate
20697c478bdstevel@tonic-gate
20707c478bdstevel@tonic-gate/*
20717c478bdstevel@tonic-gate * disp_fix_unbound_pri()
20727c478bdstevel@tonic-gate *	Determines the maximum priority of unbound threads on the queue.
20737c478bdstevel@tonic-gate *	The priority is kept for the queue, but is only increased, never
20747c478bdstevel@tonic-gate *	reduced unless some CPU is looking for something on that queue.
20757c478bdstevel@tonic-gate *
20767c478bdstevel@tonic-gate *	The priority argument is the known upper limit.
20777c478bdstevel@tonic-gate *
20787c478bdstevel@tonic-gate *	Perhaps this should be kept accurately, but that probably means
20797c478bdstevel@tonic-gate *	separate bitmaps for bound and unbound threads.  Since only idled
20807c478bdstevel@tonic-gate *	CPUs will have to do this recalculation, it seems better this way.
20817c478bdstevel@tonic-gate */
20827c478bdstevel@tonic-gatestatic void
20837c478bdstevel@tonic-gatedisp_fix_unbound_pri(disp_t *dp, pri_t pri)
20847c478bdstevel@tonic-gate{
20857c478bdstevel@tonic-gate	kthread_t	*tp;
20867c478bdstevel@tonic-gate	dispq_t		*dq;
20877c478bdstevel@tonic-gate	ulong_t		*dqactmap = dp->disp_qactmap;
20887c478bdstevel@tonic-gate	ulong_t		mapword;
20897c478bdstevel@tonic-gate	int		wx;
20907c478bdstevel@tonic-gate
20917c478bdstevel@tonic-gate	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
20927c478bdstevel@tonic-gate
20937c478bdstevel@tonic-gate	ASSERT(pri >= 0);			/* checked by caller */
20947c478bdstevel@tonic-gate
20957c478bdstevel@tonic-gate	/*
20967c478bdstevel@tonic-gate	 * Start the search at the next lowest priority below the supplied
20977c478bdstevel@tonic-gate	 * priority.  This depends on the bitmap implementation.
20987c478bdstevel@tonic-gate	 */
20997c478bdstevel@tonic-gate	do {
21007c478bdstevel@tonic-gate		wx = pri >> BT_ULSHIFT;		/* index of word in map */
21017c478bdstevel@tonic-gate
21027c478bdstevel@tonic-gate		/*
21037c478bdstevel@tonic-gate		 * Form mask for all lower priorities in the word.
21047c478bdstevel@tonic-gate		 */
21057c478bdstevel@tonic-gate		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
21067c478bdstevel@tonic-gate
21077c478bdstevel@tonic-gate		/*
21087c478bdstevel@tonic-gate		 * Get next lower active priority.
21097c478bdstevel@tonic-gate		 */
21107c478bdstevel@tonic-gate		if (mapword != 0) {
21117c478bdstevel@tonic-gate			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
21127c478bdstevel@tonic-gate		} else if (wx > 0) {
21137c478bdstevel@tonic-gate			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
21147c478bdstevel@tonic-gate			if (pri < 0)
21157c478bdstevel@tonic-gate				break;
21167c478bdstevel@tonic-gate		} else {
21177c478bdstevel@tonic-gate			pri = -1;
21187c478bdstevel@tonic-gate			break;
21197c478bdstevel@tonic-gate		}
21207c478bdstevel@tonic-gate
21217c478bdstevel@tonic-gate		/*
21227c478bdstevel@tonic-gate		 * Search the queue for unbound, runnable threads.
21237c478bdstevel@tonic-gate		 */
21247c478bdstevel@tonic-gate		dq = &dp->disp_q[pri];
21257c478bdstevel@tonic-gate		tp = dq->dq_first;
21267c478bdstevel@tonic-gate
21277c478bdstevel@tonic-gate		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
21287c478bdstevel@tonic-gate			tp = tp->t_link;
21297c478bdstevel@tonic-gate		}
21307c478bdstevel@tonic-gate
21317c478bdstevel@tonic-gate		/*
21327c478bdstevel@tonic-gate		 * If a thread was found, set the priority and return.
21337c478bdstevel@tonic-gate		 */
21347c478bdstevel@tonic-gate	} while (tp == NULL);