1*0e751525SEric Saxe /*
2*0e751525SEric Saxe  * CDDL HEADER START
3*0e751525SEric Saxe  *
4*0e751525SEric Saxe  * The contents of this file are subject to the terms of the
5*0e751525SEric Saxe  * Common Development and Distribution License (the "License").
6*0e751525SEric Saxe  * You may not use this file except in compliance with the License.
7*0e751525SEric Saxe  *
8*0e751525SEric Saxe  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*0e751525SEric Saxe  * or http://www.opensolaris.org/os/licensing.
10*0e751525SEric Saxe  * See the License for the specific language governing permissions
11*0e751525SEric Saxe  * and limitations under the License.
12*0e751525SEric Saxe  *
13*0e751525SEric Saxe  * When distributing Covered Code, include this CDDL HEADER in each
14*0e751525SEric Saxe  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*0e751525SEric Saxe  * If applicable, add the following below this CDDL HEADER, with the
16*0e751525SEric Saxe  * fields enclosed by brackets "[]" replaced with your own identifying
17*0e751525SEric Saxe  * information: Portions Copyright [yyyy] [name of copyright owner]
18*0e751525SEric Saxe  *
19*0e751525SEric Saxe  * CDDL HEADER END
20*0e751525SEric Saxe  */
21*0e751525SEric Saxe /*
22*0e751525SEric Saxe  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23*0e751525SEric Saxe  * Use is subject to license terms.
24*0e751525SEric Saxe  */
25*0e751525SEric Saxe 
26*0e751525SEric Saxe #include <sys/systm.h>
27*0e751525SEric Saxe #include <sys/types.h>
28*0e751525SEric Saxe #include <sys/param.h>
29*0e751525SEric Saxe #include <sys/thread.h>
30*0e751525SEric Saxe #include <sys/cpuvar.h>
31*0e751525SEric Saxe #include <sys/cpupart.h>
32*0e751525SEric Saxe #include <sys/cmn_err.h>
33*0e751525SEric Saxe #include <sys/disp.h>
34*0e751525SEric Saxe #include <sys/group.h>
35*0e751525SEric Saxe #include <sys/bitset.h>
36*0e751525SEric Saxe #include <sys/lgrp.h>
37*0e751525SEric Saxe #include <sys/cmt.h>
38*0e751525SEric Saxe 
39*0e751525SEric Saxe /*
40*0e751525SEric Saxe  * CMT dispatcher policies
41*0e751525SEric Saxe  *
42*0e751525SEric Saxe  * This file implements CMT dispatching policies using Processor Groups.
43*0e751525SEric Saxe  *
44*0e751525SEric Saxe  * The scheduler/dispatcher leverages knowledge of the performance
45*0e751525SEric Saxe  * relevant CMT sharing relationships existing between CPUs to implement
46*0e751525SEric Saxe  * load balancing, and coalescence thread placement policies.
47*0e751525SEric Saxe  *
48*0e751525SEric Saxe  * Load balancing policy seeks to improve performance by minimizing
49*0e751525SEric Saxe  * contention over shared processor resources / facilities. Coalescence
50*0e751525SEric Saxe  * policies improve resource utilization and ultimately power efficiency.
51*0e751525SEric Saxe  *
52*0e751525SEric Saxe  * On NUMA systems, the dispatcher will generally perform load balancing and
53*0e751525SEric Saxe  * coalescence within (and not across) lgroups. This is because there isn't
54*0e751525SEric Saxe  * much sense in trying to correct an imbalance by sending a thread outside
55*0e751525SEric Saxe  * of its home, if it would attempt to return home a short while later.
56*0e751525SEric Saxe  * The dispatcher will implement CMT policy across lgroups however, if
57*0e751525SEric Saxe  * it can do so with a thread homed to the root lgroup, since root homed
58*0e751525SEric Saxe  * threads have no lgroup affinity.
59*0e751525SEric Saxe  */
60*0e751525SEric Saxe 
61*0e751525SEric Saxe /*
62*0e751525SEric Saxe  * Return non-zero if, given the policy, we should migrate from running
63*0e751525SEric Saxe  * somewhere "here" to somewhere "there".
64*0e751525SEric Saxe  */
65*0e751525SEric Saxe static int
cmt_should_migrate(pg_cmt_t * here,pg_cmt_t * there,pg_cmt_policy_t policy,int self)66*0e751525SEric Saxe cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy,
67*0e751525SEric Saxe     int self)
68*0e751525SEric Saxe {
69*0e751525SEric Saxe 	uint32_t here_util, there_util;
70*0e751525SEric Saxe 
71*0e751525SEric Saxe 	here_util = here->cmt_utilization;
72*0e751525SEric Saxe 	there_util = there->cmt_utilization;
73*0e751525SEric Saxe 
74*0e751525SEric Saxe 	/*
75*0e751525SEric Saxe 	 * This assumes that curthread's utilization is "1"
76*0e751525SEric Saxe 	 */
77*0e751525SEric Saxe 	if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid))
78*0e751525SEric Saxe 		here_util--;	/* Ignore curthread's effect */
79*0e751525SEric Saxe 
80*0e751525SEric Saxe 	/*
81*0e751525SEric Saxe 	 * Load balancing and coalescence are conflicting policies
82*0e751525SEric Saxe 	 */
83*0e751525SEric Saxe 	ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) !=
84*0e751525SEric Saxe 	    (CMT_BALANCE|CMT_COALESCE));
85*0e751525SEric Saxe 
86*0e751525SEric Saxe 	if (policy & CMT_BALANCE) {
87*0e751525SEric Saxe 		/*
88*0e751525SEric Saxe 		 * Balance utilization
89*0e751525SEric Saxe 		 *
90*0e751525SEric Saxe 		 * If the target is comparatively underutilized
91*0e751525SEric Saxe 		 * (either in an absolute sense, or scaled by capacity),
92*0e751525SEric Saxe 		 * then choose to balance.
93*0e751525SEric Saxe 		 */
94*0e751525SEric Saxe 		if ((here_util > there_util) ||
95*0e751525SEric Saxe 		    (here_util == there_util &&
96*0e751525SEric Saxe 		    (CMT_CAPACITY(there) > CMT_CAPACITY(here)))) {
97*0e751525SEric Saxe 			return (1);
98*0e751525SEric Saxe 		}
99*0e751525SEric Saxe 	} else if (policy & CMT_COALESCE) {
100*0e751525SEric Saxe 		/*
101*0e751525SEric Saxe 		 * Attempt to drive group utilization up to capacity
102*0e751525SEric Saxe 		 */
103*0e751525SEric Saxe 		if (there_util > here_util &&
104*0e751525SEric Saxe 		    there_util < CMT_CAPACITY(there))
105*0e751525SEric Saxe 			return (1);
106*0e751525SEric Saxe 	}
107*0e751525SEric Saxe 	return (0);
108*0e751525SEric Saxe }
109*0e751525SEric Saxe 
110*0e751525SEric Saxe /*
111*0e751525SEric Saxe  * Perform multi-level CMT load balancing of running threads.
112*0e751525SEric Saxe  *
113*0e751525SEric Saxe  * tp is the thread being enqueued.
114*0e751525SEric Saxe  * cp is a hint CPU, against which CMT load balancing will be performed.
115*0e751525SEric Saxe  *
116*0e751525SEric Saxe  * Returns cp, or a CPU better than cp with respect to balancing
117*0e751525SEric Saxe  * running thread load.
118*0e751525SEric Saxe  */
119*0e751525SEric Saxe cpu_t *
cmt_balance(kthread_t * tp,cpu_t * cp)120*0e751525SEric Saxe cmt_balance(kthread_t *tp, cpu_t *cp)
121*0e751525SEric Saxe {
122*0e751525SEric Saxe 	int		hint, i, cpu, nsiblings;
123*0e751525SEric Saxe 	int		self = 0;
124*0e751525SEric Saxe 	group_t		*cmt_pgs, *siblings;
125*0e751525SEric Saxe 	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
126*0e751525SEric Saxe 	int		level = 0;
127*0e751525SEric Saxe 	cpu_t		*newcp;
128*0e751525SEric Saxe 	extern cmt_lgrp_t *cmt_root;
129*0e751525SEric Saxe 
130*0e751525SEric Saxe 	ASSERT(THREAD_LOCK_HELD(tp));
131*0e751525SEric Saxe 
132*0e751525SEric Saxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
133*0e751525SEric Saxe 
134*0e751525SEric Saxe 	if (GROUP_SIZE(cmt_pgs) == 0)
135*0e751525SEric Saxe 		return (cp);	/* nothing to do */
136*0e751525SEric Saxe 
137*0e751525SEric Saxe 	if (tp == curthread)
138*0e751525SEric Saxe 		self = 1;
139*0e751525SEric Saxe 
140*0e751525SEric Saxe 	/*
141*0e751525SEric Saxe 	 * Balance across siblings in the CPUs CMT lineage
142*0e751525SEric Saxe 	 * If the thread is homed to the root lgroup, perform
143*0e751525SEric Saxe 	 * top level balancing against other top level PGs
144*0e751525SEric Saxe 	 * in the system. Otherwise, start with the default
145*0e751525SEric Saxe 	 * top level siblings group, which is within the leaf lgroup
146*0e751525SEric Saxe 	 */
147*0e751525SEric Saxe 	pg = GROUP_ACCESS(cmt_pgs, level);
148*0e751525SEric Saxe 	if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
149*0e751525SEric Saxe 		siblings = &cmt_root->cl_pgs;
150*0e751525SEric Saxe 	else
151*0e751525SEric Saxe 		siblings = pg->cmt_siblings;
152*0e751525SEric Saxe 
153*0e751525SEric Saxe 	/*
154*0e751525SEric Saxe 	 * Traverse down the lineage until we find a level that needs
155*0e751525SEric Saxe 	 * balancing, or we get to the end.
156*0e751525SEric Saxe 	 */
157*0e751525SEric Saxe 	for (;;) {
158*0e751525SEric Saxe 		nsiblings = GROUP_SIZE(siblings);	/* self inclusive */
159*0e751525SEric Saxe 		if (nsiblings == 1)
160*0e751525SEric Saxe 			goto next_level;
161*0e751525SEric Saxe 
162*0e751525SEric Saxe 		hint = CPU_PSEUDO_RANDOM() % nsiblings;
163*0e751525SEric Saxe 
164*0e751525SEric Saxe 		/*
165*0e751525SEric Saxe 		 * Find a balancing candidate from among our siblings
166*0e751525SEric Saxe 		 * "hint" is a hint for where to start looking
167*0e751525SEric Saxe 		 */
168*0e751525SEric Saxe 		i = hint;
169*0e751525SEric Saxe 		do {
170*0e751525SEric Saxe 			ASSERT(i < nsiblings);
171*0e751525SEric Saxe 			pg_tmp = GROUP_ACCESS(siblings, i);
172*0e751525SEric Saxe 
173*0e751525SEric Saxe 			/*
174*0e751525SEric Saxe 			 * The candidate must not be us, and must
175*0e751525SEric Saxe 			 * have some CPU resources in the thread's
176*0e751525SEric Saxe 			 * partition
177*0e751525SEric Saxe 			 */
178*0e751525SEric Saxe 			if (pg_tmp != pg &&
179*0e751525SEric Saxe 			    bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
180*0e751525SEric Saxe 			    ((pg_t *)pg_tmp)->pg_id)) {
181*0e751525SEric Saxe 				tpg = pg_tmp;
182*0e751525SEric Saxe 				break;
183*0e751525SEric Saxe 			}
184*0e751525SEric Saxe 
185*0e751525SEric Saxe 			if (++i >= nsiblings)
186*0e751525SEric Saxe 				i = 0;
187*0e751525SEric Saxe 		} while (i != hint);
188*0e751525SEric Saxe 
189*0e751525SEric Saxe 		if (!tpg)
190*0e751525SEric Saxe 			goto next_level; /* no candidates at this level */
191*0e751525SEric Saxe 
192*0e751525SEric Saxe 		/*
193*0e751525SEric Saxe 		 * Decide if we should migrate from the current PG to a
194*0e751525SEric Saxe 		 * target PG given a policy
195*0e751525SEric Saxe 		 */
196*0e751525SEric Saxe 		if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self))
197*0e751525SEric Saxe 			break;
198*0e751525SEric Saxe 		tpg = NULL;
199*0e751525SEric Saxe 
200*0e751525SEric Saxe next_level:
201*0e751525SEric Saxe 		if (++level == GROUP_SIZE(cmt_pgs))
202*0e751525SEric Saxe 			break;
203*0e751525SEric Saxe 
204*0e751525SEric Saxe 		pg = GROUP_ACCESS(cmt_pgs, level);
205*0e751525SEric Saxe 		siblings = pg->cmt_siblings;
206*0e751525SEric Saxe 	}
207*0e751525SEric Saxe 
208*0e751525SEric Saxe 	if (tpg) {
209*0e751525SEric Saxe 		uint_t	tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
210*0e751525SEric Saxe 
211*0e751525SEric Saxe 		/*
212*0e751525SEric Saxe 		 * Select an idle CPU from the target
213*0e751525SEric Saxe 		 */
214*0e751525SEric Saxe 		hint = CPU_PSEUDO_RANDOM() % tgt_size;
215*0e751525SEric Saxe 		cpu = hint;
216*0e751525SEric Saxe 		do {
217*0e751525SEric Saxe 			newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
218*0e751525SEric Saxe 			if (newcp->cpu_part == tp->t_cpupart &&
219*0e751525SEric Saxe 			    newcp->cpu_dispatch_pri == -1) {
220*0e751525SEric Saxe 				cp = newcp;
221*0e751525SEric Saxe 				break;
222*0e751525SEric Saxe 			}
223*0e751525SEric Saxe 			if (++cpu == tgt_size)
224*0e751525SEric Saxe 				cpu = 0;
225*0e751525SEric Saxe 		} while (cpu != hint);
226*0e751525SEric Saxe 	}
227*0e751525SEric Saxe 
228*0e751525SEric Saxe 	return (cp);
229*0e751525SEric Saxe }
230