1*0e751525SEric Saxe /*
2*0e751525SEric Saxe * CDDL HEADER START
3*0e751525SEric Saxe *
4*0e751525SEric Saxe * The contents of this file are subject to the terms of the
5*0e751525SEric Saxe * Common Development and Distribution License (the "License").
6*0e751525SEric Saxe * You may not use this file except in compliance with the License.
7*0e751525SEric Saxe *
8*0e751525SEric Saxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*0e751525SEric Saxe * or http://www.opensolaris.org/os/licensing.
10*0e751525SEric Saxe * See the License for the specific language governing permissions
11*0e751525SEric Saxe * and limitations under the License.
12*0e751525SEric Saxe *
13*0e751525SEric Saxe * When distributing Covered Code, include this CDDL HEADER in each
14*0e751525SEric Saxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*0e751525SEric Saxe * If applicable, add the following below this CDDL HEADER, with the
16*0e751525SEric Saxe * fields enclosed by brackets "[]" replaced with your own identifying
17*0e751525SEric Saxe * information: Portions Copyright [yyyy] [name of copyright owner]
18*0e751525SEric Saxe *
19*0e751525SEric Saxe * CDDL HEADER END
20*0e751525SEric Saxe */
21*0e751525SEric Saxe /*
22*0e751525SEric Saxe * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23*0e751525SEric Saxe * Use is subject to license terms.
24*0e751525SEric Saxe */
25*0e751525SEric Saxe
26*0e751525SEric Saxe #include <sys/systm.h>
27*0e751525SEric Saxe #include <sys/types.h>
28*0e751525SEric Saxe #include <sys/param.h>
29*0e751525SEric Saxe #include <sys/thread.h>
30*0e751525SEric Saxe #include <sys/cpuvar.h>
31*0e751525SEric Saxe #include <sys/cpupart.h>
32*0e751525SEric Saxe #include <sys/cmn_err.h>
33*0e751525SEric Saxe #include <sys/disp.h>
34*0e751525SEric Saxe #include <sys/group.h>
35*0e751525SEric Saxe #include <sys/bitset.h>
36*0e751525SEric Saxe #include <sys/lgrp.h>
37*0e751525SEric Saxe #include <sys/cmt.h>
38*0e751525SEric Saxe
39*0e751525SEric Saxe /*
40*0e751525SEric Saxe * CMT dispatcher policies
41*0e751525SEric Saxe *
42*0e751525SEric Saxe * This file implements CMT dispatching policies using Processor Groups.
43*0e751525SEric Saxe *
44*0e751525SEric Saxe * The scheduler/dispatcher leverages knowledge of the performance
45*0e751525SEric Saxe * relevant CMT sharing relationships existing between CPUs to implement
46*0e751525SEric Saxe * load balancing, and coalescence thread placement policies.
47*0e751525SEric Saxe *
48*0e751525SEric Saxe * Load balancing policy seeks to improve performance by minimizing
49*0e751525SEric Saxe * contention over shared processor resources / facilities. Coalescence
50*0e751525SEric Saxe * policies improve resource utilization and ultimately power efficiency.
51*0e751525SEric Saxe *
52*0e751525SEric Saxe * On NUMA systems, the dispatcher will generally perform load balancing and
53*0e751525SEric Saxe * coalescence within (and not across) lgroups. This is because there isn't
54*0e751525SEric Saxe * much sense in trying to correct an imbalance by sending a thread outside
55*0e751525SEric Saxe * of its home, if it would attempt to return home a short while later.
56*0e751525SEric Saxe * The dispatcher will implement CMT policy across lgroups however, if
57*0e751525SEric Saxe * it can do so with a thread homed to the root lgroup, since root homed
58*0e751525SEric Saxe * threads have no lgroup affinity.
59*0e751525SEric Saxe */
60*0e751525SEric Saxe
61*0e751525SEric Saxe /*
62*0e751525SEric Saxe * Return non-zero if, given the policy, we should migrate from running
63*0e751525SEric Saxe * somewhere "here" to somewhere "there".
64*0e751525SEric Saxe */
65*0e751525SEric Saxe static int
cmt_should_migrate(pg_cmt_t * here,pg_cmt_t * there,pg_cmt_policy_t policy,int self)66*0e751525SEric Saxe cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy,
67*0e751525SEric Saxe int self)
68*0e751525SEric Saxe {
69*0e751525SEric Saxe uint32_t here_util, there_util;
70*0e751525SEric Saxe
71*0e751525SEric Saxe here_util = here->cmt_utilization;
72*0e751525SEric Saxe there_util = there->cmt_utilization;
73*0e751525SEric Saxe
74*0e751525SEric Saxe /*
75*0e751525SEric Saxe * This assumes that curthread's utilization is "1"
76*0e751525SEric Saxe */
77*0e751525SEric Saxe if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid))
78*0e751525SEric Saxe here_util--; /* Ignore curthread's effect */
79*0e751525SEric Saxe
80*0e751525SEric Saxe /*
81*0e751525SEric Saxe * Load balancing and coalescence are conflicting policies
82*0e751525SEric Saxe */
83*0e751525SEric Saxe ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) !=
84*0e751525SEric Saxe (CMT_BALANCE|CMT_COALESCE));
85*0e751525SEric Saxe
86*0e751525SEric Saxe if (policy & CMT_BALANCE) {
87*0e751525SEric Saxe /*
88*0e751525SEric Saxe * Balance utilization
89*0e751525SEric Saxe *
90*0e751525SEric Saxe * If the target is comparatively underutilized
91*0e751525SEric Saxe * (either in an absolute sense, or scaled by capacity),
92*0e751525SEric Saxe * then choose to balance.
93*0e751525SEric Saxe */
94*0e751525SEric Saxe if ((here_util > there_util) ||
95*0e751525SEric Saxe (here_util == there_util &&
96*0e751525SEric Saxe (CMT_CAPACITY(there) > CMT_CAPACITY(here)))) {
97*0e751525SEric Saxe return (1);
98*0e751525SEric Saxe }
99*0e751525SEric Saxe } else if (policy & CMT_COALESCE) {
100*0e751525SEric Saxe /*
101*0e751525SEric Saxe * Attempt to drive group utilization up to capacity
102*0e751525SEric Saxe */
103*0e751525SEric Saxe if (there_util > here_util &&
104*0e751525SEric Saxe there_util < CMT_CAPACITY(there))
105*0e751525SEric Saxe return (1);
106*0e751525SEric Saxe }
107*0e751525SEric Saxe return (0);
108*0e751525SEric Saxe }
109*0e751525SEric Saxe
110*0e751525SEric Saxe /*
111*0e751525SEric Saxe * Perform multi-level CMT load balancing of running threads.
112*0e751525SEric Saxe *
113*0e751525SEric Saxe * tp is the thread being enqueued.
114*0e751525SEric Saxe * cp is a hint CPU, against which CMT load balancing will be performed.
115*0e751525SEric Saxe *
116*0e751525SEric Saxe * Returns cp, or a CPU better than cp with respect to balancing
117*0e751525SEric Saxe * running thread load.
118*0e751525SEric Saxe */
119*0e751525SEric Saxe cpu_t *
cmt_balance(kthread_t * tp,cpu_t * cp)120*0e751525SEric Saxe cmt_balance(kthread_t *tp, cpu_t *cp)
121*0e751525SEric Saxe {
122*0e751525SEric Saxe int hint, i, cpu, nsiblings;
123*0e751525SEric Saxe int self = 0;
124*0e751525SEric Saxe group_t *cmt_pgs, *siblings;
125*0e751525SEric Saxe pg_cmt_t *pg, *pg_tmp, *tpg = NULL;
126*0e751525SEric Saxe int level = 0;
127*0e751525SEric Saxe cpu_t *newcp;
128*0e751525SEric Saxe extern cmt_lgrp_t *cmt_root;
129*0e751525SEric Saxe
130*0e751525SEric Saxe ASSERT(THREAD_LOCK_HELD(tp));
131*0e751525SEric Saxe
132*0e751525SEric Saxe cmt_pgs = &cp->cpu_pg->cmt_pgs;
133*0e751525SEric Saxe
134*0e751525SEric Saxe if (GROUP_SIZE(cmt_pgs) == 0)
135*0e751525SEric Saxe return (cp); /* nothing to do */
136*0e751525SEric Saxe
137*0e751525SEric Saxe if (tp == curthread)
138*0e751525SEric Saxe self = 1;
139*0e751525SEric Saxe
140*0e751525SEric Saxe /*
141*0e751525SEric Saxe * Balance across siblings in the CPUs CMT lineage
142*0e751525SEric Saxe * If the thread is homed to the root lgroup, perform
143*0e751525SEric Saxe * top level balancing against other top level PGs
144*0e751525SEric Saxe * in the system. Otherwise, start with the default
145*0e751525SEric Saxe * top level siblings group, which is within the leaf lgroup
146*0e751525SEric Saxe */
147*0e751525SEric Saxe pg = GROUP_ACCESS(cmt_pgs, level);
148*0e751525SEric Saxe if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
149*0e751525SEric Saxe siblings = &cmt_root->cl_pgs;
150*0e751525SEric Saxe else
151*0e751525SEric Saxe siblings = pg->cmt_siblings;
152*0e751525SEric Saxe
153*0e751525SEric Saxe /*
154*0e751525SEric Saxe * Traverse down the lineage until we find a level that needs
155*0e751525SEric Saxe * balancing, or we get to the end.
156*0e751525SEric Saxe */
157*0e751525SEric Saxe for (;;) {
158*0e751525SEric Saxe nsiblings = GROUP_SIZE(siblings); /* self inclusive */
159*0e751525SEric Saxe if (nsiblings == 1)
160*0e751525SEric Saxe goto next_level;
161*0e751525SEric Saxe
162*0e751525SEric Saxe hint = CPU_PSEUDO_RANDOM() % nsiblings;
163*0e751525SEric Saxe
164*0e751525SEric Saxe /*
165*0e751525SEric Saxe * Find a balancing candidate from among our siblings
166*0e751525SEric Saxe * "hint" is a hint for where to start looking
167*0e751525SEric Saxe */
168*0e751525SEric Saxe i = hint;
169*0e751525SEric Saxe do {
170*0e751525SEric Saxe ASSERT(i < nsiblings);
171*0e751525SEric Saxe pg_tmp = GROUP_ACCESS(siblings, i);
172*0e751525SEric Saxe
173*0e751525SEric Saxe /*
174*0e751525SEric Saxe * The candidate must not be us, and must
175*0e751525SEric Saxe * have some CPU resources in the thread's
176*0e751525SEric Saxe * partition
177*0e751525SEric Saxe */
178*0e751525SEric Saxe if (pg_tmp != pg &&
179*0e751525SEric Saxe bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
180*0e751525SEric Saxe ((pg_t *)pg_tmp)->pg_id)) {
181*0e751525SEric Saxe tpg = pg_tmp;
182*0e751525SEric Saxe break;
183*0e751525SEric Saxe }
184*0e751525SEric Saxe
185*0e751525SEric Saxe if (++i >= nsiblings)
186*0e751525SEric Saxe i = 0;
187*0e751525SEric Saxe } while (i != hint);
188*0e751525SEric Saxe
189*0e751525SEric Saxe if (!tpg)
190*0e751525SEric Saxe goto next_level; /* no candidates at this level */
191*0e751525SEric Saxe
192*0e751525SEric Saxe /*
193*0e751525SEric Saxe * Decide if we should migrate from the current PG to a
194*0e751525SEric Saxe * target PG given a policy
195*0e751525SEric Saxe */
196*0e751525SEric Saxe if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self))
197*0e751525SEric Saxe break;
198*0e751525SEric Saxe tpg = NULL;
199*0e751525SEric Saxe
200*0e751525SEric Saxe next_level:
201*0e751525SEric Saxe if (++level == GROUP_SIZE(cmt_pgs))
202*0e751525SEric Saxe break;
203*0e751525SEric Saxe
204*0e751525SEric Saxe pg = GROUP_ACCESS(cmt_pgs, level);
205*0e751525SEric Saxe siblings = pg->cmt_siblings;
206*0e751525SEric Saxe }
207*0e751525SEric Saxe
208*0e751525SEric Saxe if (tpg) {
209*0e751525SEric Saxe uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
210*0e751525SEric Saxe
211*0e751525SEric Saxe /*
212*0e751525SEric Saxe * Select an idle CPU from the target
213*0e751525SEric Saxe */
214*0e751525SEric Saxe hint = CPU_PSEUDO_RANDOM() % tgt_size;
215*0e751525SEric Saxe cpu = hint;
216*0e751525SEric Saxe do {
217*0e751525SEric Saxe newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
218*0e751525SEric Saxe if (newcp->cpu_part == tp->t_cpupart &&
219*0e751525SEric Saxe newcp->cpu_dispatch_pri == -1) {
220*0e751525SEric Saxe cp = newcp;
221*0e751525SEric Saxe break;
222*0e751525SEric Saxe }
223*0e751525SEric Saxe if (++cpu == tgt_size)
224*0e751525SEric Saxe cpu = 0;
225*0e751525SEric Saxe } while (cpu != hint);
226*0e751525SEric Saxe }
227*0e751525SEric Saxe
228*0e751525SEric Saxe return (cp);
229*0e751525SEric Saxe }
230