1455e370cSJohn Levon /*
2455e370cSJohn Levon * This file and its contents are supplied under the terms of the
3455e370cSJohn Levon * Common Development and Distribution License ("CDDL"), version 1.0.
4455e370cSJohn Levon * You may only use this file in accordance with the terms of version
5455e370cSJohn Levon * 1.0 of the CDDL.
6455e370cSJohn Levon *
7455e370cSJohn Levon * A full copy of the text of the CDDL should have accompanied this
8455e370cSJohn Levon * source. A copy of the CDDL is also available via the Internet at
9455e370cSJohn Levon * http://www.illumos.org/license/CDDL.
10455e370cSJohn Levon */
11455e370cSJohn Levon
12455e370cSJohn Levon /*
13a9cc46cfSRobert Mustacchi * Copyright 2019 Joyent, Inc.
14455e370cSJohn Levon */
15455e370cSJohn Levon
16455e370cSJohn Levon /*
17c3377ee9SJohn Levon * SMT exclusion: prevent a sibling in a hyper-threaded core from running in VMX
18455e370cSJohn Levon * non-root guest mode, when certain threads are running on the other sibling.
19455e370cSJohn Levon * This avoids speculation-based information leaks such as L1TF being available
20455e370cSJohn Levon * to the untrusted guest. The stance we take is that threads from the same
21455e370cSJohn Levon * zone as the guest VPCU thread are considered safe to run alongside, but all
22455e370cSJohn Levon * other threads (except the idle thread), and all interrupts, are unsafe. Note
23455e370cSJohn Levon * that due to the implementation here, there are significant sections of e.g.
24455e370cSJohn Levon * the dispatcher code that can run concurrently with a guest, until the thread
25c3377ee9SJohn Levon * reaches smt_mark(). This code assumes there are only two SMT threads per
26c3377ee9SJohn Levon * core.
27455e370cSJohn Levon *
28455e370cSJohn Levon * The entry points are as follows:
29455e370cSJohn Levon *
30c3377ee9SJohn Levon * smt_mark_as_vcpu()
31455e370cSJohn Levon *
32455e370cSJohn Levon * All threads that enter guest mode (i.e. VCPU threads) need to call this at
33455e370cSJohn Levon * least once, which sets TS_VCPU in ->t_schedflag.
34455e370cSJohn Levon *
35c3377ee9SJohn Levon * smt_mark()
36455e370cSJohn Levon *
37455e370cSJohn Levon * A new ->cpu_thread is now curthread (although interrupt threads have their
38455e370cSJohn Levon * own separate handling). After preventing any interrupts, we will take our
39c3377ee9SJohn Levon * own CPU's spinlock and update our own state in mcpu_smt.
40455e370cSJohn Levon *
41455e370cSJohn Levon * If our sibling is poisoned (i.e. in guest mode or the little bit of code
42455e370cSJohn Levon * around it), and we're not compatible (that is, same zone ID, or the idle
43c3377ee9SJohn Levon * thread), then we need to smt_kick() that sibling. smt_kick() itself waits
44c3377ee9SJohn Levon * for the sibling to call smt_release(), and it will not re-enter guest mode
45c3377ee9SJohn Levon * until allowed.
46455e370cSJohn Levon *
47455e370cSJohn Levon * Note that we ignore the fact a process can change its zone ID: poisoning
48455e370cSJohn Levon * threads never do so, and we can ignore the other cases.
49455e370cSJohn Levon *
50c3377ee9SJohn Levon * smt_acquire()
51455e370cSJohn Levon *
52455e370cSJohn Levon * We are a VCPU thread about to start guest execution. Interrupts are
53c3377ee9SJohn Levon * disabled. We must have already run smt_mark() to be in this code, so there's
54455e370cSJohn Levon * no need to take our *own* spinlock in order to mark ourselves as CM_POISONED.
55455e370cSJohn Levon * Instead, we take our sibling's lock to also mark ourselves as poisoned in the
56c3377ee9SJohn Levon * sibling cpu_smt_t. This is so smt_mark() will only ever need to look at its
57c3377ee9SJohn Levon * local mcpu_smt.
58455e370cSJohn Levon *
59c3377ee9SJohn Levon * We'll loop here for up to smt_acquire_wait_time microseconds; this is mainly
60455e370cSJohn Levon * to wait out any sibling interrupt: many of them will complete quicker than
61455e370cSJohn Levon * this.
62455e370cSJohn Levon *
63455e370cSJohn Levon * Finally, if we succeeded in acquiring the core, we'll flush the L1 cache as
64455e370cSJohn Levon * mitigation against L1TF: no incompatible thread will now be able to populate
65c3377ee9SJohn Levon * the L1 cache until *we* smt_release().
66455e370cSJohn Levon *
67c3377ee9SJohn Levon * smt_release()
68455e370cSJohn Levon *
69c3377ee9SJohn Levon * Simply unpoison ourselves similarly to smt_acquire(); smt_kick() will wait
70c3377ee9SJohn Levon * for this to happen if needed.
71455e370cSJohn Levon *
72c3377ee9SJohn Levon * smt_begin_intr()
73455e370cSJohn Levon *
74455e370cSJohn Levon * In an interrupt prolog. We're either a hilevel interrupt, or a pinning
75455e370cSJohn Levon * interrupt. In both cases, we mark our interrupt depth, and potentially
76c3377ee9SJohn Levon * smt_kick(). This enforces exclusion, but doesn't otherwise modify
77c3377ee9SJohn Levon * ->cs_state: we want the dispatcher code to essentially ignore interrupts.
78455e370cSJohn Levon *
79c3377ee9SJohn Levon * smt_end_intr()
80455e370cSJohn Levon *
81455e370cSJohn Levon * In an interrupt epilogue *or* thread_unpin(). In the first case, we never
82455e370cSJohn Levon * slept, and we can simply decrement our counter. In the second case, we're an
83455e370cSJohn Levon * interrupt thread about to sleep: we'll still just decrement our counter, and
84455e370cSJohn Levon * henceforth treat the thread as a normal thread when it next gets scheduled,
85455e370cSJohn Levon * until it finally gets to its epilogue.
86455e370cSJohn Levon *
87c3377ee9SJohn Levon * smt_mark_unsafe() / smt_mark_safe()
88455e370cSJohn Levon *
89455e370cSJohn Levon * Mark the current thread as temporarily unsafe (guests should not be executing
90455e370cSJohn Levon * while a sibling is marked unsafe). This can be used for a thread that's
91455e370cSJohn Levon * otherwise considered safe, if it needs to handle potentially sensitive data.
92455e370cSJohn Levon * Right now, this means certain I/O handling operations that reach down into
93455e370cSJohn Levon * the networking and ZFS sub-systems.
94455e370cSJohn Levon *
95c3377ee9SJohn Levon * smt_should_run(thread, cpu)
96455e370cSJohn Levon *
97455e370cSJohn Levon * This is used by the dispatcher when making scheduling decisions: if the
98455e370cSJohn Levon * sibling is compatible with the given thread, we return B_TRUE. This is
99c3377ee9SJohn Levon * essentially trying to guess if any subsequent smt_acquire() will fail, by
100455e370cSJohn Levon * peeking at the sibling CPU's state. The peek is racy, but if we get things
101c3377ee9SJohn Levon * wrong, the "only" consequence is that smt_acquire() may lose.
102455e370cSJohn Levon *
103c3377ee9SJohn Levon * smt_adjust_cpu_score()
104455e370cSJohn Levon *
105455e370cSJohn Levon * Used when scoring other CPUs in disp_lowpri_cpu(). If we shouldn't run here,
106455e370cSJohn Levon * we'll add a small penalty to the score. This also makes sure a VCPU thread
107455e370cSJohn Levon * migration behaves properly.
108c3377ee9SJohn Levon *
109c3377ee9SJohn Levon * smt_init() / smt_late_init()
110c3377ee9SJohn Levon *
111c3377ee9SJohn Levon * Set up SMT handling. If smt_boot_disable is set, smt_late_init(), which runs
112c3377ee9SJohn Levon * late enough to be able to do so, will offline and mark CPU_DISABLED all the
113c3377ee9SJohn Levon * siblings. smt_disable() can also be called after boot via psradm -Ha.
114455e370cSJohn Levon */
115455e370cSJohn Levon
116455e370cSJohn Levon #include <sys/archsystm.h>
117455e370cSJohn Levon #include <sys/disp.h>
118455e370cSJohn Levon #include <sys/cmt.h>
119455e370cSJohn Levon #include <sys/systm.h>
120455e370cSJohn Levon #include <sys/cpu.h>
121455e370cSJohn Levon #include <sys/var.h>
122455e370cSJohn Levon #include <sys/xc_levels.h>
123455e370cSJohn Levon #include <sys/cmn_err.h>
124455e370cSJohn Levon #include <sys/sysmacros.h>
125455e370cSJohn Levon #include <sys/x86_archext.h>
126c3377ee9SJohn Levon #include <sys/esunddi.h>
127c3377ee9SJohn Levon #include <sys/promif.h>
128c3377ee9SJohn Levon #include <sys/policy.h>
129c3377ee9SJohn Levon #include <sys/smt.h>
130455e370cSJohn Levon
131455e370cSJohn Levon #define CS_SHIFT (8)
132455e370cSJohn Levon #define CS_MASK ((1 << CS_SHIFT) - 1)
133455e370cSJohn Levon #define CS_MARK(s) ((s) & CS_MASK)
134455e370cSJohn Levon #define CS_ZONE(s) ((s) >> CS_SHIFT)
135455e370cSJohn Levon #define CS_MK(s, z) ((s) | (z << CS_SHIFT))
136455e370cSJohn Levon
137c3377ee9SJohn Levon typedef enum cs_mark {
138455e370cSJohn Levon CM_IDLE = 0, /* running CPU idle thread */
139455e370cSJohn Levon CM_THREAD, /* running general non-VCPU thread */
140455e370cSJohn Levon CM_UNSAFE, /* running ->t_unsafe thread */
141455e370cSJohn Levon CM_VCPU, /* running VCPU thread */
142455e370cSJohn Levon CM_POISONED /* running in guest */
143c3377ee9SJohn Levon } cs_mark_t;
144455e370cSJohn Levon
145455e370cSJohn Levon /* Double-check our false-sharing padding. */
146c3377ee9SJohn Levon CTASSERT(offsetof(cpu_smt_t, cs_sib) == 64);
147455e370cSJohn Levon CTASSERT(CM_IDLE == 0);
148455e370cSJohn Levon CTASSERT(CM_POISONED < (1 << CS_SHIFT));
149455e370cSJohn Levon CTASSERT(CM_POISONED > CM_VCPU);
150455e370cSJohn Levon CTASSERT(CM_VCPU > CM_UNSAFE);
151455e370cSJohn Levon
152455e370cSJohn Levon static uint_t empty_pil = XC_CPUPOKE_PIL;
153455e370cSJohn Levon
154455e370cSJohn Levon /*
155c3377ee9SJohn Levon * If disabled, no SMT exclusion is performed, and system is potentially
156455e370cSJohn Levon * vulnerable to L1TF if hyper-threading is enabled, and we don't have the "not
157455e370cSJohn Levon * vulnerable" CPUID bit.
158455e370cSJohn Levon */
159c3377ee9SJohn Levon int smt_exclusion = 1;
160455e370cSJohn Levon
161455e370cSJohn Levon /*
162c3377ee9SJohn Levon * How long smt_acquire() will spin trying to acquire the core, in
163c3377ee9SJohn Levon * micro-seconds. This is enough time to wait out a significant proportion of
164c3377ee9SJohn Levon * interrupts.
165455e370cSJohn Levon */
166c3377ee9SJohn Levon clock_t smt_acquire_wait_time = 64;
167455e370cSJohn Levon
168455e370cSJohn Levon /*
169c3377ee9SJohn Levon * Did we request a disable of SMT at boot time?
170455e370cSJohn Levon */
171c3377ee9SJohn Levon int smt_boot_disable;
172455e370cSJohn Levon
173c3377ee9SJohn Levon /*
174c3377ee9SJohn Levon * Whether SMT is enabled.
175c3377ee9SJohn Levon */
176c3377ee9SJohn Levon int smt_enabled = 1;
177455e370cSJohn Levon
178455e370cSJohn Levon /*
179455e370cSJohn Levon * We're adding an interrupt handler of some kind at the given PIL. If this
180455e370cSJohn Levon * happens to be the same PIL as XC_CPUPOKE_PIL, then we need to disable our
181455e370cSJohn Levon * pil_needs_kick() optimization, as there is now potentially an unsafe
182455e370cSJohn Levon * interrupt handler at that PIL. This typically won't occur, so we're not that
183455e370cSJohn Levon * careful about what's actually getting added, which CPU it's on, or if it gets
184455e370cSJohn Levon * removed. This also presumes that softints can't cover our empty_pil.
185455e370cSJohn Levon */
186455e370cSJohn Levon void
smt_intr_alloc_pil(uint_t pil)187c3377ee9SJohn Levon smt_intr_alloc_pil(uint_t pil)
188455e370cSJohn Levon {
189455e370cSJohn Levon ASSERT(pil <= PIL_MAX);
190455e370cSJohn Levon
191455e370cSJohn Levon if (empty_pil == pil)
192455e370cSJohn Levon empty_pil = PIL_MAX + 1;
193455e370cSJohn Levon }
194455e370cSJohn Levon
195455e370cSJohn Levon /*
196455e370cSJohn Levon * If our sibling is also a VCPU thread from a different zone, we need one of
197455e370cSJohn Levon * them to give up, otherwise they will just battle each other for exclusion
198455e370cSJohn Levon * until they exhaust their quantum.
199455e370cSJohn Levon *
200455e370cSJohn Levon * We arbitrate between them by dispatch priority: clearly, a higher-priority
201455e370cSJohn Levon * thread deserves to win the acquisition. However, under CPU load, it'll be
202455e370cSJohn Levon * very common to see both threads with ->t_pri == 1. If so, we'll break the
203455e370cSJohn Levon * tie by cpu_id (which is hopefully arbitrary enough).
204455e370cSJohn Levon *
205455e370cSJohn Levon * If we lose, the VMM code will take this as a hint to call
206455e370cSJohn Levon * thread_affinity_set(CPU_BEST), which will likely migrate the VCPU thread
207455e370cSJohn Levon * somewhere else.
208455e370cSJohn Levon *
209455e370cSJohn Levon * Note that all of this state examination is racy, as we don't own any locks
210455e370cSJohn Levon * here.
211455e370cSJohn Levon */
212455e370cSJohn Levon static boolean_t
yield_to_vcpu(cpu_t * sib,zoneid_t zoneid)213455e370cSJohn Levon yield_to_vcpu(cpu_t *sib, zoneid_t zoneid)
214455e370cSJohn Levon {
215c3377ee9SJohn Levon cpu_smt_t *sibsmt = &sib->cpu_m.mcpu_smt;
216c3377ee9SJohn Levon uint64_t sibstate = sibsmt->cs_state;
217455e370cSJohn Levon
218455e370cSJohn Levon /*
219455e370cSJohn Levon * If we're likely just waiting for an interrupt, don't yield.
220455e370cSJohn Levon */
221c3377ee9SJohn Levon if (sibsmt->cs_intr_depth != 0)
222455e370cSJohn Levon return (B_FALSE);
223455e370cSJohn Levon
224455e370cSJohn Levon /*
225455e370cSJohn Levon * We're only interested in VCPUs from a different zone.
226455e370cSJohn Levon */
227455e370cSJohn Levon if (CS_MARK(sibstate) < CM_VCPU || CS_ZONE(sibstate) == zoneid)
228455e370cSJohn Levon return (B_FALSE);
229455e370cSJohn Levon
230455e370cSJohn Levon if (curthread->t_pri < sib->cpu_dispatch_pri)
231455e370cSJohn Levon return (B_TRUE);
232455e370cSJohn Levon
233455e370cSJohn Levon if (curthread->t_pri == sib->cpu_dispatch_pri &&
234455e370cSJohn Levon CPU->cpu_id < sib->cpu_id)
235455e370cSJohn Levon return (B_TRUE);
236455e370cSJohn Levon
237455e370cSJohn Levon return (B_FALSE);
238455e370cSJohn Levon }
239455e370cSJohn Levon
240455e370cSJohn Levon static inline boolean_t
sibling_compatible(cpu_smt_t * sibsmt,zoneid_t zoneid)241c3377ee9SJohn Levon sibling_compatible(cpu_smt_t *sibsmt, zoneid_t zoneid)
242455e370cSJohn Levon {
243c3377ee9SJohn Levon uint64_t sibstate = sibsmt->cs_state;
244455e370cSJohn Levon
245c3377ee9SJohn Levon if (sibsmt->cs_intr_depth != 0)
246455e370cSJohn Levon return (B_FALSE);
247455e370cSJohn Levon
248455e370cSJohn Levon if (CS_MARK(sibstate) == CM_UNSAFE)
249455e370cSJohn Levon return (B_FALSE);
250455e370cSJohn Levon
251455e370cSJohn Levon if (CS_MARK(sibstate) == CM_IDLE)
252455e370cSJohn Levon return (B_TRUE);
253455e370cSJohn Levon
254455e370cSJohn Levon return (CS_ZONE(sibstate) == zoneid);
255455e370cSJohn Levon }
256455e370cSJohn Levon
257455e370cSJohn Levon int
smt_acquire(void)258c3377ee9SJohn Levon smt_acquire(void)
259455e370cSJohn Levon {
260c3377ee9SJohn Levon clock_t wait = smt_acquire_wait_time;
261c3377ee9SJohn Levon cpu_smt_t *smt = &CPU->cpu_m.mcpu_smt;
262455e370cSJohn Levon zoneid_t zoneid = getzoneid();
263c3377ee9SJohn Levon cpu_smt_t *sibsmt;
264455e370cSJohn Levon int ret = 0;
265455e370cSJohn Levon
266455e370cSJohn Levon ASSERT(!interrupts_enabled());
267455e370cSJohn Levon
268c3377ee9SJohn Levon if (smt->cs_sib == NULL) {
269455e370cSJohn Levon /* For the "sequential" L1TF case. */
270a9cc46cfSRobert Mustacchi spec_uarch_flush();
271455e370cSJohn Levon return (1);
272455e370cSJohn Levon }
273455e370cSJohn Levon
274c3377ee9SJohn Levon sibsmt = &smt->cs_sib->cpu_m.mcpu_smt;
275455e370cSJohn Levon
276455e370cSJohn Levon /* A VCPU thread should never change zone. */
277c3377ee9SJohn Levon ASSERT3U(CS_ZONE(smt->cs_state), ==, zoneid);
278c3377ee9SJohn Levon ASSERT3U(CS_MARK(smt->cs_state), ==, CM_VCPU);
279455e370cSJohn Levon ASSERT3U(curthread->t_preempt, >=, 1);
280455e370cSJohn Levon ASSERT(curthread->t_schedflag & TS_VCPU);
281455e370cSJohn Levon
282455e370cSJohn Levon while (ret == 0 && wait > 0) {
283455e370cSJohn Levon
284c3377ee9SJohn Levon if (yield_to_vcpu(smt->cs_sib, zoneid)) {
285455e370cSJohn Levon ret = -1;
286455e370cSJohn Levon break;
287455e370cSJohn Levon }
288455e370cSJohn Levon
289c3377ee9SJohn Levon if (sibling_compatible(sibsmt, zoneid)) {
290c3377ee9SJohn Levon lock_set(&sibsmt->cs_lock);
291455e370cSJohn Levon
292c3377ee9SJohn Levon if (sibling_compatible(sibsmt, zoneid)) {
293c3377ee9SJohn Levon smt->cs_state = CS_MK(CM_POISONED, zoneid);
294c3377ee9SJohn Levon sibsmt->cs_sibstate = CS_MK(CM_POISONED,
295c3377ee9SJohn Levon zoneid);
296455e370cSJohn Levon membar_enter();
297455e370cSJohn Levon ret = 1;
298455e370cSJohn Levon }
299455e370cSJohn Levon
300c3377ee9SJohn Levon lock_clear(&sibsmt->cs_lock);
301455e370cSJohn Levon } else {
302455e370cSJohn Levon drv_usecwait(10);
303455e370cSJohn Levon wait -= 10;
304455e370cSJohn Levon }
305455e370cSJohn Levon }
306455e370cSJohn Levon
307c3377ee9SJohn Levon DTRACE_PROBE4(smt__acquire, int, ret, uint64_t, sibsmt->cs_state,
308c3377ee9SJohn Levon uint64_t, sibsmt->cs_intr_depth, clock_t, wait);
309455e370cSJohn Levon
310455e370cSJohn Levon if (ret == 1)
311a9cc46cfSRobert Mustacchi spec_uarch_flush();
312455e370cSJohn Levon
313455e370cSJohn Levon return (ret);
314455e370cSJohn Levon }
315455e370cSJohn Levon
316455e370cSJohn Levon void
smt_release(void)317c3377ee9SJohn Levon smt_release(void)
318455e370cSJohn Levon {
319c3377ee9SJohn Levon cpu_smt_t *smt = &CPU->cpu_m.mcpu_smt;
320455e370cSJohn Levon zoneid_t zoneid = getzoneid();
321c3377ee9SJohn Levon cpu_smt_t *sibsmt;
322455e370cSJohn Levon
323455e370cSJohn Levon ASSERT(!interrupts_enabled());
324455e370cSJohn Levon
325c3377ee9SJohn Levon if (smt->cs_sib == NULL)
326455e370cSJohn Levon return;
327455e370cSJohn Levon
328c3377ee9SJohn Levon ASSERT3U(CS_ZONE(smt->cs_state), ==, zoneid);
329c3377ee9SJohn Levon ASSERT3U(CS_MARK(smt->cs_state), ==, CM_POISONED);
330455e370cSJohn Levon ASSERT3U(curthread->t_preempt, >=, 1);
331455e370cSJohn Levon
332c3377ee9SJohn Levon sibsmt = &smt->cs_sib->cpu_m.mcpu_smt;
333455e370cSJohn Levon
334c3377ee9SJohn Levon lock_set(&sibsmt->cs_lock);
335455e370cSJohn Levon
336c3377ee9SJohn Levon smt->cs_state = CS_MK(CM_VCPU, zoneid);
337c3377ee9SJohn Levon sibsmt->cs_sibstate = CS_MK(CM_VCPU, zoneid);
338455e370cSJohn Levon membar_producer();
339455e370cSJohn Levon
340c3377ee9SJohn Levon lock_clear(&sibsmt->cs_lock);
341455e370cSJohn Levon }
342455e370cSJohn Levon
343455e370cSJohn Levon static void
smt_kick(cpu_smt_t * smt,zoneid_t zoneid)344c3377ee9SJohn Levon smt_kick(cpu_smt_t *smt, zoneid_t zoneid)
345455e370cSJohn Levon {
346455e370cSJohn Levon uint64_t sibstate;
347455e370cSJohn Levon
348c3377ee9SJohn Levon ASSERT(LOCK_HELD(&smt->cs_lock));
349455e370cSJohn Levon ASSERT(!interrupts_enabled());
350455e370cSJohn Levon
351c3377ee9SJohn Levon poke_cpu(smt->cs_sib->cpu_id);
352455e370cSJohn Levon
353455e370cSJohn Levon membar_consumer();
354c3377ee9SJohn Levon sibstate = smt->cs_sibstate;
355455e370cSJohn Levon
356455e370cSJohn Levon if (CS_MARK(sibstate) != CM_POISONED || CS_ZONE(sibstate) == zoneid)
357455e370cSJohn Levon return;
358455e370cSJohn Levon
359c3377ee9SJohn Levon lock_clear(&smt->cs_lock);
360455e370cSJohn Levon
361455e370cSJohn Levon /*
362455e370cSJohn Levon * Spin until we can see the sibling has been kicked out or is otherwise
363455e370cSJohn Levon * OK.
364455e370cSJohn Levon */
365455e370cSJohn Levon for (;;) {
366455e370cSJohn Levon membar_consumer();
367c3377ee9SJohn Levon sibstate = smt->cs_sibstate;
368455e370cSJohn Levon
369455e370cSJohn Levon if (CS_MARK(sibstate) != CM_POISONED ||
370455e370cSJohn Levon CS_ZONE(sibstate) == zoneid)
371455e370cSJohn Levon break;
372455e370cSJohn Levon
373455e370cSJohn Levon SMT_PAUSE();
374455e370cSJohn Levon }
375455e370cSJohn Levon
376c3377ee9SJohn Levon lock_set(&smt->cs_lock);
377455e370cSJohn Levon }
378455e370cSJohn Levon
379455e370cSJohn Levon static boolean_t
pil_needs_kick(uint_t pil)380455e370cSJohn Levon pil_needs_kick(uint_t pil)
381455e370cSJohn Levon {
382455e370cSJohn Levon return (pil != empty_pil);
383455e370cSJohn Levon }
384455e370cSJohn Levon
385455e370cSJohn Levon void
smt_begin_intr(uint_t pil)386c3377ee9SJohn Levon smt_begin_intr(uint_t pil)
387455e370cSJohn Levon {
388455e370cSJohn Levon ulong_t flags;
389c3377ee9SJohn Levon cpu_smt_t *smt;
390455e370cSJohn Levon
391455e370cSJohn Levon ASSERT(pil <= PIL_MAX);
392455e370cSJohn Levon
393455e370cSJohn Levon flags = intr_clear();
394c3377ee9SJohn Levon smt = &CPU->cpu_m.mcpu_smt;
395455e370cSJohn Levon
396c3377ee9SJohn Levon if (smt->cs_sib == NULL) {
397455e370cSJohn Levon intr_restore(flags);
398455e370cSJohn Levon return;
399455e370cSJohn Levon }
400455e370cSJohn Levon
401c3377ee9SJohn Levon if (atomic_inc_64_nv(&smt->cs_intr_depth) == 1 && pil_needs_kick(pil)) {
402c3377ee9SJohn Levon lock_set(&smt->cs_lock);
403455e370cSJohn Levon
404455e370cSJohn Levon membar_consumer();
405455e370cSJohn Levon
406c3377ee9SJohn Levon if (CS_MARK(smt->cs_sibstate) == CM_POISONED)
407c3377ee9SJohn Levon smt_kick(smt, GLOBAL_ZONEID);
408455e370cSJohn Levon
409c3377ee9SJohn Levon lock_clear(&smt->cs_lock);
410455e370cSJohn Levon }
411455e370cSJohn Levon
412455e370cSJohn Levon intr_restore(flags);
413455e370cSJohn Levon }
414455e370cSJohn Levon
415455e370cSJohn Levon void
smt_end_intr(void)416c3377ee9SJohn Levon smt_end_intr(void)
417455e370cSJohn Levon {
418455e370cSJohn Levon ulong_t flags;
419c3377ee9SJohn Levon cpu_smt_t *smt;
420455e370cSJohn Levon
421455e370cSJohn Levon flags = intr_clear();
422c3377ee9SJohn Levon smt = &CPU->cpu_m.mcpu_smt;
423455e370cSJohn Levon
424c3377ee9SJohn Levon if (smt->cs_sib == NULL) {
425455e370cSJohn Levon intr_restore(flags);
426455e370cSJohn Levon return;
427455e370cSJohn Levon }
428455e370cSJohn Levon
429c3377ee9SJohn Levon ASSERT3U(smt->cs_intr_depth, >, 0);
430c3377ee9SJohn Levon atomic_dec_64(&smt->cs_intr_depth);
431455e370cSJohn Levon
432455e370cSJohn Levon intr_restore(flags);
433455e370cSJohn Levon }
434455e370cSJohn Levon
435455e370cSJohn Levon static inline boolean_t
smt_need_kick(cpu_smt_t * smt,zoneid_t zoneid)436c3377ee9SJohn Levon smt_need_kick(cpu_smt_t *smt, zoneid_t zoneid)
437455e370cSJohn Levon {
438455e370cSJohn Levon membar_consumer();
439455e370cSJohn Levon
440c3377ee9SJohn Levon if (CS_MARK(smt->cs_sibstate) != CM_POISONED)
441455e370cSJohn Levon return (B_FALSE);
442455e370cSJohn Levon
443c3377ee9SJohn Levon if (CS_MARK(smt->cs_state) == CM_UNSAFE)
444455e370cSJohn Levon return (B_TRUE);
445455e370cSJohn Levon
446c3377ee9SJohn Levon return (CS_ZONE(smt->cs_sibstate) != zoneid);
447455e370cSJohn Levon }
448455e370cSJohn Levon
449455e370cSJohn Levon void
smt_mark(void)450c3377ee9SJohn Levon smt_mark(void)
451455e370cSJohn Levon {
452455e370cSJohn Levon zoneid_t zoneid = getzoneid();
453455e370cSJohn Levon kthread_t *t = curthread;
454455e370cSJohn Levon ulong_t flags;
455c3377ee9SJohn Levon cpu_smt_t *smt;
456455e370cSJohn Levon cpu_t *cp;
457455e370cSJohn Levon
458455e370cSJohn Levon flags = intr_clear();
459455e370cSJohn Levon
460455e370cSJohn Levon cp = CPU;
461c3377ee9SJohn Levon smt = &cp->cpu_m.mcpu_smt;
462455e370cSJohn Levon
463c3377ee9SJohn Levon if (smt->cs_sib == NULL) {
464455e370cSJohn Levon intr_restore(flags);
465455e370cSJohn Levon return;
466455e370cSJohn Levon }
467455e370cSJohn Levon
468c3377ee9SJohn Levon lock_set(&smt->cs_lock);
469455e370cSJohn Levon
470455e370cSJohn Levon /*
471455e370cSJohn Levon * If we were a nested interrupt and went through the resume_from_intr()
472455e370cSJohn Levon * path, we can now be resuming to a pinning interrupt thread; in which
473455e370cSJohn Levon * case, skip marking, until we later resume to a "real" thread.
474455e370cSJohn Levon */
475c3377ee9SJohn Levon if (smt->cs_intr_depth > 0) {
476455e370cSJohn Levon ASSERT3P(t->t_intr, !=, NULL);
477455e370cSJohn Levon
478c3377ee9SJohn Levon if (smt_need_kick(smt, zoneid))
479c3377ee9SJohn Levon smt_kick(smt, zoneid);
480455e370cSJohn Levon goto out;
481455e370cSJohn Levon }
482455e370cSJohn Levon
483455e370cSJohn Levon if (t == t->t_cpu->cpu_idle_thread) {
484455e370cSJohn Levon ASSERT3U(zoneid, ==, GLOBAL_ZONEID);
485c3377ee9SJohn Levon smt->cs_state = CS_MK(CM_IDLE, zoneid);
486455e370cSJohn Levon } else {
487455e370cSJohn Levon uint64_t state = CM_THREAD;
488455e370cSJohn Levon
489455e370cSJohn Levon if (t->t_unsafe)
490455e370cSJohn Levon state = CM_UNSAFE;
491455e370cSJohn Levon else if (t->t_schedflag & TS_VCPU)
492455e370cSJohn Levon state = CM_VCPU;
493455e370cSJohn Levon
494c3377ee9SJohn Levon smt->cs_state = CS_MK(state, zoneid);
495455e370cSJohn Levon
496c3377ee9SJohn Levon if (smt_need_kick(smt, zoneid))
497c3377ee9SJohn Levon smt_kick(smt, zoneid);
498455e370cSJohn Levon }
499455e370cSJohn Levon
500455e370cSJohn Levon out:
501455e370cSJohn Levon membar_producer();
502c3377ee9SJohn Levon lock_clear(&smt->cs_lock);
503455e370cSJohn Levon intr_restore(flags);
504455e370cSJohn Levon }
505455e370cSJohn Levon
506455e370cSJohn Levon void
smt_begin_unsafe(void)507c3377ee9SJohn Levon smt_begin_unsafe(void)
508455e370cSJohn Levon {
509455e370cSJohn Levon curthread->t_unsafe++;
510c3377ee9SJohn Levon smt_mark();
511455e370cSJohn Levon }
512455e370cSJohn Levon
513455e370cSJohn Levon void
smt_end_unsafe(void)514c3377ee9SJohn Levon smt_end_unsafe(void)
515455e370cSJohn Levon {
516455e370cSJohn Levon ASSERT3U(curthread->t_unsafe, >, 0);
517455e370cSJohn Levon curthread->t_unsafe--;
518c3377ee9SJohn Levon smt_mark();
519455e370cSJohn Levon }
520455e370cSJohn Levon
521455e370cSJohn Levon void
smt_mark_as_vcpu(void)522c3377ee9SJohn Levon smt_mark_as_vcpu(void)
523455e370cSJohn Levon {
524455e370cSJohn Levon thread_lock(curthread);
525455e370cSJohn Levon curthread->t_schedflag |= TS_VCPU;
526c3377ee9SJohn Levon smt_mark();
527455e370cSJohn Levon thread_unlock(curthread);
528455e370cSJohn Levon }
529455e370cSJohn Levon
530455e370cSJohn Levon boolean_t
smt_should_run(kthread_t * t,cpu_t * cp)531c3377ee9SJohn Levon smt_should_run(kthread_t *t, cpu_t *cp)
532455e370cSJohn Levon {
533455e370cSJohn Levon uint64_t sibstate;
534455e370cSJohn Levon cpu_t *sib;
535455e370cSJohn Levon
536455e370cSJohn Levon if (t == t->t_cpu->cpu_idle_thread)
537455e370cSJohn Levon return (B_TRUE);
538455e370cSJohn Levon
539c3377ee9SJohn Levon if ((sib = cp->cpu_m.mcpu_smt.cs_sib) == NULL)
540455e370cSJohn Levon return (B_TRUE);
541455e370cSJohn Levon
542c3377ee9SJohn Levon sibstate = sib->cpu_m.mcpu_smt.cs_state;
543455e370cSJohn Levon
544455e370cSJohn Levon if ((t->t_schedflag & TS_VCPU)) {
545455e370cSJohn Levon if (CS_MARK(sibstate) == CM_IDLE)
546455e370cSJohn Levon return (B_TRUE);
547455e370cSJohn Levon if (CS_MARK(sibstate) == CM_UNSAFE)
548455e370cSJohn Levon return (B_FALSE);
549455e370cSJohn Levon return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
550455e370cSJohn Levon }
551455e370cSJohn Levon
552455e370cSJohn Levon if (CS_MARK(sibstate) < CM_VCPU)
553455e370cSJohn Levon return (B_TRUE);
554455e370cSJohn Levon
555455e370cSJohn Levon return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
556455e370cSJohn Levon }
557455e370cSJohn Levon
558455e370cSJohn Levon pri_t
smt_adjust_cpu_score(kthread_t * t,struct cpu * cp,pri_t score)559c3377ee9SJohn Levon smt_adjust_cpu_score(kthread_t *t, struct cpu *cp, pri_t score)
560455e370cSJohn Levon {
561c3377ee9SJohn Levon if (smt_should_run(t, cp))
562455e370cSJohn Levon return (score);
563455e370cSJohn Levon
564455e370cSJohn Levon /*
565455e370cSJohn Levon * If we're a VCPU thread scoring our current CPU, we are most likely
566c3377ee9SJohn Levon * asking to be rescheduled elsewhere after losing smt_acquire(). In
567455e370cSJohn Levon * this case, the current CPU is not a good choice, most likely, and we
568455e370cSJohn Levon * should go elsewhere.
569455e370cSJohn Levon */
570455e370cSJohn Levon if ((t->t_schedflag & TS_VCPU) && cp == t->t_cpu && score < 0)
571455e370cSJohn Levon return ((v.v_maxsyspri + 1) * 2);
572455e370cSJohn Levon
573455e370cSJohn Levon return (score + 1);
574455e370cSJohn Levon }
575c3377ee9SJohn Levon
576c3377ee9SJohn Levon static void
set_smt_prop(void)577c3377ee9SJohn Levon set_smt_prop(void)
578c3377ee9SJohn Levon {
579c3377ee9SJohn Levon (void) e_ddi_prop_update_string(DDI_DEV_T_NONE, ddi_root_node(),
580c3377ee9SJohn Levon "smt_enabled", smt_enabled ? "true" : "false");
581c3377ee9SJohn Levon }
582c3377ee9SJohn Levon
583c3377ee9SJohn Levon static cpu_t *
smt_find_sibling(cpu_t * cp)584c3377ee9SJohn Levon smt_find_sibling(cpu_t *cp)
585c3377ee9SJohn Levon {
586c3377ee9SJohn Levon for (uint_t i = 0; i < GROUP_SIZE(&cp->cpu_pg->cmt_pgs); i++) {
587c3377ee9SJohn Levon pg_cmt_t *pg = GROUP_ACCESS(&cp->cpu_pg->cmt_pgs, i);
588c3377ee9SJohn Levon group_t *cg = &pg->cmt_pg.pghw_pg.pg_cpus;
589c3377ee9SJohn Levon
590c3377ee9SJohn Levon if (pg->cmt_pg.pghw_hw != PGHW_IPIPE)
591c3377ee9SJohn Levon continue;
592c3377ee9SJohn Levon
593c3377ee9SJohn Levon if (GROUP_SIZE(cg) == 1)
594c3377ee9SJohn Levon break;
595c3377ee9SJohn Levon
596c3377ee9SJohn Levon if (GROUP_SIZE(cg) != 2) {
597c3377ee9SJohn Levon panic("%u SMT threads unsupported", GROUP_SIZE(cg));
598c3377ee9SJohn Levon }
599c3377ee9SJohn Levon
600c3377ee9SJohn Levon if (GROUP_ACCESS(cg, 0) != cp)
601c3377ee9SJohn Levon return (GROUP_ACCESS(cg, 0));
602c3377ee9SJohn Levon
603c3377ee9SJohn Levon VERIFY3P(GROUP_ACCESS(cg, 1), !=, cp);
604c3377ee9SJohn Levon
605c3377ee9SJohn Levon return (GROUP_ACCESS(cg, 1));
606c3377ee9SJohn Levon }
607c3377ee9SJohn Levon
608c3377ee9SJohn Levon return (NULL);
609c3377ee9SJohn Levon }
610c3377ee9SJohn Levon
611c3377ee9SJohn Levon /*
612c3377ee9SJohn Levon * Offline all siblings and mark as CPU_DISABLED. Note that any siblings that
613c3377ee9SJohn Levon * can't be offlined (if it would leave an empty partition, or it's a spare, or
614c3377ee9SJohn Levon * whatever) will fail the whole operation.
615c3377ee9SJohn Levon */
616c3377ee9SJohn Levon int
smt_disable(void)617c3377ee9SJohn Levon smt_disable(void)
618c3377ee9SJohn Levon {
619c3377ee9SJohn Levon int error = 0;
620c3377ee9SJohn Levon
621c3377ee9SJohn Levon ASSERT(MUTEX_HELD(&cpu_lock));
622c3377ee9SJohn Levon
623c3377ee9SJohn Levon if (secpolicy_ponline(CRED()) != 0)
624c3377ee9SJohn Levon return (EPERM);
625c3377ee9SJohn Levon
626c3377ee9SJohn Levon if (!smt_enabled)
627c3377ee9SJohn Levon return (0);
628c3377ee9SJohn Levon
629c3377ee9SJohn Levon for (size_t i = 0; i < NCPU; i++) {
630c3377ee9SJohn Levon cpu_t *sib;
631c3377ee9SJohn Levon cpu_t *cp;
632c3377ee9SJohn Levon
633c3377ee9SJohn Levon if ((cp = cpu_get(i)) == NULL)
634c3377ee9SJohn Levon continue;
635c3377ee9SJohn Levon
636c3377ee9SJohn Levon /* NB: we don't necessarily have .mcpu_smt to use here. */
637c3377ee9SJohn Levon if ((sib = smt_find_sibling(cp)) == NULL)
638c3377ee9SJohn Levon continue;
639c3377ee9SJohn Levon
640c3377ee9SJohn Levon if (cp->cpu_id < sib->cpu_id)
641c3377ee9SJohn Levon continue;
642c3377ee9SJohn Levon
643c3377ee9SJohn Levon if (cp->cpu_flags & CPU_DISABLED) {
644c3377ee9SJohn Levon VERIFY(cp->cpu_flags & CPU_OFFLINE);
645c3377ee9SJohn Levon continue;
646c3377ee9SJohn Levon }
647c3377ee9SJohn Levon
648c3377ee9SJohn Levon if (cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) {
649c3377ee9SJohn Levon error = EINVAL;
650c3377ee9SJohn Levon break;
651c3377ee9SJohn Levon }
652c3377ee9SJohn Levon
653c3377ee9SJohn Levon if ((cp->cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY) {
654c3377ee9SJohn Levon cp->cpu_flags |= CPU_DISABLED;
655c3377ee9SJohn Levon continue;
656c3377ee9SJohn Levon }
657c3377ee9SJohn Levon
658c3377ee9SJohn Levon if ((error = cpu_offline(cp, CPU_FORCED)) != 0)
659c3377ee9SJohn Levon break;
660c3377ee9SJohn Levon
661c3377ee9SJohn Levon cp->cpu_flags |= CPU_DISABLED;
662c3377ee9SJohn Levon cpu_set_state(cp);
663c3377ee9SJohn Levon }
664c3377ee9SJohn Levon
665c3377ee9SJohn Levon if (error != 0)
666c3377ee9SJohn Levon return (error);
667c3377ee9SJohn Levon
668c3377ee9SJohn Levon smt_enabled = 0;
669c3377ee9SJohn Levon set_smt_prop();
670c3377ee9SJohn Levon cmn_err(CE_NOTE, "!SMT / hyper-threading explicitly disabled.");
671c3377ee9SJohn Levon return (0);
672c3377ee9SJohn Levon }
673c3377ee9SJohn Levon
674c3377ee9SJohn Levon boolean_t
smt_can_enable(cpu_t * cp,int flags)675c3377ee9SJohn Levon smt_can_enable(cpu_t *cp, int flags)
676c3377ee9SJohn Levon {
677c3377ee9SJohn Levon VERIFY(cp->cpu_flags & CPU_DISABLED);
678c3377ee9SJohn Levon
679c3377ee9SJohn Levon return (!smt_boot_disable && (flags & CPU_FORCED));
680c3377ee9SJohn Levon }
681c3377ee9SJohn Levon
682c3377ee9SJohn Levon /*
683c3377ee9SJohn Levon * If we force-onlined a CPU_DISABLED CPU, then we can no longer consider the
684c3377ee9SJohn Levon * system to be SMT-disabled in toto.
685c3377ee9SJohn Levon */
686c3377ee9SJohn Levon void
smt_force_enabled(void)687c3377ee9SJohn Levon smt_force_enabled(void)
688c3377ee9SJohn Levon {
689c3377ee9SJohn Levon VERIFY(!smt_boot_disable);
690c3377ee9SJohn Levon
691c3377ee9SJohn Levon if (!smt_enabled)
692c3377ee9SJohn Levon cmn_err(CE_NOTE, "!Disabled SMT sibling forced on-line.");
693c3377ee9SJohn Levon
694c3377ee9SJohn Levon smt_enabled = 1;
695c3377ee9SJohn Levon set_smt_prop();
696c3377ee9SJohn Levon }
697c3377ee9SJohn Levon
698c3377ee9SJohn Levon /*
699c3377ee9SJohn Levon * Initialize SMT links. We have to be careful here not to race with
700c3377ee9SJohn Levon * smt_begin/end_intr(), which also complicates trying to do this initialization
701c3377ee9SJohn Levon * from a cross-call; hence the slightly odd approach below.
702c3377ee9SJohn Levon *
703c3377ee9SJohn Levon * If we're going to disable SMT via smt_late_init(), we will avoid paying the
704c3377ee9SJohn Levon * price here at all (we can't do it here since we're still too early in
705c3377ee9SJohn Levon * main()).
706c3377ee9SJohn Levon */
707c3377ee9SJohn Levon void
smt_init(void)708c3377ee9SJohn Levon smt_init(void)
709c3377ee9SJohn Levon {
710c3377ee9SJohn Levon boolean_t found_sibling = B_FALSE;
711c3377ee9SJohn Levon cpu_t *scp = CPU;
712c3377ee9SJohn Levon cpu_t *cp = scp;
713c3377ee9SJohn Levon ulong_t flags;
714c3377ee9SJohn Levon
715c3377ee9SJohn Levon if (!smt_exclusion || smt_boot_disable)
716c3377ee9SJohn Levon return;
717c3377ee9SJohn Levon
718c3377ee9SJohn Levon mutex_enter(&cpu_lock);
719c3377ee9SJohn Levon
720c3377ee9SJohn Levon do {
721c3377ee9SJohn Levon thread_affinity_set(curthread, cp->cpu_id);
722c3377ee9SJohn Levon flags = intr_clear();
723c3377ee9SJohn Levon
724c3377ee9SJohn Levon cp->cpu_m.mcpu_smt.cs_intr_depth = 0;
725c3377ee9SJohn Levon cp->cpu_m.mcpu_smt.cs_state = CS_MK(CM_THREAD, GLOBAL_ZONEID);
726c3377ee9SJohn Levon cp->cpu_m.mcpu_smt.cs_sibstate = CS_MK(CM_THREAD,
727c3377ee9SJohn Levon GLOBAL_ZONEID);
728c3377ee9SJohn Levon ASSERT3P(cp->cpu_m.mcpu_smt.cs_sib, ==, NULL);
729c3377ee9SJohn Levon cp->cpu_m.mcpu_smt.cs_sib = smt_find_sibling(cp);
730c3377ee9SJohn Levon
731c3377ee9SJohn Levon if (cp->cpu_m.mcpu_smt.cs_sib != NULL)
732c3377ee9SJohn Levon found_sibling = B_TRUE;
733c3377ee9SJohn Levon
734c3377ee9SJohn Levon intr_restore(flags);
735c3377ee9SJohn Levon thread_affinity_clear(curthread);
736c3377ee9SJohn Levon } while ((cp = cp->cpu_next_onln) != scp);
737c3377ee9SJohn Levon
738c3377ee9SJohn Levon mutex_exit(&cpu_lock);
739c3377ee9SJohn Levon
740c3377ee9SJohn Levon if (!found_sibling)
741c3377ee9SJohn Levon smt_enabled = 0;
742c3377ee9SJohn Levon }
743c3377ee9SJohn Levon
744c3377ee9SJohn Levon void
smt_late_init(void)745c3377ee9SJohn Levon smt_late_init(void)
746c3377ee9SJohn Levon {
747c3377ee9SJohn Levon if (smt_boot_disable) {
748c3377ee9SJohn Levon int err;
749c3377ee9SJohn Levon
750c3377ee9SJohn Levon mutex_enter(&cpu_lock);
751c3377ee9SJohn Levon
752c3377ee9SJohn Levon err = smt_disable();
753c3377ee9SJohn Levon
754c3377ee9SJohn Levon /*
755c3377ee9SJohn Levon * We're early enough in boot that nothing should have stopped
756c3377ee9SJohn Levon * us from offlining the siblings. As we didn't prepare our
757c3377ee9SJohn Levon * L1TF mitigation in this case, we need to panic.
758c3377ee9SJohn Levon */
759c3377ee9SJohn Levon if (err) {
760c3377ee9SJohn Levon cmn_err(CE_PANIC, "smt_disable() failed with %d", err);
761c3377ee9SJohn Levon }
762c3377ee9SJohn Levon
763c3377ee9SJohn Levon mutex_exit(&cpu_lock);
764c3377ee9SJohn Levon }
765c3377ee9SJohn Levon
766c3377ee9SJohn Levon if (smt_enabled)
767c3377ee9SJohn Levon cmn_err(CE_NOTE, "!SMT enabled\n");
768c3377ee9SJohn Levon
769c3377ee9SJohn Levon set_smt_prop();
770c3377ee9SJohn Levon }
771