xref: /illumos-gate/usr/src/uts/intel/os/smt.c (revision 7c8c0b82)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * SMT exclusion: prevent a sibling in a hyper-threaded core from running in VMX
18  * non-root guest mode, when certain threads are running on the other sibling.
19  * This avoids speculation-based information leaks such as L1TF being available
20  * to the untrusted guest.  The stance we take is that threads from the same
21  * zone as the guest VPCU thread are considered safe to run alongside, but all
22  * other threads (except the idle thread), and all interrupts, are unsafe.  Note
23  * that due to the implementation here, there are significant sections of e.g.
24  * the dispatcher code that can run concurrently with a guest, until the thread
25  * reaches smt_mark().  This code assumes there are only two SMT threads per
26  * core.
27  *
28  * The entry points are as follows:
29  *
30  * smt_mark_as_vcpu()
31  *
32  * All threads that enter guest mode (i.e. VCPU threads) need to call this at
33  * least once, which sets TS_VCPU in ->t_schedflag.
34  *
35  * smt_mark()
36  *
37  * A new ->cpu_thread is now curthread (although interrupt threads have their
38  * own separate handling).  After preventing any interrupts, we will take our
39  * own CPU's spinlock and update our own state in mcpu_smt.
40  *
41  * If our sibling is poisoned (i.e. in guest mode or the little bit of code
42  * around it), and we're not compatible (that is, same zone ID, or the idle
43  * thread), then we need to smt_kick() that sibling.  smt_kick() itself waits
44  * for the sibling to call smt_release(), and it will not re-enter guest mode
45  * until allowed.
46  *
47  * Note that we ignore the fact a process can change its zone ID: poisoning
48  * threads never do so, and we can ignore the other cases.
49  *
50  * smt_acquire()
51  *
52  * We are a VCPU thread about to start guest execution.  Interrupts are
53  * disabled.  We must have already run smt_mark() to be in this code, so there's
54  * no need to take our *own* spinlock in order to mark ourselves as CM_POISONED.
55  * Instead, we take our sibling's lock to also mark ourselves as poisoned in the
56  * sibling cpu_smt_t.  This is so smt_mark() will only ever need to look at its
57  * local mcpu_smt.
58  *
59  * We'll loop here for up to smt_acquire_wait_time microseconds; this is mainly
60  * to wait out any sibling interrupt: many of them will complete quicker than
61  * this.
62  *
63  * Finally, if we succeeded in acquiring the core, we'll flush the L1 cache as
64  * mitigation against L1TF: no incompatible thread will now be able to populate
65  * the L1 cache until *we* smt_release().
66  *
67  * smt_release()
68  *
69  * Simply unpoison ourselves similarly to smt_acquire(); smt_kick() will wait
70  * for this to happen if needed.
71  *
72  * smt_begin_intr()
73  *
74  * In an interrupt prolog.  We're either a hilevel interrupt, or a pinning
75  * interrupt.  In both cases, we mark our interrupt depth, and potentially
76  * smt_kick().  This enforces exclusion, but doesn't otherwise modify
77  * ->cs_state: we want the dispatcher code to essentially ignore interrupts.
78  *
79  * smt_end_intr()
80  *
81  * In an interrupt epilogue *or* thread_unpin().  In the first case, we never
82  * slept, and we can simply decrement our counter.  In the second case, we're an
83  * interrupt thread about to sleep: we'll still just decrement our counter, and
84  * henceforth treat the thread as a normal thread when it next gets scheduled,
85  * until it finally gets to its epilogue.
86  *
87  * smt_mark_unsafe() / smt_mark_safe()
88  *
89  * Mark the current thread as temporarily unsafe (guests should not be executing
90  * while a sibling is marked unsafe).  This can be used for a thread that's
91  * otherwise considered safe, if it needs to handle potentially sensitive data.
92  * Right now, this means certain I/O handling operations that reach down into
93  * the networking and ZFS sub-systems.
94  *
95  * smt_should_run(thread, cpu)
96  *
97  * This is used by the dispatcher when making scheduling decisions: if the
98  * sibling is compatible with the given thread, we return B_TRUE. This is
99  * essentially trying to guess if any subsequent smt_acquire() will fail, by
100  * peeking at the sibling CPU's state.  The peek is racy, but if we get things
101  * wrong, the "only" consequence is that smt_acquire() may lose.
102  *
103  * smt_adjust_cpu_score()
104  *
105  * Used when scoring other CPUs in disp_lowpri_cpu().  If we shouldn't run here,
106  * we'll add a small penalty to the score.  This also makes sure a VCPU thread
107  * migration behaves properly.
108  *
109  * smt_init() / smt_late_init()
110  *
111  * Set up SMT handling. If smt_boot_disable is set, smt_late_init(), which runs
112  * late enough to be able to do so, will offline and mark CPU_DISABLED all the
113  * siblings. smt_disable() can also be called after boot via psradm -Ha.
114  */
115 
116 #include <sys/archsystm.h>
117 #include <sys/disp.h>
118 #include <sys/cmt.h>
119 #include <sys/systm.h>
120 #include <sys/cpu.h>
121 #include <sys/var.h>
122 #include <sys/xc_levels.h>
123 #include <sys/cmn_err.h>
124 #include <sys/sysmacros.h>
125 #include <sys/x86_archext.h>
126 #include <sys/esunddi.h>
127 #include <sys/promif.h>
128 #include <sys/policy.h>
129 #include <sys/smt.h>
130 
131 #define	CS_SHIFT (8)
132 #define	CS_MASK ((1 << CS_SHIFT) - 1)
133 #define	CS_MARK(s) ((s) & CS_MASK)
134 #define	CS_ZONE(s) ((s) >> CS_SHIFT)
135 #define	CS_MK(s, z) ((s) | (z << CS_SHIFT))
136 
137 typedef enum cs_mark {
138 	CM_IDLE = 0,	/* running CPU idle thread */
139 	CM_THREAD,	/* running general non-VCPU thread */
140 	CM_UNSAFE,	/* running ->t_unsafe thread */
141 	CM_VCPU,	/* running VCPU thread */
142 	CM_POISONED	/* running in guest */
143 } cs_mark_t;
144 
145 /* Double-check our false-sharing padding. */
146 CTASSERT(offsetof(cpu_smt_t, cs_sib) == 64);
147 CTASSERT(CM_IDLE == 0);
148 CTASSERT(CM_POISONED < (1 << CS_SHIFT));
149 CTASSERT(CM_POISONED > CM_VCPU);
150 CTASSERT(CM_VCPU > CM_UNSAFE);
151 
152 static uint_t empty_pil = XC_CPUPOKE_PIL;
153 
154 /*
155  * If disabled, no SMT exclusion is performed, and system is potentially
156  * vulnerable to L1TF if hyper-threading is enabled, and we don't have the "not
157  * vulnerable" CPUID bit.
158  */
159 int smt_exclusion = 1;
160 
161 /*
162  * How long smt_acquire() will spin trying to acquire the core, in
163  * micro-seconds.  This is enough time to wait out a significant proportion of
164  * interrupts.
165  */
166 clock_t smt_acquire_wait_time = 64;
167 
168 /*
169  * Did we request a disable of SMT at boot time?
170  */
171 int smt_boot_disable;
172 
173 /*
174  * Whether SMT is enabled.
175  */
176 int smt_enabled = 1;
177 
178 /*
179  * We're adding an interrupt handler of some kind at the given PIL.  If this
180  * happens to be the same PIL as XC_CPUPOKE_PIL, then we need to disable our
181  * pil_needs_kick() optimization, as there is now potentially an unsafe
182  * interrupt handler at that PIL.  This typically won't occur, so we're not that
183  * careful about what's actually getting added, which CPU it's on, or if it gets
184  * removed.  This also presumes that softints can't cover our empty_pil.
185  */
186 void
smt_intr_alloc_pil(uint_t pil)187 smt_intr_alloc_pil(uint_t pil)
188 {
189 	ASSERT(pil <= PIL_MAX);
190 
191 	if (empty_pil == pil)
192 		empty_pil = PIL_MAX + 1;
193 }
194 
195 /*
196  * If our sibling is also a VCPU thread from a different zone, we need one of
197  * them to give up, otherwise they will just battle each other for exclusion
198  * until they exhaust their quantum.
199  *
200  * We arbitrate between them by dispatch priority: clearly, a higher-priority
201  * thread deserves to win the acquisition.  However, under CPU load, it'll be
202  * very common to see both threads with ->t_pri == 1.  If so, we'll break the
203  * tie by cpu_id (which is hopefully arbitrary enough).
204  *
205  * If we lose, the VMM code will take this as a hint to call
206  * thread_affinity_set(CPU_BEST), which will likely migrate the VCPU thread
207  * somewhere else.
208  *
209  * Note that all of this state examination is racy, as we don't own any locks
210  * here.
211  */
212 static boolean_t
yield_to_vcpu(cpu_t * sib,zoneid_t zoneid)213 yield_to_vcpu(cpu_t *sib, zoneid_t zoneid)
214 {
215 	cpu_smt_t *sibsmt = &sib->cpu_m.mcpu_smt;
216 	uint64_t sibstate = sibsmt->cs_state;
217 
218 	/*
219 	 * If we're likely just waiting for an interrupt, don't yield.
220 	 */
221 	if (sibsmt->cs_intr_depth != 0)
222 		return (B_FALSE);
223 
224 	/*
225 	 * We're only interested in VCPUs from a different zone.
226 	 */
227 	if (CS_MARK(sibstate) < CM_VCPU || CS_ZONE(sibstate) == zoneid)
228 		return (B_FALSE);
229 
230 	if (curthread->t_pri < sib->cpu_dispatch_pri)
231 		return (B_TRUE);
232 
233 	if (curthread->t_pri == sib->cpu_dispatch_pri &&
234 	    CPU->cpu_id < sib->cpu_id)
235 		return (B_TRUE);
236 
237 	return (B_FALSE);
238 }
239 
240 static inline boolean_t
sibling_compatible(cpu_smt_t * sibsmt,zoneid_t zoneid)241 sibling_compatible(cpu_smt_t *sibsmt, zoneid_t zoneid)
242 {
243 	uint64_t sibstate = sibsmt->cs_state;
244 
245 	if (sibsmt->cs_intr_depth != 0)
246 		return (B_FALSE);
247 
248 	if (CS_MARK(sibstate) == CM_UNSAFE)
249 		return (B_FALSE);
250 
251 	if (CS_MARK(sibstate) == CM_IDLE)
252 		return (B_TRUE);
253 
254 	return (CS_ZONE(sibstate) == zoneid);
255 }
256 
257 int
smt_acquire(void)258 smt_acquire(void)
259 {
260 	clock_t wait = smt_acquire_wait_time;
261 	cpu_smt_t *smt = &CPU->cpu_m.mcpu_smt;
262 	zoneid_t zoneid = getzoneid();
263 	cpu_smt_t *sibsmt;
264 	int ret = 0;
265 
266 	ASSERT(!interrupts_enabled());
267 
268 	if (smt->cs_sib == NULL) {
269 		/* For the "sequential" L1TF case. */
270 		spec_uarch_flush();
271 		return (1);
272 	}
273 
274 	sibsmt = &smt->cs_sib->cpu_m.mcpu_smt;
275 
276 	/* A VCPU thread should never change zone. */
277 	ASSERT3U(CS_ZONE(smt->cs_state), ==, zoneid);
278 	ASSERT3U(CS_MARK(smt->cs_state), ==, CM_VCPU);
279 	ASSERT3U(curthread->t_preempt, >=, 1);
280 	ASSERT(curthread->t_schedflag & TS_VCPU);
281 
282 	while (ret == 0 && wait > 0) {
283 
284 		if (yield_to_vcpu(smt->cs_sib, zoneid)) {
285 			ret = -1;
286 			break;
287 		}
288 
289 		if (sibling_compatible(sibsmt, zoneid)) {
290 			lock_set(&sibsmt->cs_lock);
291 
292 			if (sibling_compatible(sibsmt, zoneid)) {
293 				smt->cs_state = CS_MK(CM_POISONED, zoneid);
294 				sibsmt->cs_sibstate = CS_MK(CM_POISONED,
295 				    zoneid);
296 				membar_enter();
297 				ret = 1;
298 			}
299 
300 			lock_clear(&sibsmt->cs_lock);
301 		} else {
302 			drv_usecwait(10);
303 			wait -= 10;
304 		}
305 	}
306 
307 	DTRACE_PROBE4(smt__acquire, int, ret, uint64_t, sibsmt->cs_state,
308 	    uint64_t, sibsmt->cs_intr_depth, clock_t, wait);
309 
310 	if (ret == 1)
311 		spec_uarch_flush();
312 
313 	return (ret);
314 }
315 
316 void
smt_release(void)317 smt_release(void)
318 {
319 	cpu_smt_t *smt = &CPU->cpu_m.mcpu_smt;
320 	zoneid_t zoneid = getzoneid();
321 	cpu_smt_t *sibsmt;
322 
323 	ASSERT(!interrupts_enabled());
324 
325 	if (smt->cs_sib == NULL)
326 		return;
327 
328 	ASSERT3U(CS_ZONE(smt->cs_state), ==, zoneid);
329 	ASSERT3U(CS_MARK(smt->cs_state), ==, CM_POISONED);
330 	ASSERT3U(curthread->t_preempt, >=, 1);
331 
332 	sibsmt = &smt->cs_sib->cpu_m.mcpu_smt;
333 
334 	lock_set(&sibsmt->cs_lock);
335 
336 	smt->cs_state = CS_MK(CM_VCPU, zoneid);
337 	sibsmt->cs_sibstate = CS_MK(CM_VCPU, zoneid);
338 	membar_producer();
339 
340 	lock_clear(&sibsmt->cs_lock);
341 }
342 
343 static void
smt_kick(cpu_smt_t * smt,zoneid_t zoneid)344 smt_kick(cpu_smt_t *smt, zoneid_t zoneid)
345 {
346 	uint64_t sibstate;
347 
348 	ASSERT(LOCK_HELD(&smt->cs_lock));
349 	ASSERT(!interrupts_enabled());
350 
351 	poke_cpu(smt->cs_sib->cpu_id);
352 
353 	membar_consumer();
354 	sibstate = smt->cs_sibstate;
355 
356 	if (CS_MARK(sibstate) != CM_POISONED || CS_ZONE(sibstate) == zoneid)
357 		return;
358 
359 	lock_clear(&smt->cs_lock);
360 
361 	/*
362 	 * Spin until we can see the sibling has been kicked out or is otherwise
363 	 * OK.
364 	 */
365 	for (;;) {
366 		membar_consumer();
367 		sibstate = smt->cs_sibstate;
368 
369 		if (CS_MARK(sibstate) != CM_POISONED ||
370 		    CS_ZONE(sibstate) == zoneid)
371 			break;
372 
373 		SMT_PAUSE();
374 	}
375 
376 	lock_set(&smt->cs_lock);
377 }
378 
379 static boolean_t
pil_needs_kick(uint_t pil)380 pil_needs_kick(uint_t pil)
381 {
382 	return (pil != empty_pil);
383 }
384 
385 void
smt_begin_intr(uint_t pil)386 smt_begin_intr(uint_t pil)
387 {
388 	ulong_t flags;
389 	cpu_smt_t *smt;
390 
391 	ASSERT(pil <= PIL_MAX);
392 
393 	flags = intr_clear();
394 	smt = &CPU->cpu_m.mcpu_smt;
395 
396 	if (smt->cs_sib == NULL) {
397 		intr_restore(flags);
398 		return;
399 	}
400 
401 	if (atomic_inc_64_nv(&smt->cs_intr_depth) == 1 && pil_needs_kick(pil)) {
402 		lock_set(&smt->cs_lock);
403 
404 		membar_consumer();
405 
406 		if (CS_MARK(smt->cs_sibstate) == CM_POISONED)
407 			smt_kick(smt, GLOBAL_ZONEID);
408 
409 		lock_clear(&smt->cs_lock);
410 	}
411 
412 	intr_restore(flags);
413 }
414 
415 void
smt_end_intr(void)416 smt_end_intr(void)
417 {
418 	ulong_t flags;
419 	cpu_smt_t *smt;
420 
421 	flags = intr_clear();
422 	smt = &CPU->cpu_m.mcpu_smt;
423 
424 	if (smt->cs_sib == NULL) {
425 		intr_restore(flags);
426 		return;
427 	}
428 
429 	ASSERT3U(smt->cs_intr_depth, >, 0);
430 	atomic_dec_64(&smt->cs_intr_depth);
431 
432 	intr_restore(flags);
433 }
434 
435 static inline boolean_t
smt_need_kick(cpu_smt_t * smt,zoneid_t zoneid)436 smt_need_kick(cpu_smt_t *smt, zoneid_t zoneid)
437 {
438 	membar_consumer();
439 
440 	if (CS_MARK(smt->cs_sibstate) != CM_POISONED)
441 		return (B_FALSE);
442 
443 	if (CS_MARK(smt->cs_state) == CM_UNSAFE)
444 		return (B_TRUE);
445 
446 	return (CS_ZONE(smt->cs_sibstate) != zoneid);
447 }
448 
449 void
smt_mark(void)450 smt_mark(void)
451 {
452 	zoneid_t zoneid = getzoneid();
453 	kthread_t *t = curthread;
454 	ulong_t flags;
455 	cpu_smt_t *smt;
456 	cpu_t *cp;
457 
458 	flags = intr_clear();
459 
460 	cp = CPU;
461 	smt = &cp->cpu_m.mcpu_smt;
462 
463 	if (smt->cs_sib == NULL) {
464 		intr_restore(flags);
465 		return;
466 	}
467 
468 	lock_set(&smt->cs_lock);
469 
470 	/*
471 	 * If we were a nested interrupt and went through the resume_from_intr()
472 	 * path, we can now be resuming to a pinning interrupt thread; in which
473 	 * case, skip marking, until we later resume to a "real" thread.
474 	 */
475 	if (smt->cs_intr_depth > 0) {
476 		ASSERT3P(t->t_intr, !=, NULL);
477 
478 		if (smt_need_kick(smt, zoneid))
479 			smt_kick(smt, zoneid);
480 		goto out;
481 	}
482 
483 	if (t == t->t_cpu->cpu_idle_thread) {
484 		ASSERT3U(zoneid, ==, GLOBAL_ZONEID);
485 		smt->cs_state = CS_MK(CM_IDLE, zoneid);
486 	} else {
487 		uint64_t state = CM_THREAD;
488 
489 		if (t->t_unsafe)
490 			state = CM_UNSAFE;
491 		else if (t->t_schedflag & TS_VCPU)
492 			state = CM_VCPU;
493 
494 		smt->cs_state = CS_MK(state, zoneid);
495 
496 		if (smt_need_kick(smt, zoneid))
497 			smt_kick(smt, zoneid);
498 	}
499 
500 out:
501 	membar_producer();
502 	lock_clear(&smt->cs_lock);
503 	intr_restore(flags);
504 }
505 
506 void
smt_begin_unsafe(void)507 smt_begin_unsafe(void)
508 {
509 	curthread->t_unsafe++;
510 	smt_mark();
511 }
512 
513 void
smt_end_unsafe(void)514 smt_end_unsafe(void)
515 {
516 	ASSERT3U(curthread->t_unsafe, >, 0);
517 	curthread->t_unsafe--;
518 	smt_mark();
519 }
520 
521 void
smt_mark_as_vcpu(void)522 smt_mark_as_vcpu(void)
523 {
524 	thread_lock(curthread);
525 	curthread->t_schedflag |= TS_VCPU;
526 	smt_mark();
527 	thread_unlock(curthread);
528 }
529 
530 boolean_t
smt_should_run(kthread_t * t,cpu_t * cp)531 smt_should_run(kthread_t *t, cpu_t *cp)
532 {
533 	uint64_t sibstate;
534 	cpu_t *sib;
535 
536 	if (t == t->t_cpu->cpu_idle_thread)
537 		return (B_TRUE);
538 
539 	if ((sib = cp->cpu_m.mcpu_smt.cs_sib) == NULL)
540 		return (B_TRUE);
541 
542 	sibstate = sib->cpu_m.mcpu_smt.cs_state;
543 
544 	if ((t->t_schedflag & TS_VCPU)) {
545 		if (CS_MARK(sibstate) == CM_IDLE)
546 			return (B_TRUE);
547 		if (CS_MARK(sibstate) == CM_UNSAFE)
548 			return (B_FALSE);
549 		return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
550 	}
551 
552 	if (CS_MARK(sibstate) < CM_VCPU)
553 		return (B_TRUE);
554 
555 	return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
556 }
557 
558 pri_t
smt_adjust_cpu_score(kthread_t * t,struct cpu * cp,pri_t score)559 smt_adjust_cpu_score(kthread_t *t, struct cpu *cp, pri_t score)
560 {
561 	if (smt_should_run(t, cp))
562 		return (score);
563 
564 	/*
565 	 * If we're a VCPU thread scoring our current CPU, we are most likely
566 	 * asking to be rescheduled elsewhere after losing smt_acquire().  In
567 	 * this case, the current CPU is not a good choice, most likely, and we
568 	 * should go elsewhere.
569 	 */
570 	if ((t->t_schedflag & TS_VCPU) && cp == t->t_cpu && score < 0)
571 		return ((v.v_maxsyspri + 1) * 2);
572 
573 	return (score + 1);
574 }
575 
576 static void
set_smt_prop(void)577 set_smt_prop(void)
578 {
579 	(void) e_ddi_prop_update_string(DDI_DEV_T_NONE, ddi_root_node(),
580 	    "smt_enabled", smt_enabled ? "true" : "false");
581 }
582 
583 static cpu_t *
smt_find_sibling(cpu_t * cp)584 smt_find_sibling(cpu_t *cp)
585 {
586 	for (uint_t i = 0; i < GROUP_SIZE(&cp->cpu_pg->cmt_pgs); i++) {
587 		pg_cmt_t *pg = GROUP_ACCESS(&cp->cpu_pg->cmt_pgs, i);
588 		group_t *cg = &pg->cmt_pg.pghw_pg.pg_cpus;
589 
590 		if (pg->cmt_pg.pghw_hw != PGHW_IPIPE)
591 			continue;
592 
593 		if (GROUP_SIZE(cg) == 1)
594 			break;
595 
596 		if (GROUP_SIZE(cg) != 2) {
597 			panic("%u SMT threads unsupported", GROUP_SIZE(cg));
598 		}
599 
600 		if (GROUP_ACCESS(cg, 0) != cp)
601 			return (GROUP_ACCESS(cg, 0));
602 
603 		VERIFY3P(GROUP_ACCESS(cg, 1), !=, cp);
604 
605 		return (GROUP_ACCESS(cg, 1));
606 	}
607 
608 	return (NULL);
609 }
610 
611 /*
612  * Offline all siblings and mark as CPU_DISABLED. Note that any siblings that
613  * can't be offlined (if it would leave an empty partition, or it's a spare, or
614  * whatever) will fail the whole operation.
615  */
616 int
smt_disable(void)617 smt_disable(void)
618 {
619 	int error = 0;
620 
621 	ASSERT(MUTEX_HELD(&cpu_lock));
622 
623 	if (secpolicy_ponline(CRED()) != 0)
624 		return (EPERM);
625 
626 	if (!smt_enabled)
627 		return (0);
628 
629 	for (size_t i = 0; i < NCPU; i++) {
630 		cpu_t *sib;
631 		cpu_t *cp;
632 
633 		if ((cp = cpu_get(i)) == NULL)
634 			continue;
635 
636 		/* NB: we don't necessarily have .mcpu_smt to use here. */
637 		if ((sib = smt_find_sibling(cp)) == NULL)
638 			continue;
639 
640 		if (cp->cpu_id < sib->cpu_id)
641 			continue;
642 
643 		if (cp->cpu_flags & CPU_DISABLED) {
644 			VERIFY(cp->cpu_flags & CPU_OFFLINE);
645 			continue;
646 		}
647 
648 		if (cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) {
649 			error = EINVAL;
650 			break;
651 		}
652 
653 		if ((cp->cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY) {
654 			cp->cpu_flags |= CPU_DISABLED;
655 			continue;
656 		}
657 
658 		if ((error = cpu_offline(cp, CPU_FORCED)) != 0)
659 			break;
660 
661 		cp->cpu_flags |= CPU_DISABLED;
662 		cpu_set_state(cp);
663 	}
664 
665 	if (error != 0)
666 		return (error);
667 
668 	smt_enabled = 0;
669 	set_smt_prop();
670 	cmn_err(CE_NOTE, "!SMT / hyper-threading explicitly disabled.");
671 	return (0);
672 }
673 
674 boolean_t
smt_can_enable(cpu_t * cp,int flags)675 smt_can_enable(cpu_t *cp, int flags)
676 {
677 	VERIFY(cp->cpu_flags & CPU_DISABLED);
678 
679 	return (!smt_boot_disable && (flags & CPU_FORCED));
680 }
681 
682 /*
683  * If we force-onlined a CPU_DISABLED CPU, then we can no longer consider the
684  * system to be SMT-disabled in toto.
685  */
686 void
smt_force_enabled(void)687 smt_force_enabled(void)
688 {
689 	VERIFY(!smt_boot_disable);
690 
691 	if (!smt_enabled)
692 		cmn_err(CE_NOTE, "!Disabled SMT sibling forced on-line.");
693 
694 	smt_enabled = 1;
695 	set_smt_prop();
696 }
697 
698 /*
699  * Initialize SMT links.  We have to be careful here not to race with
700  * smt_begin/end_intr(), which also complicates trying to do this initialization
701  * from a cross-call; hence the slightly odd approach below.
702  *
703  * If we're going to disable SMT via smt_late_init(), we will avoid paying the
704  * price here at all (we can't do it here since we're still too early in
705  * main()).
706  */
707 void
smt_init(void)708 smt_init(void)
709 {
710 	boolean_t found_sibling = B_FALSE;
711 	cpu_t *scp = CPU;
712 	cpu_t *cp = scp;
713 	ulong_t flags;
714 
715 	if (!smt_exclusion || smt_boot_disable)
716 		return;
717 
718 	mutex_enter(&cpu_lock);
719 
720 	do {
721 		thread_affinity_set(curthread, cp->cpu_id);
722 		flags = intr_clear();
723 
724 		cp->cpu_m.mcpu_smt.cs_intr_depth = 0;
725 		cp->cpu_m.mcpu_smt.cs_state = CS_MK(CM_THREAD, GLOBAL_ZONEID);
726 		cp->cpu_m.mcpu_smt.cs_sibstate = CS_MK(CM_THREAD,
727 		    GLOBAL_ZONEID);
728 		ASSERT3P(cp->cpu_m.mcpu_smt.cs_sib, ==, NULL);
729 		cp->cpu_m.mcpu_smt.cs_sib = smt_find_sibling(cp);
730 
731 		if (cp->cpu_m.mcpu_smt.cs_sib != NULL)
732 			found_sibling = B_TRUE;
733 
734 		intr_restore(flags);
735 		thread_affinity_clear(curthread);
736 	} while ((cp = cp->cpu_next_onln) != scp);
737 
738 	mutex_exit(&cpu_lock);
739 
740 	if (!found_sibling)
741 		smt_enabled = 0;
742 }
743 
744 void
smt_late_init(void)745 smt_late_init(void)
746 {
747 	if (smt_boot_disable) {
748 		int err;
749 
750 		mutex_enter(&cpu_lock);
751 
752 		err = smt_disable();
753 
754 		/*
755 		 * We're early enough in boot that nothing should have stopped
756 		 * us from offlining the siblings. As we didn't prepare our
757 		 * L1TF mitigation in this case, we need to panic.
758 		 */
759 		if (err) {
760 			cmn_err(CE_PANIC, "smt_disable() failed with %d", err);
761 		}
762 
763 		mutex_exit(&cpu_lock);
764 	}
765 
766 	if (smt_enabled)
767 		cmn_err(CE_NOTE, "!SMT enabled\n");
768 
769 	set_smt_prop();
770 }
771