1843e198johnlev/*
2843e198johnlev * CDDL HEADER START
3843e198johnlev *
4843e198johnlev * The contents of this file are subject to the terms of the
5843e198johnlev * Common Development and Distribution License (the "License").
6843e198johnlev * You may not use this file except in compliance with the License.
7843e198johnlev *
8843e198johnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9843e198johnlev * or http://www.opensolaris.org/os/licensing.
10843e198johnlev * See the License for the specific language governing permissions
11843e198johnlev * and limitations under the License.
12843e198johnlev *
13843e198johnlev * When distributing Covered Code, include this CDDL HEADER in each
14843e198johnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15843e198johnlev * If applicable, add the following below this CDDL HEADER, with the
16843e198johnlev * fields enclosed by brackets "[]" replaced with your own identifying
17843e198johnlev * information: Portions Copyright [yyyy] [name of copyright owner]
18843e198johnlev *
19843e198johnlev * CDDL HEADER END
20843e198johnlev */
21843e198johnlev
22843e198johnlev/*
23f34a717Joe Bonasera * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24843e198johnlev * Use is subject to license terms.
25843e198johnlev */
26843e198johnlev
271d03c31johnlev/*
28c3377eeJohn Levon * Copyright 2019 Joyent, Inc.
29c3377eeJohn Levon */
30c3377eeJohn Levon
31c3377eeJohn Levon/*
321d03c31johnlev * Virtual CPU management.
331d03c31johnlev *
341d03c31johnlev * VCPUs can be controlled in one of two ways; through the domain itself
351d03c31johnlev * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
361d03c31johnlev * Unfortunately, the terminology is used in different ways; they work out as
371d03c31johnlev * follows:
381d03c31johnlev *
391d03c31johnlev * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
401d03c31johnlev *
411d03c31johnlev * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
421d03c31johnlev * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
431d03c31johnlev * receive interrupts, and we require this for offline CPUs in Solaris.
441d03c31johnlev *
451d03c31johnlev * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
461d03c31johnlev * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
471d03c31johnlev * if it has run previously, its software state (cpu_t, machcpu structures, IPI
481d03c31johnlev * event channels, etc.) will still exist.
491d03c31johnlev *
501d03c31johnlev * The hypervisor has two notions of CPU states as represented in the store:
511d03c31johnlev *
521d03c31johnlev * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
531d03c31johnlev *
541d03c31johnlev * "online": the VCPU is running.  Corresponds to a CPU state other than
551d03c31johnlev * P_POWEROFF.
561d03c31johnlev *
571d03c31johnlev * Currently, only a notification via xenstore can bring a CPU into a
581d03c31johnlev * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
591d03c31johnlev * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
601d03c31johnlev * idempotently, as we'll get 'duplicate' entries when we resume a domain.
611d03c31johnlev *
621d03c31johnlev * Note that the xenstore configuration is strictly advisory, in that a domain
631d03c31johnlev * can choose to ignore it and still power up a VCPU in the offline state. To
641d03c31johnlev * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
651d03c31johnlev * ENOTSUP from within Solaris.
661d03c31johnlev *
671d03c31johnlev * Powering off a VCPU and suspending the domain use similar code. The
681d03c31johnlev * difficulty here is that we must ensure that each VCPU is in a stable
691d03c31johnlev * state: it must have a saved PCB, and not be responding to interrupts
701d03c31johnlev * (since we are just about to remove its ability to run on a real CPU,
711d03c31johnlev * possibly forever).  However, an offline CPU in Solaris can take
721d03c31johnlev * cross-call interrupts, as mentioned, so we must go through a
731d03c31johnlev * two-stage process.  First, we use the standard Solaris pause_cpus().
741d03c31johnlev * This ensures that all CPUs are either in mach_cpu_pause() or
751d03c31johnlev * mach_cpu_idle(), and nothing will cross-call them.
761d03c31johnlev *
771d03c31johnlev * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
781d03c31johnlev * bring them back up, and in state CPU_PHASE_POWERED_OFF.
791d03c31johnlev *
801d03c31johnlev * Running CPUs are spinning in mach_cpu_pause() waiting for either
811d03c31johnlev * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
821d03c31johnlev *
831d03c31johnlev * Offline CPUs are either running the idle thread and periodically
841d03c31johnlev * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
851d03c31johnlev *
861d03c31johnlev * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
871d03c31johnlev * poking them to make sure they're not blocked[1]. When every CPU has
881d03c31johnlev * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
891d03c31johnlev * know we can suspend, or power-off a CPU, without problems.
901d03c31johnlev *
911d03c31johnlev * [1] note that we have to repeatedly poke offline CPUs: it's the only
921d03c31johnlev * way to ensure that the CPU doesn't miss the state change before
931d03c31johnlev * dropping into HYPERVISOR_block().
941d03c31johnlev */
951d03c31johnlev
96843e198johnlev#include <sys/types.h>
97843e198johnlev#include <sys/systm.h>
98843e198johnlev#include <sys/param.h>
99843e198johnlev#include <sys/taskq.h>
100843e198johnlev#include <sys/cmn_err.h>
101843e198johnlev#include <sys/archsystm.h>
102843e198johnlev#include <sys/machsystm.h>
103843e198johnlev#include <sys/segments.h>
104843e198johnlev#include <sys/cpuvar.h>
105843e198johnlev#include <sys/x86_archext.h>
106843e198johnlev#include <sys/controlregs.h>
107843e198johnlev#include <sys/hypervisor.h>
108843e198johnlev#include <sys/xpv_panic.h>
1091d03c31johnlev#include <sys/mman.h>
1101d03c31johnlev#include <sys/psw.h>
111843e198johnlev#include <sys/cpu.h>
1121d03c31johnlev#include <sys/sunddi.h>
1131d03c31johnlev#include <util/sscanf.h>
1141d03c31johnlev#include <vm/hat_i86.h>
1151d03c31johnlev#include <vm/hat.h>
1161d03c31johnlev#include <vm/as.h>
117843e198johnlev
118843e198johnlev#include <xen/public/io/xs_wire.h>
1191d03c31johnlev#include <xen/sys/xenbus_impl.h>
1201d03c31johnlev#include <xen/public/vcpu.h>
121843e198johnlev
122f34a717Joe Bonaseraextern cpuset_t cpu_ready_set;
123f34a717Joe Bonasera
1241d03c31johnlev#define	CPU_PHASE_NONE 0
1251d03c31johnlev#define	CPU_PHASE_WAIT_SAFE 1
1261d03c31johnlev#define	CPU_PHASE_SAFE 2
1271d03c31johnlev#define	CPU_PHASE_POWERED_OFF 3
1281d03c31johnlev
1291d03c31johnlev/*
1301d03c31johnlev * We can only poke CPUs during barrier enter 256 times a second at
1311d03c31johnlev * most.
1321d03c31johnlev */
1331d03c31johnlev#define	POKE_TIMEOUT (NANOSEC / 256)
134843e198johnlev
135843e198johnlevstatic taskq_t *cpu_config_tq;
1361d03c31johnlevstatic int cpu_phase[NCPU];
1371d03c31johnlev
138843e198johnlevstatic void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
139843e198johnlevstatic int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
140843e198johnlev
141843e198johnlev/*
142b9bc7f7smaybe * Return whether or not the vcpu is actually running on a pcpu
143b9bc7f7smaybe */
144b9bc7f7smaybeint
145b9bc7f7smaybevcpu_on_pcpu(processorid_t cpu)
146b9bc7f7smaybe{
147b9bc7f7smaybe	struct vcpu_runstate_info runstate;
148b9bc7f7smaybe	int	ret = VCPU_STATE_UNKNOWN;
149b9bc7f7smaybe
150b9bc7f7smaybe	ASSERT(cpu < NCPU);
151b9bc7f7smaybe	/*
152b9bc7f7smaybe	 * Don't bother with hypercall if we are asking about ourself
153b9bc7f7smaybe	 */
154b9bc7f7smaybe	if (cpu == CPU->cpu_id)
155b9bc7f7smaybe		return (VCPU_ON_PCPU);
156b9bc7f7smaybe	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
157b9bc7f7smaybe		goto out;
158b9bc7f7smaybe
159b9bc7f7smaybe	switch (runstate.state) {
160b9bc7f7smaybe	case RUNSTATE_running:
161b9bc7f7smaybe		ret = VCPU_ON_PCPU;
162b9bc7f7smaybe		break;
163b9bc7f7smaybe
164b9bc7f7smaybe	case RUNSTATE_runnable:
165b9bc7f7smaybe	case RUNSTATE_offline:
166b9bc7f7smaybe	case RUNSTATE_blocked:
167b9bc7f7smaybe		ret = VCPU_NOT_ON_PCPU;
168b9bc7f7smaybe		break;
169b9bc7f7smaybe
170b9bc7f7smaybe	default:
171b9bc7f7smaybe		break;
172b9bc7f7smaybe	}
173b9bc7f7smaybe
174b9bc7f7smaybeout:
175b9bc7f7smaybe	return (ret);
176b9bc7f7smaybe}
177b9bc7f7smaybe
178b9bc7f7smaybe/*
179843e198johnlev * These routines allocate any global state that might be needed
180843e198johnlev * while starting cpus.  For virtual cpus, there is no such state.
181843e198johnlev */
182843e198johnlevint
183843e198johnlevmach_cpucontext_init(void)
184843e198johnlev{
185843e198johnlev	return (0);
186843e198johnlev}
187843e198johnlev
188843e198johnlevvoid
189843e198johnlevdo_cpu_config_watch(int state)
190843e198johnlev{
191843e198johnlev	static struct xenbus_watch cpu_config_watch;
192843e198johnlev
193843e198johnlev	if (state != XENSTORE_UP)
194843e198johnlev		return;
195843e198johnlev	cpu_config_watch.node = "cpu";
196843e198johnlev	cpu_config_watch.callback = vcpu_config_event;
197843e198johnlev	if (register_xenbus_watch(&cpu_config_watch)) {
198843e198johnlev		taskq_destroy(cpu_config_tq);
199843e198johnlev		cmn_err(CE_WARN, "do_cpu_config_watch: "
200843e198johnlev		    "failed to set vcpu config watch");
201843e198johnlev	}
202843e198johnlev
203843e198johnlev}
204843e198johnlev
205843e198johnlev/*
206843e198johnlev * This routine is called after all the "normal" MP startup has
207843e198johnlev * been done; a good place to start watching xen store for virtual
208843e198johnlev * cpu hot plug events.
209843e198johnlev */
210843e198johnlevvoid
211843e198johnlevmach_cpucontext_fini(void)
212843e198johnlev{
213843e198johnlev
214843e198johnlev	cpu_config_tq = taskq_create("vcpu config taskq", 1,
215843e198johnlev	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
216843e198johnlev
217843e198johnlev	(void) xs_register_xenbus_callback(do_cpu_config_watch);
218843e198johnlev}
219843e198johnlev
220843e198johnlev/*
221843e198johnlev * Fill in the remaining CPU context and initialize it.
222843e198johnlev */
223843e198johnlevstatic int
224843e198johnlevmp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
225843e198johnlev{
226843e198johnlev	uint_t vec, iopl;
227843e198johnlev
228843e198johnlev	vgc->flags = VGCF_IN_KERNEL;
229843e198johnlev
230843e198johnlev	/*
231843e198johnlev	 * fpu_ctx we leave as zero; on first fault we'll store
232843e198johnlev	 * sse_initial into it anyway.
233843e198johnlev	 */
234843e198johnlev
235843e198johnlev#if defined(__amd64)
236843e198johnlev	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
237843e198johnlev#else
238843e198johnlev	vgc->user_regs.cs = KCS_SEL;
239843e198johnlev#endif
240843e198johnlev	vgc->user_regs.ds = KDS_SEL;
241843e198johnlev	vgc->user_regs.es = KDS_SEL;
242843e198johnlev	vgc->user_regs.ss = KDS_SEL;
243843e198johnlev	vgc->kernel_ss = KDS_SEL;
244843e198johnlev
245843e198johnlev	/*
246843e198johnlev	 * Allow I/O privilege level for Dom0 kernel.
247843e198johnlev	 */
248843e198johnlev	if (DOMAIN_IS_INITDOMAIN(xen_info))
249843e198johnlev		iopl = (PS_IOPL & 0x1000); /* ring 1 */
250843e198johnlev	else
251843e198johnlev		iopl = 0;
252843e198johnlev
253843e198johnlev#if defined(__amd64)
254843e198johnlev	vgc->user_regs.fs = 0;
255843e198johnlev	vgc->user_regs.gs = 0;
256843e198johnlev	vgc->user_regs.rflags = F_OFF | iopl;
257843e198johnlev#elif defined(__i386)
258843e198johnlev	vgc->user_regs.fs = KFS_SEL;
259843e198johnlev	vgc->user_regs.gs = KGS_SEL;
260843e198johnlev	vgc->user_regs.eflags = F_OFF | iopl;
261843e198johnlev	vgc->event_callback_cs = vgc->user_regs.cs;
262843e198johnlev	vgc->failsafe_callback_cs = vgc->user_regs.cs;
263843e198johnlev#endif
264843e198johnlev
265843e198johnlev	/*
266843e198johnlev	 * Initialize the trap_info_t from the IDT
267843e198johnlev	 */
268843e198johnlev#if !defined(__lint)
269843e198johnlev	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
270843e198johnlev#endif
271843e198johnlev	for (vec = 0; vec < NIDT; vec++) {
272843e198johnlev		trap_info_t *ti = &vgc->trap_ctxt[vec];
273843e198johnlev
274843e198johnlev		if (xen_idt_to_trap_info(vec,
275843e198johnlev		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
276843e198johnlev			ti->cs = KCS_SEL;
277843e198johnlev			ti->vector = vec;
278843e198johnlev		}
279843e198johnlev	}
280843e198johnlev
281843e198johnlev	/*
282843e198johnlev	 * No LDT
283843e198johnlev	 */
284843e198johnlev
285843e198johnlev	/*
286843e198johnlev	 * (We assert in various places that the GDT is (a) aligned on a
287843e198johnlev	 * page boundary and (b) one page long, so this really should fit..)
288843e198johnlev	 */
289843e198johnlev#ifdef CRASH_XEN
290843e198johnlev	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
291843e198johnlev#else
292843e198johnlev	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
293843e198johnlev#endif
294843e198johnlev	vgc->gdt_ents = NGDT;
295843e198johnlev
296843e198johnlev	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
297843e198johnlev
298843e198johnlev#if defined(__i386)
299843e198johnlev	if (mmu.pae_hat)
300843e198johnlev		vgc->ctrlreg[3] =
301843e198johnlev		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
302843e198johnlev	else
303843e198johnlev#endif
304843e198johnlev		vgc->ctrlreg[3] =
305843e198johnlev		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
306843e198johnlev
307843e198johnlev	vgc->ctrlreg[4] = getcr4();
308843e198johnlev
309843e198johnlev	vgc->event_callback_eip = (uintptr_t)xen_callback;
310843e198johnlev	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
311843e198johnlev	vgc->flags |= VGCF_failsafe_disables_events;
312843e198johnlev
313843e198johnlev#if defined(__amd64)
314843e198johnlev	/*
315843e198johnlev	 * XXPV should this be moved to init_cpu_syscall?
316843e198johnlev	 */
317843e198johnlev	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
318843e198johnlev	vgc->flags |= VGCF_syscall_disables_events;
319843e198johnlev
320843e198johnlev	ASSERT(vgc->user_regs.gs == 0);
321843e198johnlev	vgc->gs_base_kernel = (uintptr_t)cp;
322843e198johnlev#endif
323843e198johnlev
324843e198johnlev	return (xen_vcpu_initialize(cp->cpu_id, vgc));
325843e198johnlev}
326843e198johnlev
327843e198johnlev/*
328843e198johnlev * Create a guest virtual cpu context so that the virtual cpu
329843e198johnlev * springs into life in the domain just about to call mp_startup()
330843e198johnlev *
331843e198johnlev * Virtual CPUs must be initialized once in the lifetime of the domain;
332843e198johnlev * after that subsequent attempts to start them will fail with X_EEXIST.
333843e198johnlev *
334843e198johnlev * Thus 'alloc' -really- creates and initializes the virtual
335843e198johnlev * CPU context just once. Once the initialisation succeeds, we never
336843e198johnlev * free it, nor the regular cpu_t to which it refers.
337843e198johnlev */
338843e198johnlevvoid *
339843e198johnlevmach_cpucontext_alloc(struct cpu *cp)
340843e198johnlev{
341843e198johnlev	kthread_t *tp = cp->cpu_thread;
342843e198johnlev	vcpu_guest_context_t vgc;
343843e198johnlev
344843e198johnlev	int err = 1;
345843e198johnlev
346843e198johnlev	/*
347843e198johnlev	 * First, augment the incoming cpu structure
348843e198johnlev	 * - vcpu pointer reference
349843e198johnlev	 * - pending event storage area
350843e198johnlev	 * - physical address of GDT
351843e198johnlev	 */
352843e198johnlev	cp->cpu_m.mcpu_vcpu_info =
353843e198johnlev	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
354843e198johnlev	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
355843e198johnlev	    sizeof (struct xen_evt_data), KM_SLEEP);
356843e198johnlev	cp->cpu_m.mcpu_gdtpa =
357843e198johnlev	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
358843e198johnlev
359843e198johnlev	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
360843e198johnlev		goto done;
361843e198johnlev
362843e198johnlev	/*
363843e198johnlev	 * Now set up the vcpu context so that we can start this vcpu
364843e198johnlev	 * in the kernel at tp->t_pc (mp_startup).  Note that the
365843e198johnlev	 * thread will thread_exit() shortly after performing the
366843e198johnlev	 * initialization; in particular, we will *never* take a
367843e198johnlev	 * privilege transition on this thread.
368843e198johnlev	 */
369843e198johnlev
370843e198johnlev	bzero(&vgc, sizeof (vgc));
371843e198johnlev
372843e198johnlev#ifdef __amd64
373843e198johnlev	vgc.user_regs.rip = tp->t_pc;
374843e198johnlev	vgc.user_regs.rsp = tp->t_sp;
375843e198johnlev	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
376843e198johnlev#else
377843e198johnlev	vgc.user_regs.eip = tp->t_pc;
378843e198johnlev	vgc.user_regs.esp = tp->t_sp;
379843e198johnlev	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
380843e198johnlev#endif
381843e198johnlev	/*
382843e198johnlev	 * XXPV	Fix resume, if Russ didn't already fix it.
383843e198johnlev	 *
384843e198johnlev	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
385843e198johnlev	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
386843e198johnlev	 * that only lwps take traps that switch to the kernel stack;
387843e198johnlev	 * part of creating an lwp adjusts the stack by subtracting
388843e198johnlev	 * sizeof (struct regs) off t_stk.
389843e198johnlev	 *
390843e198johnlev	 * The more interesting question is, why do we do all the work
391843e198johnlev	 * of a fully fledged lwp for a plain thread?  In particular
392843e198johnlev	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
393843e198johnlev	 * or futz with the LDT.  This should probably all be done with
394843e198johnlev	 * an lwp context operator to keep pure thread context switch fast.
395843e198johnlev	 */
396843e198johnlev	vgc.kernel_sp = (ulong_t)tp->t_stk;
397843e198johnlev
398843e198johnlev	err = mp_set_cpu_context(&vgc, cp);
399843e198johnlev
400843e198johnlevdone:
401843e198johnlev	if (err) {
402843e198johnlev		mach_cpucontext_free(cp, NULL, err);
403843e198johnlev		return (NULL);
404843e198johnlev	}
405843e198johnlev	return (cp);
406843e198johnlev}
407843e198johnlev
408843e198johnlev/*
409843e198johnlev * By the time we are called either we have successfully started
410843e198johnlev * the cpu, or our attempt to start it has failed.
411843e198johnlev */
412843e198johnlev
413