xref: /illumos-gate/usr/src/uts/i86xpv/os/mp_xen.c (revision 1d03c31e0733adea0edef54f0d5d2ea9639ecd2a)
1843e1988Sjohnlev /*
2843e1988Sjohnlev  * CDDL HEADER START
3843e1988Sjohnlev  *
4843e1988Sjohnlev  * The contents of this file are subject to the terms of the
5843e1988Sjohnlev  * Common Development and Distribution License (the "License").
6843e1988Sjohnlev  * You may not use this file except in compliance with the License.
7843e1988Sjohnlev  *
8843e1988Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9843e1988Sjohnlev  * or http://www.opensolaris.org/os/licensing.
10843e1988Sjohnlev  * See the License for the specific language governing permissions
11843e1988Sjohnlev  * and limitations under the License.
12843e1988Sjohnlev  *
13843e1988Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
14843e1988Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15843e1988Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
16843e1988Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
17843e1988Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
18843e1988Sjohnlev  *
19843e1988Sjohnlev  * CDDL HEADER END
20843e1988Sjohnlev  */
21843e1988Sjohnlev 
22843e1988Sjohnlev /*
23843e1988Sjohnlev  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24843e1988Sjohnlev  * Use is subject to license terms.
25843e1988Sjohnlev  */
26843e1988Sjohnlev 
27*1d03c31eSjohnlev /*
28*1d03c31eSjohnlev  * Virtual CPU management.
29*1d03c31eSjohnlev  *
30*1d03c31eSjohnlev  * VCPUs can be controlled in one of two ways; through the domain itself
31*1d03c31eSjohnlev  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
32*1d03c31eSjohnlev  * Unfortunately, the terminology is used in different ways; they work out as
33*1d03c31eSjohnlev  * follows:
34*1d03c31eSjohnlev  *
35*1d03c31eSjohnlev  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
36*1d03c31eSjohnlev  *
37*1d03c31eSjohnlev  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
38*1d03c31eSjohnlev  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
39*1d03c31eSjohnlev  * receive interrupts, and we require this for offline CPUs in Solaris.
40*1d03c31eSjohnlev  *
41*1d03c31eSjohnlev  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
42*1d03c31eSjohnlev  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
43*1d03c31eSjohnlev  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
44*1d03c31eSjohnlev  * event channels, etc.) will still exist.
45*1d03c31eSjohnlev  *
46*1d03c31eSjohnlev  * The hypervisor has two notions of CPU states as represented in the store:
47*1d03c31eSjohnlev  *
48*1d03c31eSjohnlev  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
49*1d03c31eSjohnlev  *
50*1d03c31eSjohnlev  * "online": the VCPU is running.  Corresponds to a CPU state other than
51*1d03c31eSjohnlev  * P_POWEROFF.
52*1d03c31eSjohnlev  *
53*1d03c31eSjohnlev  * Currently, only a notification via xenstore can bring a CPU into a
54*1d03c31eSjohnlev  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
55*1d03c31eSjohnlev  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
56*1d03c31eSjohnlev  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
57*1d03c31eSjohnlev  *
58*1d03c31eSjohnlev  * Note that the xenstore configuration is strictly advisory, in that a domain
59*1d03c31eSjohnlev  * can choose to ignore it and still power up a VCPU in the offline state. To
60*1d03c31eSjohnlev  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
61*1d03c31eSjohnlev  * ENOTSUP from within Solaris.
62*1d03c31eSjohnlev  *
63*1d03c31eSjohnlev  * Powering off a VCPU and suspending the domain use similar code. The
64*1d03c31eSjohnlev  * difficulty here is that we must ensure that each VCPU is in a stable
65*1d03c31eSjohnlev  * state: it must have a saved PCB, and not be responding to interrupts
66*1d03c31eSjohnlev  * (since we are just about to remove its ability to run on a real CPU,
67*1d03c31eSjohnlev  * possibly forever).  However, an offline CPU in Solaris can take
68*1d03c31eSjohnlev  * cross-call interrupts, as mentioned, so we must go through a
69*1d03c31eSjohnlev  * two-stage process.  First, we use the standard Solaris pause_cpus().
70*1d03c31eSjohnlev  * This ensures that all CPUs are either in mach_cpu_pause() or
71*1d03c31eSjohnlev  * mach_cpu_idle(), and nothing will cross-call them.
72*1d03c31eSjohnlev  *
73*1d03c31eSjohnlev  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
74*1d03c31eSjohnlev  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
75*1d03c31eSjohnlev  *
76*1d03c31eSjohnlev  * Running CPUs are spinning in mach_cpu_pause() waiting for either
77*1d03c31eSjohnlev  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
78*1d03c31eSjohnlev  *
79*1d03c31eSjohnlev  * Offline CPUs are either running the idle thread and periodically
80*1d03c31eSjohnlev  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
81*1d03c31eSjohnlev  *
82*1d03c31eSjohnlev  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
83*1d03c31eSjohnlev  * poking them to make sure they're not blocked[1]. When every CPU has
84*1d03c31eSjohnlev  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
85*1d03c31eSjohnlev  * know we can suspend, or power-off a CPU, without problems.
86*1d03c31eSjohnlev  *
87*1d03c31eSjohnlev  * [1] note that we have to repeatedly poke offline CPUs: it's the only
88*1d03c31eSjohnlev  * way to ensure that the CPU doesn't miss the state change before
89*1d03c31eSjohnlev  * dropping into HYPERVISOR_block().
90*1d03c31eSjohnlev  */
91*1d03c31eSjohnlev 
92843e1988Sjohnlev #pragma ident	"%Z%%M%	%I%	%E% SMI"
93843e1988Sjohnlev 
94843e1988Sjohnlev #include <sys/types.h>
95843e1988Sjohnlev #include <sys/systm.h>
96843e1988Sjohnlev #include <sys/param.h>
97843e1988Sjohnlev #include <sys/taskq.h>
98843e1988Sjohnlev #include <sys/cmn_err.h>
99843e1988Sjohnlev #include <sys/archsystm.h>
100843e1988Sjohnlev #include <sys/machsystm.h>
101843e1988Sjohnlev #include <sys/segments.h>
102843e1988Sjohnlev #include <sys/cpuvar.h>
103843e1988Sjohnlev #include <sys/x86_archext.h>
104843e1988Sjohnlev #include <sys/controlregs.h>
105843e1988Sjohnlev #include <sys/hypervisor.h>
106843e1988Sjohnlev #include <sys/xpv_panic.h>
107*1d03c31eSjohnlev #include <sys/mman.h>
108*1d03c31eSjohnlev #include <sys/psw.h>
109843e1988Sjohnlev #include <sys/cpu.h>
110*1d03c31eSjohnlev #include <sys/sunddi.h>
111*1d03c31eSjohnlev #include <util/sscanf.h>
112*1d03c31eSjohnlev #include <vm/hat_i86.h>
113*1d03c31eSjohnlev #include <vm/hat.h>
114*1d03c31eSjohnlev #include <vm/as.h>
115843e1988Sjohnlev 
116843e1988Sjohnlev #include <xen/public/io/xs_wire.h>
117*1d03c31eSjohnlev #include <xen/sys/xenbus_impl.h>
118*1d03c31eSjohnlev #include <xen/public/vcpu.h>
119843e1988Sjohnlev 
120*1d03c31eSjohnlev #define	CPU_PHASE_NONE 0
121*1d03c31eSjohnlev #define	CPU_PHASE_WAIT_SAFE 1
122*1d03c31eSjohnlev #define	CPU_PHASE_SAFE 2
123*1d03c31eSjohnlev #define	CPU_PHASE_POWERED_OFF 3
124*1d03c31eSjohnlev 
125*1d03c31eSjohnlev /*
126*1d03c31eSjohnlev  * We can only poke CPUs during barrier enter 256 times a second at
127*1d03c31eSjohnlev  * most.
128*1d03c31eSjohnlev  */
129*1d03c31eSjohnlev #define	POKE_TIMEOUT (NANOSEC / 256)
130843e1988Sjohnlev 
131843e1988Sjohnlev static taskq_t *cpu_config_tq;
132*1d03c31eSjohnlev static int cpu_phase[NCPU];
133*1d03c31eSjohnlev 
134843e1988Sjohnlev static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
135843e1988Sjohnlev static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
136843e1988Sjohnlev 
137843e1988Sjohnlev /*
138843e1988Sjohnlev  * These routines allocate any global state that might be needed
139843e1988Sjohnlev  * while starting cpus.  For virtual cpus, there is no such state.
140843e1988Sjohnlev  */
141843e1988Sjohnlev int
142843e1988Sjohnlev mach_cpucontext_init(void)
143843e1988Sjohnlev {
144843e1988Sjohnlev 	return (0);
145843e1988Sjohnlev }
146843e1988Sjohnlev 
147843e1988Sjohnlev void
148843e1988Sjohnlev do_cpu_config_watch(int state)
149843e1988Sjohnlev {
150843e1988Sjohnlev 	static struct xenbus_watch cpu_config_watch;
151843e1988Sjohnlev 
152843e1988Sjohnlev 	if (state != XENSTORE_UP)
153843e1988Sjohnlev 		return;
154843e1988Sjohnlev 	cpu_config_watch.node = "cpu";
155843e1988Sjohnlev 	cpu_config_watch.callback = vcpu_config_event;
156843e1988Sjohnlev 	if (register_xenbus_watch(&cpu_config_watch)) {
157843e1988Sjohnlev 		taskq_destroy(cpu_config_tq);
158843e1988Sjohnlev 		cmn_err(CE_WARN, "do_cpu_config_watch: "
159843e1988Sjohnlev 		    "failed to set vcpu config watch");
160843e1988Sjohnlev 	}
161843e1988Sjohnlev 
162843e1988Sjohnlev }
163843e1988Sjohnlev 
164843e1988Sjohnlev /*
165843e1988Sjohnlev  * This routine is called after all the "normal" MP startup has
166843e1988Sjohnlev  * been done; a good place to start watching xen store for virtual
167843e1988Sjohnlev  * cpu hot plug events.
168843e1988Sjohnlev  */
169843e1988Sjohnlev void
170843e1988Sjohnlev mach_cpucontext_fini(void)
171843e1988Sjohnlev {
172843e1988Sjohnlev 
173843e1988Sjohnlev 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
174843e1988Sjohnlev 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
175843e1988Sjohnlev 
176843e1988Sjohnlev 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
177843e1988Sjohnlev }
178843e1988Sjohnlev 
179843e1988Sjohnlev /*
180843e1988Sjohnlev  * Fill in the remaining CPU context and initialize it.
181843e1988Sjohnlev  */
182843e1988Sjohnlev static int
183843e1988Sjohnlev mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
184843e1988Sjohnlev {
185843e1988Sjohnlev 	uint_t vec, iopl;
186843e1988Sjohnlev 
187843e1988Sjohnlev 	vgc->flags = VGCF_IN_KERNEL;
188843e1988Sjohnlev 
189843e1988Sjohnlev 	/*
190843e1988Sjohnlev 	 * fpu_ctx we leave as zero; on first fault we'll store
191843e1988Sjohnlev 	 * sse_initial into it anyway.
192843e1988Sjohnlev 	 */
193843e1988Sjohnlev 
194843e1988Sjohnlev #if defined(__amd64)
195843e1988Sjohnlev 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
196843e1988Sjohnlev #else
197843e1988Sjohnlev 	vgc->user_regs.cs = KCS_SEL;
198843e1988Sjohnlev #endif
199843e1988Sjohnlev 	vgc->user_regs.ds = KDS_SEL;
200843e1988Sjohnlev 	vgc->user_regs.es = KDS_SEL;
201843e1988Sjohnlev 	vgc->user_regs.ss = KDS_SEL;
202843e1988Sjohnlev 	vgc->kernel_ss = KDS_SEL;
203843e1988Sjohnlev 
204843e1988Sjohnlev 	/*
205843e1988Sjohnlev 	 * Allow I/O privilege level for Dom0 kernel.
206843e1988Sjohnlev 	 */
207843e1988Sjohnlev 	if (DOMAIN_IS_INITDOMAIN(xen_info))
208843e1988Sjohnlev 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
209843e1988Sjohnlev 	else
210843e1988Sjohnlev 		iopl = 0;
211843e1988Sjohnlev 
212843e1988Sjohnlev #if defined(__amd64)
213843e1988Sjohnlev 	vgc->user_regs.fs = 0;
214843e1988Sjohnlev 	vgc->user_regs.gs = 0;
215843e1988Sjohnlev 	vgc->user_regs.rflags = F_OFF | iopl;
216843e1988Sjohnlev #elif defined(__i386)
217843e1988Sjohnlev 	vgc->user_regs.fs = KFS_SEL;
218843e1988Sjohnlev 	vgc->user_regs.gs = KGS_SEL;
219843e1988Sjohnlev 	vgc->user_regs.eflags = F_OFF | iopl;
220843e1988Sjohnlev 	vgc->event_callback_cs = vgc->user_regs.cs;
221843e1988Sjohnlev 	vgc->failsafe_callback_cs = vgc->user_regs.cs;
222843e1988Sjohnlev #endif
223843e1988Sjohnlev 
224843e1988Sjohnlev 	/*
225843e1988Sjohnlev 	 * Initialize the trap_info_t from the IDT
226843e1988Sjohnlev 	 */
227843e1988Sjohnlev #if !defined(__lint)
228843e1988Sjohnlev 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
229843e1988Sjohnlev #endif
230843e1988Sjohnlev 	for (vec = 0; vec < NIDT; vec++) {
231843e1988Sjohnlev 		trap_info_t *ti = &vgc->trap_ctxt[vec];
232843e1988Sjohnlev 
233843e1988Sjohnlev 		if (xen_idt_to_trap_info(vec,
234843e1988Sjohnlev 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
235843e1988Sjohnlev 			ti->cs = KCS_SEL;
236843e1988Sjohnlev 			ti->vector = vec;
237843e1988Sjohnlev 		}
238843e1988Sjohnlev 	}
239843e1988Sjohnlev 
240843e1988Sjohnlev 	/*
241843e1988Sjohnlev 	 * No LDT
242843e1988Sjohnlev 	 */
243843e1988Sjohnlev 
244843e1988Sjohnlev 	/*
245843e1988Sjohnlev 	 * (We assert in various places that the GDT is (a) aligned on a
246843e1988Sjohnlev 	 * page boundary and (b) one page long, so this really should fit..)
247843e1988Sjohnlev 	 */
248843e1988Sjohnlev #ifdef CRASH_XEN
249843e1988Sjohnlev 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
250843e1988Sjohnlev #else
251843e1988Sjohnlev 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
252843e1988Sjohnlev #endif
253843e1988Sjohnlev 	vgc->gdt_ents = NGDT;
254843e1988Sjohnlev 
255843e1988Sjohnlev 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
256843e1988Sjohnlev 
257843e1988Sjohnlev #if defined(__i386)
258843e1988Sjohnlev 	if (mmu.pae_hat)
259843e1988Sjohnlev 		vgc->ctrlreg[3] =
260843e1988Sjohnlev 		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
261843e1988Sjohnlev 	else
262843e1988Sjohnlev #endif
263843e1988Sjohnlev 		vgc->ctrlreg[3] =
264843e1988Sjohnlev 		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
265843e1988Sjohnlev 
266843e1988Sjohnlev 	vgc->ctrlreg[4] = getcr4();
267843e1988Sjohnlev 
268843e1988Sjohnlev 	vgc->event_callback_eip = (uintptr_t)xen_callback;
269843e1988Sjohnlev 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
270843e1988Sjohnlev 	vgc->flags |= VGCF_failsafe_disables_events;
271843e1988Sjohnlev 
272843e1988Sjohnlev #if defined(__amd64)
273843e1988Sjohnlev 	/*
274843e1988Sjohnlev 	 * XXPV should this be moved to init_cpu_syscall?
275843e1988Sjohnlev 	 */
276843e1988Sjohnlev 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
277843e1988Sjohnlev 	vgc->flags |= VGCF_syscall_disables_events;
278843e1988Sjohnlev 
279843e1988Sjohnlev 	ASSERT(vgc->user_regs.gs == 0);
280843e1988Sjohnlev 	vgc->gs_base_kernel = (uintptr_t)cp;
281843e1988Sjohnlev #endif
282843e1988Sjohnlev 
283843e1988Sjohnlev 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
284843e1988Sjohnlev }
285843e1988Sjohnlev 
286843e1988Sjohnlev /*
287843e1988Sjohnlev  * Create a guest virtual cpu context so that the virtual cpu
288843e1988Sjohnlev  * springs into life in the domain just about to call mp_startup()
289843e1988Sjohnlev  *
290843e1988Sjohnlev  * Virtual CPUs must be initialized once in the lifetime of the domain;
291843e1988Sjohnlev  * after that subsequent attempts to start them will fail with X_EEXIST.
292843e1988Sjohnlev  *
293843e1988Sjohnlev  * Thus 'alloc' -really- creates and initializes the virtual
294843e1988Sjohnlev  * CPU context just once. Once the initialisation succeeds, we never
295843e1988Sjohnlev  * free it, nor the regular cpu_t to which it refers.
296843e1988Sjohnlev  */
297843e1988Sjohnlev void *
298843e1988Sjohnlev mach_cpucontext_alloc(struct cpu *cp)
299843e1988Sjohnlev {
300843e1988Sjohnlev 	kthread_t *tp = cp->cpu_thread;
301843e1988Sjohnlev 	vcpu_guest_context_t vgc;
302843e1988Sjohnlev 
303843e1988Sjohnlev 	int err = 1;
304843e1988Sjohnlev 
305843e1988Sjohnlev 	/*
306843e1988Sjohnlev 	 * First, augment the incoming cpu structure
307843e1988Sjohnlev 	 * - vcpu pointer reference
308843e1988Sjohnlev 	 * - pending event storage area
309843e1988Sjohnlev 	 * - physical address of GDT
310843e1988Sjohnlev 	 */
311843e1988Sjohnlev 	cp->cpu_m.mcpu_vcpu_info =
312843e1988Sjohnlev 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
313843e1988Sjohnlev 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
314843e1988Sjohnlev 	    sizeof (struct xen_evt_data), KM_SLEEP);
315843e1988Sjohnlev 	cp->cpu_m.mcpu_gdtpa =
316843e1988Sjohnlev 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
317843e1988Sjohnlev 
318843e1988Sjohnlev 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
319843e1988Sjohnlev 		goto done;
320843e1988Sjohnlev 
321843e1988Sjohnlev 	/*
322843e1988Sjohnlev 	 * Now set up the vcpu context so that we can start this vcpu
323843e1988Sjohnlev 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
324843e1988Sjohnlev 	 * thread will thread_exit() shortly after performing the
325843e1988Sjohnlev 	 * initialization; in particular, we will *never* take a
326843e1988Sjohnlev 	 * privilege transition on this thread.
327843e1988Sjohnlev 	 */
328843e1988Sjohnlev 
329843e1988Sjohnlev 	bzero(&vgc, sizeof (vgc));
330843e1988Sjohnlev 
331843e1988Sjohnlev #ifdef __amd64
332843e1988Sjohnlev 	vgc.user_regs.rip = tp->t_pc;
333843e1988Sjohnlev 	vgc.user_regs.rsp = tp->t_sp;
334843e1988Sjohnlev 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
335843e1988Sjohnlev #else
336843e1988Sjohnlev 	vgc.user_regs.eip = tp->t_pc;
337843e1988Sjohnlev 	vgc.user_regs.esp = tp->t_sp;
338843e1988Sjohnlev 	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
339843e1988Sjohnlev #endif
340843e1988Sjohnlev 	/*
341843e1988Sjohnlev 	 * XXPV	Fix resume, if Russ didn't already fix it.
342843e1988Sjohnlev 	 *
343843e1988Sjohnlev 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
344843e1988Sjohnlev 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
345843e1988Sjohnlev 	 * that only lwps take traps that switch to the kernel stack;
346843e1988Sjohnlev 	 * part of creating an lwp adjusts the stack by subtracting
347843e1988Sjohnlev 	 * sizeof (struct regs) off t_stk.
348843e1988Sjohnlev 	 *
349843e1988Sjohnlev 	 * The more interesting question is, why do we do all the work
350843e1988Sjohnlev 	 * of a fully fledged lwp for a plain thread?  In particular
351843e1988Sjohnlev 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
352843e1988Sjohnlev 	 * or futz with the LDT.  This should probably all be done with
353843e1988Sjohnlev 	 * an lwp context operator to keep pure thread context switch fast.
354843e1988Sjohnlev 	 */
355843e1988Sjohnlev 	vgc.kernel_sp = (ulong_t)tp->t_stk;
356843e1988Sjohnlev 
357843e1988Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
358843e1988Sjohnlev 
359843e1988Sjohnlev done:
360843e1988Sjohnlev 	if (err) {
361843e1988Sjohnlev 		mach_cpucontext_free(cp, NULL, err);
362843e1988Sjohnlev 		return (NULL);
363843e1988Sjohnlev 	}
364843e1988Sjohnlev 	return (cp);
365843e1988Sjohnlev }
366843e1988Sjohnlev 
367843e1988Sjohnlev /*
368843e1988Sjohnlev  * By the time we are called either we have successfully started
369843e1988Sjohnlev  * the cpu, or our attempt to start it has failed.
370843e1988Sjohnlev  */
371843e1988Sjohnlev 
372843e1988Sjohnlev /*ARGSUSED*/
373843e1988Sjohnlev void
374843e1988Sjohnlev mach_cpucontext_free(struct cpu *cp, void *arg, int err)
375843e1988Sjohnlev {
376843e1988Sjohnlev 	switch (err) {
377843e1988Sjohnlev 	case 0:
378843e1988Sjohnlev 		break;
379843e1988Sjohnlev 	case ETIMEDOUT:
380843e1988Sjohnlev 		/*
381843e1988Sjohnlev 		 * The vcpu context is loaded into the hypervisor, and
382843e1988Sjohnlev 		 * we've tried to start it, but the vcpu has not been set
383843e1988Sjohnlev 		 * running yet, for whatever reason.  We arrange to -not-
384843e1988Sjohnlev 		 * free any data structures it may be referencing.  In
385843e1988Sjohnlev 		 * particular, we've already told the hypervisor about
386843e1988Sjohnlev 		 * the GDT, and so we can't map it read-write again.
387843e1988Sjohnlev 		 */
388843e1988Sjohnlev 		break;
389843e1988Sjohnlev 	default:
390843e1988Sjohnlev 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
391843e1988Sjohnlev 		kmem_free(cp->cpu_m.mcpu_evt_pend,
392843e1988Sjohnlev 		    sizeof (struct xen_evt_data));
393843e1988Sjohnlev 		break;
394843e1988Sjohnlev 	}
395843e1988Sjohnlev }
396843e1988Sjohnlev 
397843e1988Sjohnlev /*
398843e1988Sjohnlev  * Reset this CPU's context.  Clear out any pending evtchn data, since event
399843e1988Sjohnlev  * channel numbers will all change when we resume.
400843e1988Sjohnlev  */
401843e1988Sjohnlev void
402843e1988Sjohnlev mach_cpucontext_reset(cpu_t *cp)
403843e1988Sjohnlev {
404843e1988Sjohnlev 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
405843e1988Sjohnlev 	/* mcpu_intr_pending ? */
406843e1988Sjohnlev }
407843e1988Sjohnlev 
408843e1988Sjohnlev static void
409843e1988Sjohnlev pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
410843e1988Sjohnlev {
411843e1988Sjohnlev #ifdef __amd64
412843e1988Sjohnlev 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
413843e1988Sjohnlev 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
414843e1988Sjohnlev 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
415843e1988Sjohnlev 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
416843e1988Sjohnlev 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
417843e1988Sjohnlev 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
418843e1988Sjohnlev 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
419843e1988Sjohnlev 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
420843e1988Sjohnlev #else /* __amd64 */
421843e1988Sjohnlev 	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
422843e1988Sjohnlev 	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
423843e1988Sjohnlev 	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
424843e1988Sjohnlev 	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
425843e1988Sjohnlev 	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
426843e1988Sjohnlev 	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
427843e1988Sjohnlev #endif /* __amd64 */
428843e1988Sjohnlev }
429843e1988Sjohnlev 
430843e1988Sjohnlev /*
431*1d03c31eSjohnlev  * Restore the context of a CPU during resume.  This context is always
432*1d03c31eSjohnlev  * inside enter_safe_phase(), below.
433843e1988Sjohnlev  */
434843e1988Sjohnlev void
435843e1988Sjohnlev mach_cpucontext_restore(cpu_t *cp)
436843e1988Sjohnlev {
437843e1988Sjohnlev 	vcpu_guest_context_t vgc;
438843e1988Sjohnlev 	int err;
439843e1988Sjohnlev 
440843e1988Sjohnlev 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
441843e1988Sjohnlev 	    cp->cpu_thread == cp->cpu_idle_thread);
442843e1988Sjohnlev 
443843e1988Sjohnlev 	bzero(&vgc, sizeof (vgc));
444843e1988Sjohnlev 
445843e1988Sjohnlev 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
446843e1988Sjohnlev 
447843e1988Sjohnlev 	/*
448843e1988Sjohnlev 	 * We're emulating a longjmp() here: in particular, we need to bump the
449843e1988Sjohnlev 	 * stack pointer to account for the pop of xIP that returning from
450843e1988Sjohnlev 	 * longjmp() normally would do, and set the return value in xAX to 1.
451843e1988Sjohnlev 	 */
452843e1988Sjohnlev #ifdef __amd64
453843e1988Sjohnlev 	vgc.user_regs.rax = 1;
454843e1988Sjohnlev 	vgc.user_regs.rsp += sizeof (ulong_t);
455843e1988Sjohnlev #else
456843e1988Sjohnlev 	vgc.user_regs.eax = 1;
457843e1988Sjohnlev 	vgc.user_regs.esp += sizeof (ulong_t);
458843e1988Sjohnlev #endif
459843e1988Sjohnlev 
460843e1988Sjohnlev 	vgc.kernel_sp = cp->cpu_thread->t_sp;
461843e1988Sjohnlev 
462843e1988Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
463843e1988Sjohnlev 
464843e1988Sjohnlev 	ASSERT(err == 0);
465843e1988Sjohnlev }
466843e1988Sjohnlev 
467*1d03c31eSjohnlev /*
468*1d03c31eSjohnlev  * Reach a point at which the CPU can be safely powered-off or
469*1d03c31eSjohnlev  * suspended.  Nothing can wake this CPU out of the loop.
470*1d03c31eSjohnlev  */
471*1d03c31eSjohnlev static void
472*1d03c31eSjohnlev enter_safe_phase(void)
473*1d03c31eSjohnlev {
474*1d03c31eSjohnlev 	ulong_t flags = intr_clear();
475*1d03c31eSjohnlev 
476*1d03c31eSjohnlev 	if (setjmp(&curthread->t_pcb) == 0) {
477*1d03c31eSjohnlev 		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
478*1d03c31eSjohnlev 		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
479*1d03c31eSjohnlev 			SMT_PAUSE();
480*1d03c31eSjohnlev 	}
481*1d03c31eSjohnlev 
482*1d03c31eSjohnlev 	ASSERT(!interrupts_enabled());
483*1d03c31eSjohnlev 
484*1d03c31eSjohnlev 	intr_restore(flags);
485*1d03c31eSjohnlev }
486*1d03c31eSjohnlev 
487*1d03c31eSjohnlev /*
488*1d03c31eSjohnlev  * Offline CPUs run this code even under a pause_cpus(), so we must
489*1d03c31eSjohnlev  * check if we need to enter the safe phase.
490*1d03c31eSjohnlev  */
491843e1988Sjohnlev void
492843e1988Sjohnlev mach_cpu_idle(void)
493843e1988Sjohnlev {
494843e1988Sjohnlev 	if (IN_XPV_PANIC()) {
495843e1988Sjohnlev 		xpv_panic_halt();
496843e1988Sjohnlev 	} else  {
497843e1988Sjohnlev 		(void) HYPERVISOR_block();
498*1d03c31eSjohnlev 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
499*1d03c31eSjohnlev 			enter_safe_phase();
500843e1988Sjohnlev 	}
501843e1988Sjohnlev }
502843e1988Sjohnlev 
503*1d03c31eSjohnlev /*
504*1d03c31eSjohnlev  * Spin until either start_cpus() wakes us up, or we get a request to
505*1d03c31eSjohnlev  * enter the safe phase (followed by a later start_cpus()).
506*1d03c31eSjohnlev  */
507843e1988Sjohnlev void
508843e1988Sjohnlev mach_cpu_pause(volatile char *safe)
509843e1988Sjohnlev {
510*1d03c31eSjohnlev 	*safe = PAUSE_WAIT;
511*1d03c31eSjohnlev 	membar_enter();
512843e1988Sjohnlev 
513*1d03c31eSjohnlev 	while (*safe != PAUSE_IDLE) {
514*1d03c31eSjohnlev 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
515*1d03c31eSjohnlev 			enter_safe_phase();
516843e1988Sjohnlev 		SMT_PAUSE();
517*1d03c31eSjohnlev 	}
518843e1988Sjohnlev }
519843e1988Sjohnlev 
520*1d03c31eSjohnlev void
521*1d03c31eSjohnlev mach_cpu_halt(char *msg)
522*1d03c31eSjohnlev {
523*1d03c31eSjohnlev 	if (msg)
524*1d03c31eSjohnlev 		prom_printf("%s\n", msg);
525*1d03c31eSjohnlev 	(void) xen_vcpu_down(CPU->cpu_id);
526*1d03c31eSjohnlev }
527843e1988Sjohnlev 
528843e1988Sjohnlev /*ARGSUSED*/
529843e1988Sjohnlev int
530843e1988Sjohnlev mp_cpu_poweron(struct cpu *cp)
531843e1988Sjohnlev {
532843e1988Sjohnlev 	return (ENOTSUP);
533843e1988Sjohnlev }
534843e1988Sjohnlev 
535843e1988Sjohnlev /*ARGSUSED*/
536843e1988Sjohnlev int
537843e1988Sjohnlev mp_cpu_poweroff(struct cpu *cp)
538843e1988Sjohnlev {
539843e1988Sjohnlev 	return (ENOTSUP);
540843e1988Sjohnlev }
541843e1988Sjohnlev 
542*1d03c31eSjohnlev void
543*1d03c31eSjohnlev mp_enter_barrier(void)
544843e1988Sjohnlev {
545*1d03c31eSjohnlev 	hrtime_t last_poke_time = 0;
546*1d03c31eSjohnlev 	int poke_allowed = 0;
547*1d03c31eSjohnlev 	int done = 0;
548*1d03c31eSjohnlev 	int i;
549843e1988Sjohnlev 
550843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
551843e1988Sjohnlev 
552*1d03c31eSjohnlev 	pause_cpus(NULL);
553*1d03c31eSjohnlev 
554*1d03c31eSjohnlev 	while (!done) {
555*1d03c31eSjohnlev 		done = 1;
556*1d03c31eSjohnlev 		poke_allowed = 0;
557*1d03c31eSjohnlev 
558*1d03c31eSjohnlev 		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
559*1d03c31eSjohnlev 			last_poke_time = xpv_gethrtime();
560*1d03c31eSjohnlev 			poke_allowed = 1;
561*1d03c31eSjohnlev 		}
562*1d03c31eSjohnlev 
563*1d03c31eSjohnlev 		for (i = 0; i < NCPU; i++) {
564*1d03c31eSjohnlev 			cpu_t *cp = cpu_get(i);
565*1d03c31eSjohnlev 
566*1d03c31eSjohnlev 			if (cp == NULL || cp == CPU)
567*1d03c31eSjohnlev 				continue;
568*1d03c31eSjohnlev 
569*1d03c31eSjohnlev 			switch (cpu_phase[i]) {
570*1d03c31eSjohnlev 			case CPU_PHASE_NONE:
571*1d03c31eSjohnlev 				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
572*1d03c31eSjohnlev 				poke_cpu(i);
573*1d03c31eSjohnlev 				done = 0;
574*1d03c31eSjohnlev 				break;
575*1d03c31eSjohnlev 
576*1d03c31eSjohnlev 			case CPU_PHASE_WAIT_SAFE:
577*1d03c31eSjohnlev 				if (poke_allowed)
578*1d03c31eSjohnlev 					poke_cpu(i);
579*1d03c31eSjohnlev 				done = 0;
580*1d03c31eSjohnlev 				break;
581*1d03c31eSjohnlev 
582*1d03c31eSjohnlev 			case CPU_PHASE_SAFE:
583*1d03c31eSjohnlev 			case CPU_PHASE_POWERED_OFF:
584*1d03c31eSjohnlev 				break;
585*1d03c31eSjohnlev 			}
586*1d03c31eSjohnlev 		}
587*1d03c31eSjohnlev 
588*1d03c31eSjohnlev 		SMT_PAUSE();
589843e1988Sjohnlev 	}
590*1d03c31eSjohnlev }
591843e1988Sjohnlev 
592*1d03c31eSjohnlev void
593*1d03c31eSjohnlev mp_leave_barrier(void)
594*1d03c31eSjohnlev {
595*1d03c31eSjohnlev 	int i;
596*1d03c31eSjohnlev 
597*1d03c31eSjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
598*1d03c31eSjohnlev 
599*1d03c31eSjohnlev 	for (i = 0; i < NCPU; i++) {
600*1d03c31eSjohnlev 		cpu_t *cp = cpu_get(i);
601*1d03c31eSjohnlev 
602*1d03c31eSjohnlev 		if (cp == NULL || cp == CPU)
603*1d03c31eSjohnlev 			continue;
604*1d03c31eSjohnlev 
605*1d03c31eSjohnlev 		switch (cpu_phase[i]) {
606843e1988Sjohnlev 		/*
607*1d03c31eSjohnlev 		 * If we see a CPU in one of these phases, something has
608*1d03c31eSjohnlev 		 * gone badly wrong with the guarantees
609*1d03c31eSjohnlev 		 * mp_enter_barrier() is supposed to provide.  Rather
610*1d03c31eSjohnlev 		 * than attempt to stumble along (and since we can't
611*1d03c31eSjohnlev 		 * panic properly in this context), we tell the
612*1d03c31eSjohnlev 		 * hypervisor we've crashed.
613843e1988Sjohnlev 		 */
614*1d03c31eSjohnlev 		case CPU_PHASE_NONE:
615*1d03c31eSjohnlev 		case CPU_PHASE_WAIT_SAFE:
616*1d03c31eSjohnlev 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
617*1d03c31eSjohnlev 			break;
618843e1988Sjohnlev 
619*1d03c31eSjohnlev 		case CPU_PHASE_POWERED_OFF:
620*1d03c31eSjohnlev 			break;
621*1d03c31eSjohnlev 
622*1d03c31eSjohnlev 		case CPU_PHASE_SAFE:
623*1d03c31eSjohnlev 			cpu_phase[i] = CPU_PHASE_NONE;
624*1d03c31eSjohnlev 		}
625843e1988Sjohnlev 	}
626843e1988Sjohnlev 
627*1d03c31eSjohnlev 	start_cpus();
628843e1988Sjohnlev }
629843e1988Sjohnlev 
630843e1988Sjohnlev static int
631843e1988Sjohnlev poweroff_vcpu(struct cpu *cp)
632843e1988Sjohnlev {
633843e1988Sjohnlev 	int error;
634843e1988Sjohnlev 
635843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
636843e1988Sjohnlev 
637843e1988Sjohnlev 	ASSERT(CPU->cpu_id != cp->cpu_id);
638843e1988Sjohnlev 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
639843e1988Sjohnlev 
640*1d03c31eSjohnlev 	mp_enter_barrier();
641843e1988Sjohnlev 
642843e1988Sjohnlev 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
643*1d03c31eSjohnlev 		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
644*1d03c31eSjohnlev 
645843e1988Sjohnlev 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
646*1d03c31eSjohnlev 
647843e1988Sjohnlev 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
648843e1988Sjohnlev 		cp->cpu_flags &=
649843e1988Sjohnlev 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
650843e1988Sjohnlev 
651*1d03c31eSjohnlev 		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
652*1d03c31eSjohnlev 
653843e1988Sjohnlev 		cpu_set_state(cp);
654843e1988Sjohnlev 	}
655*1d03c31eSjohnlev 
656*1d03c31eSjohnlev 	mp_leave_barrier();
657*1d03c31eSjohnlev 
658843e1988Sjohnlev 	return (error);
659843e1988Sjohnlev }
660843e1988Sjohnlev 
661843e1988Sjohnlev static int
662843e1988Sjohnlev vcpu_config_poweroff(processorid_t id)
663843e1988Sjohnlev {
664843e1988Sjohnlev 	int oldstate;
665843e1988Sjohnlev 	int error;
666843e1988Sjohnlev 	cpu_t *cp;
667843e1988Sjohnlev 
668843e1988Sjohnlev 	mutex_enter(&cpu_lock);
669843e1988Sjohnlev 
670843e1988Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
671843e1988Sjohnlev 		mutex_exit(&cpu_lock);
672843e1988Sjohnlev 		return (ESRCH);
673843e1988Sjohnlev 	}
674843e1988Sjohnlev 
675843e1988Sjohnlev 	if (cpu_get_state(cp) == P_POWEROFF) {
676843e1988Sjohnlev 		mutex_exit(&cpu_lock);
677843e1988Sjohnlev 		return (0);
678843e1988Sjohnlev 	}
679843e1988Sjohnlev 
680843e1988Sjohnlev 	mutex_exit(&cpu_lock);
681843e1988Sjohnlev 
682843e1988Sjohnlev 	do {
683843e1988Sjohnlev 		error = p_online_internal(id, P_OFFLINE,
684843e1988Sjohnlev 		    &oldstate);
685843e1988Sjohnlev 
686843e1988Sjohnlev 		if (error != 0)
687843e1988Sjohnlev 			break;
688843e1988Sjohnlev 
689843e1988Sjohnlev 		/*
690843e1988Sjohnlev 		 * So we just changed it to P_OFFLINE.  But then we dropped
691843e1988Sjohnlev 		 * cpu_lock, so now it is possible for another thread to change
692843e1988Sjohnlev 		 * the cpu back to a different, non-quiesced state e.g.
693843e1988Sjohnlev 		 * P_ONLINE.
694843e1988Sjohnlev 		 */
695843e1988Sjohnlev 		mutex_enter(&cpu_lock);
696843e1988Sjohnlev 		if ((cp = cpu_get(id)) == NULL)
697843e1988Sjohnlev 			error = ESRCH;
698843e1988Sjohnlev 		else {
699843e1988Sjohnlev 			if (cp->cpu_flags & CPU_QUIESCED)
700843e1988Sjohnlev 				error = poweroff_vcpu(cp);
701843e1988Sjohnlev 			else
702843e1988Sjohnlev 				error = EBUSY;
703843e1988Sjohnlev 		}
704843e1988Sjohnlev 		mutex_exit(&cpu_lock);
705843e1988Sjohnlev 	} while (error == EBUSY);
706843e1988Sjohnlev 
707843e1988Sjohnlev 	return (error);
708843e1988Sjohnlev }
709843e1988Sjohnlev 
710843e1988Sjohnlev /*
711843e1988Sjohnlev  * Add a new virtual cpu to the domain.
712843e1988Sjohnlev  */
713843e1988Sjohnlev static int
714843e1988Sjohnlev vcpu_config_new(processorid_t id)
715843e1988Sjohnlev {
716843e1988Sjohnlev 	extern int start_cpu(processorid_t);
717843e1988Sjohnlev 	int error;
718843e1988Sjohnlev 
719843e1988Sjohnlev 	if (ncpus == 1) {
720843e1988Sjohnlev 		printf("cannot (yet) add cpus to a single-cpu domain\n");
721843e1988Sjohnlev 		return (ENOTSUP);
722843e1988Sjohnlev 	}
723843e1988Sjohnlev 
724843e1988Sjohnlev 	affinity_set(CPU_CURRENT);
725843e1988Sjohnlev 	error = start_cpu(id);
726843e1988Sjohnlev 	affinity_clear();
727843e1988Sjohnlev 	return (error);
728843e1988Sjohnlev }
729843e1988Sjohnlev 
730*1d03c31eSjohnlev static int
731*1d03c31eSjohnlev poweron_vcpu(struct cpu *cp)
732*1d03c31eSjohnlev {
733*1d03c31eSjohnlev 	int error;
734*1d03c31eSjohnlev 
735*1d03c31eSjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
736*1d03c31eSjohnlev 
737*1d03c31eSjohnlev 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
738*1d03c31eSjohnlev 		printf("poweron_vcpu: vcpu%d is not available!\n",
739*1d03c31eSjohnlev 		    cp->cpu_id);
740*1d03c31eSjohnlev 		return (ENXIO);
741*1d03c31eSjohnlev 	}
742*1d03c31eSjohnlev 
743*1d03c31eSjohnlev 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
744*1d03c31eSjohnlev 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
745*1d03c31eSjohnlev 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
746*1d03c31eSjohnlev 		cp->cpu_flags &= ~CPU_POWEROFF;
747*1d03c31eSjohnlev 		/*
748*1d03c31eSjohnlev 		 * There are some nasty races possible here.
749*1d03c31eSjohnlev 		 * Tell the vcpu it's up one more time.
750*1d03c31eSjohnlev 		 * XXPV	Is this enough?  Is this safe?
751*1d03c31eSjohnlev 		 */
752*1d03c31eSjohnlev 		(void) xen_vcpu_up(cp->cpu_id);
753*1d03c31eSjohnlev 
754*1d03c31eSjohnlev 		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
755*1d03c31eSjohnlev 
756*1d03c31eSjohnlev 		cpu_set_state(cp);
757*1d03c31eSjohnlev 	}
758*1d03c31eSjohnlev 	return (error);
759*1d03c31eSjohnlev }
760*1d03c31eSjohnlev 
761843e1988Sjohnlev static int
762843e1988Sjohnlev vcpu_config_poweron(processorid_t id)
763843e1988Sjohnlev {
764843e1988Sjohnlev 	cpu_t *cp;
765843e1988Sjohnlev 	int oldstate;
766843e1988Sjohnlev 	int error;
767843e1988Sjohnlev 
768843e1988Sjohnlev 	if (id >= ncpus)
769843e1988Sjohnlev 		return (vcpu_config_new(id));
770843e1988Sjohnlev 
771843e1988Sjohnlev 	mutex_enter(&cpu_lock);
772843e1988Sjohnlev 
773843e1988Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
774843e1988Sjohnlev 		mutex_exit(&cpu_lock);
775843e1988Sjohnlev 		return (ESRCH);
776843e1988Sjohnlev 	}
777843e1988Sjohnlev 
778843e1988Sjohnlev 	if (cpu_get_state(cp) != P_POWEROFF) {
779843e1988Sjohnlev 		mutex_exit(&cpu_lock);
780843e1988Sjohnlev 		return (0);
781843e1988Sjohnlev 	}
782843e1988Sjohnlev 
783843e1988Sjohnlev 	if ((error = poweron_vcpu(cp)) != 0) {
784843e1988Sjohnlev 		mutex_exit(&cpu_lock);
785843e1988Sjohnlev 		return (error);
786843e1988Sjohnlev 	}
787843e1988Sjohnlev 
788843e1988Sjohnlev 	mutex_exit(&cpu_lock);
789843e1988Sjohnlev 
790843e1988Sjohnlev 	return (p_online_internal(id, P_ONLINE, &oldstate));
791843e1988Sjohnlev }
792843e1988Sjohnlev 
793843e1988Sjohnlev #define	REPORT_LEN	128
794843e1988Sjohnlev 
795843e1988Sjohnlev static void
796843e1988Sjohnlev vcpu_config_report(processorid_t id, uint_t newstate, int error)
797843e1988Sjohnlev {
798843e1988Sjohnlev 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
799843e1988Sjohnlev 	size_t len;
800843e1988Sjohnlev 	char *ps;
801843e1988Sjohnlev 
802843e1988Sjohnlev 	switch (newstate) {
803843e1988Sjohnlev 	case P_ONLINE:
804843e1988Sjohnlev 		ps = PS_ONLINE;
805843e1988Sjohnlev 		break;
806843e1988Sjohnlev 	case P_POWEROFF:
807843e1988Sjohnlev 		ps = PS_POWEROFF;
808843e1988Sjohnlev 		break;
809843e1988Sjohnlev 	default:
810843e1988Sjohnlev 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
811843e1988Sjohnlev 		break;
812843e1988Sjohnlev 	}
813843e1988Sjohnlev 
814843e1988Sjohnlev 	len = snprintf(report, REPORT_LEN,
815843e1988Sjohnlev 	    "cpu%d: externally initiated %s", id, ps);
816843e1988Sjohnlev 
817843e1988Sjohnlev 	if (!error) {
818843e1988Sjohnlev 		cmn_err(CE_CONT, "!%s\n", report);
819843e1988Sjohnlev 		kmem_free(report, REPORT_LEN);
820843e1988Sjohnlev 		return;
821843e1988Sjohnlev 	}
822843e1988Sjohnlev 
823843e1988Sjohnlev 	len += snprintf(report + len, REPORT_LEN - len,
824843e1988Sjohnlev 	    " failed, error %d: ", error);
825843e1988Sjohnlev 	switch (error) {
826843e1988Sjohnlev 	case EEXIST:
827843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
828843e1988Sjohnlev 		    "cpu already %s", ps ? ps : "?");
829843e1988Sjohnlev 		break;
830843e1988Sjohnlev 	case ESRCH:
831843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
832843e1988Sjohnlev 		    "cpu not found");
833843e1988Sjohnlev 		break;
834843e1988Sjohnlev 	case EINVAL:
835843e1988Sjohnlev 	case EALREADY:
836843e1988Sjohnlev 		break;
837843e1988Sjohnlev 	case EPERM:
838843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
839843e1988Sjohnlev 		    "insufficient privilege (0x%x)", id);
840843e1988Sjohnlev 		break;
841843e1988Sjohnlev 	case EBUSY:
842843e1988Sjohnlev 		switch (newstate) {
843843e1988Sjohnlev 		case P_ONLINE:
844843e1988Sjohnlev 			/*
845843e1988Sjohnlev 			 * This return comes from mp_cpu_start -
846843e1988Sjohnlev 			 * we cannot 'start' the boot CPU.
847843e1988Sjohnlev 			 */
848843e1988Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
849843e1988Sjohnlev 			    "already running");
850843e1988Sjohnlev 			break;
851843e1988Sjohnlev 		case P_POWEROFF:
852843e1988Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
853843e1988Sjohnlev 			    "bound lwps?");
854843e1988Sjohnlev 			break;
855843e1988Sjohnlev 		default:
856843e1988Sjohnlev 			break;
857843e1988Sjohnlev 		}
858843e1988Sjohnlev 	default:
859843e1988Sjohnlev 		break;
860843e1988Sjohnlev 	}
861843e1988Sjohnlev 
862843e1988Sjohnlev 	cmn_err(CE_CONT, "%s\n", report);
863843e1988Sjohnlev 	kmem_free(report, REPORT_LEN);
864843e1988Sjohnlev }
865843e1988Sjohnlev 
866843e1988Sjohnlev static void
867843e1988Sjohnlev vcpu_config(void *arg)
868843e1988Sjohnlev {
869843e1988Sjohnlev 	int id = (int)(uintptr_t)arg;
870843e1988Sjohnlev 	int error;
871843e1988Sjohnlev 	char dir[16];
872843e1988Sjohnlev 	char *state;
873843e1988Sjohnlev 
874843e1988Sjohnlev 	if ((uint_t)id >= max_ncpus) {
875843e1988Sjohnlev 		cmn_err(CE_WARN,
876843e1988Sjohnlev 		    "vcpu_config: cpu%d does not fit in this domain", id);
877843e1988Sjohnlev 		return;
878843e1988Sjohnlev 	}
879843e1988Sjohnlev 
880843e1988Sjohnlev 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
881843e1988Sjohnlev 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
882843e1988Sjohnlev 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
883843e1988Sjohnlev 		if (strcmp(state, "online") == 0) {
884843e1988Sjohnlev 			error = vcpu_config_poweron(id);
885843e1988Sjohnlev 			vcpu_config_report(id, P_ONLINE, error);
886843e1988Sjohnlev 		} else if (strcmp(state, "offline") == 0) {
887843e1988Sjohnlev 			error = vcpu_config_poweroff(id);
888843e1988Sjohnlev 			vcpu_config_report(id, P_POWEROFF, error);
889843e1988Sjohnlev 		} else {
890843e1988Sjohnlev 			cmn_err(CE_WARN,
891843e1988Sjohnlev 			    "cpu%d: unknown target state '%s'", id, state);
892843e1988Sjohnlev 		}
893843e1988Sjohnlev 	} else
894843e1988Sjohnlev 		cmn_err(CE_WARN,
895843e1988Sjohnlev 		    "cpu%d: unable to read target state from xenstore", id);
896843e1988Sjohnlev 
897843e1988Sjohnlev 	kmem_free(state, MAXPATHLEN);
898843e1988Sjohnlev }
899843e1988Sjohnlev 
900843e1988Sjohnlev /*ARGSUSED*/
901843e1988Sjohnlev static void
902843e1988Sjohnlev vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
903843e1988Sjohnlev {
904843e1988Sjohnlev 	const char *path = vec[XS_WATCH_PATH];
905843e1988Sjohnlev 	processorid_t id;
906843e1988Sjohnlev 	char *s;
907843e1988Sjohnlev 
908843e1988Sjohnlev 	if ((s = strstr(path, "cpu/")) != NULL &&
909843e1988Sjohnlev 	    sscanf(s, "cpu/%d", &id) == 1) {
910843e1988Sjohnlev 		/*
911843e1988Sjohnlev 		 * Run the virtual CPU configuration on a separate thread to
912843e1988Sjohnlev 		 * avoid blocking on this event for too long (and for now,
913843e1988Sjohnlev 		 * to ensure configuration requests are serialized.)
914843e1988Sjohnlev 		 */
915843e1988Sjohnlev 		(void) taskq_dispatch(cpu_config_tq,
916843e1988Sjohnlev 		    vcpu_config, (void *)(uintptr_t)id, 0);
917843e1988Sjohnlev 	}
918843e1988Sjohnlev }
919843e1988Sjohnlev 
920843e1988Sjohnlev static int
921843e1988Sjohnlev xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
922843e1988Sjohnlev {
923843e1988Sjohnlev 	int err;
924843e1988Sjohnlev 
925843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
926843e1988Sjohnlev 		char *str;
927843e1988Sjohnlev 		int level = CE_WARN;
928843e1988Sjohnlev 
929843e1988Sjohnlev 		switch (err) {
930843e1988Sjohnlev 		case -X_EINVAL:
931843e1988Sjohnlev 			/*
932843e1988Sjohnlev 			 * This interface squashes multiple error sources
933843e1988Sjohnlev 			 * to one error code.  In particular, an X_EINVAL
934843e1988Sjohnlev 			 * code can mean:
935843e1988Sjohnlev 			 *
936843e1988Sjohnlev 			 * -	the vcpu id is out of range
937843e1988Sjohnlev 			 * -	cs or ss are in ring 0
938843e1988Sjohnlev 			 * -	cr3 is wrong
939843e1988Sjohnlev 			 * -	an entry in the new gdt is above the
940843e1988Sjohnlev 			 *	reserved entry
941843e1988Sjohnlev 			 * -	a frame underneath the new gdt is bad
942843e1988Sjohnlev 			 */
943843e1988Sjohnlev 			str = "something is wrong :(";
944843e1988Sjohnlev 			break;
945843e1988Sjohnlev 		case -X_ENOENT:
946843e1988Sjohnlev 			str = "no such cpu";
947843e1988Sjohnlev 			break;
948843e1988Sjohnlev 		case -X_ENOMEM:
949843e1988Sjohnlev 			str = "no mem to copy ctxt";
950843e1988Sjohnlev 			break;
951843e1988Sjohnlev 		case -X_EFAULT:
952843e1988Sjohnlev 			str = "bad address";
953843e1988Sjohnlev 			break;
954843e1988Sjohnlev 		case -X_EEXIST:
955843e1988Sjohnlev 			/*
956843e1988Sjohnlev 			 * Hmm.  This error is returned if the vcpu has already
957843e1988Sjohnlev 			 * been initialized once before in the lifetime of this
958843e1988Sjohnlev 			 * domain.  This is a logic error in the kernel.
959843e1988Sjohnlev 			 */
960843e1988Sjohnlev 			level = CE_PANIC;
961843e1988Sjohnlev 			str = "already initialized";
962843e1988Sjohnlev 			break;
963843e1988Sjohnlev 		default:
964843e1988Sjohnlev 			level = CE_PANIC;
965843e1988Sjohnlev 			str = "<unexpected>";
966843e1988Sjohnlev 			break;
967843e1988Sjohnlev 		}
968843e1988Sjohnlev 
969843e1988Sjohnlev 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
970843e1988Sjohnlev 		    id, -err, str);
971843e1988Sjohnlev 	}
972843e1988Sjohnlev 	return (err);
973843e1988Sjohnlev }
974843e1988Sjohnlev 
975843e1988Sjohnlev long
976843e1988Sjohnlev xen_vcpu_up(processorid_t id)
977843e1988Sjohnlev {
978843e1988Sjohnlev 	long err;
979843e1988Sjohnlev 
980843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
981843e1988Sjohnlev 		char *str;
982843e1988Sjohnlev 
983843e1988Sjohnlev 		switch (err) {
984843e1988Sjohnlev 		case -X_ENOENT:
985843e1988Sjohnlev 			str = "no such cpu";
986843e1988Sjohnlev 			break;
987843e1988Sjohnlev 		case -X_EINVAL:
988843e1988Sjohnlev 			/*
989843e1988Sjohnlev 			 * Perhaps this is diagnostic overkill.
990843e1988Sjohnlev 			 */
991843e1988Sjohnlev 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
992843e1988Sjohnlev 				str = "bad cpuid";
993843e1988Sjohnlev 			else
994843e1988Sjohnlev 				str = "not initialized";
995843e1988Sjohnlev 			break;
996843e1988Sjohnlev 		default:
997843e1988Sjohnlev 			str = "<unexpected>";
998843e1988Sjohnlev 			break;
999843e1988Sjohnlev 		}
1000843e1988Sjohnlev 
1001843e1988Sjohnlev 		printf("vcpu%d: failed to start: error %d: %s\n",
1002843e1988Sjohnlev 		    id, -(int)err, str);
1003843e1988Sjohnlev 		return (EBFONT);	/* deliberately silly */
1004843e1988Sjohnlev 	}
1005843e1988Sjohnlev 	return (err);
1006843e1988Sjohnlev }
1007843e1988Sjohnlev 
1008843e1988Sjohnlev long
1009843e1988Sjohnlev xen_vcpu_down(processorid_t id)
1010843e1988Sjohnlev {
1011843e1988Sjohnlev 	long err;
1012843e1988Sjohnlev 
1013843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1014843e1988Sjohnlev 		/*
1015843e1988Sjohnlev 		 * X_ENOENT:	no such cpu
1016843e1988Sjohnlev 		 * X_EINVAL:	bad cpuid
1017843e1988Sjohnlev 		 */
1018843e1988Sjohnlev 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1019843e1988Sjohnlev 	}
1020843e1988Sjohnlev 
1021843e1988Sjohnlev 	return (err);
1022843e1988Sjohnlev }
1023