xref: /illumos-gate/usr/src/uts/i86xpv/os/mp_xen.c (revision 843e19887f64dde75055cf8842fc4db2171eff45)
1*843e1988Sjohnlev /*
2*843e1988Sjohnlev  * CDDL HEADER START
3*843e1988Sjohnlev  *
4*843e1988Sjohnlev  * The contents of this file are subject to the terms of the
5*843e1988Sjohnlev  * Common Development and Distribution License (the "License").
6*843e1988Sjohnlev  * You may not use this file except in compliance with the License.
7*843e1988Sjohnlev  *
8*843e1988Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*843e1988Sjohnlev  * or http://www.opensolaris.org/os/licensing.
10*843e1988Sjohnlev  * See the License for the specific language governing permissions
11*843e1988Sjohnlev  * and limitations under the License.
12*843e1988Sjohnlev  *
13*843e1988Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
14*843e1988Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*843e1988Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
16*843e1988Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
17*843e1988Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
18*843e1988Sjohnlev  *
19*843e1988Sjohnlev  * CDDL HEADER END
20*843e1988Sjohnlev  */
21*843e1988Sjohnlev 
22*843e1988Sjohnlev /*
23*843e1988Sjohnlev  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24*843e1988Sjohnlev  * Use is subject to license terms.
25*843e1988Sjohnlev  */
26*843e1988Sjohnlev 
27*843e1988Sjohnlev #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*843e1988Sjohnlev 
29*843e1988Sjohnlev #include <sys/types.h>
30*843e1988Sjohnlev #include <sys/systm.h>
31*843e1988Sjohnlev #include <sys/param.h>
32*843e1988Sjohnlev #include <sys/taskq.h>
33*843e1988Sjohnlev #include <sys/cmn_err.h>
34*843e1988Sjohnlev #include <sys/archsystm.h>
35*843e1988Sjohnlev #include <sys/machsystm.h>
36*843e1988Sjohnlev #include <sys/segments.h>
37*843e1988Sjohnlev #include <sys/cpuvar.h>
38*843e1988Sjohnlev #include <sys/psw.h>
39*843e1988Sjohnlev #include <sys/x86_archext.h>
40*843e1988Sjohnlev #include <sys/controlregs.h>
41*843e1988Sjohnlev #include <vm/as.h>
42*843e1988Sjohnlev #include <vm/hat.h>
43*843e1988Sjohnlev #include <vm/hat_i86.h>
44*843e1988Sjohnlev #include <sys/mman.h>
45*843e1988Sjohnlev #include <sys/hypervisor.h>
46*843e1988Sjohnlev #include <xen/sys/xenbus_impl.h>
47*843e1988Sjohnlev #include <sys/xpv_panic.h>
48*843e1988Sjohnlev #include <util/sscanf.h>
49*843e1988Sjohnlev #include <sys/cpu.h>
50*843e1988Sjohnlev #include <asm/cpu.h>
51*843e1988Sjohnlev 
52*843e1988Sjohnlev #include <xen/public/vcpu.h>
53*843e1988Sjohnlev #include <xen/public/io/xs_wire.h>
54*843e1988Sjohnlev 
55*843e1988Sjohnlev struct xen_evt_data cpu0_evt_data;		/* cpu0's pending event data */
56*843e1988Sjohnlev 
57*843e1988Sjohnlev static taskq_t *cpu_config_tq;
58*843e1988Sjohnlev static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
59*843e1988Sjohnlev static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
60*843e1988Sjohnlev 
61*843e1988Sjohnlev /*
62*843e1988Sjohnlev  * These routines allocate any global state that might be needed
63*843e1988Sjohnlev  * while starting cpus.  For virtual cpus, there is no such state.
64*843e1988Sjohnlev  */
65*843e1988Sjohnlev int
66*843e1988Sjohnlev mach_cpucontext_init(void)
67*843e1988Sjohnlev {
68*843e1988Sjohnlev 	return (0);
69*843e1988Sjohnlev }
70*843e1988Sjohnlev 
71*843e1988Sjohnlev void
72*843e1988Sjohnlev do_cpu_config_watch(int state)
73*843e1988Sjohnlev {
74*843e1988Sjohnlev 	static struct xenbus_watch cpu_config_watch;
75*843e1988Sjohnlev 
76*843e1988Sjohnlev 	if (state != XENSTORE_UP)
77*843e1988Sjohnlev 		return;
78*843e1988Sjohnlev 	cpu_config_watch.node = "cpu";
79*843e1988Sjohnlev 	cpu_config_watch.callback = vcpu_config_event;
80*843e1988Sjohnlev 	if (register_xenbus_watch(&cpu_config_watch)) {
81*843e1988Sjohnlev 		taskq_destroy(cpu_config_tq);
82*843e1988Sjohnlev 		cmn_err(CE_WARN, "do_cpu_config_watch: "
83*843e1988Sjohnlev 		    "failed to set vcpu config watch");
84*843e1988Sjohnlev 	}
85*843e1988Sjohnlev 
86*843e1988Sjohnlev }
87*843e1988Sjohnlev 
88*843e1988Sjohnlev /*
89*843e1988Sjohnlev  * This routine is called after all the "normal" MP startup has
90*843e1988Sjohnlev  * been done; a good place to start watching xen store for virtual
91*843e1988Sjohnlev  * cpu hot plug events.
92*843e1988Sjohnlev  */
93*843e1988Sjohnlev void
94*843e1988Sjohnlev mach_cpucontext_fini(void)
95*843e1988Sjohnlev {
96*843e1988Sjohnlev 
97*843e1988Sjohnlev 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
98*843e1988Sjohnlev 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
99*843e1988Sjohnlev 
100*843e1988Sjohnlev 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
101*843e1988Sjohnlev }
102*843e1988Sjohnlev 
103*843e1988Sjohnlev /*
104*843e1988Sjohnlev  * Fill in the remaining CPU context and initialize it.
105*843e1988Sjohnlev  */
106*843e1988Sjohnlev static int
107*843e1988Sjohnlev mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
108*843e1988Sjohnlev {
109*843e1988Sjohnlev 	uint_t vec, iopl;
110*843e1988Sjohnlev 
111*843e1988Sjohnlev 	vgc->flags = VGCF_IN_KERNEL;
112*843e1988Sjohnlev 
113*843e1988Sjohnlev 	/*
114*843e1988Sjohnlev 	 * fpu_ctx we leave as zero; on first fault we'll store
115*843e1988Sjohnlev 	 * sse_initial into it anyway.
116*843e1988Sjohnlev 	 */
117*843e1988Sjohnlev 
118*843e1988Sjohnlev #if defined(__amd64)
119*843e1988Sjohnlev 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
120*843e1988Sjohnlev #else
121*843e1988Sjohnlev 	vgc->user_regs.cs = KCS_SEL;
122*843e1988Sjohnlev #endif
123*843e1988Sjohnlev 	vgc->user_regs.ds = KDS_SEL;
124*843e1988Sjohnlev 	vgc->user_regs.es = KDS_SEL;
125*843e1988Sjohnlev 	vgc->user_regs.ss = KDS_SEL;
126*843e1988Sjohnlev 	vgc->kernel_ss = KDS_SEL;
127*843e1988Sjohnlev 
128*843e1988Sjohnlev 	/*
129*843e1988Sjohnlev 	 * Allow I/O privilege level for Dom0 kernel.
130*843e1988Sjohnlev 	 */
131*843e1988Sjohnlev 	if (DOMAIN_IS_INITDOMAIN(xen_info))
132*843e1988Sjohnlev 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
133*843e1988Sjohnlev 	else
134*843e1988Sjohnlev 		iopl = 0;
135*843e1988Sjohnlev 
136*843e1988Sjohnlev #if defined(__amd64)
137*843e1988Sjohnlev 	vgc->user_regs.fs = 0;
138*843e1988Sjohnlev 	vgc->user_regs.gs = 0;
139*843e1988Sjohnlev 	vgc->user_regs.rflags = F_OFF | iopl;
140*843e1988Sjohnlev #elif defined(__i386)
141*843e1988Sjohnlev 	vgc->user_regs.fs = KFS_SEL;
142*843e1988Sjohnlev 	vgc->user_regs.gs = KGS_SEL;
143*843e1988Sjohnlev 	vgc->user_regs.eflags = F_OFF | iopl;
144*843e1988Sjohnlev 	vgc->event_callback_cs = vgc->user_regs.cs;
145*843e1988Sjohnlev 	vgc->failsafe_callback_cs = vgc->user_regs.cs;
146*843e1988Sjohnlev #endif
147*843e1988Sjohnlev 
148*843e1988Sjohnlev 	/*
149*843e1988Sjohnlev 	 * Initialize the trap_info_t from the IDT
150*843e1988Sjohnlev 	 */
151*843e1988Sjohnlev #if !defined(__lint)
152*843e1988Sjohnlev 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
153*843e1988Sjohnlev #endif
154*843e1988Sjohnlev 	for (vec = 0; vec < NIDT; vec++) {
155*843e1988Sjohnlev 		trap_info_t *ti = &vgc->trap_ctxt[vec];
156*843e1988Sjohnlev 
157*843e1988Sjohnlev 		if (xen_idt_to_trap_info(vec,
158*843e1988Sjohnlev 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
159*843e1988Sjohnlev 			ti->cs = KCS_SEL;
160*843e1988Sjohnlev 			ti->vector = vec;
161*843e1988Sjohnlev 		}
162*843e1988Sjohnlev 	}
163*843e1988Sjohnlev 
164*843e1988Sjohnlev 	/*
165*843e1988Sjohnlev 	 * No LDT
166*843e1988Sjohnlev 	 */
167*843e1988Sjohnlev 
168*843e1988Sjohnlev 	/*
169*843e1988Sjohnlev 	 * (We assert in various places that the GDT is (a) aligned on a
170*843e1988Sjohnlev 	 * page boundary and (b) one page long, so this really should fit..)
171*843e1988Sjohnlev 	 */
172*843e1988Sjohnlev #ifdef CRASH_XEN
173*843e1988Sjohnlev 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
174*843e1988Sjohnlev #else
175*843e1988Sjohnlev 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
176*843e1988Sjohnlev #endif
177*843e1988Sjohnlev 	vgc->gdt_ents = NGDT;
178*843e1988Sjohnlev 
179*843e1988Sjohnlev 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
180*843e1988Sjohnlev 
181*843e1988Sjohnlev #if defined(__i386)
182*843e1988Sjohnlev 	if (mmu.pae_hat)
183*843e1988Sjohnlev 		vgc->ctrlreg[3] =
184*843e1988Sjohnlev 		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
185*843e1988Sjohnlev 	else
186*843e1988Sjohnlev #endif
187*843e1988Sjohnlev 		vgc->ctrlreg[3] =
188*843e1988Sjohnlev 		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
189*843e1988Sjohnlev 
190*843e1988Sjohnlev 	vgc->ctrlreg[4] = getcr4();
191*843e1988Sjohnlev 
192*843e1988Sjohnlev 	vgc->event_callback_eip = (uintptr_t)xen_callback;
193*843e1988Sjohnlev 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
194*843e1988Sjohnlev 	vgc->flags |= VGCF_failsafe_disables_events;
195*843e1988Sjohnlev 
196*843e1988Sjohnlev #if defined(__amd64)
197*843e1988Sjohnlev 	/*
198*843e1988Sjohnlev 	 * XXPV should this be moved to init_cpu_syscall?
199*843e1988Sjohnlev 	 */
200*843e1988Sjohnlev 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
201*843e1988Sjohnlev 	vgc->flags |= VGCF_syscall_disables_events;
202*843e1988Sjohnlev 
203*843e1988Sjohnlev 	ASSERT(vgc->user_regs.gs == 0);
204*843e1988Sjohnlev 	vgc->gs_base_kernel = (uintptr_t)cp;
205*843e1988Sjohnlev #endif
206*843e1988Sjohnlev 
207*843e1988Sjohnlev 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
208*843e1988Sjohnlev }
209*843e1988Sjohnlev 
210*843e1988Sjohnlev /*
211*843e1988Sjohnlev  * Create a guest virtual cpu context so that the virtual cpu
212*843e1988Sjohnlev  * springs into life in the domain just about to call mp_startup()
213*843e1988Sjohnlev  *
214*843e1988Sjohnlev  * Virtual CPUs must be initialized once in the lifetime of the domain;
215*843e1988Sjohnlev  * after that subsequent attempts to start them will fail with X_EEXIST.
216*843e1988Sjohnlev  *
217*843e1988Sjohnlev  * Thus 'alloc' -really- creates and initializes the virtual
218*843e1988Sjohnlev  * CPU context just once. Once the initialisation succeeds, we never
219*843e1988Sjohnlev  * free it, nor the regular cpu_t to which it refers.
220*843e1988Sjohnlev  */
221*843e1988Sjohnlev void *
222*843e1988Sjohnlev mach_cpucontext_alloc(struct cpu *cp)
223*843e1988Sjohnlev {
224*843e1988Sjohnlev 	kthread_t *tp = cp->cpu_thread;
225*843e1988Sjohnlev 	vcpu_guest_context_t vgc;
226*843e1988Sjohnlev 
227*843e1988Sjohnlev 	int err = 1;
228*843e1988Sjohnlev 
229*843e1988Sjohnlev 	/*
230*843e1988Sjohnlev 	 * First, augment the incoming cpu structure
231*843e1988Sjohnlev 	 * - vcpu pointer reference
232*843e1988Sjohnlev 	 * - pending event storage area
233*843e1988Sjohnlev 	 * - physical address of GDT
234*843e1988Sjohnlev 	 */
235*843e1988Sjohnlev 	cp->cpu_m.mcpu_vcpu_info =
236*843e1988Sjohnlev 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
237*843e1988Sjohnlev 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
238*843e1988Sjohnlev 	    sizeof (struct xen_evt_data), KM_SLEEP);
239*843e1988Sjohnlev 	cp->cpu_m.mcpu_gdtpa =
240*843e1988Sjohnlev 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
241*843e1988Sjohnlev 
242*843e1988Sjohnlev 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
243*843e1988Sjohnlev 		goto done;
244*843e1988Sjohnlev 
245*843e1988Sjohnlev 	/*
246*843e1988Sjohnlev 	 * Now set up the vcpu context so that we can start this vcpu
247*843e1988Sjohnlev 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
248*843e1988Sjohnlev 	 * thread will thread_exit() shortly after performing the
249*843e1988Sjohnlev 	 * initialization; in particular, we will *never* take a
250*843e1988Sjohnlev 	 * privilege transition on this thread.
251*843e1988Sjohnlev 	 */
252*843e1988Sjohnlev 
253*843e1988Sjohnlev 	bzero(&vgc, sizeof (vgc));
254*843e1988Sjohnlev 
255*843e1988Sjohnlev #ifdef __amd64
256*843e1988Sjohnlev 	vgc.user_regs.rip = tp->t_pc;
257*843e1988Sjohnlev 	vgc.user_regs.rsp = tp->t_sp;
258*843e1988Sjohnlev 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
259*843e1988Sjohnlev #else
260*843e1988Sjohnlev 	vgc.user_regs.eip = tp->t_pc;
261*843e1988Sjohnlev 	vgc.user_regs.esp = tp->t_sp;
262*843e1988Sjohnlev 	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
263*843e1988Sjohnlev #endif
264*843e1988Sjohnlev 	/*
265*843e1988Sjohnlev 	 * XXPV	Fix resume, if Russ didn't already fix it.
266*843e1988Sjohnlev 	 *
267*843e1988Sjohnlev 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
268*843e1988Sjohnlev 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
269*843e1988Sjohnlev 	 * that only lwps take traps that switch to the kernel stack;
270*843e1988Sjohnlev 	 * part of creating an lwp adjusts the stack by subtracting
271*843e1988Sjohnlev 	 * sizeof (struct regs) off t_stk.
272*843e1988Sjohnlev 	 *
273*843e1988Sjohnlev 	 * The more interesting question is, why do we do all the work
274*843e1988Sjohnlev 	 * of a fully fledged lwp for a plain thread?  In particular
275*843e1988Sjohnlev 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
276*843e1988Sjohnlev 	 * or futz with the LDT.  This should probably all be done with
277*843e1988Sjohnlev 	 * an lwp context operator to keep pure thread context switch fast.
278*843e1988Sjohnlev 	 */
279*843e1988Sjohnlev 	vgc.kernel_sp = (ulong_t)tp->t_stk;
280*843e1988Sjohnlev 
281*843e1988Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
282*843e1988Sjohnlev 
283*843e1988Sjohnlev done:
284*843e1988Sjohnlev 	if (err) {
285*843e1988Sjohnlev 		mach_cpucontext_free(cp, NULL, err);
286*843e1988Sjohnlev 		return (NULL);
287*843e1988Sjohnlev 	}
288*843e1988Sjohnlev 	return (cp);
289*843e1988Sjohnlev }
290*843e1988Sjohnlev 
291*843e1988Sjohnlev /*
292*843e1988Sjohnlev  * By the time we are called either we have successfully started
293*843e1988Sjohnlev  * the cpu, or our attempt to start it has failed.
294*843e1988Sjohnlev  */
295*843e1988Sjohnlev 
296*843e1988Sjohnlev /*ARGSUSED*/
297*843e1988Sjohnlev void
298*843e1988Sjohnlev mach_cpucontext_free(struct cpu *cp, void *arg, int err)
299*843e1988Sjohnlev {
300*843e1988Sjohnlev 	switch (err) {
301*843e1988Sjohnlev 	case 0:
302*843e1988Sjohnlev 		break;
303*843e1988Sjohnlev 	case ETIMEDOUT:
304*843e1988Sjohnlev 		/*
305*843e1988Sjohnlev 		 * The vcpu context is loaded into the hypervisor, and
306*843e1988Sjohnlev 		 * we've tried to start it, but the vcpu has not been set
307*843e1988Sjohnlev 		 * running yet, for whatever reason.  We arrange to -not-
308*843e1988Sjohnlev 		 * free any data structures it may be referencing.  In
309*843e1988Sjohnlev 		 * particular, we've already told the hypervisor about
310*843e1988Sjohnlev 		 * the GDT, and so we can't map it read-write again.
311*843e1988Sjohnlev 		 */
312*843e1988Sjohnlev 		break;
313*843e1988Sjohnlev 	default:
314*843e1988Sjohnlev 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
315*843e1988Sjohnlev 		kmem_free(cp->cpu_m.mcpu_evt_pend,
316*843e1988Sjohnlev 		    sizeof (struct xen_evt_data));
317*843e1988Sjohnlev 		break;
318*843e1988Sjohnlev 	}
319*843e1988Sjohnlev }
320*843e1988Sjohnlev 
321*843e1988Sjohnlev /*
322*843e1988Sjohnlev  * Reset this CPU's context.  Clear out any pending evtchn data, since event
323*843e1988Sjohnlev  * channel numbers will all change when we resume.
324*843e1988Sjohnlev  */
325*843e1988Sjohnlev void
326*843e1988Sjohnlev mach_cpucontext_reset(cpu_t *cp)
327*843e1988Sjohnlev {
328*843e1988Sjohnlev 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
329*843e1988Sjohnlev 	/* mcpu_intr_pending ? */
330*843e1988Sjohnlev }
331*843e1988Sjohnlev 
332*843e1988Sjohnlev static void
333*843e1988Sjohnlev pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
334*843e1988Sjohnlev {
335*843e1988Sjohnlev #ifdef __amd64
336*843e1988Sjohnlev 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
337*843e1988Sjohnlev 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
338*843e1988Sjohnlev 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
339*843e1988Sjohnlev 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
340*843e1988Sjohnlev 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
341*843e1988Sjohnlev 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
342*843e1988Sjohnlev 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
343*843e1988Sjohnlev 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
344*843e1988Sjohnlev #else /* __amd64 */
345*843e1988Sjohnlev 	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
346*843e1988Sjohnlev 	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
347*843e1988Sjohnlev 	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
348*843e1988Sjohnlev 	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
349*843e1988Sjohnlev 	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
350*843e1988Sjohnlev 	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
351*843e1988Sjohnlev #endif /* __amd64 */
352*843e1988Sjohnlev }
353*843e1988Sjohnlev 
354*843e1988Sjohnlev /*
355*843e1988Sjohnlev  * Restore the context of a CPU during resume.  The CPU must either
356*843e1988Sjohnlev  * have been blocked in cpu_idle() (running the idle thread), if it was
357*843e1988Sjohnlev  * offline, or inside cpu_pause_thread().  Either way we can restore safely
358*843e1988Sjohnlev  * from the t_pcb.
359*843e1988Sjohnlev  */
360*843e1988Sjohnlev void
361*843e1988Sjohnlev mach_cpucontext_restore(cpu_t *cp)
362*843e1988Sjohnlev {
363*843e1988Sjohnlev 	vcpu_guest_context_t vgc;
364*843e1988Sjohnlev 	int err;
365*843e1988Sjohnlev 
366*843e1988Sjohnlev 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
367*843e1988Sjohnlev 	    cp->cpu_thread == cp->cpu_idle_thread);
368*843e1988Sjohnlev 
369*843e1988Sjohnlev 	bzero(&vgc, sizeof (vgc));
370*843e1988Sjohnlev 
371*843e1988Sjohnlev 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
372*843e1988Sjohnlev 
373*843e1988Sjohnlev 	/*
374*843e1988Sjohnlev 	 * We're emulating a longjmp() here: in particular, we need to bump the
375*843e1988Sjohnlev 	 * stack pointer to account for the pop of xIP that returning from
376*843e1988Sjohnlev 	 * longjmp() normally would do, and set the return value in xAX to 1.
377*843e1988Sjohnlev 	 */
378*843e1988Sjohnlev #ifdef __amd64
379*843e1988Sjohnlev 	vgc.user_regs.rax = 1;
380*843e1988Sjohnlev 	vgc.user_regs.rsp += sizeof (ulong_t);
381*843e1988Sjohnlev #else
382*843e1988Sjohnlev 	vgc.user_regs.eax = 1;
383*843e1988Sjohnlev 	vgc.user_regs.esp += sizeof (ulong_t);
384*843e1988Sjohnlev #endif
385*843e1988Sjohnlev 
386*843e1988Sjohnlev 	vgc.kernel_sp = cp->cpu_thread->t_sp;
387*843e1988Sjohnlev 
388*843e1988Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
389*843e1988Sjohnlev 
390*843e1988Sjohnlev 	ASSERT(err == 0);
391*843e1988Sjohnlev }
392*843e1988Sjohnlev 
393*843e1988Sjohnlev void
394*843e1988Sjohnlev mach_cpu_idle(void)
395*843e1988Sjohnlev {
396*843e1988Sjohnlev 	if (IN_XPV_PANIC()) {
397*843e1988Sjohnlev 		xpv_panic_halt();
398*843e1988Sjohnlev 	} else  {
399*843e1988Sjohnlev 		(void) setjmp(&curthread->t_pcb);
400*843e1988Sjohnlev 		CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id);
401*843e1988Sjohnlev 		(void) HYPERVISOR_block();
402*843e1988Sjohnlev 		CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id);
403*843e1988Sjohnlev 	}
404*843e1988Sjohnlev }
405*843e1988Sjohnlev 
406*843e1988Sjohnlev void
407*843e1988Sjohnlev mach_cpu_halt(char *msg)
408*843e1988Sjohnlev {
409*843e1988Sjohnlev 	if (msg)
410*843e1988Sjohnlev 		prom_printf("%s\n", msg);
411*843e1988Sjohnlev 	(void) xen_vcpu_down(CPU->cpu_id);
412*843e1988Sjohnlev }
413*843e1988Sjohnlev 
414*843e1988Sjohnlev void
415*843e1988Sjohnlev mach_cpu_pause(volatile char *safe)
416*843e1988Sjohnlev {
417*843e1988Sjohnlev 	ulong_t flags;
418*843e1988Sjohnlev 
419*843e1988Sjohnlev 	flags = intr_clear();
420*843e1988Sjohnlev 
421*843e1988Sjohnlev 	if (setjmp(&curthread->t_pcb) == 0) {
422*843e1988Sjohnlev 		CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id);
423*843e1988Sjohnlev 		/*
424*843e1988Sjohnlev 		 * This cpu is now safe.
425*843e1988Sjohnlev 		 */
426*843e1988Sjohnlev 		*safe = PAUSE_WAIT;
427*843e1988Sjohnlev 		membar_enter();
428*843e1988Sjohnlev 	}
429*843e1988Sjohnlev 
430*843e1988Sjohnlev 	while (*safe != PAUSE_IDLE)
431*843e1988Sjohnlev 		SMT_PAUSE();
432*843e1988Sjohnlev 
433*843e1988Sjohnlev 	CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id);
434*843e1988Sjohnlev 
435*843e1988Sjohnlev 	intr_restore(flags);
436*843e1988Sjohnlev }
437*843e1988Sjohnlev 
438*843e1988Sjohnlev /*
439*843e1988Sjohnlev  * Virtual CPU management.
440*843e1988Sjohnlev  *
441*843e1988Sjohnlev  * VCPUs can be controlled in one of two ways; through the domain itself
442*843e1988Sjohnlev  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
443*843e1988Sjohnlev  * Unfortunately, the terminology is used in different ways; they work out as
444*843e1988Sjohnlev  * follows:
445*843e1988Sjohnlev  *
446*843e1988Sjohnlev  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
447*843e1988Sjohnlev  *
448*843e1988Sjohnlev  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
449*843e1988Sjohnlev  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
450*843e1988Sjohnlev  * receive interrupts, and we require this for offline CPUs in Solaris.
451*843e1988Sjohnlev  *
452*843e1988Sjohnlev  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
453*843e1988Sjohnlev  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
454*843e1988Sjohnlev  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
455*843e1988Sjohnlev  * event channels, etc.) will still exist.
456*843e1988Sjohnlev  *
457*843e1988Sjohnlev  * The hypervisor has two notions of CPU states as represented in the store:
458*843e1988Sjohnlev  *
459*843e1988Sjohnlev  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
460*843e1988Sjohnlev  *
461*843e1988Sjohnlev  * "online": the VCPU is running.  Corresponds to a CPU state other than
462*843e1988Sjohnlev  * P_POWEROFF.
463*843e1988Sjohnlev  *
464*843e1988Sjohnlev  * Currently, only a notification via xenstore can bring a CPU into a
465*843e1988Sjohnlev  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
466*843e1988Sjohnlev  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
467*843e1988Sjohnlev  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
468*843e1988Sjohnlev  *
469*843e1988Sjohnlev  * Note that the xenstore configuration is strictly advisory, in that a domain
470*843e1988Sjohnlev  * can choose to ignore it and still power up a VCPU in the offline state. To
471*843e1988Sjohnlev  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
472*843e1988Sjohnlev  * ENOTSUP from within Solaris.
473*843e1988Sjohnlev  */
474*843e1988Sjohnlev 
475*843e1988Sjohnlev /*ARGSUSED*/
476*843e1988Sjohnlev int
477*843e1988Sjohnlev mp_cpu_poweron(struct cpu *cp)
478*843e1988Sjohnlev {
479*843e1988Sjohnlev 	return (ENOTSUP);
480*843e1988Sjohnlev }
481*843e1988Sjohnlev 
482*843e1988Sjohnlev /*ARGSUSED*/
483*843e1988Sjohnlev int
484*843e1988Sjohnlev mp_cpu_poweroff(struct cpu *cp)
485*843e1988Sjohnlev {
486*843e1988Sjohnlev 	return (ENOTSUP);
487*843e1988Sjohnlev }
488*843e1988Sjohnlev 
489*843e1988Sjohnlev static int
490*843e1988Sjohnlev poweron_vcpu(struct cpu *cp)
491*843e1988Sjohnlev {
492*843e1988Sjohnlev 	int error;
493*843e1988Sjohnlev 
494*843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
495*843e1988Sjohnlev 
496*843e1988Sjohnlev 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
497*843e1988Sjohnlev 		printf("poweron_vcpu: vcpu%d is not available!\n",
498*843e1988Sjohnlev 		    cp->cpu_id);
499*843e1988Sjohnlev 		return (ENXIO);
500*843e1988Sjohnlev 	}
501*843e1988Sjohnlev 
502*843e1988Sjohnlev 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
503*843e1988Sjohnlev 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
504*843e1988Sjohnlev 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
505*843e1988Sjohnlev 		cp->cpu_flags &= ~CPU_POWEROFF;
506*843e1988Sjohnlev 		/*
507*843e1988Sjohnlev 		 * There are some nasty races possible here.
508*843e1988Sjohnlev 		 * Tell the vcpu it's up one more time.
509*843e1988Sjohnlev 		 * XXPV	Is this enough?  Is this safe?
510*843e1988Sjohnlev 		 */
511*843e1988Sjohnlev 		(void) xen_vcpu_up(cp->cpu_id);
512*843e1988Sjohnlev 
513*843e1988Sjohnlev 		cpu_set_state(cp);
514*843e1988Sjohnlev 	}
515*843e1988Sjohnlev 	return (error);
516*843e1988Sjohnlev }
517*843e1988Sjohnlev 
518*843e1988Sjohnlev static int
519*843e1988Sjohnlev poweroff_poke(void)
520*843e1988Sjohnlev {
521*843e1988Sjohnlev 	CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id);
522*843e1988Sjohnlev 	return (0);
523*843e1988Sjohnlev }
524*843e1988Sjohnlev 
525*843e1988Sjohnlev /*
526*843e1988Sjohnlev  * We must ensure that the VCPU reaches a safe state (in the suspend set, and
527*843e1988Sjohnlev  * thus is not going to change) before we can power it off.  The VCPU could
528*843e1988Sjohnlev  * still be in mach_cpu_pause() and about to head back out; so just checking
529*843e1988Sjohnlev  * cpu_suspend_set() isn't sufficient to make sure the VCPU has stopped moving.
530*843e1988Sjohnlev  * Instead, we xcall it to delete itself from the set; whichever way it comes
531*843e1988Sjohnlev  * back from that xcall, it won't mark itself in the set until it's safely back
532*843e1988Sjohnlev  * in mach_cpu_idle().
533*843e1988Sjohnlev  */
534*843e1988Sjohnlev static int
535*843e1988Sjohnlev poweroff_vcpu(struct cpu *cp)
536*843e1988Sjohnlev {
537*843e1988Sjohnlev 	int error;
538*843e1988Sjohnlev 	cpuset_t set;
539*843e1988Sjohnlev 
540*843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
541*843e1988Sjohnlev 
542*843e1988Sjohnlev 	ASSERT(CPU->cpu_id != cp->cpu_id);
543*843e1988Sjohnlev 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
544*843e1988Sjohnlev 
545*843e1988Sjohnlev 	CPUSET_ONLY(set, cp->cpu_id);
546*843e1988Sjohnlev 
547*843e1988Sjohnlev 	xc_sync(0, 0, 0, X_CALL_HIPRI, set, (xc_func_t)poweroff_poke);
548*843e1988Sjohnlev 
549*843e1988Sjohnlev 	while (!CPU_IN_SET(cpu_suspend_set, cp->cpu_id))
550*843e1988Sjohnlev 		SMT_PAUSE();
551*843e1988Sjohnlev 
552*843e1988Sjohnlev 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
553*843e1988Sjohnlev 		ASSERT(CPU_IN_SET(cpu_suspend_set, cp->cpu_id));
554*843e1988Sjohnlev 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
555*843e1988Sjohnlev 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
556*843e1988Sjohnlev 		cp->cpu_flags &=
557*843e1988Sjohnlev 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
558*843e1988Sjohnlev 
559*843e1988Sjohnlev 		cpu_set_state(cp);
560*843e1988Sjohnlev 	}
561*843e1988Sjohnlev 	return (error);
562*843e1988Sjohnlev }
563*843e1988Sjohnlev 
564*843e1988Sjohnlev static int
565*843e1988Sjohnlev vcpu_config_poweroff(processorid_t id)
566*843e1988Sjohnlev {
567*843e1988Sjohnlev 	int oldstate;
568*843e1988Sjohnlev 	int error;
569*843e1988Sjohnlev 	cpu_t *cp;
570*843e1988Sjohnlev 
571*843e1988Sjohnlev 	mutex_enter(&cpu_lock);
572*843e1988Sjohnlev 
573*843e1988Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
574*843e1988Sjohnlev 		mutex_exit(&cpu_lock);
575*843e1988Sjohnlev 		return (ESRCH);
576*843e1988Sjohnlev 	}
577*843e1988Sjohnlev 
578*843e1988Sjohnlev 	if (cpu_get_state(cp) == P_POWEROFF) {
579*843e1988Sjohnlev 		mutex_exit(&cpu_lock);
580*843e1988Sjohnlev 		return (0);
581*843e1988Sjohnlev 	}
582*843e1988Sjohnlev 
583*843e1988Sjohnlev 	mutex_exit(&cpu_lock);
584*843e1988Sjohnlev 
585*843e1988Sjohnlev 	do {
586*843e1988Sjohnlev 		error = p_online_internal(id, P_OFFLINE,
587*843e1988Sjohnlev 		    &oldstate);
588*843e1988Sjohnlev 
589*843e1988Sjohnlev 		if (error != 0)
590*843e1988Sjohnlev 			break;
591*843e1988Sjohnlev 
592*843e1988Sjohnlev 		/*
593*843e1988Sjohnlev 		 * So we just changed it to P_OFFLINE.  But then we dropped
594*843e1988Sjohnlev 		 * cpu_lock, so now it is possible for another thread to change
595*843e1988Sjohnlev 		 * the cpu back to a different, non-quiesced state e.g.
596*843e1988Sjohnlev 		 * P_ONLINE.
597*843e1988Sjohnlev 		 */
598*843e1988Sjohnlev 		mutex_enter(&cpu_lock);
599*843e1988Sjohnlev 		if ((cp = cpu_get(id)) == NULL)
600*843e1988Sjohnlev 			error = ESRCH;
601*843e1988Sjohnlev 		else {
602*843e1988Sjohnlev 			if (cp->cpu_flags & CPU_QUIESCED)
603*843e1988Sjohnlev 				error = poweroff_vcpu(cp);
604*843e1988Sjohnlev 			else
605*843e1988Sjohnlev 				error = EBUSY;
606*843e1988Sjohnlev 		}
607*843e1988Sjohnlev 		mutex_exit(&cpu_lock);
608*843e1988Sjohnlev 	} while (error == EBUSY);
609*843e1988Sjohnlev 
610*843e1988Sjohnlev 	return (error);
611*843e1988Sjohnlev }
612*843e1988Sjohnlev 
613*843e1988Sjohnlev /*
614*843e1988Sjohnlev  * Add a new virtual cpu to the domain.
615*843e1988Sjohnlev  */
616*843e1988Sjohnlev static int
617*843e1988Sjohnlev vcpu_config_new(processorid_t id)
618*843e1988Sjohnlev {
619*843e1988Sjohnlev 	extern int start_cpu(processorid_t);
620*843e1988Sjohnlev 	int error;
621*843e1988Sjohnlev 
622*843e1988Sjohnlev 	if (ncpus == 1) {
623*843e1988Sjohnlev 		printf("cannot (yet) add cpus to a single-cpu domain\n");
624*843e1988Sjohnlev 		return (ENOTSUP);
625*843e1988Sjohnlev 	}
626*843e1988Sjohnlev 
627*843e1988Sjohnlev 	affinity_set(CPU_CURRENT);
628*843e1988Sjohnlev 	error = start_cpu(id);
629*843e1988Sjohnlev 	affinity_clear();
630*843e1988Sjohnlev 	return (error);
631*843e1988Sjohnlev }
632*843e1988Sjohnlev 
633*843e1988Sjohnlev static int
634*843e1988Sjohnlev vcpu_config_poweron(processorid_t id)
635*843e1988Sjohnlev {
636*843e1988Sjohnlev 	cpu_t *cp;
637*843e1988Sjohnlev 	int oldstate;
638*843e1988Sjohnlev 	int error;
639*843e1988Sjohnlev 
640*843e1988Sjohnlev 	if (id >= ncpus)
641*843e1988Sjohnlev 		return (vcpu_config_new(id));
642*843e1988Sjohnlev 
643*843e1988Sjohnlev 	mutex_enter(&cpu_lock);
644*843e1988Sjohnlev 
645*843e1988Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
646*843e1988Sjohnlev 		mutex_exit(&cpu_lock);
647*843e1988Sjohnlev 		return (ESRCH);
648*843e1988Sjohnlev 	}
649*843e1988Sjohnlev 
650*843e1988Sjohnlev 	if (cpu_get_state(cp) != P_POWEROFF) {
651*843e1988Sjohnlev 		mutex_exit(&cpu_lock);
652*843e1988Sjohnlev 		return (0);
653*843e1988Sjohnlev 	}
654*843e1988Sjohnlev 
655*843e1988Sjohnlev 	if ((error = poweron_vcpu(cp)) != 0) {
656*843e1988Sjohnlev 		mutex_exit(&cpu_lock);
657*843e1988Sjohnlev 		return (error);
658*843e1988Sjohnlev 	}
659*843e1988Sjohnlev 
660*843e1988Sjohnlev 	mutex_exit(&cpu_lock);
661*843e1988Sjohnlev 
662*843e1988Sjohnlev 	return (p_online_internal(id, P_ONLINE, &oldstate));
663*843e1988Sjohnlev }
664*843e1988Sjohnlev 
665*843e1988Sjohnlev #define	REPORT_LEN	128
666*843e1988Sjohnlev 
667*843e1988Sjohnlev static void
668*843e1988Sjohnlev vcpu_config_report(processorid_t id, uint_t newstate, int error)
669*843e1988Sjohnlev {
670*843e1988Sjohnlev 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
671*843e1988Sjohnlev 	size_t len;
672*843e1988Sjohnlev 	char *ps;
673*843e1988Sjohnlev 
674*843e1988Sjohnlev 	switch (newstate) {
675*843e1988Sjohnlev 	case P_ONLINE:
676*843e1988Sjohnlev 		ps = PS_ONLINE;
677*843e1988Sjohnlev 		break;
678*843e1988Sjohnlev 	case P_POWEROFF:
679*843e1988Sjohnlev 		ps = PS_POWEROFF;
680*843e1988Sjohnlev 		break;
681*843e1988Sjohnlev 	default:
682*843e1988Sjohnlev 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
683*843e1988Sjohnlev 		break;
684*843e1988Sjohnlev 	}
685*843e1988Sjohnlev 
686*843e1988Sjohnlev 	len = snprintf(report, REPORT_LEN,
687*843e1988Sjohnlev 	    "cpu%d: externally initiated %s", id, ps);
688*843e1988Sjohnlev 
689*843e1988Sjohnlev 	if (!error) {
690*843e1988Sjohnlev 		cmn_err(CE_CONT, "!%s\n", report);
691*843e1988Sjohnlev 		kmem_free(report, REPORT_LEN);
692*843e1988Sjohnlev 		return;
693*843e1988Sjohnlev 	}
694*843e1988Sjohnlev 
695*843e1988Sjohnlev 	len += snprintf(report + len, REPORT_LEN - len,
696*843e1988Sjohnlev 	    " failed, error %d: ", error);
697*843e1988Sjohnlev 	switch (error) {
698*843e1988Sjohnlev 	case EEXIST:
699*843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
700*843e1988Sjohnlev 		    "cpu already %s", ps ? ps : "?");
701*843e1988Sjohnlev 		break;
702*843e1988Sjohnlev 	case ESRCH:
703*843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
704*843e1988Sjohnlev 		    "cpu not found");
705*843e1988Sjohnlev 		break;
706*843e1988Sjohnlev 	case EINVAL:
707*843e1988Sjohnlev 	case EALREADY:
708*843e1988Sjohnlev 		break;
709*843e1988Sjohnlev 	case EPERM:
710*843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
711*843e1988Sjohnlev 		    "insufficient privilege (0x%x)", id);
712*843e1988Sjohnlev 		break;
713*843e1988Sjohnlev 	case EBUSY:
714*843e1988Sjohnlev 		switch (newstate) {
715*843e1988Sjohnlev 		case P_ONLINE:
716*843e1988Sjohnlev 			/*
717*843e1988Sjohnlev 			 * This return comes from mp_cpu_start -
718*843e1988Sjohnlev 			 * we cannot 'start' the boot CPU.
719*843e1988Sjohnlev 			 */
720*843e1988Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
721*843e1988Sjohnlev 			    "already running");
722*843e1988Sjohnlev 			break;
723*843e1988Sjohnlev 		case P_POWEROFF:
724*843e1988Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
725*843e1988Sjohnlev 			    "bound lwps?");
726*843e1988Sjohnlev 			break;
727*843e1988Sjohnlev 		default:
728*843e1988Sjohnlev 			break;
729*843e1988Sjohnlev 		}
730*843e1988Sjohnlev 	default:
731*843e1988Sjohnlev 		break;
732*843e1988Sjohnlev 	}
733*843e1988Sjohnlev 
734*843e1988Sjohnlev 	cmn_err(CE_CONT, "%s\n", report);
735*843e1988Sjohnlev 	kmem_free(report, REPORT_LEN);
736*843e1988Sjohnlev }
737*843e1988Sjohnlev 
738*843e1988Sjohnlev static void
739*843e1988Sjohnlev vcpu_config(void *arg)
740*843e1988Sjohnlev {
741*843e1988Sjohnlev 	int id = (int)(uintptr_t)arg;
742*843e1988Sjohnlev 	int error;
743*843e1988Sjohnlev 	char dir[16];
744*843e1988Sjohnlev 	char *state;
745*843e1988Sjohnlev 
746*843e1988Sjohnlev 	if ((uint_t)id >= max_ncpus) {
747*843e1988Sjohnlev 		cmn_err(CE_WARN,
748*843e1988Sjohnlev 		    "vcpu_config: cpu%d does not fit in this domain", id);
749*843e1988Sjohnlev 		return;
750*843e1988Sjohnlev 	}
751*843e1988Sjohnlev 
752*843e1988Sjohnlev 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
753*843e1988Sjohnlev 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
754*843e1988Sjohnlev 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
755*843e1988Sjohnlev 		if (strcmp(state, "online") == 0) {
756*843e1988Sjohnlev 			error = vcpu_config_poweron(id);
757*843e1988Sjohnlev 			vcpu_config_report(id, P_ONLINE, error);
758*843e1988Sjohnlev 		} else if (strcmp(state, "offline") == 0) {
759*843e1988Sjohnlev 			error = vcpu_config_poweroff(id);
760*843e1988Sjohnlev 			vcpu_config_report(id, P_POWEROFF, error);
761*843e1988Sjohnlev 		} else {
762*843e1988Sjohnlev 			cmn_err(CE_WARN,
763*843e1988Sjohnlev 			    "cpu%d: unknown target state '%s'", id, state);
764*843e1988Sjohnlev 		}
765*843e1988Sjohnlev 	} else
766*843e1988Sjohnlev 		cmn_err(CE_WARN,
767*843e1988Sjohnlev 		    "cpu%d: unable to read target state from xenstore", id);
768*843e1988Sjohnlev 
769*843e1988Sjohnlev 	kmem_free(state, MAXPATHLEN);
770*843e1988Sjohnlev }
771*843e1988Sjohnlev 
772*843e1988Sjohnlev /*ARGSUSED*/
773*843e1988Sjohnlev static void
774*843e1988Sjohnlev vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
775*843e1988Sjohnlev {
776*843e1988Sjohnlev 	const char *path = vec[XS_WATCH_PATH];
777*843e1988Sjohnlev 	processorid_t id;
778*843e1988Sjohnlev 	char *s;
779*843e1988Sjohnlev 
780*843e1988Sjohnlev 	if ((s = strstr(path, "cpu/")) != NULL &&
781*843e1988Sjohnlev 	    sscanf(s, "cpu/%d", &id) == 1) {
782*843e1988Sjohnlev 		/*
783*843e1988Sjohnlev 		 * Run the virtual CPU configuration on a separate thread to
784*843e1988Sjohnlev 		 * avoid blocking on this event for too long (and for now,
785*843e1988Sjohnlev 		 * to ensure configuration requests are serialized.)
786*843e1988Sjohnlev 		 */
787*843e1988Sjohnlev 		(void) taskq_dispatch(cpu_config_tq,
788*843e1988Sjohnlev 		    vcpu_config, (void *)(uintptr_t)id, 0);
789*843e1988Sjohnlev 	}
790*843e1988Sjohnlev }
791*843e1988Sjohnlev 
792*843e1988Sjohnlev static int
793*843e1988Sjohnlev xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
794*843e1988Sjohnlev {
795*843e1988Sjohnlev 	int err;
796*843e1988Sjohnlev 
797*843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
798*843e1988Sjohnlev 		char *str;
799*843e1988Sjohnlev 		int level = CE_WARN;
800*843e1988Sjohnlev 
801*843e1988Sjohnlev 		switch (err) {
802*843e1988Sjohnlev 		case -X_EINVAL:
803*843e1988Sjohnlev 			/*
804*843e1988Sjohnlev 			 * This interface squashes multiple error sources
805*843e1988Sjohnlev 			 * to one error code.  In particular, an X_EINVAL
806*843e1988Sjohnlev 			 * code can mean:
807*843e1988Sjohnlev 			 *
808*843e1988Sjohnlev 			 * -	the vcpu id is out of range
809*843e1988Sjohnlev 			 * -	cs or ss are in ring 0
810*843e1988Sjohnlev 			 * -	cr3 is wrong
811*843e1988Sjohnlev 			 * -	an entry in the new gdt is above the
812*843e1988Sjohnlev 			 *	reserved entry
813*843e1988Sjohnlev 			 * -	a frame underneath the new gdt is bad
814*843e1988Sjohnlev 			 */
815*843e1988Sjohnlev 			str = "something is wrong :(";
816*843e1988Sjohnlev 			break;
817*843e1988Sjohnlev 		case -X_ENOENT:
818*843e1988Sjohnlev 			str = "no such cpu";
819*843e1988Sjohnlev 			break;
820*843e1988Sjohnlev 		case -X_ENOMEM:
821*843e1988Sjohnlev 			str = "no mem to copy ctxt";
822*843e1988Sjohnlev 			break;
823*843e1988Sjohnlev 		case -X_EFAULT:
824*843e1988Sjohnlev 			str = "bad address";
825*843e1988Sjohnlev 			break;
826*843e1988Sjohnlev 		case -X_EEXIST:
827*843e1988Sjohnlev 			/*
828*843e1988Sjohnlev 			 * Hmm.  This error is returned if the vcpu has already
829*843e1988Sjohnlev 			 * been initialized once before in the lifetime of this
830*843e1988Sjohnlev 			 * domain.  This is a logic error in the kernel.
831*843e1988Sjohnlev 			 */
832*843e1988Sjohnlev 			level = CE_PANIC;
833*843e1988Sjohnlev 			str = "already initialized";
834*843e1988Sjohnlev 			break;
835*843e1988Sjohnlev 		default:
836*843e1988Sjohnlev 			level = CE_PANIC;
837*843e1988Sjohnlev 			str = "<unexpected>";
838*843e1988Sjohnlev 			break;
839*843e1988Sjohnlev 		}
840*843e1988Sjohnlev 
841*843e1988Sjohnlev 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
842*843e1988Sjohnlev 		    id, -err, str);
843*843e1988Sjohnlev 	}
844*843e1988Sjohnlev 	return (err);
845*843e1988Sjohnlev }
846*843e1988Sjohnlev 
847*843e1988Sjohnlev long
848*843e1988Sjohnlev xen_vcpu_up(processorid_t id)
849*843e1988Sjohnlev {
850*843e1988Sjohnlev 	long err;
851*843e1988Sjohnlev 
852*843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
853*843e1988Sjohnlev 		char *str;
854*843e1988Sjohnlev 
855*843e1988Sjohnlev 		switch (err) {
856*843e1988Sjohnlev 		case -X_ENOENT:
857*843e1988Sjohnlev 			str = "no such cpu";
858*843e1988Sjohnlev 			break;
859*843e1988Sjohnlev 		case -X_EINVAL:
860*843e1988Sjohnlev 			/*
861*843e1988Sjohnlev 			 * Perhaps this is diagnostic overkill.
862*843e1988Sjohnlev 			 */
863*843e1988Sjohnlev 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
864*843e1988Sjohnlev 				str = "bad cpuid";
865*843e1988Sjohnlev 			else
866*843e1988Sjohnlev 				str = "not initialized";
867*843e1988Sjohnlev 			break;
868*843e1988Sjohnlev 		default:
869*843e1988Sjohnlev 			str = "<unexpected>";
870*843e1988Sjohnlev 			break;
871*843e1988Sjohnlev 		}
872*843e1988Sjohnlev 
873*843e1988Sjohnlev 		printf("vcpu%d: failed to start: error %d: %s\n",
874*843e1988Sjohnlev 		    id, -(int)err, str);
875*843e1988Sjohnlev 		return (EBFONT);	/* deliberately silly */
876*843e1988Sjohnlev 	}
877*843e1988Sjohnlev 	return (err);
878*843e1988Sjohnlev }
879*843e1988Sjohnlev 
880*843e1988Sjohnlev long
881*843e1988Sjohnlev xen_vcpu_down(processorid_t id)
882*843e1988Sjohnlev {
883*843e1988Sjohnlev 	long err;
884*843e1988Sjohnlev 
885*843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
886*843e1988Sjohnlev 		/*
887*843e1988Sjohnlev 		 * X_ENOENT:	no such cpu
888*843e1988Sjohnlev 		 * X_EINVAL:	bad cpuid
889*843e1988Sjohnlev 		 */
890*843e1988Sjohnlev 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
891*843e1988Sjohnlev 	}
892*843e1988Sjohnlev 
893*843e1988Sjohnlev 	return (err);
894*843e1988Sjohnlev }
895