xref: /illumos-gate/usr/src/uts/i86xpv/os/mp_xen.c (revision 86ef0a63)
1843e1988Sjohnlev /*
2843e1988Sjohnlev  * CDDL HEADER START
3843e1988Sjohnlev  *
4843e1988Sjohnlev  * The contents of this file are subject to the terms of the
5843e1988Sjohnlev  * Common Development and Distribution License (the "License").
6843e1988Sjohnlev  * You may not use this file except in compliance with the License.
7843e1988Sjohnlev  *
8843e1988Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9843e1988Sjohnlev  * or http://www.opensolaris.org/os/licensing.
10843e1988Sjohnlev  * See the License for the specific language governing permissions
11843e1988Sjohnlev  * and limitations under the License.
12843e1988Sjohnlev  *
13843e1988Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
14843e1988Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15843e1988Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
16843e1988Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
17843e1988Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
18843e1988Sjohnlev  *
19843e1988Sjohnlev  * CDDL HEADER END
20843e1988Sjohnlev  */
21843e1988Sjohnlev 
22843e1988Sjohnlev /*
23f34a7178SJoe Bonasera  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24843e1988Sjohnlev  * Use is subject to license terms.
25843e1988Sjohnlev  */
26843e1988Sjohnlev 
27c3377ee9SJohn Levon /*
28c3377ee9SJohn Levon  * Copyright 2019 Joyent, Inc.
29c3377ee9SJohn Levon  */
30c3377ee9SJohn Levon 
311d03c31eSjohnlev /*
321d03c31eSjohnlev  * Virtual CPU management.
331d03c31eSjohnlev  *
341d03c31eSjohnlev  * VCPUs can be controlled in one of two ways; through the domain itself
351d03c31eSjohnlev  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
361d03c31eSjohnlev  * Unfortunately, the terminology is used in different ways; they work out as
371d03c31eSjohnlev  * follows:
381d03c31eSjohnlev  *
391d03c31eSjohnlev  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
401d03c31eSjohnlev  *
411d03c31eSjohnlev  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
421d03c31eSjohnlev  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
431d03c31eSjohnlev  * receive interrupts, and we require this for offline CPUs in Solaris.
441d03c31eSjohnlev  *
451d03c31eSjohnlev  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
461d03c31eSjohnlev  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
471d03c31eSjohnlev  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
481d03c31eSjohnlev  * event channels, etc.) will still exist.
491d03c31eSjohnlev  *
501d03c31eSjohnlev  * The hypervisor has two notions of CPU states as represented in the store:
511d03c31eSjohnlev  *
521d03c31eSjohnlev  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
531d03c31eSjohnlev  *
541d03c31eSjohnlev  * "online": the VCPU is running.  Corresponds to a CPU state other than
551d03c31eSjohnlev  * P_POWEROFF.
561d03c31eSjohnlev  *
571d03c31eSjohnlev  * Currently, only a notification via xenstore can bring a CPU into a
581d03c31eSjohnlev  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
591d03c31eSjohnlev  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
601d03c31eSjohnlev  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
611d03c31eSjohnlev  *
621d03c31eSjohnlev  * Note that the xenstore configuration is strictly advisory, in that a domain
631d03c31eSjohnlev  * can choose to ignore it and still power up a VCPU in the offline state. To
641d03c31eSjohnlev  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
651d03c31eSjohnlev  * ENOTSUP from within Solaris.
661d03c31eSjohnlev  *
671d03c31eSjohnlev  * Powering off a VCPU and suspending the domain use similar code. The
681d03c31eSjohnlev  * difficulty here is that we must ensure that each VCPU is in a stable
691d03c31eSjohnlev  * state: it must have a saved PCB, and not be responding to interrupts
701d03c31eSjohnlev  * (since we are just about to remove its ability to run on a real CPU,
711d03c31eSjohnlev  * possibly forever).  However, an offline CPU in Solaris can take
721d03c31eSjohnlev  * cross-call interrupts, as mentioned, so we must go through a
731d03c31eSjohnlev  * two-stage process.  First, we use the standard Solaris pause_cpus().
741d03c31eSjohnlev  * This ensures that all CPUs are either in mach_cpu_pause() or
751d03c31eSjohnlev  * mach_cpu_idle(), and nothing will cross-call them.
761d03c31eSjohnlev  *
771d03c31eSjohnlev  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
781d03c31eSjohnlev  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
791d03c31eSjohnlev  *
801d03c31eSjohnlev  * Running CPUs are spinning in mach_cpu_pause() waiting for either
811d03c31eSjohnlev  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
821d03c31eSjohnlev  *
831d03c31eSjohnlev  * Offline CPUs are either running the idle thread and periodically
841d03c31eSjohnlev  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
851d03c31eSjohnlev  *
861d03c31eSjohnlev  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
871d03c31eSjohnlev  * poking them to make sure they're not blocked[1]. When every CPU has
881d03c31eSjohnlev  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
891d03c31eSjohnlev  * know we can suspend, or power-off a CPU, without problems.
901d03c31eSjohnlev  *
911d03c31eSjohnlev  * [1] note that we have to repeatedly poke offline CPUs: it's the only
921d03c31eSjohnlev  * way to ensure that the CPU doesn't miss the state change before
931d03c31eSjohnlev  * dropping into HYPERVISOR_block().
941d03c31eSjohnlev  */
951d03c31eSjohnlev 
96843e1988Sjohnlev #include <sys/types.h>
97843e1988Sjohnlev #include <sys/systm.h>
98843e1988Sjohnlev #include <sys/param.h>
99843e1988Sjohnlev #include <sys/taskq.h>
100843e1988Sjohnlev #include <sys/cmn_err.h>
101843e1988Sjohnlev #include <sys/archsystm.h>
102843e1988Sjohnlev #include <sys/machsystm.h>
103843e1988Sjohnlev #include <sys/segments.h>
104843e1988Sjohnlev #include <sys/cpuvar.h>
105843e1988Sjohnlev #include <sys/x86_archext.h>
106843e1988Sjohnlev #include <sys/controlregs.h>
107843e1988Sjohnlev #include <sys/hypervisor.h>
108843e1988Sjohnlev #include <sys/xpv_panic.h>
1091d03c31eSjohnlev #include <sys/mman.h>
1101d03c31eSjohnlev #include <sys/psw.h>
111843e1988Sjohnlev #include <sys/cpu.h>
1121d03c31eSjohnlev #include <sys/sunddi.h>
1131d03c31eSjohnlev #include <util/sscanf.h>
1141d03c31eSjohnlev #include <vm/hat_i86.h>
1151d03c31eSjohnlev #include <vm/hat.h>
1161d03c31eSjohnlev #include <vm/as.h>
117843e1988Sjohnlev 
118843e1988Sjohnlev #include <xen/public/io/xs_wire.h>
1191d03c31eSjohnlev #include <xen/sys/xenbus_impl.h>
1201d03c31eSjohnlev #include <xen/public/vcpu.h>
121843e1988Sjohnlev 
122f34a7178SJoe Bonasera extern cpuset_t cpu_ready_set;
123f34a7178SJoe Bonasera 
1241d03c31eSjohnlev #define	CPU_PHASE_NONE 0
1251d03c31eSjohnlev #define	CPU_PHASE_WAIT_SAFE 1
1261d03c31eSjohnlev #define	CPU_PHASE_SAFE 2
1271d03c31eSjohnlev #define	CPU_PHASE_POWERED_OFF 3
1281d03c31eSjohnlev 
1291d03c31eSjohnlev /*
1301d03c31eSjohnlev  * We can only poke CPUs during barrier enter 256 times a second at
1311d03c31eSjohnlev  * most.
1321d03c31eSjohnlev  */
1331d03c31eSjohnlev #define	POKE_TIMEOUT (NANOSEC / 256)
134843e1988Sjohnlev 
135843e1988Sjohnlev static taskq_t *cpu_config_tq;
1361d03c31eSjohnlev static int cpu_phase[NCPU];
1371d03c31eSjohnlev 
138843e1988Sjohnlev static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
139843e1988Sjohnlev static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
140843e1988Sjohnlev 
141b9bc7f78Ssmaybe /*
142b9bc7f78Ssmaybe  * Return whether or not the vcpu is actually running on a pcpu
143b9bc7f78Ssmaybe  */
144b9bc7f78Ssmaybe int
vcpu_on_pcpu(processorid_t cpu)145b9bc7f78Ssmaybe vcpu_on_pcpu(processorid_t cpu)
146b9bc7f78Ssmaybe {
147b9bc7f78Ssmaybe 	struct vcpu_runstate_info runstate;
148b9bc7f78Ssmaybe 	int	ret = VCPU_STATE_UNKNOWN;
149b9bc7f78Ssmaybe 
150b9bc7f78Ssmaybe 	ASSERT(cpu < NCPU);
151b9bc7f78Ssmaybe 	/*
152b9bc7f78Ssmaybe 	 * Don't bother with hypercall if we are asking about ourself
153b9bc7f78Ssmaybe 	 */
154b9bc7f78Ssmaybe 	if (cpu == CPU->cpu_id)
155b9bc7f78Ssmaybe 		return (VCPU_ON_PCPU);
156b9bc7f78Ssmaybe 	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
157b9bc7f78Ssmaybe 		goto out;
158b9bc7f78Ssmaybe 
159b9bc7f78Ssmaybe 	switch (runstate.state) {
160b9bc7f78Ssmaybe 	case RUNSTATE_running:
161b9bc7f78Ssmaybe 		ret = VCPU_ON_PCPU;
162b9bc7f78Ssmaybe 		break;
163b9bc7f78Ssmaybe 
164b9bc7f78Ssmaybe 	case RUNSTATE_runnable:
165b9bc7f78Ssmaybe 	case RUNSTATE_offline:
166b9bc7f78Ssmaybe 	case RUNSTATE_blocked:
167b9bc7f78Ssmaybe 		ret = VCPU_NOT_ON_PCPU;
168b9bc7f78Ssmaybe 		break;
169b9bc7f78Ssmaybe 
170b9bc7f78Ssmaybe 	default:
171b9bc7f78Ssmaybe 		break;
172b9bc7f78Ssmaybe 	}
173b9bc7f78Ssmaybe 
174b9bc7f78Ssmaybe out:
175b9bc7f78Ssmaybe 	return (ret);
176b9bc7f78Ssmaybe }
177b9bc7f78Ssmaybe 
178843e1988Sjohnlev /*
179843e1988Sjohnlev  * These routines allocate any global state that might be needed
180843e1988Sjohnlev  * while starting cpus.  For virtual cpus, there is no such state.
181843e1988Sjohnlev  */
182843e1988Sjohnlev int
mach_cpucontext_init(void)183843e1988Sjohnlev mach_cpucontext_init(void)
184843e1988Sjohnlev {
185843e1988Sjohnlev 	return (0);
186843e1988Sjohnlev }
187843e1988Sjohnlev 
188843e1988Sjohnlev void
do_cpu_config_watch(int state)189843e1988Sjohnlev do_cpu_config_watch(int state)
190843e1988Sjohnlev {
191843e1988Sjohnlev 	static struct xenbus_watch cpu_config_watch;
192843e1988Sjohnlev 
193843e1988Sjohnlev 	if (state != XENSTORE_UP)
194843e1988Sjohnlev 		return;
195843e1988Sjohnlev 	cpu_config_watch.node = "cpu";
196843e1988Sjohnlev 	cpu_config_watch.callback = vcpu_config_event;
197843e1988Sjohnlev 	if (register_xenbus_watch(&cpu_config_watch)) {
198843e1988Sjohnlev 		taskq_destroy(cpu_config_tq);
199843e1988Sjohnlev 		cmn_err(CE_WARN, "do_cpu_config_watch: "
200843e1988Sjohnlev 		    "failed to set vcpu config watch");
201843e1988Sjohnlev 	}
202843e1988Sjohnlev 
203843e1988Sjohnlev }
204843e1988Sjohnlev 
205843e1988Sjohnlev /*
206843e1988Sjohnlev  * This routine is called after all the "normal" MP startup has
207843e1988Sjohnlev  * been done; a good place to start watching xen store for virtual
208843e1988Sjohnlev  * cpu hot plug events.
209843e1988Sjohnlev  */
210843e1988Sjohnlev void
mach_cpucontext_fini(void)211843e1988Sjohnlev mach_cpucontext_fini(void)
212843e1988Sjohnlev {
213843e1988Sjohnlev 
214843e1988Sjohnlev 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
215843e1988Sjohnlev 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
216843e1988Sjohnlev 
217843e1988Sjohnlev 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
218843e1988Sjohnlev }
219843e1988Sjohnlev 
220843e1988Sjohnlev /*
221843e1988Sjohnlev  * Fill in the remaining CPU context and initialize it.
222843e1988Sjohnlev  */
223843e1988Sjohnlev static int
mp_set_cpu_context(vcpu_guest_context_t * vgc,cpu_t * cp)224843e1988Sjohnlev mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
225843e1988Sjohnlev {
226843e1988Sjohnlev 	uint_t vec, iopl;
227843e1988Sjohnlev 
228843e1988Sjohnlev 	vgc->flags = VGCF_IN_KERNEL;
229843e1988Sjohnlev 
230843e1988Sjohnlev 	/*
231843e1988Sjohnlev 	 * fpu_ctx we leave as zero; on first fault we'll store
232843e1988Sjohnlev 	 * sse_initial into it anyway.
233843e1988Sjohnlev 	 */
234843e1988Sjohnlev 
235843e1988Sjohnlev 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
236843e1988Sjohnlev 	vgc->user_regs.ds = KDS_SEL;
237843e1988Sjohnlev 	vgc->user_regs.es = KDS_SEL;
238843e1988Sjohnlev 	vgc->user_regs.ss = KDS_SEL;
239843e1988Sjohnlev 	vgc->kernel_ss = KDS_SEL;
240843e1988Sjohnlev 
241843e1988Sjohnlev 	/*
242843e1988Sjohnlev 	 * Allow I/O privilege level for Dom0 kernel.
243843e1988Sjohnlev 	 */
244843e1988Sjohnlev 	if (DOMAIN_IS_INITDOMAIN(xen_info))
245843e1988Sjohnlev 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
246843e1988Sjohnlev 	else
247843e1988Sjohnlev 		iopl = 0;
248843e1988Sjohnlev 
249843e1988Sjohnlev 	vgc->user_regs.fs = 0;
250843e1988Sjohnlev 	vgc->user_regs.gs = 0;
251843e1988Sjohnlev 	vgc->user_regs.rflags = F_OFF | iopl;
252843e1988Sjohnlev 
253843e1988Sjohnlev 	/*
254843e1988Sjohnlev 	 * Initialize the trap_info_t from the IDT
255843e1988Sjohnlev 	 */
256843e1988Sjohnlev #if !defined(__lint)
257843e1988Sjohnlev 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
258843e1988Sjohnlev #endif
259843e1988Sjohnlev 	for (vec = 0; vec < NIDT; vec++) {
260843e1988Sjohnlev 		trap_info_t *ti = &vgc->trap_ctxt[vec];
261843e1988Sjohnlev 
262843e1988Sjohnlev 		if (xen_idt_to_trap_info(vec,
263843e1988Sjohnlev 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
264843e1988Sjohnlev 			ti->cs = KCS_SEL;
265843e1988Sjohnlev 			ti->vector = vec;
266843e1988Sjohnlev 		}
267843e1988Sjohnlev 	}
268843e1988Sjohnlev 
269843e1988Sjohnlev 	/*
270843e1988Sjohnlev 	 * No LDT
271843e1988Sjohnlev 	 */
272843e1988Sjohnlev 
273843e1988Sjohnlev 	/*
274843e1988Sjohnlev 	 * (We assert in various places that the GDT is (a) aligned on a
275843e1988Sjohnlev 	 * page boundary and (b) one page long, so this really should fit..)
276843e1988Sjohnlev 	 */
277843e1988Sjohnlev #ifdef CRASH_XEN
278843e1988Sjohnlev 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
279843e1988Sjohnlev #else
280843e1988Sjohnlev 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
281843e1988Sjohnlev #endif
282843e1988Sjohnlev 	vgc->gdt_ents = NGDT;
283843e1988Sjohnlev 
284843e1988Sjohnlev 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
285843e1988Sjohnlev 
286*86ef0a63SRichard Lowe 	vgc->ctrlreg[3] =
287*86ef0a63SRichard Lowe 	    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
288843e1988Sjohnlev 
289843e1988Sjohnlev 	vgc->ctrlreg[4] = getcr4();
290843e1988Sjohnlev 
291843e1988Sjohnlev 	vgc->event_callback_eip = (uintptr_t)xen_callback;
292843e1988Sjohnlev 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
293843e1988Sjohnlev 	vgc->flags |= VGCF_failsafe_disables_events;
294843e1988Sjohnlev 
295843e1988Sjohnlev 	/*
296843e1988Sjohnlev 	 * XXPV should this be moved to init_cpu_syscall?
297843e1988Sjohnlev 	 */
298843e1988Sjohnlev 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
299843e1988Sjohnlev 	vgc->flags |= VGCF_syscall_disables_events;
300843e1988Sjohnlev 
301843e1988Sjohnlev 	ASSERT(vgc->user_regs.gs == 0);
302843e1988Sjohnlev 	vgc->gs_base_kernel = (uintptr_t)cp;
303843e1988Sjohnlev 
304843e1988Sjohnlev 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
305843e1988Sjohnlev }
306843e1988Sjohnlev 
307843e1988Sjohnlev /*
308843e1988Sjohnlev  * Create a guest virtual cpu context so that the virtual cpu
309843e1988Sjohnlev  * springs into life in the domain just about to call mp_startup()
310843e1988Sjohnlev  *
311843e1988Sjohnlev  * Virtual CPUs must be initialized once in the lifetime of the domain;
312843e1988Sjohnlev  * after that subsequent attempts to start them will fail with X_EEXIST.
313843e1988Sjohnlev  *
314843e1988Sjohnlev  * Thus 'alloc' -really- creates and initializes the virtual
315843e1988Sjohnlev  * CPU context just once. Once the initialisation succeeds, we never
316843e1988Sjohnlev  * free it, nor the regular cpu_t to which it refers.
317843e1988Sjohnlev  */
318843e1988Sjohnlev void *
mach_cpucontext_alloc(struct cpu * cp)319843e1988Sjohnlev mach_cpucontext_alloc(struct cpu *cp)
320843e1988Sjohnlev {
321843e1988Sjohnlev 	kthread_t *tp = cp->cpu_thread;
322843e1988Sjohnlev 	vcpu_guest_context_t vgc;
323843e1988Sjohnlev 
324843e1988Sjohnlev 	int err = 1;
325843e1988Sjohnlev 
326843e1988Sjohnlev 	/*
327843e1988Sjohnlev 	 * First, augment the incoming cpu structure
328843e1988Sjohnlev 	 * - vcpu pointer reference
329843e1988Sjohnlev 	 * - pending event storage area
330843e1988Sjohnlev 	 * - physical address of GDT
331843e1988Sjohnlev 	 */
332843e1988Sjohnlev 	cp->cpu_m.mcpu_vcpu_info =
333843e1988Sjohnlev 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
334843e1988Sjohnlev 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
335843e1988Sjohnlev 	    sizeof (struct xen_evt_data), KM_SLEEP);
336843e1988Sjohnlev 	cp->cpu_m.mcpu_gdtpa =
337843e1988Sjohnlev 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
338843e1988Sjohnlev 
339843e1988Sjohnlev 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
340843e1988Sjohnlev 		goto done;
341843e1988Sjohnlev 
342843e1988Sjohnlev 	/*
343843e1988Sjohnlev 	 * Now set up the vcpu context so that we can start this vcpu
344843e1988Sjohnlev 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
345843e1988Sjohnlev 	 * thread will thread_exit() shortly after performing the
346843e1988Sjohnlev 	 * initialization; in particular, we will *never* take a
347843e1988Sjohnlev 	 * privilege transition on this thread.
348843e1988Sjohnlev 	 */
349843e1988Sjohnlev 
350843e1988Sjohnlev 	bzero(&vgc, sizeof (vgc));
351843e1988Sjohnlev 
352843e1988Sjohnlev 	vgc.user_regs.rip = tp->t_pc;
353843e1988Sjohnlev 	vgc.user_regs.rsp = tp->t_sp;
354843e1988Sjohnlev 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
355843e1988Sjohnlev 	/*
356843e1988Sjohnlev 	 * XXPV	Fix resume, if Russ didn't already fix it.
357843e1988Sjohnlev 	 *
358843e1988Sjohnlev 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
359843e1988Sjohnlev 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
360843e1988Sjohnlev 	 * that only lwps take traps that switch to the kernel stack;
361843e1988Sjohnlev 	 * part of creating an lwp adjusts the stack by subtracting
362843e1988Sjohnlev 	 * sizeof (struct regs) off t_stk.
363843e1988Sjohnlev 	 *
364843e1988Sjohnlev 	 * The more interesting question is, why do we do all the work
365843e1988Sjohnlev 	 * of a fully fledged lwp for a plain thread?  In particular
366843e1988Sjohnlev 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
367843e1988Sjohnlev 	 * or futz with the LDT.  This should probably all be done with
368843e1988Sjohnlev 	 * an lwp context operator to keep pure thread context switch fast.
369843e1988Sjohnlev 	 */
370843e1988Sjohnlev 	vgc.kernel_sp = (ulong_t)tp->t_stk;
371843e1988Sjohnlev 
372843e1988Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
373843e1988Sjohnlev 
374843e1988Sjohnlev done:
375843e1988Sjohnlev 	if (err) {
376843e1988Sjohnlev 		mach_cpucontext_free(cp, NULL, err);
377843e1988Sjohnlev 		return (NULL);
378843e1988Sjohnlev 	}
379843e1988Sjohnlev 	return (cp);
380843e1988Sjohnlev }
381843e1988Sjohnlev 
382843e1988Sjohnlev /*
383843e1988Sjohnlev  * By the time we are called either we have successfully started
384843e1988Sjohnlev  * the cpu, or our attempt to start it has failed.
385843e1988Sjohnlev  */
386843e1988Sjohnlev 
387843e1988Sjohnlev /*ARGSUSED*/
388843e1988Sjohnlev void
mach_cpucontext_free(struct cpu * cp,void * arg,int err)389843e1988Sjohnlev mach_cpucontext_free(struct cpu *cp, void *arg, int err)
390843e1988Sjohnlev {
391843e1988Sjohnlev 	switch (err) {
392843e1988Sjohnlev 	case 0:
393843e1988Sjohnlev 		break;
394843e1988Sjohnlev 	case ETIMEDOUT:
395843e1988Sjohnlev 		/*
396843e1988Sjohnlev 		 * The vcpu context is loaded into the hypervisor, and
397843e1988Sjohnlev 		 * we've tried to start it, but the vcpu has not been set
398843e1988Sjohnlev 		 * running yet, for whatever reason.  We arrange to -not-
399843e1988Sjohnlev 		 * free any data structures it may be referencing.  In
400843e1988Sjohnlev 		 * particular, we've already told the hypervisor about
401843e1988Sjohnlev 		 * the GDT, and so we can't map it read-write again.
402843e1988Sjohnlev 		 */
403843e1988Sjohnlev 		break;
404843e1988Sjohnlev 	default:
405843e1988Sjohnlev 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
406843e1988Sjohnlev 		kmem_free(cp->cpu_m.mcpu_evt_pend,
407843e1988Sjohnlev 		    sizeof (struct xen_evt_data));
408843e1988Sjohnlev 		break;
409843e1988Sjohnlev 	}
410843e1988Sjohnlev }
411843e1988Sjohnlev 
412843e1988Sjohnlev /*
413843e1988Sjohnlev  * Reset this CPU's context.  Clear out any pending evtchn data, since event
414843e1988Sjohnlev  * channel numbers will all change when we resume.
415843e1988Sjohnlev  */
416843e1988Sjohnlev void
mach_cpucontext_reset(cpu_t * cp)417843e1988Sjohnlev mach_cpucontext_reset(cpu_t *cp)
418843e1988Sjohnlev {
419843e1988Sjohnlev 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
420843e1988Sjohnlev 	/* mcpu_intr_pending ? */
421843e1988Sjohnlev }
422843e1988Sjohnlev 
423843e1988Sjohnlev static void
pcb_to_user_regs(label_t * pcb,vcpu_guest_context_t * vgc)424843e1988Sjohnlev pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
425843e1988Sjohnlev {
426843e1988Sjohnlev 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
427843e1988Sjohnlev 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
428843e1988Sjohnlev 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
429843e1988Sjohnlev 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
430843e1988Sjohnlev 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
431843e1988Sjohnlev 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
432843e1988Sjohnlev 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
433843e1988Sjohnlev 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
434843e1988Sjohnlev }
435843e1988Sjohnlev 
436843e1988Sjohnlev /*
4371d03c31eSjohnlev  * Restore the context of a CPU during resume.  This context is always
4381d03c31eSjohnlev  * inside enter_safe_phase(), below.
439843e1988Sjohnlev  */
440843e1988Sjohnlev void
mach_cpucontext_restore(cpu_t * cp)441843e1988Sjohnlev mach_cpucontext_restore(cpu_t *cp)
442843e1988Sjohnlev {
443843e1988Sjohnlev 	vcpu_guest_context_t vgc;
444843e1988Sjohnlev 	int err;
445843e1988Sjohnlev 
446843e1988Sjohnlev 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
447843e1988Sjohnlev 	    cp->cpu_thread == cp->cpu_idle_thread);
448843e1988Sjohnlev 
449843e1988Sjohnlev 	bzero(&vgc, sizeof (vgc));
450843e1988Sjohnlev 
451843e1988Sjohnlev 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
452843e1988Sjohnlev 
453843e1988Sjohnlev 	/*
454843e1988Sjohnlev 	 * We're emulating a longjmp() here: in particular, we need to bump the
455843e1988Sjohnlev 	 * stack pointer to account for the pop of xIP that returning from
456843e1988Sjohnlev 	 * longjmp() normally would do, and set the return value in xAX to 1.
457843e1988Sjohnlev 	 */
458843e1988Sjohnlev 	vgc.user_regs.rax = 1;
459843e1988Sjohnlev 	vgc.user_regs.rsp += sizeof (ulong_t);
460843e1988Sjohnlev 
461843e1988Sjohnlev 	vgc.kernel_sp = cp->cpu_thread->t_sp;
462843e1988Sjohnlev 
463843e1988Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
464843e1988Sjohnlev 
465843e1988Sjohnlev 	ASSERT(err == 0);
466843e1988Sjohnlev }
467843e1988Sjohnlev 
4681d03c31eSjohnlev /*
4691d03c31eSjohnlev  * Reach a point at which the CPU can be safely powered-off or
4701d03c31eSjohnlev  * suspended.  Nothing can wake this CPU out of the loop.
4711d03c31eSjohnlev  */
4721d03c31eSjohnlev static void
enter_safe_phase(void)4731d03c31eSjohnlev enter_safe_phase(void)
4741d03c31eSjohnlev {
4751d03c31eSjohnlev 	ulong_t flags = intr_clear();
4761d03c31eSjohnlev 
4771d03c31eSjohnlev 	if (setjmp(&curthread->t_pcb) == 0) {
4781d03c31eSjohnlev 		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
4791d03c31eSjohnlev 		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
4801d03c31eSjohnlev 			SMT_PAUSE();
4811d03c31eSjohnlev 	}
4821d03c31eSjohnlev 
4831d03c31eSjohnlev 	ASSERT(!interrupts_enabled());
4841d03c31eSjohnlev 
4851d03c31eSjohnlev 	intr_restore(flags);
4861d03c31eSjohnlev }
4871d03c31eSjohnlev 
4881d03c31eSjohnlev /*
4891d03c31eSjohnlev  * Offline CPUs run this code even under a pause_cpus(), so we must
4901d03c31eSjohnlev  * check if we need to enter the safe phase.
4911d03c31eSjohnlev  */
492843e1988Sjohnlev void
mach_cpu_idle(void)493843e1988Sjohnlev mach_cpu_idle(void)
494843e1988Sjohnlev {
495843e1988Sjohnlev 	if (IN_XPV_PANIC()) {
496843e1988Sjohnlev 		xpv_panic_halt();
497843e1988Sjohnlev 	} else  {
498843e1988Sjohnlev 		(void) HYPERVISOR_block();
4991d03c31eSjohnlev 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5001d03c31eSjohnlev 			enter_safe_phase();
501843e1988Sjohnlev 	}
502843e1988Sjohnlev }
503843e1988Sjohnlev 
5041d03c31eSjohnlev /*
5051d03c31eSjohnlev  * Spin until either start_cpus() wakes us up, or we get a request to
5061d03c31eSjohnlev  * enter the safe phase (followed by a later start_cpus()).
5071d03c31eSjohnlev  */
508843e1988Sjohnlev void
mach_cpu_pause(volatile char * safe)509843e1988Sjohnlev mach_cpu_pause(volatile char *safe)
510843e1988Sjohnlev {
5111d03c31eSjohnlev 	*safe = PAUSE_WAIT;
5121d03c31eSjohnlev 	membar_enter();
513843e1988Sjohnlev 
5141d03c31eSjohnlev 	while (*safe != PAUSE_IDLE) {
5151d03c31eSjohnlev 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5161d03c31eSjohnlev 			enter_safe_phase();
517843e1988Sjohnlev 		SMT_PAUSE();
5181d03c31eSjohnlev 	}
519843e1988Sjohnlev }
520843e1988Sjohnlev 
521027bcc9fSToomas Soome int
mach_cpu_halt(xc_arg_t arg1,xc_arg_t arg2 __unused,xc_arg_t arg3 __unused)522027bcc9fSToomas Soome mach_cpu_halt(xc_arg_t arg1, xc_arg_t arg2 __unused, xc_arg_t arg3 __unused)
5231d03c31eSjohnlev {
524027bcc9fSToomas Soome 	char *msg = (char *)arg1;
525027bcc9fSToomas Soome 
5261d03c31eSjohnlev 	if (msg)
5271d03c31eSjohnlev 		prom_printf("%s\n", msg);
5281d03c31eSjohnlev 	(void) xen_vcpu_down(CPU->cpu_id);
529027bcc9fSToomas Soome 	return (0);
5301d03c31eSjohnlev }
531843e1988Sjohnlev 
532843e1988Sjohnlev /*ARGSUSED*/
533843e1988Sjohnlev int
mp_cpu_poweron(struct cpu * cp)534843e1988Sjohnlev mp_cpu_poweron(struct cpu *cp)
535843e1988Sjohnlev {
536843e1988Sjohnlev 	return (ENOTSUP);
537843e1988Sjohnlev }
538843e1988Sjohnlev 
539843e1988Sjohnlev /*ARGSUSED*/
540843e1988Sjohnlev int
mp_cpu_poweroff(struct cpu * cp)541843e1988Sjohnlev mp_cpu_poweroff(struct cpu *cp)
542843e1988Sjohnlev {
543843e1988Sjohnlev 	return (ENOTSUP);
544843e1988Sjohnlev }
545843e1988Sjohnlev 
5461d03c31eSjohnlev void
mp_enter_barrier(void)5471d03c31eSjohnlev mp_enter_barrier(void)
548843e1988Sjohnlev {
5491d03c31eSjohnlev 	hrtime_t last_poke_time = 0;
5501d03c31eSjohnlev 	int poke_allowed = 0;
5511d03c31eSjohnlev 	int done = 0;
5521d03c31eSjohnlev 	int i;
553843e1988Sjohnlev 
554843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
555843e1988Sjohnlev 
5560ed5c46eSJosef 'Jeff' Sipek 	pause_cpus(NULL, NULL);
5571d03c31eSjohnlev 
5581d03c31eSjohnlev 	while (!done) {
5591d03c31eSjohnlev 		done = 1;
5601d03c31eSjohnlev 		poke_allowed = 0;
5611d03c31eSjohnlev 
5621d03c31eSjohnlev 		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
5631d03c31eSjohnlev 			last_poke_time = xpv_gethrtime();
5641d03c31eSjohnlev 			poke_allowed = 1;
5651d03c31eSjohnlev 		}
5661d03c31eSjohnlev 
5671d03c31eSjohnlev 		for (i = 0; i < NCPU; i++) {
5681d03c31eSjohnlev 			cpu_t *cp = cpu_get(i);
5691d03c31eSjohnlev 
5701d03c31eSjohnlev 			if (cp == NULL || cp == CPU)
5711d03c31eSjohnlev 				continue;
5721d03c31eSjohnlev 
5731d03c31eSjohnlev 			switch (cpu_phase[i]) {
5741d03c31eSjohnlev 			case CPU_PHASE_NONE:
5751d03c31eSjohnlev 				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
5761d03c31eSjohnlev 				poke_cpu(i);
5771d03c31eSjohnlev 				done = 0;
5781d03c31eSjohnlev 				break;
5791d03c31eSjohnlev 
5801d03c31eSjohnlev 			case CPU_PHASE_WAIT_SAFE:
5811d03c31eSjohnlev 				if (poke_allowed)
5821d03c31eSjohnlev 					poke_cpu(i);
5831d03c31eSjohnlev 				done = 0;
5841d03c31eSjohnlev 				break;
5851d03c31eSjohnlev 
5861d03c31eSjohnlev 			case CPU_PHASE_SAFE:
5871d03c31eSjohnlev 			case CPU_PHASE_POWERED_OFF:
5881d03c31eSjohnlev 				break;
5891d03c31eSjohnlev 			}
5901d03c31eSjohnlev 		}
5911d03c31eSjohnlev 
5921d03c31eSjohnlev 		SMT_PAUSE();
593843e1988Sjohnlev 	}
5941d03c31eSjohnlev }
595843e1988Sjohnlev 
5961d03c31eSjohnlev void
mp_leave_barrier(void)5971d03c31eSjohnlev mp_leave_barrier(void)
5981d03c31eSjohnlev {
5991d03c31eSjohnlev 	int i;
6001d03c31eSjohnlev 
6011d03c31eSjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
6021d03c31eSjohnlev 
6031d03c31eSjohnlev 	for (i = 0; i < NCPU; i++) {
6041d03c31eSjohnlev 		cpu_t *cp = cpu_get(i);
6051d03c31eSjohnlev 
6061d03c31eSjohnlev 		if (cp == NULL || cp == CPU)
6071d03c31eSjohnlev 			continue;
6081d03c31eSjohnlev 
6091d03c31eSjohnlev 		switch (cpu_phase[i]) {
610843e1988Sjohnlev 		/*
6111d03c31eSjohnlev 		 * If we see a CPU in one of these phases, something has
6121d03c31eSjohnlev 		 * gone badly wrong with the guarantees
6131d03c31eSjohnlev 		 * mp_enter_barrier() is supposed to provide.  Rather
6141d03c31eSjohnlev 		 * than attempt to stumble along (and since we can't
6151d03c31eSjohnlev 		 * panic properly in this context), we tell the
6161d03c31eSjohnlev 		 * hypervisor we've crashed.
617843e1988Sjohnlev 		 */
6181d03c31eSjohnlev 		case CPU_PHASE_NONE:
6191d03c31eSjohnlev 		case CPU_PHASE_WAIT_SAFE:
6201d03c31eSjohnlev 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
6211d03c31eSjohnlev 			break;
622843e1988Sjohnlev 
6231d03c31eSjohnlev 		case CPU_PHASE_POWERED_OFF:
6241d03c31eSjohnlev 			break;
6251d03c31eSjohnlev 
6261d03c31eSjohnlev 		case CPU_PHASE_SAFE:
6271d03c31eSjohnlev 			cpu_phase[i] = CPU_PHASE_NONE;
6281d03c31eSjohnlev 		}
629843e1988Sjohnlev 	}
630843e1988Sjohnlev 
6311d03c31eSjohnlev 	start_cpus();
632843e1988Sjohnlev }
633843e1988Sjohnlev 
634843e1988Sjohnlev static int
poweroff_vcpu(struct cpu * cp)635843e1988Sjohnlev poweroff_vcpu(struct cpu *cp)
636843e1988Sjohnlev {
637843e1988Sjohnlev 	int error;
638843e1988Sjohnlev 
639843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
640843e1988Sjohnlev 
641843e1988Sjohnlev 	ASSERT(CPU->cpu_id != cp->cpu_id);
642843e1988Sjohnlev 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
643843e1988Sjohnlev 
6441d03c31eSjohnlev 	mp_enter_barrier();
645843e1988Sjohnlev 
646843e1988Sjohnlev 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
6471d03c31eSjohnlev 		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
6481d03c31eSjohnlev 
649843e1988Sjohnlev 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
6501d03c31eSjohnlev 
651c3377ee9SJohn Levon 		if (cp->cpu_flags & CPU_ENABLE)
652c3377ee9SJohn Levon 			ncpus_intr_enabled--;
653c3377ee9SJohn Levon 
654843e1988Sjohnlev 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
655843e1988Sjohnlev 		cp->cpu_flags &=
656843e1988Sjohnlev 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
657843e1988Sjohnlev 
6581d03c31eSjohnlev 		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
6591d03c31eSjohnlev 
660843e1988Sjohnlev 		cpu_set_state(cp);
661843e1988Sjohnlev 	}
6621d03c31eSjohnlev 
6631d03c31eSjohnlev 	mp_leave_barrier();
6641d03c31eSjohnlev 
665843e1988Sjohnlev 	return (error);
666843e1988Sjohnlev }
667843e1988Sjohnlev 
668843e1988Sjohnlev static int
vcpu_config_poweroff(processorid_t id)669843e1988Sjohnlev vcpu_config_poweroff(processorid_t id)
670843e1988Sjohnlev {
671843e1988Sjohnlev 	int oldstate;
672843e1988Sjohnlev 	int error;
673843e1988Sjohnlev 	cpu_t *cp;
674843e1988Sjohnlev 
675843e1988Sjohnlev 	mutex_enter(&cpu_lock);
676843e1988Sjohnlev 
677843e1988Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
678843e1988Sjohnlev 		mutex_exit(&cpu_lock);
679843e1988Sjohnlev 		return (ESRCH);
680843e1988Sjohnlev 	}
681843e1988Sjohnlev 
682843e1988Sjohnlev 	if (cpu_get_state(cp) == P_POWEROFF) {
683843e1988Sjohnlev 		mutex_exit(&cpu_lock);
684843e1988Sjohnlev 		return (0);
685843e1988Sjohnlev 	}
686843e1988Sjohnlev 
687843e1988Sjohnlev 	mutex_exit(&cpu_lock);
688843e1988Sjohnlev 
689843e1988Sjohnlev 	do {
690843e1988Sjohnlev 		error = p_online_internal(id, P_OFFLINE,
691843e1988Sjohnlev 		    &oldstate);
692843e1988Sjohnlev 
693843e1988Sjohnlev 		if (error != 0)
694843e1988Sjohnlev 			break;
695843e1988Sjohnlev 
696843e1988Sjohnlev 		/*
697843e1988Sjohnlev 		 * So we just changed it to P_OFFLINE.  But then we dropped
698843e1988Sjohnlev 		 * cpu_lock, so now it is possible for another thread to change
699843e1988Sjohnlev 		 * the cpu back to a different, non-quiesced state e.g.
700843e1988Sjohnlev 		 * P_ONLINE.
701843e1988Sjohnlev 		 */
702843e1988Sjohnlev 		mutex_enter(&cpu_lock);
703843e1988Sjohnlev 		if ((cp = cpu_get(id)) == NULL)
704843e1988Sjohnlev 			error = ESRCH;
705843e1988Sjohnlev 		else {
706843e1988Sjohnlev 			if (cp->cpu_flags & CPU_QUIESCED)
707843e1988Sjohnlev 				error = poweroff_vcpu(cp);
708843e1988Sjohnlev 			else
709843e1988Sjohnlev 				error = EBUSY;
710843e1988Sjohnlev 		}
711843e1988Sjohnlev 		mutex_exit(&cpu_lock);
712843e1988Sjohnlev 	} while (error == EBUSY);
713843e1988Sjohnlev 
714843e1988Sjohnlev 	return (error);
715843e1988Sjohnlev }
716843e1988Sjohnlev 
717843e1988Sjohnlev /*
718843e1988Sjohnlev  * Add a new virtual cpu to the domain.
719843e1988Sjohnlev  */
720843e1988Sjohnlev static int
vcpu_config_new(processorid_t id)721843e1988Sjohnlev vcpu_config_new(processorid_t id)
722843e1988Sjohnlev {
723843e1988Sjohnlev 	extern int start_cpu(processorid_t);
724843e1988Sjohnlev 	int error;
725843e1988Sjohnlev 
726843e1988Sjohnlev 	if (ncpus == 1) {
727843e1988Sjohnlev 		printf("cannot (yet) add cpus to a single-cpu domain\n");
728843e1988Sjohnlev 		return (ENOTSUP);
729843e1988Sjohnlev 	}
730843e1988Sjohnlev 
731843e1988Sjohnlev 	affinity_set(CPU_CURRENT);
732843e1988Sjohnlev 	error = start_cpu(id);
733843e1988Sjohnlev 	affinity_clear();
734843e1988Sjohnlev 	return (error);
735843e1988Sjohnlev }
736843e1988Sjohnlev 
7371d03c31eSjohnlev static int
poweron_vcpu(struct cpu * cp)7381d03c31eSjohnlev poweron_vcpu(struct cpu *cp)
7391d03c31eSjohnlev {
7401d03c31eSjohnlev 	int error;
7411d03c31eSjohnlev 
7421d03c31eSjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
7431d03c31eSjohnlev 
7441d03c31eSjohnlev 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
7451d03c31eSjohnlev 		printf("poweron_vcpu: vcpu%d is not available!\n",
7461d03c31eSjohnlev 		    cp->cpu_id);
7471d03c31eSjohnlev 		return (ENXIO);
7481d03c31eSjohnlev 	}
7491d03c31eSjohnlev 
7501d03c31eSjohnlev 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
7511d03c31eSjohnlev 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
7521d03c31eSjohnlev 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
7531d03c31eSjohnlev 		cp->cpu_flags &= ~CPU_POWEROFF;
7541d03c31eSjohnlev 		/*
7551d03c31eSjohnlev 		 * There are some nasty races possible here.
7561d03c31eSjohnlev 		 * Tell the vcpu it's up one more time.
7571d03c31eSjohnlev 		 * XXPV	Is this enough?  Is this safe?
7581d03c31eSjohnlev 		 */
7591d03c31eSjohnlev 		(void) xen_vcpu_up(cp->cpu_id);
7601d03c31eSjohnlev 
7611d03c31eSjohnlev 		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
7621d03c31eSjohnlev 
7631d03c31eSjohnlev 		cpu_set_state(cp);
7641d03c31eSjohnlev 	}
7651d03c31eSjohnlev 	return (error);
7661d03c31eSjohnlev }
7671d03c31eSjohnlev 
768843e1988Sjohnlev static int
vcpu_config_poweron(processorid_t id)769843e1988Sjohnlev vcpu_config_poweron(processorid_t id)
770843e1988Sjohnlev {
771843e1988Sjohnlev 	cpu_t *cp;
772843e1988Sjohnlev 	int oldstate;
773843e1988Sjohnlev 	int error;
774843e1988Sjohnlev 
775843e1988Sjohnlev 	if (id >= ncpus)
776843e1988Sjohnlev 		return (vcpu_config_new(id));
777843e1988Sjohnlev 
778843e1988Sjohnlev 	mutex_enter(&cpu_lock);
779843e1988Sjohnlev 
780843e1988Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
781843e1988Sjohnlev 		mutex_exit(&cpu_lock);
782843e1988Sjohnlev 		return (ESRCH);
783843e1988Sjohnlev 	}
784843e1988Sjohnlev 
785843e1988Sjohnlev 	if (cpu_get_state(cp) != P_POWEROFF) {
786843e1988Sjohnlev 		mutex_exit(&cpu_lock);
787843e1988Sjohnlev 		return (0);
788843e1988Sjohnlev 	}
789843e1988Sjohnlev 
790843e1988Sjohnlev 	if ((error = poweron_vcpu(cp)) != 0) {
791843e1988Sjohnlev 		mutex_exit(&cpu_lock);
792843e1988Sjohnlev 		return (error);
793843e1988Sjohnlev 	}
794843e1988Sjohnlev 
795843e1988Sjohnlev 	mutex_exit(&cpu_lock);
796843e1988Sjohnlev 
797843e1988Sjohnlev 	return (p_online_internal(id, P_ONLINE, &oldstate));
798843e1988Sjohnlev }
799843e1988Sjohnlev 
800843e1988Sjohnlev #define	REPORT_LEN	128
801843e1988Sjohnlev 
802843e1988Sjohnlev static void
vcpu_config_report(processorid_t id,uint_t newstate,int error)803843e1988Sjohnlev vcpu_config_report(processorid_t id, uint_t newstate, int error)
804843e1988Sjohnlev {
805843e1988Sjohnlev 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
806843e1988Sjohnlev 	size_t len;
807843e1988Sjohnlev 	char *ps;
808843e1988Sjohnlev 
8092a9992ecSToomas Soome 	ps = NULL;
810843e1988Sjohnlev 	switch (newstate) {
811843e1988Sjohnlev 	case P_ONLINE:
812843e1988Sjohnlev 		ps = PS_ONLINE;
813843e1988Sjohnlev 		break;
814843e1988Sjohnlev 	case P_POWEROFF:
815843e1988Sjohnlev 		ps = PS_POWEROFF;
816843e1988Sjohnlev 		break;
817843e1988Sjohnlev 	default:
818843e1988Sjohnlev 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
819843e1988Sjohnlev 		break;
820843e1988Sjohnlev 	}
821843e1988Sjohnlev 
822843e1988Sjohnlev 	len = snprintf(report, REPORT_LEN,
823843e1988Sjohnlev 	    "cpu%d: externally initiated %s", id, ps);
824843e1988Sjohnlev 
825843e1988Sjohnlev 	if (!error) {
826843e1988Sjohnlev 		cmn_err(CE_CONT, "!%s\n", report);
827843e1988Sjohnlev 		kmem_free(report, REPORT_LEN);
828843e1988Sjohnlev 		return;
829843e1988Sjohnlev 	}
830843e1988Sjohnlev 
831843e1988Sjohnlev 	len += snprintf(report + len, REPORT_LEN - len,
832843e1988Sjohnlev 	    " failed, error %d: ", error);
833843e1988Sjohnlev 	switch (error) {
834843e1988Sjohnlev 	case EEXIST:
835843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
836843e1988Sjohnlev 		    "cpu already %s", ps ? ps : "?");
837843e1988Sjohnlev 		break;
838843e1988Sjohnlev 	case ESRCH:
839843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
840843e1988Sjohnlev 		    "cpu not found");
841843e1988Sjohnlev 		break;
842843e1988Sjohnlev 	case EINVAL:
843843e1988Sjohnlev 	case EALREADY:
844843e1988Sjohnlev 		break;
845843e1988Sjohnlev 	case EPERM:
846843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
847843e1988Sjohnlev 		    "insufficient privilege (0x%x)", id);
848843e1988Sjohnlev 		break;
849843e1988Sjohnlev 	case EBUSY:
850843e1988Sjohnlev 		switch (newstate) {
851843e1988Sjohnlev 		case P_ONLINE:
852843e1988Sjohnlev 			/*
853843e1988Sjohnlev 			 * This return comes from mp_cpu_start -
854843e1988Sjohnlev 			 * we cannot 'start' the boot CPU.
855843e1988Sjohnlev 			 */
856843e1988Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
857843e1988Sjohnlev 			    "already running");
858843e1988Sjohnlev 			break;
859843e1988Sjohnlev 		case P_POWEROFF:
860843e1988Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
861843e1988Sjohnlev 			    "bound lwps?");
862843e1988Sjohnlev 			break;
863843e1988Sjohnlev 		default:
864843e1988Sjohnlev 			break;
865843e1988Sjohnlev 		}
866843e1988Sjohnlev 	default:
867843e1988Sjohnlev 		break;
868843e1988Sjohnlev 	}
869843e1988Sjohnlev 
870843e1988Sjohnlev 	cmn_err(CE_CONT, "%s\n", report);
871843e1988Sjohnlev 	kmem_free(report, REPORT_LEN);
872843e1988Sjohnlev }
873843e1988Sjohnlev 
874843e1988Sjohnlev static void
vcpu_config(void * arg)875843e1988Sjohnlev vcpu_config(void *arg)
876843e1988Sjohnlev {
877843e1988Sjohnlev 	int id = (int)(uintptr_t)arg;
878843e1988Sjohnlev 	int error;
879843e1988Sjohnlev 	char dir[16];
880843e1988Sjohnlev 	char *state;
881843e1988Sjohnlev 
882843e1988Sjohnlev 	if ((uint_t)id >= max_ncpus) {
883843e1988Sjohnlev 		cmn_err(CE_WARN,
884843e1988Sjohnlev 		    "vcpu_config: cpu%d does not fit in this domain", id);
885843e1988Sjohnlev 		return;
886843e1988Sjohnlev 	}
887843e1988Sjohnlev 
888843e1988Sjohnlev 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
889843e1988Sjohnlev 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
890843e1988Sjohnlev 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
891843e1988Sjohnlev 		if (strcmp(state, "online") == 0) {
892843e1988Sjohnlev 			error = vcpu_config_poweron(id);
893843e1988Sjohnlev 			vcpu_config_report(id, P_ONLINE, error);
894843e1988Sjohnlev 		} else if (strcmp(state, "offline") == 0) {
895843e1988Sjohnlev 			error = vcpu_config_poweroff(id);
896843e1988Sjohnlev 			vcpu_config_report(id, P_POWEROFF, error);
897843e1988Sjohnlev 		} else {
898843e1988Sjohnlev 			cmn_err(CE_WARN,
899843e1988Sjohnlev 			    "cpu%d: unknown target state '%s'", id, state);
900843e1988Sjohnlev 		}
901843e1988Sjohnlev 	} else
902843e1988Sjohnlev 		cmn_err(CE_WARN,
903843e1988Sjohnlev 		    "cpu%d: unable to read target state from xenstore", id);
904843e1988Sjohnlev 
905843e1988Sjohnlev 	kmem_free(state, MAXPATHLEN);
906843e1988Sjohnlev }
907843e1988Sjohnlev 
908843e1988Sjohnlev /*ARGSUSED*/
909843e1988Sjohnlev static void
vcpu_config_event(struct xenbus_watch * watch,const char ** vec,uint_t len)910843e1988Sjohnlev vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
911843e1988Sjohnlev {
912843e1988Sjohnlev 	const char *path = vec[XS_WATCH_PATH];
913843e1988Sjohnlev 	processorid_t id;
914843e1988Sjohnlev 	char *s;
915843e1988Sjohnlev 
916843e1988Sjohnlev 	if ((s = strstr(path, "cpu/")) != NULL &&
917843e1988Sjohnlev 	    sscanf(s, "cpu/%d", &id) == 1) {
918843e1988Sjohnlev 		/*
919843e1988Sjohnlev 		 * Run the virtual CPU configuration on a separate thread to
920843e1988Sjohnlev 		 * avoid blocking on this event for too long (and for now,
921843e1988Sjohnlev 		 * to ensure configuration requests are serialized.)
922843e1988Sjohnlev 		 */
923843e1988Sjohnlev 		(void) taskq_dispatch(cpu_config_tq,
924843e1988Sjohnlev 		    vcpu_config, (void *)(uintptr_t)id, 0);
925843e1988Sjohnlev 	}
926843e1988Sjohnlev }
927843e1988Sjohnlev 
928843e1988Sjohnlev static int
xen_vcpu_initialize(processorid_t id,vcpu_guest_context_t * vgc)929843e1988Sjohnlev xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
930843e1988Sjohnlev {
931843e1988Sjohnlev 	int err;
932843e1988Sjohnlev 
933843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
934843e1988Sjohnlev 		char *str;
935843e1988Sjohnlev 		int level = CE_WARN;
936843e1988Sjohnlev 
937843e1988Sjohnlev 		switch (err) {
938843e1988Sjohnlev 		case -X_EINVAL:
939843e1988Sjohnlev 			/*
940843e1988Sjohnlev 			 * This interface squashes multiple error sources
941843e1988Sjohnlev 			 * to one error code.  In particular, an X_EINVAL
942843e1988Sjohnlev 			 * code can mean:
943843e1988Sjohnlev 			 *
944843e1988Sjohnlev 			 * -	the vcpu id is out of range
945843e1988Sjohnlev 			 * -	cs or ss are in ring 0
946843e1988Sjohnlev 			 * -	cr3 is wrong
947843e1988Sjohnlev 			 * -	an entry in the new gdt is above the
948843e1988Sjohnlev 			 *	reserved entry
949843e1988Sjohnlev 			 * -	a frame underneath the new gdt is bad
950843e1988Sjohnlev 			 */
951843e1988Sjohnlev 			str = "something is wrong :(";
952843e1988Sjohnlev 			break;
953843e1988Sjohnlev 		case -X_ENOENT:
954843e1988Sjohnlev 			str = "no such cpu";
955843e1988Sjohnlev 			break;
956843e1988Sjohnlev 		case -X_ENOMEM:
957843e1988Sjohnlev 			str = "no mem to copy ctxt";
958843e1988Sjohnlev 			break;
959843e1988Sjohnlev 		case -X_EFAULT:
960843e1988Sjohnlev 			str = "bad address";
961843e1988Sjohnlev 			break;
962843e1988Sjohnlev 		case -X_EEXIST:
963843e1988Sjohnlev 			/*
964843e1988Sjohnlev 			 * Hmm.  This error is returned if the vcpu has already
965843e1988Sjohnlev 			 * been initialized once before in the lifetime of this
966843e1988Sjohnlev 			 * domain.  This is a logic error in the kernel.
967843e1988Sjohnlev 			 */
968843e1988Sjohnlev 			level = CE_PANIC;
969843e1988Sjohnlev 			str = "already initialized";
970843e1988Sjohnlev 			break;
971843e1988Sjohnlev 		default:
972843e1988Sjohnlev 			level = CE_PANIC;
973843e1988Sjohnlev 			str = "<unexpected>";
974843e1988Sjohnlev 			break;
975843e1988Sjohnlev 		}
976843e1988Sjohnlev 
977843e1988Sjohnlev 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
978843e1988Sjohnlev 		    id, -err, str);
979843e1988Sjohnlev 	}
980843e1988Sjohnlev 	return (err);
981843e1988Sjohnlev }
982843e1988Sjohnlev 
983843e1988Sjohnlev long
xen_vcpu_up(processorid_t id)984843e1988Sjohnlev xen_vcpu_up(processorid_t id)
985843e1988Sjohnlev {
986843e1988Sjohnlev 	long err;
987843e1988Sjohnlev 
988843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
989843e1988Sjohnlev 		char *str;
990843e1988Sjohnlev 
991843e1988Sjohnlev 		switch (err) {
992843e1988Sjohnlev 		case -X_ENOENT:
993843e1988Sjohnlev 			str = "no such cpu";
994843e1988Sjohnlev 			break;
995843e1988Sjohnlev 		case -X_EINVAL:
996843e1988Sjohnlev 			/*
997843e1988Sjohnlev 			 * Perhaps this is diagnostic overkill.
998843e1988Sjohnlev 			 */
999843e1988Sjohnlev 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1000843e1988Sjohnlev 				str = "bad cpuid";
1001843e1988Sjohnlev 			else
1002843e1988Sjohnlev 				str = "not initialized";
1003843e1988Sjohnlev 			break;
1004843e1988Sjohnlev 		default:
1005843e1988Sjohnlev 			str = "<unexpected>";
1006843e1988Sjohnlev 			break;
1007843e1988Sjohnlev 		}
1008843e1988Sjohnlev 
1009843e1988Sjohnlev 		printf("vcpu%d: failed to start: error %d: %s\n",
1010843e1988Sjohnlev 		    id, -(int)err, str);
1011843e1988Sjohnlev 		return (EBFONT);	/* deliberately silly */
1012843e1988Sjohnlev 	}
1013843e1988Sjohnlev 	return (err);
1014843e1988Sjohnlev }
1015843e1988Sjohnlev 
1016843e1988Sjohnlev long
xen_vcpu_down(processorid_t id)1017843e1988Sjohnlev xen_vcpu_down(processorid_t id)
1018843e1988Sjohnlev {
1019843e1988Sjohnlev 	long err;
1020843e1988Sjohnlev 
1021843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1022843e1988Sjohnlev 		/*
1023843e1988Sjohnlev 		 * X_ENOENT:	no such cpu
1024843e1988Sjohnlev 		 * X_EINVAL:	bad cpuid
1025843e1988Sjohnlev 		 */
1026843e1988Sjohnlev 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1027843e1988Sjohnlev 	}
1028843e1988Sjohnlev 
1029843e1988Sjohnlev 	return (err);
1030843e1988Sjohnlev }
1031