1843e1988Sjohnlev /*
2843e1988Sjohnlev * CDDL HEADER START
3843e1988Sjohnlev *
4843e1988Sjohnlev * The contents of this file are subject to the terms of the
5843e1988Sjohnlev * Common Development and Distribution License (the "License").
6843e1988Sjohnlev * You may not use this file except in compliance with the License.
7843e1988Sjohnlev *
8843e1988Sjohnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9843e1988Sjohnlev * or http://www.opensolaris.org/os/licensing.
10843e1988Sjohnlev * See the License for the specific language governing permissions
11843e1988Sjohnlev * and limitations under the License.
12843e1988Sjohnlev *
13843e1988Sjohnlev * When distributing Covered Code, include this CDDL HEADER in each
14843e1988Sjohnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15843e1988Sjohnlev * If applicable, add the following below this CDDL HEADER, with the
16843e1988Sjohnlev * fields enclosed by brackets "[]" replaced with your own identifying
17843e1988Sjohnlev * information: Portions Copyright [yyyy] [name of copyright owner]
18843e1988Sjohnlev *
19843e1988Sjohnlev * CDDL HEADER END
20843e1988Sjohnlev */
21843e1988Sjohnlev
22843e1988Sjohnlev /*
23f34a7178SJoe Bonasera * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24843e1988Sjohnlev * Use is subject to license terms.
25843e1988Sjohnlev */
26843e1988Sjohnlev
27c3377ee9SJohn Levon /*
28c3377ee9SJohn Levon * Copyright 2019 Joyent, Inc.
29c3377ee9SJohn Levon */
30c3377ee9SJohn Levon
311d03c31eSjohnlev /*
321d03c31eSjohnlev * Virtual CPU management.
331d03c31eSjohnlev *
341d03c31eSjohnlev * VCPUs can be controlled in one of two ways; through the domain itself
351d03c31eSjohnlev * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
361d03c31eSjohnlev * Unfortunately, the terminology is used in different ways; they work out as
371d03c31eSjohnlev * follows:
381d03c31eSjohnlev *
391d03c31eSjohnlev * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
401d03c31eSjohnlev *
411d03c31eSjohnlev * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
421d03c31eSjohnlev * hypervisor on the idle thread). It must be up since a downed VCPU cannot
431d03c31eSjohnlev * receive interrupts, and we require this for offline CPUs in Solaris.
441d03c31eSjohnlev *
451d03c31eSjohnlev * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
461d03c31eSjohnlev * xen_vcpu_down() for it). It can't take interrupts or run anything, though
471d03c31eSjohnlev * if it has run previously, its software state (cpu_t, machcpu structures, IPI
481d03c31eSjohnlev * event channels, etc.) will still exist.
491d03c31eSjohnlev *
501d03c31eSjohnlev * The hypervisor has two notions of CPU states as represented in the store:
511d03c31eSjohnlev *
521d03c31eSjohnlev * "offline": the VCPU is down. Corresponds to P_POWEROFF.
531d03c31eSjohnlev *
541d03c31eSjohnlev * "online": the VCPU is running. Corresponds to a CPU state other than
551d03c31eSjohnlev * P_POWEROFF.
561d03c31eSjohnlev *
571d03c31eSjohnlev * Currently, only a notification via xenstore can bring a CPU into a
581d03c31eSjohnlev * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
591d03c31eSjohnlev * P_OFFLINE, etc. We need to be careful to treat xenstore notifications
601d03c31eSjohnlev * idempotently, as we'll get 'duplicate' entries when we resume a domain.
611d03c31eSjohnlev *
621d03c31eSjohnlev * Note that the xenstore configuration is strictly advisory, in that a domain
631d03c31eSjohnlev * can choose to ignore it and still power up a VCPU in the offline state. To
641d03c31eSjohnlev * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
651d03c31eSjohnlev * ENOTSUP from within Solaris.
661d03c31eSjohnlev *
671d03c31eSjohnlev * Powering off a VCPU and suspending the domain use similar code. The
681d03c31eSjohnlev * difficulty here is that we must ensure that each VCPU is in a stable
691d03c31eSjohnlev * state: it must have a saved PCB, and not be responding to interrupts
701d03c31eSjohnlev * (since we are just about to remove its ability to run on a real CPU,
711d03c31eSjohnlev * possibly forever). However, an offline CPU in Solaris can take
721d03c31eSjohnlev * cross-call interrupts, as mentioned, so we must go through a
731d03c31eSjohnlev * two-stage process. First, we use the standard Solaris pause_cpus().
741d03c31eSjohnlev * This ensures that all CPUs are either in mach_cpu_pause() or
751d03c31eSjohnlev * mach_cpu_idle(), and nothing will cross-call them.
761d03c31eSjohnlev *
771d03c31eSjohnlev * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
781d03c31eSjohnlev * bring them back up, and in state CPU_PHASE_POWERED_OFF.
791d03c31eSjohnlev *
801d03c31eSjohnlev * Running CPUs are spinning in mach_cpu_pause() waiting for either
811d03c31eSjohnlev * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
821d03c31eSjohnlev *
831d03c31eSjohnlev * Offline CPUs are either running the idle thread and periodically
841d03c31eSjohnlev * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
851d03c31eSjohnlev *
861d03c31eSjohnlev * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
871d03c31eSjohnlev * poking them to make sure they're not blocked[1]. When every CPU has
881d03c31eSjohnlev * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
891d03c31eSjohnlev * know we can suspend, or power-off a CPU, without problems.
901d03c31eSjohnlev *
911d03c31eSjohnlev * [1] note that we have to repeatedly poke offline CPUs: it's the only
921d03c31eSjohnlev * way to ensure that the CPU doesn't miss the state change before
931d03c31eSjohnlev * dropping into HYPERVISOR_block().
941d03c31eSjohnlev */
951d03c31eSjohnlev
96843e1988Sjohnlev #include <sys/types.h>
97843e1988Sjohnlev #include <sys/systm.h>
98843e1988Sjohnlev #include <sys/param.h>
99843e1988Sjohnlev #include <sys/taskq.h>
100843e1988Sjohnlev #include <sys/cmn_err.h>
101843e1988Sjohnlev #include <sys/archsystm.h>
102843e1988Sjohnlev #include <sys/machsystm.h>
103843e1988Sjohnlev #include <sys/segments.h>
104843e1988Sjohnlev #include <sys/cpuvar.h>
105843e1988Sjohnlev #include <sys/x86_archext.h>
106843e1988Sjohnlev #include <sys/controlregs.h>
107843e1988Sjohnlev #include <sys/hypervisor.h>
108843e1988Sjohnlev #include <sys/xpv_panic.h>
1091d03c31eSjohnlev #include <sys/mman.h>
1101d03c31eSjohnlev #include <sys/psw.h>
111843e1988Sjohnlev #include <sys/cpu.h>
1121d03c31eSjohnlev #include <sys/sunddi.h>
1131d03c31eSjohnlev #include <util/sscanf.h>
1141d03c31eSjohnlev #include <vm/hat_i86.h>
1151d03c31eSjohnlev #include <vm/hat.h>
1161d03c31eSjohnlev #include <vm/as.h>
117843e1988Sjohnlev
118843e1988Sjohnlev #include <xen/public/io/xs_wire.h>
1191d03c31eSjohnlev #include <xen/sys/xenbus_impl.h>
1201d03c31eSjohnlev #include <xen/public/vcpu.h>
121843e1988Sjohnlev
122f34a7178SJoe Bonasera extern cpuset_t cpu_ready_set;
123f34a7178SJoe Bonasera
1241d03c31eSjohnlev #define CPU_PHASE_NONE 0
1251d03c31eSjohnlev #define CPU_PHASE_WAIT_SAFE 1
1261d03c31eSjohnlev #define CPU_PHASE_SAFE 2
1271d03c31eSjohnlev #define CPU_PHASE_POWERED_OFF 3
1281d03c31eSjohnlev
1291d03c31eSjohnlev /*
1301d03c31eSjohnlev * We can only poke CPUs during barrier enter 256 times a second at
1311d03c31eSjohnlev * most.
1321d03c31eSjohnlev */
1331d03c31eSjohnlev #define POKE_TIMEOUT (NANOSEC / 256)
134843e1988Sjohnlev
135843e1988Sjohnlev static taskq_t *cpu_config_tq;
1361d03c31eSjohnlev static int cpu_phase[NCPU];
1371d03c31eSjohnlev
138843e1988Sjohnlev static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
139843e1988Sjohnlev static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
140843e1988Sjohnlev
141b9bc7f78Ssmaybe /*
142b9bc7f78Ssmaybe * Return whether or not the vcpu is actually running on a pcpu
143b9bc7f78Ssmaybe */
144b9bc7f78Ssmaybe int
vcpu_on_pcpu(processorid_t cpu)145b9bc7f78Ssmaybe vcpu_on_pcpu(processorid_t cpu)
146b9bc7f78Ssmaybe {
147b9bc7f78Ssmaybe struct vcpu_runstate_info runstate;
148b9bc7f78Ssmaybe int ret = VCPU_STATE_UNKNOWN;
149b9bc7f78Ssmaybe
150b9bc7f78Ssmaybe ASSERT(cpu < NCPU);
151b9bc7f78Ssmaybe /*
152b9bc7f78Ssmaybe * Don't bother with hypercall if we are asking about ourself
153b9bc7f78Ssmaybe */
154b9bc7f78Ssmaybe if (cpu == CPU->cpu_id)
155b9bc7f78Ssmaybe return (VCPU_ON_PCPU);
156b9bc7f78Ssmaybe if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
157b9bc7f78Ssmaybe goto out;
158b9bc7f78Ssmaybe
159b9bc7f78Ssmaybe switch (runstate.state) {
160b9bc7f78Ssmaybe case RUNSTATE_running:
161b9bc7f78Ssmaybe ret = VCPU_ON_PCPU;
162b9bc7f78Ssmaybe break;
163b9bc7f78Ssmaybe
164b9bc7f78Ssmaybe case RUNSTATE_runnable:
165b9bc7f78Ssmaybe case RUNSTATE_offline:
166b9bc7f78Ssmaybe case RUNSTATE_blocked:
167b9bc7f78Ssmaybe ret = VCPU_NOT_ON_PCPU;
168b9bc7f78Ssmaybe break;
169b9bc7f78Ssmaybe
170b9bc7f78Ssmaybe default:
171b9bc7f78Ssmaybe break;
172b9bc7f78Ssmaybe }
173b9bc7f78Ssmaybe
174b9bc7f78Ssmaybe out:
175b9bc7f78Ssmaybe return (ret);
176b9bc7f78Ssmaybe }
177b9bc7f78Ssmaybe
178843e1988Sjohnlev /*
179843e1988Sjohnlev * These routines allocate any global state that might be needed
180843e1988Sjohnlev * while starting cpus. For virtual cpus, there is no such state.
181843e1988Sjohnlev */
182843e1988Sjohnlev int
mach_cpucontext_init(void)183843e1988Sjohnlev mach_cpucontext_init(void)
184843e1988Sjohnlev {
185843e1988Sjohnlev return (0);
186843e1988Sjohnlev }
187843e1988Sjohnlev
188843e1988Sjohnlev void
do_cpu_config_watch(int state)189843e1988Sjohnlev do_cpu_config_watch(int state)
190843e1988Sjohnlev {
191843e1988Sjohnlev static struct xenbus_watch cpu_config_watch;
192843e1988Sjohnlev
193843e1988Sjohnlev if (state != XENSTORE_UP)
194843e1988Sjohnlev return;
195843e1988Sjohnlev cpu_config_watch.node = "cpu";
196843e1988Sjohnlev cpu_config_watch.callback = vcpu_config_event;
197843e1988Sjohnlev if (register_xenbus_watch(&cpu_config_watch)) {
198843e1988Sjohnlev taskq_destroy(cpu_config_tq);
199843e1988Sjohnlev cmn_err(CE_WARN, "do_cpu_config_watch: "
200843e1988Sjohnlev "failed to set vcpu config watch");
201843e1988Sjohnlev }
202843e1988Sjohnlev
203843e1988Sjohnlev }
204843e1988Sjohnlev
205843e1988Sjohnlev /*
206843e1988Sjohnlev * This routine is called after all the "normal" MP startup has
207843e1988Sjohnlev * been done; a good place to start watching xen store for virtual
208843e1988Sjohnlev * cpu hot plug events.
209843e1988Sjohnlev */
210843e1988Sjohnlev void
mach_cpucontext_fini(void)211843e1988Sjohnlev mach_cpucontext_fini(void)
212843e1988Sjohnlev {
213843e1988Sjohnlev
214843e1988Sjohnlev cpu_config_tq = taskq_create("vcpu config taskq", 1,
215843e1988Sjohnlev maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
216843e1988Sjohnlev
217843e1988Sjohnlev (void) xs_register_xenbus_callback(do_cpu_config_watch);
218843e1988Sjohnlev }
219843e1988Sjohnlev
220843e1988Sjohnlev /*
221843e1988Sjohnlev * Fill in the remaining CPU context and initialize it.
222843e1988Sjohnlev */
223843e1988Sjohnlev static int
mp_set_cpu_context(vcpu_guest_context_t * vgc,cpu_t * cp)224843e1988Sjohnlev mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
225843e1988Sjohnlev {
226843e1988Sjohnlev uint_t vec, iopl;
227843e1988Sjohnlev
228843e1988Sjohnlev vgc->flags = VGCF_IN_KERNEL;
229843e1988Sjohnlev
230843e1988Sjohnlev /*
231843e1988Sjohnlev * fpu_ctx we leave as zero; on first fault we'll store
232843e1988Sjohnlev * sse_initial into it anyway.
233843e1988Sjohnlev */
234843e1988Sjohnlev
235843e1988Sjohnlev vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */
236843e1988Sjohnlev vgc->user_regs.ds = KDS_SEL;
237843e1988Sjohnlev vgc->user_regs.es = KDS_SEL;
238843e1988Sjohnlev vgc->user_regs.ss = KDS_SEL;
239843e1988Sjohnlev vgc->kernel_ss = KDS_SEL;
240843e1988Sjohnlev
241843e1988Sjohnlev /*
242843e1988Sjohnlev * Allow I/O privilege level for Dom0 kernel.
243843e1988Sjohnlev */
244843e1988Sjohnlev if (DOMAIN_IS_INITDOMAIN(xen_info))
245843e1988Sjohnlev iopl = (PS_IOPL & 0x1000); /* ring 1 */
246843e1988Sjohnlev else
247843e1988Sjohnlev iopl = 0;
248843e1988Sjohnlev
249843e1988Sjohnlev vgc->user_regs.fs = 0;
250843e1988Sjohnlev vgc->user_regs.gs = 0;
251843e1988Sjohnlev vgc->user_regs.rflags = F_OFF | iopl;
252843e1988Sjohnlev
253843e1988Sjohnlev /*
254843e1988Sjohnlev * Initialize the trap_info_t from the IDT
255843e1988Sjohnlev */
256843e1988Sjohnlev #if !defined(__lint)
257843e1988Sjohnlev ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
258843e1988Sjohnlev #endif
259843e1988Sjohnlev for (vec = 0; vec < NIDT; vec++) {
260843e1988Sjohnlev trap_info_t *ti = &vgc->trap_ctxt[vec];
261843e1988Sjohnlev
262843e1988Sjohnlev if (xen_idt_to_trap_info(vec,
263843e1988Sjohnlev &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
264843e1988Sjohnlev ti->cs = KCS_SEL;
265843e1988Sjohnlev ti->vector = vec;
266843e1988Sjohnlev }
267843e1988Sjohnlev }
268843e1988Sjohnlev
269843e1988Sjohnlev /*
270843e1988Sjohnlev * No LDT
271843e1988Sjohnlev */
272843e1988Sjohnlev
273843e1988Sjohnlev /*
274843e1988Sjohnlev * (We assert in various places that the GDT is (a) aligned on a
275843e1988Sjohnlev * page boundary and (b) one page long, so this really should fit..)
276843e1988Sjohnlev */
277843e1988Sjohnlev #ifdef CRASH_XEN
278843e1988Sjohnlev vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
279843e1988Sjohnlev #else
280843e1988Sjohnlev vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
281843e1988Sjohnlev #endif
282843e1988Sjohnlev vgc->gdt_ents = NGDT;
283843e1988Sjohnlev
284843e1988Sjohnlev vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
285843e1988Sjohnlev
286*86ef0a63SRichard Lowe vgc->ctrlreg[3] =
287*86ef0a63SRichard Lowe pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
288843e1988Sjohnlev
289843e1988Sjohnlev vgc->ctrlreg[4] = getcr4();
290843e1988Sjohnlev
291843e1988Sjohnlev vgc->event_callback_eip = (uintptr_t)xen_callback;
292843e1988Sjohnlev vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
293843e1988Sjohnlev vgc->flags |= VGCF_failsafe_disables_events;
294843e1988Sjohnlev
295843e1988Sjohnlev /*
296843e1988Sjohnlev * XXPV should this be moved to init_cpu_syscall?
297843e1988Sjohnlev */
298843e1988Sjohnlev vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
299843e1988Sjohnlev vgc->flags |= VGCF_syscall_disables_events;
300843e1988Sjohnlev
301843e1988Sjohnlev ASSERT(vgc->user_regs.gs == 0);
302843e1988Sjohnlev vgc->gs_base_kernel = (uintptr_t)cp;
303843e1988Sjohnlev
304843e1988Sjohnlev return (xen_vcpu_initialize(cp->cpu_id, vgc));
305843e1988Sjohnlev }
306843e1988Sjohnlev
307843e1988Sjohnlev /*
308843e1988Sjohnlev * Create a guest virtual cpu context so that the virtual cpu
309843e1988Sjohnlev * springs into life in the domain just about to call mp_startup()
310843e1988Sjohnlev *
311843e1988Sjohnlev * Virtual CPUs must be initialized once in the lifetime of the domain;
312843e1988Sjohnlev * after that subsequent attempts to start them will fail with X_EEXIST.
313843e1988Sjohnlev *
314843e1988Sjohnlev * Thus 'alloc' -really- creates and initializes the virtual
315843e1988Sjohnlev * CPU context just once. Once the initialisation succeeds, we never
316843e1988Sjohnlev * free it, nor the regular cpu_t to which it refers.
317843e1988Sjohnlev */
318843e1988Sjohnlev void *
mach_cpucontext_alloc(struct cpu * cp)319843e1988Sjohnlev mach_cpucontext_alloc(struct cpu *cp)
320843e1988Sjohnlev {
321843e1988Sjohnlev kthread_t *tp = cp->cpu_thread;
322843e1988Sjohnlev vcpu_guest_context_t vgc;
323843e1988Sjohnlev
324843e1988Sjohnlev int err = 1;
325843e1988Sjohnlev
326843e1988Sjohnlev /*
327843e1988Sjohnlev * First, augment the incoming cpu structure
328843e1988Sjohnlev * - vcpu pointer reference
329843e1988Sjohnlev * - pending event storage area
330843e1988Sjohnlev * - physical address of GDT
331843e1988Sjohnlev */
332843e1988Sjohnlev cp->cpu_m.mcpu_vcpu_info =
333843e1988Sjohnlev &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
334843e1988Sjohnlev cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
335843e1988Sjohnlev sizeof (struct xen_evt_data), KM_SLEEP);
336843e1988Sjohnlev cp->cpu_m.mcpu_gdtpa =
337843e1988Sjohnlev mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
338843e1988Sjohnlev
339843e1988Sjohnlev if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
340843e1988Sjohnlev goto done;
341843e1988Sjohnlev
342843e1988Sjohnlev /*
343843e1988Sjohnlev * Now set up the vcpu context so that we can start this vcpu
344843e1988Sjohnlev * in the kernel at tp->t_pc (mp_startup). Note that the
345843e1988Sjohnlev * thread will thread_exit() shortly after performing the
346843e1988Sjohnlev * initialization; in particular, we will *never* take a
347843e1988Sjohnlev * privilege transition on this thread.
348843e1988Sjohnlev */
349843e1988Sjohnlev
350843e1988Sjohnlev bzero(&vgc, sizeof (vgc));
351843e1988Sjohnlev
352843e1988Sjohnlev vgc.user_regs.rip = tp->t_pc;
353843e1988Sjohnlev vgc.user_regs.rsp = tp->t_sp;
354843e1988Sjohnlev vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
355843e1988Sjohnlev /*
356843e1988Sjohnlev * XXPV Fix resume, if Russ didn't already fix it.
357843e1988Sjohnlev *
358843e1988Sjohnlev * Note that resume unconditionally puts t->t_stk + sizeof (regs)
359843e1988Sjohnlev * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
360843e1988Sjohnlev * that only lwps take traps that switch to the kernel stack;
361843e1988Sjohnlev * part of creating an lwp adjusts the stack by subtracting
362843e1988Sjohnlev * sizeof (struct regs) off t_stk.
363843e1988Sjohnlev *
364843e1988Sjohnlev * The more interesting question is, why do we do all the work
365843e1988Sjohnlev * of a fully fledged lwp for a plain thread? In particular
366843e1988Sjohnlev * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
367843e1988Sjohnlev * or futz with the LDT. This should probably all be done with
368843e1988Sjohnlev * an lwp context operator to keep pure thread context switch fast.
369843e1988Sjohnlev */
370843e1988Sjohnlev vgc.kernel_sp = (ulong_t)tp->t_stk;
371843e1988Sjohnlev
372843e1988Sjohnlev err = mp_set_cpu_context(&vgc, cp);
373843e1988Sjohnlev
374843e1988Sjohnlev done:
375843e1988Sjohnlev if (err) {
376843e1988Sjohnlev mach_cpucontext_free(cp, NULL, err);
377843e1988Sjohnlev return (NULL);
378843e1988Sjohnlev }
379843e1988Sjohnlev return (cp);
380843e1988Sjohnlev }
381843e1988Sjohnlev
382843e1988Sjohnlev /*
383843e1988Sjohnlev * By the time we are called either we have successfully started
384843e1988Sjohnlev * the cpu, or our attempt to start it has failed.
385843e1988Sjohnlev */
386843e1988Sjohnlev
387843e1988Sjohnlev /*ARGSUSED*/
388843e1988Sjohnlev void
mach_cpucontext_free(struct cpu * cp,void * arg,int err)389843e1988Sjohnlev mach_cpucontext_free(struct cpu *cp, void *arg, int err)
390843e1988Sjohnlev {
391843e1988Sjohnlev switch (err) {
392843e1988Sjohnlev case 0:
393843e1988Sjohnlev break;
394843e1988Sjohnlev case ETIMEDOUT:
395843e1988Sjohnlev /*
396843e1988Sjohnlev * The vcpu context is loaded into the hypervisor, and
397843e1988Sjohnlev * we've tried to start it, but the vcpu has not been set
398843e1988Sjohnlev * running yet, for whatever reason. We arrange to -not-
399843e1988Sjohnlev * free any data structures it may be referencing. In
400843e1988Sjohnlev * particular, we've already told the hypervisor about
401843e1988Sjohnlev * the GDT, and so we can't map it read-write again.
402843e1988Sjohnlev */
403843e1988Sjohnlev break;
404843e1988Sjohnlev default:
405843e1988Sjohnlev (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
406843e1988Sjohnlev kmem_free(cp->cpu_m.mcpu_evt_pend,
407843e1988Sjohnlev sizeof (struct xen_evt_data));
408843e1988Sjohnlev break;
409843e1988Sjohnlev }
410843e1988Sjohnlev }
411843e1988Sjohnlev
412843e1988Sjohnlev /*
413843e1988Sjohnlev * Reset this CPU's context. Clear out any pending evtchn data, since event
414843e1988Sjohnlev * channel numbers will all change when we resume.
415843e1988Sjohnlev */
416843e1988Sjohnlev void
mach_cpucontext_reset(cpu_t * cp)417843e1988Sjohnlev mach_cpucontext_reset(cpu_t *cp)
418843e1988Sjohnlev {
419843e1988Sjohnlev bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
420843e1988Sjohnlev /* mcpu_intr_pending ? */
421843e1988Sjohnlev }
422843e1988Sjohnlev
423843e1988Sjohnlev static void
pcb_to_user_regs(label_t * pcb,vcpu_guest_context_t * vgc)424843e1988Sjohnlev pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
425843e1988Sjohnlev {
426843e1988Sjohnlev vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
427843e1988Sjohnlev vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
428843e1988Sjohnlev vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
429843e1988Sjohnlev vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
430843e1988Sjohnlev vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
431843e1988Sjohnlev vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
432843e1988Sjohnlev vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
433843e1988Sjohnlev vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
434843e1988Sjohnlev }
435843e1988Sjohnlev
436843e1988Sjohnlev /*
4371d03c31eSjohnlev * Restore the context of a CPU during resume. This context is always
4381d03c31eSjohnlev * inside enter_safe_phase(), below.
439843e1988Sjohnlev */
440843e1988Sjohnlev void
mach_cpucontext_restore(cpu_t * cp)441843e1988Sjohnlev mach_cpucontext_restore(cpu_t *cp)
442843e1988Sjohnlev {
443843e1988Sjohnlev vcpu_guest_context_t vgc;
444843e1988Sjohnlev int err;
445843e1988Sjohnlev
446843e1988Sjohnlev ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
447843e1988Sjohnlev cp->cpu_thread == cp->cpu_idle_thread);
448843e1988Sjohnlev
449843e1988Sjohnlev bzero(&vgc, sizeof (vgc));
450843e1988Sjohnlev
451843e1988Sjohnlev pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
452843e1988Sjohnlev
453843e1988Sjohnlev /*
454843e1988Sjohnlev * We're emulating a longjmp() here: in particular, we need to bump the
455843e1988Sjohnlev * stack pointer to account for the pop of xIP that returning from
456843e1988Sjohnlev * longjmp() normally would do, and set the return value in xAX to 1.
457843e1988Sjohnlev */
458843e1988Sjohnlev vgc.user_regs.rax = 1;
459843e1988Sjohnlev vgc.user_regs.rsp += sizeof (ulong_t);
460843e1988Sjohnlev
461843e1988Sjohnlev vgc.kernel_sp = cp->cpu_thread->t_sp;
462843e1988Sjohnlev
463843e1988Sjohnlev err = mp_set_cpu_context(&vgc, cp);
464843e1988Sjohnlev
465843e1988Sjohnlev ASSERT(err == 0);
466843e1988Sjohnlev }
467843e1988Sjohnlev
4681d03c31eSjohnlev /*
4691d03c31eSjohnlev * Reach a point at which the CPU can be safely powered-off or
4701d03c31eSjohnlev * suspended. Nothing can wake this CPU out of the loop.
4711d03c31eSjohnlev */
4721d03c31eSjohnlev static void
enter_safe_phase(void)4731d03c31eSjohnlev enter_safe_phase(void)
4741d03c31eSjohnlev {
4751d03c31eSjohnlev ulong_t flags = intr_clear();
4761d03c31eSjohnlev
4771d03c31eSjohnlev if (setjmp(&curthread->t_pcb) == 0) {
4781d03c31eSjohnlev cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
4791d03c31eSjohnlev while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
4801d03c31eSjohnlev SMT_PAUSE();
4811d03c31eSjohnlev }
4821d03c31eSjohnlev
4831d03c31eSjohnlev ASSERT(!interrupts_enabled());
4841d03c31eSjohnlev
4851d03c31eSjohnlev intr_restore(flags);
4861d03c31eSjohnlev }
4871d03c31eSjohnlev
4881d03c31eSjohnlev /*
4891d03c31eSjohnlev * Offline CPUs run this code even under a pause_cpus(), so we must
4901d03c31eSjohnlev * check if we need to enter the safe phase.
4911d03c31eSjohnlev */
492843e1988Sjohnlev void
mach_cpu_idle(void)493843e1988Sjohnlev mach_cpu_idle(void)
494843e1988Sjohnlev {
495843e1988Sjohnlev if (IN_XPV_PANIC()) {
496843e1988Sjohnlev xpv_panic_halt();
497843e1988Sjohnlev } else {
498843e1988Sjohnlev (void) HYPERVISOR_block();
4991d03c31eSjohnlev if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5001d03c31eSjohnlev enter_safe_phase();
501843e1988Sjohnlev }
502843e1988Sjohnlev }
503843e1988Sjohnlev
5041d03c31eSjohnlev /*
5051d03c31eSjohnlev * Spin until either start_cpus() wakes us up, or we get a request to
5061d03c31eSjohnlev * enter the safe phase (followed by a later start_cpus()).
5071d03c31eSjohnlev */
508843e1988Sjohnlev void
mach_cpu_pause(volatile char * safe)509843e1988Sjohnlev mach_cpu_pause(volatile char *safe)
510843e1988Sjohnlev {
5111d03c31eSjohnlev *safe = PAUSE_WAIT;
5121d03c31eSjohnlev membar_enter();
513843e1988Sjohnlev
5141d03c31eSjohnlev while (*safe != PAUSE_IDLE) {
5151d03c31eSjohnlev if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5161d03c31eSjohnlev enter_safe_phase();
517843e1988Sjohnlev SMT_PAUSE();
5181d03c31eSjohnlev }
519843e1988Sjohnlev }
520843e1988Sjohnlev
521027bcc9fSToomas Soome int
mach_cpu_halt(xc_arg_t arg1,xc_arg_t arg2 __unused,xc_arg_t arg3 __unused)522027bcc9fSToomas Soome mach_cpu_halt(xc_arg_t arg1, xc_arg_t arg2 __unused, xc_arg_t arg3 __unused)
5231d03c31eSjohnlev {
524027bcc9fSToomas Soome char *msg = (char *)arg1;
525027bcc9fSToomas Soome
5261d03c31eSjohnlev if (msg)
5271d03c31eSjohnlev prom_printf("%s\n", msg);
5281d03c31eSjohnlev (void) xen_vcpu_down(CPU->cpu_id);
529027bcc9fSToomas Soome return (0);
5301d03c31eSjohnlev }
531843e1988Sjohnlev
532843e1988Sjohnlev /*ARGSUSED*/
533843e1988Sjohnlev int
mp_cpu_poweron(struct cpu * cp)534843e1988Sjohnlev mp_cpu_poweron(struct cpu *cp)
535843e1988Sjohnlev {
536843e1988Sjohnlev return (ENOTSUP);
537843e1988Sjohnlev }
538843e1988Sjohnlev
539843e1988Sjohnlev /*ARGSUSED*/
540843e1988Sjohnlev int
mp_cpu_poweroff(struct cpu * cp)541843e1988Sjohnlev mp_cpu_poweroff(struct cpu *cp)
542843e1988Sjohnlev {
543843e1988Sjohnlev return (ENOTSUP);
544843e1988Sjohnlev }
545843e1988Sjohnlev
5461d03c31eSjohnlev void
mp_enter_barrier(void)5471d03c31eSjohnlev mp_enter_barrier(void)
548843e1988Sjohnlev {
5491d03c31eSjohnlev hrtime_t last_poke_time = 0;
5501d03c31eSjohnlev int poke_allowed = 0;
5511d03c31eSjohnlev int done = 0;
5521d03c31eSjohnlev int i;
553843e1988Sjohnlev
554843e1988Sjohnlev ASSERT(MUTEX_HELD(&cpu_lock));
555843e1988Sjohnlev
5560ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL);
5571d03c31eSjohnlev
5581d03c31eSjohnlev while (!done) {
5591d03c31eSjohnlev done = 1;
5601d03c31eSjohnlev poke_allowed = 0;
5611d03c31eSjohnlev
5621d03c31eSjohnlev if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
5631d03c31eSjohnlev last_poke_time = xpv_gethrtime();
5641d03c31eSjohnlev poke_allowed = 1;
5651d03c31eSjohnlev }
5661d03c31eSjohnlev
5671d03c31eSjohnlev for (i = 0; i < NCPU; i++) {
5681d03c31eSjohnlev cpu_t *cp = cpu_get(i);
5691d03c31eSjohnlev
5701d03c31eSjohnlev if (cp == NULL || cp == CPU)
5711d03c31eSjohnlev continue;
5721d03c31eSjohnlev
5731d03c31eSjohnlev switch (cpu_phase[i]) {
5741d03c31eSjohnlev case CPU_PHASE_NONE:
5751d03c31eSjohnlev cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
5761d03c31eSjohnlev poke_cpu(i);
5771d03c31eSjohnlev done = 0;
5781d03c31eSjohnlev break;
5791d03c31eSjohnlev
5801d03c31eSjohnlev case CPU_PHASE_WAIT_SAFE:
5811d03c31eSjohnlev if (poke_allowed)
5821d03c31eSjohnlev poke_cpu(i);
5831d03c31eSjohnlev done = 0;
5841d03c31eSjohnlev break;
5851d03c31eSjohnlev
5861d03c31eSjohnlev case CPU_PHASE_SAFE:
5871d03c31eSjohnlev case CPU_PHASE_POWERED_OFF:
5881d03c31eSjohnlev break;
5891d03c31eSjohnlev }
5901d03c31eSjohnlev }
5911d03c31eSjohnlev
5921d03c31eSjohnlev SMT_PAUSE();
593843e1988Sjohnlev }
5941d03c31eSjohnlev }
595843e1988Sjohnlev
5961d03c31eSjohnlev void
mp_leave_barrier(void)5971d03c31eSjohnlev mp_leave_barrier(void)
5981d03c31eSjohnlev {
5991d03c31eSjohnlev int i;
6001d03c31eSjohnlev
6011d03c31eSjohnlev ASSERT(MUTEX_HELD(&cpu_lock));
6021d03c31eSjohnlev
6031d03c31eSjohnlev for (i = 0; i < NCPU; i++) {
6041d03c31eSjohnlev cpu_t *cp = cpu_get(i);
6051d03c31eSjohnlev
6061d03c31eSjohnlev if (cp == NULL || cp == CPU)
6071d03c31eSjohnlev continue;
6081d03c31eSjohnlev
6091d03c31eSjohnlev switch (cpu_phase[i]) {
610843e1988Sjohnlev /*
6111d03c31eSjohnlev * If we see a CPU in one of these phases, something has
6121d03c31eSjohnlev * gone badly wrong with the guarantees
6131d03c31eSjohnlev * mp_enter_barrier() is supposed to provide. Rather
6141d03c31eSjohnlev * than attempt to stumble along (and since we can't
6151d03c31eSjohnlev * panic properly in this context), we tell the
6161d03c31eSjohnlev * hypervisor we've crashed.
617843e1988Sjohnlev */
6181d03c31eSjohnlev case CPU_PHASE_NONE:
6191d03c31eSjohnlev case CPU_PHASE_WAIT_SAFE:
6201d03c31eSjohnlev (void) HYPERVISOR_shutdown(SHUTDOWN_crash);
6211d03c31eSjohnlev break;
622843e1988Sjohnlev
6231d03c31eSjohnlev case CPU_PHASE_POWERED_OFF:
6241d03c31eSjohnlev break;
6251d03c31eSjohnlev
6261d03c31eSjohnlev case CPU_PHASE_SAFE:
6271d03c31eSjohnlev cpu_phase[i] = CPU_PHASE_NONE;
6281d03c31eSjohnlev }
629843e1988Sjohnlev }
630843e1988Sjohnlev
6311d03c31eSjohnlev start_cpus();
632843e1988Sjohnlev }
633843e1988Sjohnlev
634843e1988Sjohnlev static int
poweroff_vcpu(struct cpu * cp)635843e1988Sjohnlev poweroff_vcpu(struct cpu *cp)
636843e1988Sjohnlev {
637843e1988Sjohnlev int error;
638843e1988Sjohnlev
639843e1988Sjohnlev ASSERT(MUTEX_HELD(&cpu_lock));
640843e1988Sjohnlev
641843e1988Sjohnlev ASSERT(CPU->cpu_id != cp->cpu_id);
642843e1988Sjohnlev ASSERT(cp->cpu_flags & CPU_QUIESCED);
643843e1988Sjohnlev
6441d03c31eSjohnlev mp_enter_barrier();
645843e1988Sjohnlev
646843e1988Sjohnlev if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
6471d03c31eSjohnlev ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
6481d03c31eSjohnlev
649843e1988Sjohnlev CPUSET_DEL(cpu_ready_set, cp->cpu_id);
6501d03c31eSjohnlev
651c3377ee9SJohn Levon if (cp->cpu_flags & CPU_ENABLE)
652c3377ee9SJohn Levon ncpus_intr_enabled--;
653c3377ee9SJohn Levon
654843e1988Sjohnlev cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
655843e1988Sjohnlev cp->cpu_flags &=
656843e1988Sjohnlev ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
657843e1988Sjohnlev
6581d03c31eSjohnlev cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
6591d03c31eSjohnlev
660843e1988Sjohnlev cpu_set_state(cp);
661843e1988Sjohnlev }
6621d03c31eSjohnlev
6631d03c31eSjohnlev mp_leave_barrier();
6641d03c31eSjohnlev
665843e1988Sjohnlev return (error);
666843e1988Sjohnlev }
667843e1988Sjohnlev
668843e1988Sjohnlev static int
vcpu_config_poweroff(processorid_t id)669843e1988Sjohnlev vcpu_config_poweroff(processorid_t id)
670843e1988Sjohnlev {
671843e1988Sjohnlev int oldstate;
672843e1988Sjohnlev int error;
673843e1988Sjohnlev cpu_t *cp;
674843e1988Sjohnlev
675843e1988Sjohnlev mutex_enter(&cpu_lock);
676843e1988Sjohnlev
677843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) {
678843e1988Sjohnlev mutex_exit(&cpu_lock);
679843e1988Sjohnlev return (ESRCH);
680843e1988Sjohnlev }
681843e1988Sjohnlev
682843e1988Sjohnlev if (cpu_get_state(cp) == P_POWEROFF) {
683843e1988Sjohnlev mutex_exit(&cpu_lock);
684843e1988Sjohnlev return (0);
685843e1988Sjohnlev }
686843e1988Sjohnlev
687843e1988Sjohnlev mutex_exit(&cpu_lock);
688843e1988Sjohnlev
689843e1988Sjohnlev do {
690843e1988Sjohnlev error = p_online_internal(id, P_OFFLINE,
691843e1988Sjohnlev &oldstate);
692843e1988Sjohnlev
693843e1988Sjohnlev if (error != 0)
694843e1988Sjohnlev break;
695843e1988Sjohnlev
696843e1988Sjohnlev /*
697843e1988Sjohnlev * So we just changed it to P_OFFLINE. But then we dropped
698843e1988Sjohnlev * cpu_lock, so now it is possible for another thread to change
699843e1988Sjohnlev * the cpu back to a different, non-quiesced state e.g.
700843e1988Sjohnlev * P_ONLINE.
701843e1988Sjohnlev */
702843e1988Sjohnlev mutex_enter(&cpu_lock);
703843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL)
704843e1988Sjohnlev error = ESRCH;
705843e1988Sjohnlev else {
706843e1988Sjohnlev if (cp->cpu_flags & CPU_QUIESCED)
707843e1988Sjohnlev error = poweroff_vcpu(cp);
708843e1988Sjohnlev else
709843e1988Sjohnlev error = EBUSY;
710843e1988Sjohnlev }
711843e1988Sjohnlev mutex_exit(&cpu_lock);
712843e1988Sjohnlev } while (error == EBUSY);
713843e1988Sjohnlev
714843e1988Sjohnlev return (error);
715843e1988Sjohnlev }
716843e1988Sjohnlev
717843e1988Sjohnlev /*
718843e1988Sjohnlev * Add a new virtual cpu to the domain.
719843e1988Sjohnlev */
720843e1988Sjohnlev static int
vcpu_config_new(processorid_t id)721843e1988Sjohnlev vcpu_config_new(processorid_t id)
722843e1988Sjohnlev {
723843e1988Sjohnlev extern int start_cpu(processorid_t);
724843e1988Sjohnlev int error;
725843e1988Sjohnlev
726843e1988Sjohnlev if (ncpus == 1) {
727843e1988Sjohnlev printf("cannot (yet) add cpus to a single-cpu domain\n");
728843e1988Sjohnlev return (ENOTSUP);
729843e1988Sjohnlev }
730843e1988Sjohnlev
731843e1988Sjohnlev affinity_set(CPU_CURRENT);
732843e1988Sjohnlev error = start_cpu(id);
733843e1988Sjohnlev affinity_clear();
734843e1988Sjohnlev return (error);
735843e1988Sjohnlev }
736843e1988Sjohnlev
7371d03c31eSjohnlev static int
poweron_vcpu(struct cpu * cp)7381d03c31eSjohnlev poweron_vcpu(struct cpu *cp)
7391d03c31eSjohnlev {
7401d03c31eSjohnlev int error;
7411d03c31eSjohnlev
7421d03c31eSjohnlev ASSERT(MUTEX_HELD(&cpu_lock));
7431d03c31eSjohnlev
7441d03c31eSjohnlev if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
7451d03c31eSjohnlev printf("poweron_vcpu: vcpu%d is not available!\n",
7461d03c31eSjohnlev cp->cpu_id);
7471d03c31eSjohnlev return (ENXIO);
7481d03c31eSjohnlev }
7491d03c31eSjohnlev
7501d03c31eSjohnlev if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
7511d03c31eSjohnlev CPUSET_ADD(cpu_ready_set, cp->cpu_id);
7521d03c31eSjohnlev cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
7531d03c31eSjohnlev cp->cpu_flags &= ~CPU_POWEROFF;
7541d03c31eSjohnlev /*
7551d03c31eSjohnlev * There are some nasty races possible here.
7561d03c31eSjohnlev * Tell the vcpu it's up one more time.
7571d03c31eSjohnlev * XXPV Is this enough? Is this safe?
7581d03c31eSjohnlev */
7591d03c31eSjohnlev (void) xen_vcpu_up(cp->cpu_id);
7601d03c31eSjohnlev
7611d03c31eSjohnlev cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
7621d03c31eSjohnlev
7631d03c31eSjohnlev cpu_set_state(cp);
7641d03c31eSjohnlev }
7651d03c31eSjohnlev return (error);
7661d03c31eSjohnlev }
7671d03c31eSjohnlev
768843e1988Sjohnlev static int
vcpu_config_poweron(processorid_t id)769843e1988Sjohnlev vcpu_config_poweron(processorid_t id)
770843e1988Sjohnlev {
771843e1988Sjohnlev cpu_t *cp;
772843e1988Sjohnlev int oldstate;
773843e1988Sjohnlev int error;
774843e1988Sjohnlev
775843e1988Sjohnlev if (id >= ncpus)
776843e1988Sjohnlev return (vcpu_config_new(id));
777843e1988Sjohnlev
778843e1988Sjohnlev mutex_enter(&cpu_lock);
779843e1988Sjohnlev
780843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) {
781843e1988Sjohnlev mutex_exit(&cpu_lock);
782843e1988Sjohnlev return (ESRCH);
783843e1988Sjohnlev }
784843e1988Sjohnlev
785843e1988Sjohnlev if (cpu_get_state(cp) != P_POWEROFF) {
786843e1988Sjohnlev mutex_exit(&cpu_lock);
787843e1988Sjohnlev return (0);
788843e1988Sjohnlev }
789843e1988Sjohnlev
790843e1988Sjohnlev if ((error = poweron_vcpu(cp)) != 0) {
791843e1988Sjohnlev mutex_exit(&cpu_lock);
792843e1988Sjohnlev return (error);
793843e1988Sjohnlev }
794843e1988Sjohnlev
795843e1988Sjohnlev mutex_exit(&cpu_lock);
796843e1988Sjohnlev
797843e1988Sjohnlev return (p_online_internal(id, P_ONLINE, &oldstate));
798843e1988Sjohnlev }
799843e1988Sjohnlev
800843e1988Sjohnlev #define REPORT_LEN 128
801843e1988Sjohnlev
802843e1988Sjohnlev static void
vcpu_config_report(processorid_t id,uint_t newstate,int error)803843e1988Sjohnlev vcpu_config_report(processorid_t id, uint_t newstate, int error)
804843e1988Sjohnlev {
805843e1988Sjohnlev char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
806843e1988Sjohnlev size_t len;
807843e1988Sjohnlev char *ps;
808843e1988Sjohnlev
8092a9992ecSToomas Soome ps = NULL;
810843e1988Sjohnlev switch (newstate) {
811843e1988Sjohnlev case P_ONLINE:
812843e1988Sjohnlev ps = PS_ONLINE;
813843e1988Sjohnlev break;
814843e1988Sjohnlev case P_POWEROFF:
815843e1988Sjohnlev ps = PS_POWEROFF;
816843e1988Sjohnlev break;
817843e1988Sjohnlev default:
818843e1988Sjohnlev cmn_err(CE_PANIC, "unknown state %u\n", newstate);
819843e1988Sjohnlev break;
820843e1988Sjohnlev }
821843e1988Sjohnlev
822843e1988Sjohnlev len = snprintf(report, REPORT_LEN,
823843e1988Sjohnlev "cpu%d: externally initiated %s", id, ps);
824843e1988Sjohnlev
825843e1988Sjohnlev if (!error) {
826843e1988Sjohnlev cmn_err(CE_CONT, "!%s\n", report);
827843e1988Sjohnlev kmem_free(report, REPORT_LEN);
828843e1988Sjohnlev return;
829843e1988Sjohnlev }
830843e1988Sjohnlev
831843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
832843e1988Sjohnlev " failed, error %d: ", error);
833843e1988Sjohnlev switch (error) {
834843e1988Sjohnlev case EEXIST:
835843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
836843e1988Sjohnlev "cpu already %s", ps ? ps : "?");
837843e1988Sjohnlev break;
838843e1988Sjohnlev case ESRCH:
839843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
840843e1988Sjohnlev "cpu not found");
841843e1988Sjohnlev break;
842843e1988Sjohnlev case EINVAL:
843843e1988Sjohnlev case EALREADY:
844843e1988Sjohnlev break;
845843e1988Sjohnlev case EPERM:
846843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
847843e1988Sjohnlev "insufficient privilege (0x%x)", id);
848843e1988Sjohnlev break;
849843e1988Sjohnlev case EBUSY:
850843e1988Sjohnlev switch (newstate) {
851843e1988Sjohnlev case P_ONLINE:
852843e1988Sjohnlev /*
853843e1988Sjohnlev * This return comes from mp_cpu_start -
854843e1988Sjohnlev * we cannot 'start' the boot CPU.
855843e1988Sjohnlev */
856843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
857843e1988Sjohnlev "already running");
858843e1988Sjohnlev break;
859843e1988Sjohnlev case P_POWEROFF:
860843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
861843e1988Sjohnlev "bound lwps?");
862843e1988Sjohnlev break;
863843e1988Sjohnlev default:
864843e1988Sjohnlev break;
865843e1988Sjohnlev }
866843e1988Sjohnlev default:
867843e1988Sjohnlev break;
868843e1988Sjohnlev }
869843e1988Sjohnlev
870843e1988Sjohnlev cmn_err(CE_CONT, "%s\n", report);
871843e1988Sjohnlev kmem_free(report, REPORT_LEN);
872843e1988Sjohnlev }
873843e1988Sjohnlev
874843e1988Sjohnlev static void
vcpu_config(void * arg)875843e1988Sjohnlev vcpu_config(void *arg)
876843e1988Sjohnlev {
877843e1988Sjohnlev int id = (int)(uintptr_t)arg;
878843e1988Sjohnlev int error;
879843e1988Sjohnlev char dir[16];
880843e1988Sjohnlev char *state;
881843e1988Sjohnlev
882843e1988Sjohnlev if ((uint_t)id >= max_ncpus) {
883843e1988Sjohnlev cmn_err(CE_WARN,
884843e1988Sjohnlev "vcpu_config: cpu%d does not fit in this domain", id);
885843e1988Sjohnlev return;
886843e1988Sjohnlev }
887843e1988Sjohnlev
888843e1988Sjohnlev (void) snprintf(dir, sizeof (dir), "cpu/%d", id);
889843e1988Sjohnlev state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
890843e1988Sjohnlev if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
891843e1988Sjohnlev if (strcmp(state, "online") == 0) {
892843e1988Sjohnlev error = vcpu_config_poweron(id);
893843e1988Sjohnlev vcpu_config_report(id, P_ONLINE, error);
894843e1988Sjohnlev } else if (strcmp(state, "offline") == 0) {
895843e1988Sjohnlev error = vcpu_config_poweroff(id);
896843e1988Sjohnlev vcpu_config_report(id, P_POWEROFF, error);
897843e1988Sjohnlev } else {
898843e1988Sjohnlev cmn_err(CE_WARN,
899843e1988Sjohnlev "cpu%d: unknown target state '%s'", id, state);
900843e1988Sjohnlev }
901843e1988Sjohnlev } else
902843e1988Sjohnlev cmn_err(CE_WARN,
903843e1988Sjohnlev "cpu%d: unable to read target state from xenstore", id);
904843e1988Sjohnlev
905843e1988Sjohnlev kmem_free(state, MAXPATHLEN);
906843e1988Sjohnlev }
907843e1988Sjohnlev
908843e1988Sjohnlev /*ARGSUSED*/
909843e1988Sjohnlev static void
vcpu_config_event(struct xenbus_watch * watch,const char ** vec,uint_t len)910843e1988Sjohnlev vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
911843e1988Sjohnlev {
912843e1988Sjohnlev const char *path = vec[XS_WATCH_PATH];
913843e1988Sjohnlev processorid_t id;
914843e1988Sjohnlev char *s;
915843e1988Sjohnlev
916843e1988Sjohnlev if ((s = strstr(path, "cpu/")) != NULL &&
917843e1988Sjohnlev sscanf(s, "cpu/%d", &id) == 1) {
918843e1988Sjohnlev /*
919843e1988Sjohnlev * Run the virtual CPU configuration on a separate thread to
920843e1988Sjohnlev * avoid blocking on this event for too long (and for now,
921843e1988Sjohnlev * to ensure configuration requests are serialized.)
922843e1988Sjohnlev */
923843e1988Sjohnlev (void) taskq_dispatch(cpu_config_tq,
924843e1988Sjohnlev vcpu_config, (void *)(uintptr_t)id, 0);
925843e1988Sjohnlev }
926843e1988Sjohnlev }
927843e1988Sjohnlev
928843e1988Sjohnlev static int
xen_vcpu_initialize(processorid_t id,vcpu_guest_context_t * vgc)929843e1988Sjohnlev xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
930843e1988Sjohnlev {
931843e1988Sjohnlev int err;
932843e1988Sjohnlev
933843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
934843e1988Sjohnlev char *str;
935843e1988Sjohnlev int level = CE_WARN;
936843e1988Sjohnlev
937843e1988Sjohnlev switch (err) {
938843e1988Sjohnlev case -X_EINVAL:
939843e1988Sjohnlev /*
940843e1988Sjohnlev * This interface squashes multiple error sources
941843e1988Sjohnlev * to one error code. In particular, an X_EINVAL
942843e1988Sjohnlev * code can mean:
943843e1988Sjohnlev *
944843e1988Sjohnlev * - the vcpu id is out of range
945843e1988Sjohnlev * - cs or ss are in ring 0
946843e1988Sjohnlev * - cr3 is wrong
947843e1988Sjohnlev * - an entry in the new gdt is above the
948843e1988Sjohnlev * reserved entry
949843e1988Sjohnlev * - a frame underneath the new gdt is bad
950843e1988Sjohnlev */
951843e1988Sjohnlev str = "something is wrong :(";
952843e1988Sjohnlev break;
953843e1988Sjohnlev case -X_ENOENT:
954843e1988Sjohnlev str = "no such cpu";
955843e1988Sjohnlev break;
956843e1988Sjohnlev case -X_ENOMEM:
957843e1988Sjohnlev str = "no mem to copy ctxt";
958843e1988Sjohnlev break;
959843e1988Sjohnlev case -X_EFAULT:
960843e1988Sjohnlev str = "bad address";
961843e1988Sjohnlev break;
962843e1988Sjohnlev case -X_EEXIST:
963843e1988Sjohnlev /*
964843e1988Sjohnlev * Hmm. This error is returned if the vcpu has already
965843e1988Sjohnlev * been initialized once before in the lifetime of this
966843e1988Sjohnlev * domain. This is a logic error in the kernel.
967843e1988Sjohnlev */
968843e1988Sjohnlev level = CE_PANIC;
969843e1988Sjohnlev str = "already initialized";
970843e1988Sjohnlev break;
971843e1988Sjohnlev default:
972843e1988Sjohnlev level = CE_PANIC;
973843e1988Sjohnlev str = "<unexpected>";
974843e1988Sjohnlev break;
975843e1988Sjohnlev }
976843e1988Sjohnlev
977843e1988Sjohnlev cmn_err(level, "vcpu%d: failed to init: error %d: %s",
978843e1988Sjohnlev id, -err, str);
979843e1988Sjohnlev }
980843e1988Sjohnlev return (err);
981843e1988Sjohnlev }
982843e1988Sjohnlev
983843e1988Sjohnlev long
xen_vcpu_up(processorid_t id)984843e1988Sjohnlev xen_vcpu_up(processorid_t id)
985843e1988Sjohnlev {
986843e1988Sjohnlev long err;
987843e1988Sjohnlev
988843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
989843e1988Sjohnlev char *str;
990843e1988Sjohnlev
991843e1988Sjohnlev switch (err) {
992843e1988Sjohnlev case -X_ENOENT:
993843e1988Sjohnlev str = "no such cpu";
994843e1988Sjohnlev break;
995843e1988Sjohnlev case -X_EINVAL:
996843e1988Sjohnlev /*
997843e1988Sjohnlev * Perhaps this is diagnostic overkill.
998843e1988Sjohnlev */
999843e1988Sjohnlev if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1000843e1988Sjohnlev str = "bad cpuid";
1001843e1988Sjohnlev else
1002843e1988Sjohnlev str = "not initialized";
1003843e1988Sjohnlev break;
1004843e1988Sjohnlev default:
1005843e1988Sjohnlev str = "<unexpected>";
1006843e1988Sjohnlev break;
1007843e1988Sjohnlev }
1008843e1988Sjohnlev
1009843e1988Sjohnlev printf("vcpu%d: failed to start: error %d: %s\n",
1010843e1988Sjohnlev id, -(int)err, str);
1011843e1988Sjohnlev return (EBFONT); /* deliberately silly */
1012843e1988Sjohnlev }
1013843e1988Sjohnlev return (err);
1014843e1988Sjohnlev }
1015843e1988Sjohnlev
1016843e1988Sjohnlev long
xen_vcpu_down(processorid_t id)1017843e1988Sjohnlev xen_vcpu_down(processorid_t id)
1018843e1988Sjohnlev {
1019843e1988Sjohnlev long err;
1020843e1988Sjohnlev
1021843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1022843e1988Sjohnlev /*
1023843e1988Sjohnlev * X_ENOENT: no such cpu
1024843e1988Sjohnlev * X_EINVAL: bad cpuid
1025843e1988Sjohnlev */
1026843e1988Sjohnlev panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1027843e1988Sjohnlev }
1028843e1988Sjohnlev
1029843e1988Sjohnlev return (err);
1030843e1988Sjohnlev }
1031