1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright 2019 Joyent, Inc.
29 */
30
31/*
32 * Virtual CPU management.
33 *
34 * VCPUs can be controlled in one of two ways; through the domain itself
35 * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
36 * Unfortunately, the terminology is used in different ways; they work out as
37 * follows:
38 *
39 * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
40 *
41 * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
42 * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
43 * receive interrupts, and we require this for offline CPUs in Solaris.
44 *
45 * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
46 * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
47 * if it has run previously, its software state (cpu_t, machcpu structures, IPI
48 * event channels, etc.) will still exist.
49 *
50 * The hypervisor has two notions of CPU states as represented in the store:
51 *
52 * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
53 *
54 * "online": the VCPU is running.  Corresponds to a CPU state other than
55 * P_POWEROFF.
56 *
57 * Currently, only a notification via xenstore can bring a CPU into a
58 * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
59 * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
60 * idempotently, as we'll get 'duplicate' entries when we resume a domain.
61 *
62 * Note that the xenstore configuration is strictly advisory, in that a domain
63 * can choose to ignore it and still power up a VCPU in the offline state. To
64 * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
65 * ENOTSUP from within Solaris.
66 *
67 * Powering off a VCPU and suspending the domain use similar code. The
68 * difficulty here is that we must ensure that each VCPU is in a stable
69 * state: it must have a saved PCB, and not be responding to interrupts
70 * (since we are just about to remove its ability to run on a real CPU,
71 * possibly forever).  However, an offline CPU in Solaris can take
72 * cross-call interrupts, as mentioned, so we must go through a
73 * two-stage process.  First, we use the standard Solaris pause_cpus().
74 * This ensures that all CPUs are either in mach_cpu_pause() or
75 * mach_cpu_idle(), and nothing will cross-call them.
76 *
77 * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
78 * bring them back up, and in state CPU_PHASE_POWERED_OFF.
79 *
80 * Running CPUs are spinning in mach_cpu_pause() waiting for either
81 * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
82 *
83 * Offline CPUs are either running the idle thread and periodically
84 * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
85 *
86 * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
87 * poking them to make sure they're not blocked[1]. When every CPU has
88 * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
89 * know we can suspend, or power-off a CPU, without problems.
90 *
91 * [1] note that we have to repeatedly poke offline CPUs: it's the only
92 * way to ensure that the CPU doesn't miss the state change before
93 * dropping into HYPERVISOR_block().
94 */
95
96#include <sys/types.h>
97#include <sys/systm.h>
98#include <sys/param.h>
99#include <sys/taskq.h>
100#include <sys/cmn_err.h>
101#include <sys/archsystm.h>
102#include <sys/machsystm.h>
103#include <sys/segments.h>
104#include <sys/cpuvar.h>
105#include <sys/x86_archext.h>
106#include <sys/controlregs.h>
107#include <sys/hypervisor.h>
108#include <sys/xpv_panic.h>
109#include <sys/mman.h>
110#include <sys/psw.h>
111#include <sys/cpu.h>
112#include <sys/sunddi.h>
113#include <util/sscanf.h>
114#include <vm/hat_i86.h>
115#include <vm/hat.h>
116#include <vm/as.h>
117
118#include <xen/public/io/xs_wire.h>
119#include <xen/sys/xenbus_impl.h>
120#include <xen/public/vcpu.h>
121
122extern cpuset_t cpu_ready_set;
123
124#define	CPU_PHASE_NONE 0
125#define	CPU_PHASE_WAIT_SAFE 1
126#define	CPU_PHASE_SAFE 2
127#define	CPU_PHASE_POWERED_OFF 3
128
129/*
130 * We can only poke CPUs during barrier enter 256 times a second at
131 * most.
132 */
133#define	POKE_TIMEOUT (NANOSEC / 256)
134
135static taskq_t *cpu_config_tq;
136static int cpu_phase[NCPU];
137
138static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
139static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
140
141/*
142 * Return whether or not the vcpu is actually running on a pcpu
143 */
144int
145vcpu_on_pcpu(processorid_t cpu)
146{
147	struct vcpu_runstate_info runstate;
148	int	ret = VCPU_STATE_UNKNOWN;
149
150	ASSERT(cpu < NCPU);
151	/*
152	 * Don't bother with hypercall if we are asking about ourself
153	 */
154	if (cpu == CPU->cpu_id)
155		return (VCPU_ON_PCPU);
156	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
157		goto out;
158
159	switch (runstate.state) {
160	case RUNSTATE_running:
161		ret = VCPU_ON_PCPU;
162		break;
163
164	case RUNSTATE_runnable:
165	case RUNSTATE_offline:
166	case RUNSTATE_blocked:
167		ret = VCPU_NOT_ON_PCPU;
168		break;
169
170	default:
171		break;
172	}
173
174out:
175	return (ret);
176}
177
178/*
179 * These routines allocate any global state that might be needed
180 * while starting cpus.  For virtual cpus, there is no such state.
181 */
182int
183mach_cpucontext_init(void)
184{
185	return (0);
186}
187
188void
189do_cpu_config_watch(int state)
190{
191	static struct xenbus_watch cpu_config_watch;
192
193	if (state != XENSTORE_UP)
194		return;
195	cpu_config_watch.node = "cpu";
196	cpu_config_watch.callback = vcpu_config_event;
197	if (register_xenbus_watch(&cpu_config_watch)) {
198		taskq_destroy(cpu_config_tq);
199		cmn_err(CE_WARN, "do_cpu_config_watch: "
200		    "failed to set vcpu config watch");
201	}
202
203}
204
205/*
206 * This routine is called after all the "normal" MP startup has
207 * been done; a good place to start watching xen store for virtual
208 * cpu hot plug events.
209 */
210void
211mach_cpucontext_fini(void)
212{
213
214	cpu_config_tq = taskq_create("vcpu config taskq", 1,
215	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
216
217	(void) xs_register_xenbus_callback(do_cpu_config_watch);
218}
219
220/*
221 * Fill in the remaining CPU context and initialize it.
222 */
223static int
224mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
225{
226	uint_t vec, iopl;
227
228	vgc->flags = VGCF_IN_KERNEL;
229
230	/*
231	 * fpu_ctx we leave as zero; on first fault we'll store
232	 * sse_initial into it anyway.
233	 */
234
235#if defined(__amd64)
236	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
237#else
238	vgc->user_regs.cs = KCS_SEL;
239#endif
240	vgc->user_regs.ds = KDS_SEL;
241	vgc->user_regs.es = KDS_SEL;
242	vgc->user_regs.ss = KDS_SEL;
243	vgc->kernel_ss = KDS_SEL;
244
245	/*
246	 * Allow I/O privilege level for Dom0 kernel.
247	 */
248	if (DOMAIN_IS_INITDOMAIN(xen_info))
249		iopl = (PS_IOPL & 0x1000); /* ring 1 */
250	else
251		iopl = 0;
252
253#if defined(__amd64)
254	vgc->user_regs.fs = 0;
255	vgc->user_regs.gs = 0;
256	vgc->user_regs.rflags = F_OFF | iopl;
257#elif defined(__i386)
258	vgc->user_regs.fs = KFS_SEL;
259	vgc->user_regs.gs = KGS_SEL;
260	vgc->user_regs.eflags = F_OFF | iopl;
261	vgc->event_callback_cs = vgc->user_regs.cs;
262	vgc->failsafe_callback_cs = vgc->user_regs.cs;
263#endif
264
265	/*
266	 * Initialize the trap_info_t from the IDT
267	 */
268#if !defined(__lint)
269	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
270#endif
271	for (vec = 0; vec < NIDT; vec++) {
272		trap_info_t *ti = &vgc->trap_ctxt[vec];
273
274		if (xen_idt_to_trap_info(vec,
275		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
276			ti->cs = KCS_SEL;
277			ti->vector = vec;
278		}
279	}
280
281	/*
282	 * No LDT
283	 */
284
285	/*
286	 * (We assert in various places that the GDT is (a) aligned on a
287	 * page boundary and (b) one page long, so this really should fit..)
288	 */
289#ifdef CRASH_XEN
290	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
291#else
292	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
293#endif
294	vgc->gdt_ents = NGDT;
295
296	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
297
298#if defined(__i386)
299	if (mmu.pae_hat)
300		vgc->ctrlreg[3] =
301		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
302	else
303#endif
304		vgc->ctrlreg[3] =
305		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
306
307	vgc->ctrlreg[4] = getcr4();
308
309	vgc->event_callback_eip = (uintptr_t)xen_callback;
310	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
311	vgc->flags |= VGCF_failsafe_disables_events;
312
313#if defined(__amd64)
314	/*
315	 * XXPV should this be moved to init_cpu_syscall?
316	 */
317	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
318	vgc->flags |= VGCF_syscall_disables_events;
319
320	ASSERT(vgc->user_regs.gs == 0);
321	vgc->gs_base_kernel = (uintptr_t)cp;
322#endif
323
324	return (xen_vcpu_initialize(cp->cpu_id, vgc));
325}
326
327/*
328 * Create a guest virtual cpu context so that the virtual cpu
329 * springs into life in the domain just about to call mp_startup()
330 *
331 * Virtual CPUs must be initialized once in the lifetime of the domain;
332 * after that subsequent attempts to start them will fail with X_EEXIST.
333 *
334 * Thus 'alloc' -really- creates and initializes the virtual
335 * CPU context just once. Once the initialisation succeeds, we never
336 * free it, nor the regular cpu_t to which it refers.
337 */
338void *
339mach_cpucontext_alloc(struct cpu *cp)
340{
341	kthread_t *tp = cp->cpu_thread;
342	vcpu_guest_context_t vgc;
343
344	int err = 1;
345
346	/*
347	 * First, augment the incoming cpu structure
348	 * - vcpu pointer reference
349	 * - pending event storage area
350	 * - physical address of GDT
351	 */
352	cp->cpu_m.mcpu_vcpu_info =
353	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
354	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
355	    sizeof (struct xen_evt_data), KM_SLEEP);
356	cp->cpu_m.mcpu_gdtpa =
357	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
358
359	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
360		goto done;
361
362	/*
363	 * Now set up the vcpu context so that we can start this vcpu
364	 * in the kernel at tp->t_pc (mp_startup).  Note that the
365	 * thread will thread_exit() shortly after performing the
366	 * initialization; in particular, we will *never* take a
367	 * privilege transition on this thread.
368	 */
369
370	bzero(&vgc, sizeof (vgc));
371
372#ifdef __amd64
373	vgc.user_regs.rip = tp->t_pc;
374	vgc.user_regs.rsp = tp->t_sp;
375	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
376#else
377	vgc.user_regs.eip = tp->t_pc;
378	vgc.user_regs.esp = tp->t_sp;
379	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
380#endif
381	/*
382	 * XXPV	Fix resume, if Russ didn't already fix it.
383	 *
384	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
385	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
386	 * that only lwps take traps that switch to the kernel stack;
387	 * part of creating an lwp adjusts the stack by subtracting
388	 * sizeof (struct regs) off t_stk.
389	 *
390	 * The more interesting question is, why do we do all the work
391	 * of a fully fledged lwp for a plain thread?  In particular
392	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
393	 * or futz with the LDT.  This should probably all be done with
394	 * an lwp context operator to keep pure thread context switch fast.
395	 */
396	vgc.kernel_sp = (ulong_t)tp->t_stk;
397
398	err = mp_set_cpu_context(&vgc, cp);
399
400done:
401	if (err) {
402		mach_cpucontext_free(cp, NULL, err);
403		return (NULL);
404	}
405	return (cp);
406}
407
408/*
409 * By the time we are called either we have successfully started
410 * the cpu, or our attempt to start it has failed.
411 */
412
413/*ARGSUSED*/
414void
415mach_cpucontext_free(struct cpu *cp, void *arg, int err)
416{
417	switch (err) {
418	case 0:
419		break;
420	case ETIMEDOUT:
421		/*
422		 * The vcpu context is loaded into the hypervisor, and
423		 * we've tried to start it, but the vcpu has not been set
424		 * running yet, for whatever reason.  We arrange to -not-
425		 * free any data structures it may be referencing.  In
426		 * particular, we've already told the hypervisor about
427		 * the GDT, and so we can't map it read-write again.
428		 */
429		break;
430	default:
431		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
432		kmem_free(cp->cpu_m.mcpu_evt_pend,
433		    sizeof (struct xen_evt_data));
434		break;
435	}
436}
437
438/*
439 * Reset this CPU's context.  Clear out any pending evtchn data, since event
440 * channel numbers will all change when we resume.
441 */
442void
443mach_cpucontext_reset(cpu_t *cp)
444{
445	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
446	/* mcpu_intr_pending ? */
447}
448
449static void
450pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
451{
452#ifdef __amd64
453	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
454	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
455	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
456	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
457	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
458	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
459	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
460	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
461#else /* __amd64 */
462	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
463	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
464	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
465	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
466	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
467	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
468#endif /* __amd64 */
469}
470
471/*
472 * Restore the context of a CPU during resume.  This context is always
473 * inside enter_safe_phase(), below.
474 */
475void
476mach_cpucontext_restore(cpu_t *cp)
477{
478	vcpu_guest_context_t vgc;
479	int err;
480
481	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
482	    cp->cpu_thread == cp->cpu_idle_thread);
483
484	bzero(&vgc, sizeof (vgc));
485
486	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
487
488	/*
489	 * We're emulating a longjmp() here: in particular, we need to bump the
490	 * stack pointer to account for the pop of xIP that returning from
491	 * longjmp() normally would do, and set the return value in xAX to 1.
492	 */
493#ifdef __amd64
494	vgc.user_regs.rax = 1;
495	vgc.user_regs.rsp += sizeof (ulong_t);
496#else
497	vgc.user_regs.eax = 1;
498	vgc.user_regs.esp += sizeof (ulong_t);
499#endif
500
501	vgc.kernel_sp = cp->cpu_thread->t_sp;
502
503	err = mp_set_cpu_context(&vgc, cp);
504
505	ASSERT(err == 0);
506}
507
508/*
509 * Reach a point at which the CPU can be safely powered-off or
510 * suspended.  Nothing can wake this CPU out of the loop.
511 */
512static void
513enter_safe_phase(void)
514{
515	ulong_t flags = intr_clear();
516
517	if (setjmp(&curthread->t_pcb) == 0) {
518		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
519		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
520			SMT_PAUSE();
521	}
522
523	ASSERT(!interrupts_enabled());
524
525	intr_restore(flags);
526}
527
528/*
529 * Offline CPUs run this code even under a pause_cpus(), so we must
530 * check if we need to enter the safe phase.
531 */
532void
533mach_cpu_idle(void)
534{
535	if (IN_XPV_PANIC()) {
536		xpv_panic_halt();
537	} else  {
538		(void) HYPERVISOR_block();
539		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
540			enter_safe_phase();
541	}
542}
543
544/*
545 * Spin until either start_cpus() wakes us up, or we get a request to
546 * enter the safe phase (followed by a later start_cpus()).
547 */
548void
549mach_cpu_pause(volatile char *safe)
550{
551	*safe = PAUSE_WAIT;
552	membar_enter();
553
554	while (*safe != PAUSE_IDLE) {
555		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
556			enter_safe_phase();
557		SMT_PAUSE();
558	}
559}
560
561int
562mach_cpu_halt(xc_arg_t arg1, xc_arg_t arg2 __unused, xc_arg_t arg3 __unused)
563{
564	char *msg = (char *)arg1;
565
566	if (msg)
567		prom_printf("%s\n", msg);
568	(void) xen_vcpu_down(CPU->cpu_id);
569	return (0);
570}
571
572/*ARGSUSED*/
573int
574mp_cpu_poweron(struct cpu *cp)
575{
576	return (ENOTSUP);
577}
578
579/*ARGSUSED*/
580int
581mp_cpu_poweroff(struct cpu *cp)
582{
583	return (ENOTSUP);
584}
585
586void
587mp_enter_barrier(void)
588{
589	hrtime_t last_poke_time = 0;
590	int poke_allowed = 0;
591	int done = 0;
592	int i;
593
594	ASSERT(MUTEX_HELD(&cpu_lock));
595
596	pause_cpus(NULL, NULL);
597
598	while (!done) {
599		done = 1;
600		poke_allowed = 0;
601
602		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
603			last_poke_time = xpv_gethrtime();
604			poke_allowed = 1;
605		}
606
607		for (i = 0; i < NCPU; i++) {
608			cpu_t *cp = cpu_get(i);
609
610			if (cp == NULL || cp == CPU)
611				continue;
612
613			switch (cpu_phase[i]) {
614			case CPU_PHASE_NONE:
615				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
616				poke_cpu(i);
617				done = 0;
618				break;
619
620			case CPU_PHASE_WAIT_SAFE:
621				if (poke_allowed)
622					poke_cpu(i);
623				done = 0;
624				break;
625
626			case CPU_PHASE_SAFE:
627			case CPU_PHASE_POWERED_OFF:
628				break;
629			}
630		}
631
632		SMT_PAUSE();
633	}
634}
635
636void
637mp_leave_barrier(void)
638{
639	int i;
640
641	ASSERT(MUTEX_HELD(&cpu_lock));
642
643	for (i = 0; i < NCPU; i++) {
644		cpu_t *cp = cpu_get(i);
645
646		if (cp == NULL || cp == CPU)
647			continue;
648
649		switch (cpu_phase[i]) {
650		/*
651		 * If we see a CPU in one of these phases, something has
652		 * gone badly wrong with the guarantees
653		 * mp_enter_barrier() is supposed to provide.  Rather
654		 * than attempt to stumble along (and since we can't
655		 * panic properly in this context), we tell the
656		 * hypervisor we've crashed.
657		 */
658		case CPU_PHASE_NONE:
659		case CPU_PHASE_WAIT_SAFE:
660			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
661			break;
662
663		case CPU_PHASE_POWERED_OFF:
664			break;
665
666		case CPU_PHASE_SAFE:
667			cpu_phase[i] = CPU_PHASE_NONE;
668		}
669	}
670
671	start_cpus();
672}
673
674static int
675poweroff_vcpu(struct cpu *cp)
676{
677	int error;
678
679	ASSERT(MUTEX_HELD(&cpu_lock));
680
681	ASSERT(CPU->cpu_id != cp->cpu_id);
682	ASSERT(cp->cpu_flags & CPU_QUIESCED);
683
684	mp_enter_barrier();
685
686	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
687		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
688
689		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
690
691		if (cp->cpu_flags & CPU_ENABLE)
692			ncpus_intr_enabled--;
693
694		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
695		cp->cpu_flags &=
696		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
697
698		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
699
700		cpu_set_state(cp);
701	}
702
703	mp_leave_barrier();
704
705	return (error);
706}
707
708static int
709vcpu_config_poweroff(processorid_t id)
710{
711	int oldstate;
712	int error;
713	cpu_t *cp;
714
715	mutex_enter(&cpu_lock);
716
717	if ((cp = cpu_get(id)) == NULL) {
718		mutex_exit(&cpu_lock);
719		return (ESRCH);
720	}
721
722	if (cpu_get_state(cp) == P_POWEROFF) {
723		mutex_exit(&cpu_lock);
724		return (0);
725	}
726
727	mutex_exit(&cpu_lock);
728
729	do {
730		error = p_online_internal(id, P_OFFLINE,
731		    &oldstate);
732
733		if (error != 0)
734			break;
735
736		/*
737		 * So we just changed it to P_OFFLINE.  But then we dropped
738		 * cpu_lock, so now it is possible for another thread to change
739		 * the cpu back to a different, non-quiesced state e.g.
740		 * P_ONLINE.
741		 */
742		mutex_enter(&cpu_lock);
743		if ((cp = cpu_get(id)) == NULL)
744			error = ESRCH;
745		else {
746			if (cp->cpu_flags & CPU_QUIESCED)
747				error = poweroff_vcpu(cp);
748			else
749				error = EBUSY;
750		}
751		mutex_exit(&cpu_lock);
752	} while (error == EBUSY);
753
754	return (error);
755}
756
757/*
758 * Add a new virtual cpu to the domain.
759 */
760static int
761vcpu_config_new(processorid_t id)
762{
763	extern int start_cpu(processorid_t);
764	int error;
765
766	if (ncpus == 1) {
767		printf("cannot (yet) add cpus to a single-cpu domain\n");
768		return (ENOTSUP);
769	}
770
771	affinity_set(CPU_CURRENT);
772	error = start_cpu(id);
773	affinity_clear();
774	return (error);
775}
776
777static int
778poweron_vcpu(struct cpu *cp)
779{
780	int error;
781
782	ASSERT(MUTEX_HELD(&cpu_lock));
783
784	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
785		printf("poweron_vcpu: vcpu%d is not available!\n",
786		    cp->cpu_id);
787		return (ENXIO);
788	}
789
790	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
791		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
792		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
793		cp->cpu_flags &= ~CPU_POWEROFF;
794		/*
795		 * There are some nasty races possible here.
796		 * Tell the vcpu it's up one more time.
797		 * XXPV	Is this enough?  Is this safe?
798		 */
799		(void) xen_vcpu_up(cp->cpu_id);
800
801		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
802
803		cpu_set_state(cp);
804	}
805	return (error);
806}
807
808static int
809vcpu_config_poweron(processorid_t id)
810{
811	cpu_t *cp;
812	int oldstate;
813	int error;
814
815	if (id >= ncpus)
816		return (vcpu_config_new(id));
817
818	mutex_enter(&cpu_lock);
819
820	if ((cp = cpu_get(id)) == NULL) {
821		mutex_exit(&cpu_lock);
822		return (ESRCH);
823	}
824
825	if (cpu_get_state(cp) != P_POWEROFF) {
826		mutex_exit(&cpu_lock);
827		return (0);
828	}
829
830	if ((error = poweron_vcpu(cp)) != 0) {
831		mutex_exit(&cpu_lock);
832		return (error);
833	}
834
835	mutex_exit(&cpu_lock);
836
837	return (p_online_internal(id, P_ONLINE, &oldstate));
838}
839
840#define	REPORT_LEN	128
841
842static void
843vcpu_config_report(processorid_t id, uint_t newstate, int error)
844{
845	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
846	size_t len;
847	char *ps;
848
849	ps = NULL;
850	switch (newstate) {
851	case P_ONLINE:
852		ps = PS_ONLINE;
853		break;
854	case P_POWEROFF:
855		ps = PS_POWEROFF;
856		break;
857	default:
858		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
859		break;
860	}
861
862	len = snprintf(report, REPORT_LEN,
863	    "cpu%d: externally initiated %s", id, ps);
864
865	if (!error) {
866		cmn_err(CE_CONT, "!%s\n", report);
867		kmem_free(report, REPORT_LEN);
868		return;
869	}
870
871	len += snprintf(report + len, REPORT_LEN - len,
872	    " failed, error %d: ", error);
873	switch (error) {
874	case EEXIST:
875		len += snprintf(report + len, REPORT_LEN - len,
876		    "cpu already %s", ps ? ps : "?");
877		break;
878	case ESRCH:
879		len += snprintf(report + len, REPORT_LEN - len,
880		    "cpu not found");
881		break;
882	case EINVAL:
883	case EALREADY:
884		break;
885	case EPERM:
886		len += snprintf(report + len, REPORT_LEN - len,
887		    "insufficient privilege (0x%x)", id);
888		break;
889	case EBUSY:
890		switch (newstate) {
891		case P_ONLINE:
892			/*
893			 * This return comes from mp_cpu_start -
894			 * we cannot 'start' the boot CPU.
895			 */
896			len += snprintf(report + len, REPORT_LEN - len,
897			    "already running");
898			break;
899		case P_POWEROFF:
900			len += snprintf(report + len, REPORT_LEN - len,
901			    "bound lwps?");
902			break;
903		default:
904			break;
905		}
906	default:
907		break;
908	}
909
910	cmn_err(CE_CONT, "%s\n", report);
911	kmem_free(report, REPORT_LEN);
912}
913
914static void
915vcpu_config(void *arg)
916{
917	int id = (int)(uintptr_t)arg;
918	int error;
919	char dir[16];
920	char *state;
921
922	if ((uint_t)id >= max_ncpus) {
923		cmn_err(CE_WARN,
924		    "vcpu_config: cpu%d does not fit in this domain", id);
925		return;
926	}
927
928	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
929	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
930	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
931		if (strcmp(state, "online") == 0) {
932			error = vcpu_config_poweron(id);
933			vcpu_config_report(id, P_ONLINE, error);
934		} else if (strcmp(state, "offline") == 0) {
935			error = vcpu_config_poweroff(id);
936			vcpu_config_report(id, P_POWEROFF, error);
937		} else {
938			cmn_err(CE_WARN,
939			    "cpu%d: unknown target state '%s'", id, state);
940		}
941	} else
942		cmn_err(CE_WARN,
943		    "cpu%d: unable to read target state from xenstore", id);
944
945	kmem_free(state, MAXPATHLEN);
946}
947
948/*ARGSUSED*/
949static void
950vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
951{
952	const char *path = vec[XS_WATCH_PATH];
953	processorid_t id;
954	char *s;
955
956	if ((s = strstr(path, "cpu/")) != NULL &&
957	    sscanf(s, "cpu/%d", &id) == 1) {
958		/*
959		 * Run the virtual CPU configuration on a separate thread to
960		 * avoid blocking on this event for too long (and for now,
961		 * to ensure configuration requests are serialized.)
962		 */
963		(void) taskq_dispatch(cpu_config_tq,
964		    vcpu_config, (void *)(uintptr_t)id, 0);
965	}
966}
967
968static int
969xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
970{
971	int err;
972
973	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
974		char *str;
975		int level = CE_WARN;
976
977		switch (err) {
978		case -X_EINVAL:
979			/*
980			 * This interface squashes multiple error sources
981			 * to one error code.  In particular, an X_EINVAL
982			 * code can mean:
983			 *
984			 * -	the vcpu id is out of range
985			 * -	cs or ss are in ring 0
986			 * -	cr3 is wrong
987			 * -	an entry in the new gdt is above the
988			 *	reserved entry
989			 * -	a frame underneath the new gdt is bad
990			 */
991			str = "something is wrong :(";
992			break;
993		case -X_ENOENT:
994			str = "no such cpu";
995			break;
996		case -X_ENOMEM:
997			str = "no mem to copy ctxt";
998			break;
999		case -X_EFAULT:
1000			str = "bad address";
1001			break;
1002		case -X_EEXIST:
1003			/*
1004			 * Hmm.  This error is returned if the vcpu has already
1005			 * been initialized once before in the lifetime of this
1006			 * domain.  This is a logic error in the kernel.
1007			 */
1008			level = CE_PANIC;
1009			str = "already initialized";
1010			break;
1011		default:
1012			level = CE_PANIC;
1013			str = "<unexpected>";
1014			break;
1015		}
1016
1017		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
1018		    id, -err, str);
1019	}
1020	return (err);
1021}
1022
1023long
1024xen_vcpu_up(processorid_t id)
1025{
1026	long err;
1027
1028	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
1029		char *str;
1030
1031		switch (err) {
1032		case -X_ENOENT:
1033			str = "no such cpu";
1034			break;
1035		case -X_EINVAL:
1036			/*
1037			 * Perhaps this is diagnostic overkill.
1038			 */
1039			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1040				str = "bad cpuid";
1041			else
1042				str = "not initialized";
1043			break;
1044		default:
1045			str = "<unexpected>";
1046			break;
1047		}
1048
1049		printf("vcpu%d: failed to start: error %d: %s\n",
1050		    id, -(int)err, str);
1051		return (EBFONT);	/* deliberately silly */
1052	}
1053	return (err);
1054}
1055
1056long
1057xen_vcpu_down(processorid_t id)
1058{
1059	long err;
1060
1061	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1062		/*
1063		 * X_ENOENT:	no such cpu
1064		 * X_EINVAL:	bad cpuid
1065		 */
1066		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1067	}
1068
1069	return (err);
1070}
1071