1843e1988Sjohnlev /* 2843e1988Sjohnlev * CDDL HEADER START 3843e1988Sjohnlev * 4843e1988Sjohnlev * The contents of this file are subject to the terms of the 5843e1988Sjohnlev * Common Development and Distribution License (the "License"). 6843e1988Sjohnlev * You may not use this file except in compliance with the License. 7843e1988Sjohnlev * 8843e1988Sjohnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9843e1988Sjohnlev * or http://www.opensolaris.org/os/licensing. 10843e1988Sjohnlev * See the License for the specific language governing permissions 11843e1988Sjohnlev * and limitations under the License. 12843e1988Sjohnlev * 13843e1988Sjohnlev * When distributing Covered Code, include this CDDL HEADER in each 14843e1988Sjohnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15843e1988Sjohnlev * If applicable, add the following below this CDDL HEADER, with the 16843e1988Sjohnlev * fields enclosed by brackets "[]" replaced with your own identifying 17843e1988Sjohnlev * information: Portions Copyright [yyyy] [name of copyright owner] 18843e1988Sjohnlev * 19843e1988Sjohnlev * CDDL HEADER END 20843e1988Sjohnlev */ 21843e1988Sjohnlev 22843e1988Sjohnlev /* 23843e1988Sjohnlev * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24843e1988Sjohnlev * Use is subject to license terms. 25843e1988Sjohnlev */ 26843e1988Sjohnlev 27*1d03c31eSjohnlev /* 28*1d03c31eSjohnlev * Virtual CPU management. 29*1d03c31eSjohnlev * 30*1d03c31eSjohnlev * VCPUs can be controlled in one of two ways; through the domain itself 31*1d03c31eSjohnlev * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()). 32*1d03c31eSjohnlev * Unfortunately, the terminology is used in different ways; they work out as 33*1d03c31eSjohnlev * follows: 34*1d03c31eSjohnlev * 35*1d03c31eSjohnlev * P_ONLINE: the VCPU is up and running, taking interrupts and running threads 36*1d03c31eSjohnlev * 37*1d03c31eSjohnlev * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the 38*1d03c31eSjohnlev * hypervisor on the idle thread). It must be up since a downed VCPU cannot 39*1d03c31eSjohnlev * receive interrupts, and we require this for offline CPUs in Solaris. 40*1d03c31eSjohnlev * 41*1d03c31eSjohnlev * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called 42*1d03c31eSjohnlev * xen_vcpu_down() for it). It can't take interrupts or run anything, though 43*1d03c31eSjohnlev * if it has run previously, its software state (cpu_t, machcpu structures, IPI 44*1d03c31eSjohnlev * event channels, etc.) will still exist. 45*1d03c31eSjohnlev * 46*1d03c31eSjohnlev * The hypervisor has two notions of CPU states as represented in the store: 47*1d03c31eSjohnlev * 48*1d03c31eSjohnlev * "offline": the VCPU is down. Corresponds to P_POWEROFF. 49*1d03c31eSjohnlev * 50*1d03c31eSjohnlev * "online": the VCPU is running. Corresponds to a CPU state other than 51*1d03c31eSjohnlev * P_POWEROFF. 52*1d03c31eSjohnlev * 53*1d03c31eSjohnlev * Currently, only a notification via xenstore can bring a CPU into a 54*1d03c31eSjohnlev * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR, 55*1d03c31eSjohnlev * P_OFFLINE, etc. We need to be careful to treat xenstore notifications 56*1d03c31eSjohnlev * idempotently, as we'll get 'duplicate' entries when we resume a domain. 57*1d03c31eSjohnlev * 58*1d03c31eSjohnlev * Note that the xenstore configuration is strictly advisory, in that a domain 59*1d03c31eSjohnlev * can choose to ignore it and still power up a VCPU in the offline state. To 60*1d03c31eSjohnlev * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is 61*1d03c31eSjohnlev * ENOTSUP from within Solaris. 62*1d03c31eSjohnlev * 63*1d03c31eSjohnlev * Powering off a VCPU and suspending the domain use similar code. The 64*1d03c31eSjohnlev * difficulty here is that we must ensure that each VCPU is in a stable 65*1d03c31eSjohnlev * state: it must have a saved PCB, and not be responding to interrupts 66*1d03c31eSjohnlev * (since we are just about to remove its ability to run on a real CPU, 67*1d03c31eSjohnlev * possibly forever). However, an offline CPU in Solaris can take 68*1d03c31eSjohnlev * cross-call interrupts, as mentioned, so we must go through a 69*1d03c31eSjohnlev * two-stage process. First, we use the standard Solaris pause_cpus(). 70*1d03c31eSjohnlev * This ensures that all CPUs are either in mach_cpu_pause() or 71*1d03c31eSjohnlev * mach_cpu_idle(), and nothing will cross-call them. 72*1d03c31eSjohnlev * 73*1d03c31eSjohnlev * Powered-off-CPUs are already safe, as we own the cpu_lock needed to 74*1d03c31eSjohnlev * bring them back up, and in state CPU_PHASE_POWERED_OFF. 75*1d03c31eSjohnlev * 76*1d03c31eSjohnlev * Running CPUs are spinning in mach_cpu_pause() waiting for either 77*1d03c31eSjohnlev * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE. 78*1d03c31eSjohnlev * 79*1d03c31eSjohnlev * Offline CPUs are either running the idle thread and periodically 80*1d03c31eSjohnlev * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor. 81*1d03c31eSjohnlev * 82*1d03c31eSjohnlev * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as 83*1d03c31eSjohnlev * poking them to make sure they're not blocked[1]. When every CPU has 84*1d03c31eSjohnlev * responded by reaching a safe state and setting CPU_PHASE_SAFE, we 85*1d03c31eSjohnlev * know we can suspend, or power-off a CPU, without problems. 86*1d03c31eSjohnlev * 87*1d03c31eSjohnlev * [1] note that we have to repeatedly poke offline CPUs: it's the only 88*1d03c31eSjohnlev * way to ensure that the CPU doesn't miss the state change before 89*1d03c31eSjohnlev * dropping into HYPERVISOR_block(). 90*1d03c31eSjohnlev */ 91*1d03c31eSjohnlev 92843e1988Sjohnlev #pragma ident "%Z%%M% %I% %E% SMI" 93843e1988Sjohnlev 94843e1988Sjohnlev #include <sys/types.h> 95843e1988Sjohnlev #include <sys/systm.h> 96843e1988Sjohnlev #include <sys/param.h> 97843e1988Sjohnlev #include <sys/taskq.h> 98843e1988Sjohnlev #include <sys/cmn_err.h> 99843e1988Sjohnlev #include <sys/archsystm.h> 100843e1988Sjohnlev #include <sys/machsystm.h> 101843e1988Sjohnlev #include <sys/segments.h> 102843e1988Sjohnlev #include <sys/cpuvar.h> 103843e1988Sjohnlev #include <sys/x86_archext.h> 104843e1988Sjohnlev #include <sys/controlregs.h> 105843e1988Sjohnlev #include <sys/hypervisor.h> 106843e1988Sjohnlev #include <sys/xpv_panic.h> 107*1d03c31eSjohnlev #include <sys/mman.h> 108*1d03c31eSjohnlev #include <sys/psw.h> 109843e1988Sjohnlev #include <sys/cpu.h> 110*1d03c31eSjohnlev #include <sys/sunddi.h> 111*1d03c31eSjohnlev #include <util/sscanf.h> 112*1d03c31eSjohnlev #include <vm/hat_i86.h> 113*1d03c31eSjohnlev #include <vm/hat.h> 114*1d03c31eSjohnlev #include <vm/as.h> 115843e1988Sjohnlev 116843e1988Sjohnlev #include <xen/public/io/xs_wire.h> 117*1d03c31eSjohnlev #include <xen/sys/xenbus_impl.h> 118*1d03c31eSjohnlev #include <xen/public/vcpu.h> 119843e1988Sjohnlev 120*1d03c31eSjohnlev #define CPU_PHASE_NONE 0 121*1d03c31eSjohnlev #define CPU_PHASE_WAIT_SAFE 1 122*1d03c31eSjohnlev #define CPU_PHASE_SAFE 2 123*1d03c31eSjohnlev #define CPU_PHASE_POWERED_OFF 3 124*1d03c31eSjohnlev 125*1d03c31eSjohnlev /* 126*1d03c31eSjohnlev * We can only poke CPUs during barrier enter 256 times a second at 127*1d03c31eSjohnlev * most. 128*1d03c31eSjohnlev */ 129*1d03c31eSjohnlev #define POKE_TIMEOUT (NANOSEC / 256) 130843e1988Sjohnlev 131843e1988Sjohnlev static taskq_t *cpu_config_tq; 132*1d03c31eSjohnlev static int cpu_phase[NCPU]; 133*1d03c31eSjohnlev 134843e1988Sjohnlev static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t); 135843e1988Sjohnlev static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *); 136843e1988Sjohnlev 137843e1988Sjohnlev /* 138843e1988Sjohnlev * These routines allocate any global state that might be needed 139843e1988Sjohnlev * while starting cpus. For virtual cpus, there is no such state. 140843e1988Sjohnlev */ 141843e1988Sjohnlev int 142843e1988Sjohnlev mach_cpucontext_init(void) 143843e1988Sjohnlev { 144843e1988Sjohnlev return (0); 145843e1988Sjohnlev } 146843e1988Sjohnlev 147843e1988Sjohnlev void 148843e1988Sjohnlev do_cpu_config_watch(int state) 149843e1988Sjohnlev { 150843e1988Sjohnlev static struct xenbus_watch cpu_config_watch; 151843e1988Sjohnlev 152843e1988Sjohnlev if (state != XENSTORE_UP) 153843e1988Sjohnlev return; 154843e1988Sjohnlev cpu_config_watch.node = "cpu"; 155843e1988Sjohnlev cpu_config_watch.callback = vcpu_config_event; 156843e1988Sjohnlev if (register_xenbus_watch(&cpu_config_watch)) { 157843e1988Sjohnlev taskq_destroy(cpu_config_tq); 158843e1988Sjohnlev cmn_err(CE_WARN, "do_cpu_config_watch: " 159843e1988Sjohnlev "failed to set vcpu config watch"); 160843e1988Sjohnlev } 161843e1988Sjohnlev 162843e1988Sjohnlev } 163843e1988Sjohnlev 164843e1988Sjohnlev /* 165843e1988Sjohnlev * This routine is called after all the "normal" MP startup has 166843e1988Sjohnlev * been done; a good place to start watching xen store for virtual 167843e1988Sjohnlev * cpu hot plug events. 168843e1988Sjohnlev */ 169843e1988Sjohnlev void 170843e1988Sjohnlev mach_cpucontext_fini(void) 171843e1988Sjohnlev { 172843e1988Sjohnlev 173843e1988Sjohnlev cpu_config_tq = taskq_create("vcpu config taskq", 1, 174843e1988Sjohnlev maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 175843e1988Sjohnlev 176843e1988Sjohnlev (void) xs_register_xenbus_callback(do_cpu_config_watch); 177843e1988Sjohnlev } 178843e1988Sjohnlev 179843e1988Sjohnlev /* 180843e1988Sjohnlev * Fill in the remaining CPU context and initialize it. 181843e1988Sjohnlev */ 182843e1988Sjohnlev static int 183843e1988Sjohnlev mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp) 184843e1988Sjohnlev { 185843e1988Sjohnlev uint_t vec, iopl; 186843e1988Sjohnlev 187843e1988Sjohnlev vgc->flags = VGCF_IN_KERNEL; 188843e1988Sjohnlev 189843e1988Sjohnlev /* 190843e1988Sjohnlev * fpu_ctx we leave as zero; on first fault we'll store 191843e1988Sjohnlev * sse_initial into it anyway. 192843e1988Sjohnlev */ 193843e1988Sjohnlev 194843e1988Sjohnlev #if defined(__amd64) 195843e1988Sjohnlev vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */ 196843e1988Sjohnlev #else 197843e1988Sjohnlev vgc->user_regs.cs = KCS_SEL; 198843e1988Sjohnlev #endif 199843e1988Sjohnlev vgc->user_regs.ds = KDS_SEL; 200843e1988Sjohnlev vgc->user_regs.es = KDS_SEL; 201843e1988Sjohnlev vgc->user_regs.ss = KDS_SEL; 202843e1988Sjohnlev vgc->kernel_ss = KDS_SEL; 203843e1988Sjohnlev 204843e1988Sjohnlev /* 205843e1988Sjohnlev * Allow I/O privilege level for Dom0 kernel. 206843e1988Sjohnlev */ 207843e1988Sjohnlev if (DOMAIN_IS_INITDOMAIN(xen_info)) 208843e1988Sjohnlev iopl = (PS_IOPL & 0x1000); /* ring 1 */ 209843e1988Sjohnlev else 210843e1988Sjohnlev iopl = 0; 211843e1988Sjohnlev 212843e1988Sjohnlev #if defined(__amd64) 213843e1988Sjohnlev vgc->user_regs.fs = 0; 214843e1988Sjohnlev vgc->user_regs.gs = 0; 215843e1988Sjohnlev vgc->user_regs.rflags = F_OFF | iopl; 216843e1988Sjohnlev #elif defined(__i386) 217843e1988Sjohnlev vgc->user_regs.fs = KFS_SEL; 218843e1988Sjohnlev vgc->user_regs.gs = KGS_SEL; 219843e1988Sjohnlev vgc->user_regs.eflags = F_OFF | iopl; 220843e1988Sjohnlev vgc->event_callback_cs = vgc->user_regs.cs; 221843e1988Sjohnlev vgc->failsafe_callback_cs = vgc->user_regs.cs; 222843e1988Sjohnlev #endif 223843e1988Sjohnlev 224843e1988Sjohnlev /* 225843e1988Sjohnlev * Initialize the trap_info_t from the IDT 226843e1988Sjohnlev */ 227843e1988Sjohnlev #if !defined(__lint) 228843e1988Sjohnlev ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0])); 229843e1988Sjohnlev #endif 230843e1988Sjohnlev for (vec = 0; vec < NIDT; vec++) { 231843e1988Sjohnlev trap_info_t *ti = &vgc->trap_ctxt[vec]; 232843e1988Sjohnlev 233843e1988Sjohnlev if (xen_idt_to_trap_info(vec, 234843e1988Sjohnlev &cp->cpu_m.mcpu_idt[vec], ti) == 0) { 235843e1988Sjohnlev ti->cs = KCS_SEL; 236843e1988Sjohnlev ti->vector = vec; 237843e1988Sjohnlev } 238843e1988Sjohnlev } 239843e1988Sjohnlev 240843e1988Sjohnlev /* 241843e1988Sjohnlev * No LDT 242843e1988Sjohnlev */ 243843e1988Sjohnlev 244843e1988Sjohnlev /* 245843e1988Sjohnlev * (We assert in various places that the GDT is (a) aligned on a 246843e1988Sjohnlev * page boundary and (b) one page long, so this really should fit..) 247843e1988Sjohnlev */ 248843e1988Sjohnlev #ifdef CRASH_XEN 249843e1988Sjohnlev vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 250843e1988Sjohnlev #else 251843e1988Sjohnlev vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 252843e1988Sjohnlev #endif 253843e1988Sjohnlev vgc->gdt_ents = NGDT; 254843e1988Sjohnlev 255843e1988Sjohnlev vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0()); 256843e1988Sjohnlev 257843e1988Sjohnlev #if defined(__i386) 258843e1988Sjohnlev if (mmu.pae_hat) 259843e1988Sjohnlev vgc->ctrlreg[3] = 260843e1988Sjohnlev xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn)); 261843e1988Sjohnlev else 262843e1988Sjohnlev #endif 263843e1988Sjohnlev vgc->ctrlreg[3] = 264843e1988Sjohnlev pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn)); 265843e1988Sjohnlev 266843e1988Sjohnlev vgc->ctrlreg[4] = getcr4(); 267843e1988Sjohnlev 268843e1988Sjohnlev vgc->event_callback_eip = (uintptr_t)xen_callback; 269843e1988Sjohnlev vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback; 270843e1988Sjohnlev vgc->flags |= VGCF_failsafe_disables_events; 271843e1988Sjohnlev 272843e1988Sjohnlev #if defined(__amd64) 273843e1988Sjohnlev /* 274843e1988Sjohnlev * XXPV should this be moved to init_cpu_syscall? 275843e1988Sjohnlev */ 276843e1988Sjohnlev vgc->syscall_callback_eip = (uintptr_t)sys_syscall; 277843e1988Sjohnlev vgc->flags |= VGCF_syscall_disables_events; 278843e1988Sjohnlev 279843e1988Sjohnlev ASSERT(vgc->user_regs.gs == 0); 280843e1988Sjohnlev vgc->gs_base_kernel = (uintptr_t)cp; 281843e1988Sjohnlev #endif 282843e1988Sjohnlev 283843e1988Sjohnlev return (xen_vcpu_initialize(cp->cpu_id, vgc)); 284843e1988Sjohnlev } 285843e1988Sjohnlev 286843e1988Sjohnlev /* 287843e1988Sjohnlev * Create a guest virtual cpu context so that the virtual cpu 288843e1988Sjohnlev * springs into life in the domain just about to call mp_startup() 289843e1988Sjohnlev * 290843e1988Sjohnlev * Virtual CPUs must be initialized once in the lifetime of the domain; 291843e1988Sjohnlev * after that subsequent attempts to start them will fail with X_EEXIST. 292843e1988Sjohnlev * 293843e1988Sjohnlev * Thus 'alloc' -really- creates and initializes the virtual 294843e1988Sjohnlev * CPU context just once. Once the initialisation succeeds, we never 295843e1988Sjohnlev * free it, nor the regular cpu_t to which it refers. 296843e1988Sjohnlev */ 297843e1988Sjohnlev void * 298843e1988Sjohnlev mach_cpucontext_alloc(struct cpu *cp) 299843e1988Sjohnlev { 300843e1988Sjohnlev kthread_t *tp = cp->cpu_thread; 301843e1988Sjohnlev vcpu_guest_context_t vgc; 302843e1988Sjohnlev 303843e1988Sjohnlev int err = 1; 304843e1988Sjohnlev 305843e1988Sjohnlev /* 306843e1988Sjohnlev * First, augment the incoming cpu structure 307843e1988Sjohnlev * - vcpu pointer reference 308843e1988Sjohnlev * - pending event storage area 309843e1988Sjohnlev * - physical address of GDT 310843e1988Sjohnlev */ 311843e1988Sjohnlev cp->cpu_m.mcpu_vcpu_info = 312843e1988Sjohnlev &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id]; 313843e1988Sjohnlev cp->cpu_m.mcpu_evt_pend = kmem_zalloc( 314843e1988Sjohnlev sizeof (struct xen_evt_data), KM_SLEEP); 315843e1988Sjohnlev cp->cpu_m.mcpu_gdtpa = 316843e1988Sjohnlev mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt)); 317843e1988Sjohnlev 318843e1988Sjohnlev if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0) 319843e1988Sjohnlev goto done; 320843e1988Sjohnlev 321843e1988Sjohnlev /* 322843e1988Sjohnlev * Now set up the vcpu context so that we can start this vcpu 323843e1988Sjohnlev * in the kernel at tp->t_pc (mp_startup). Note that the 324843e1988Sjohnlev * thread will thread_exit() shortly after performing the 325843e1988Sjohnlev * initialization; in particular, we will *never* take a 326843e1988Sjohnlev * privilege transition on this thread. 327843e1988Sjohnlev */ 328843e1988Sjohnlev 329843e1988Sjohnlev bzero(&vgc, sizeof (vgc)); 330843e1988Sjohnlev 331843e1988Sjohnlev #ifdef __amd64 332843e1988Sjohnlev vgc.user_regs.rip = tp->t_pc; 333843e1988Sjohnlev vgc.user_regs.rsp = tp->t_sp; 334843e1988Sjohnlev vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t); 335843e1988Sjohnlev #else 336843e1988Sjohnlev vgc.user_regs.eip = tp->t_pc; 337843e1988Sjohnlev vgc.user_regs.esp = tp->t_sp; 338843e1988Sjohnlev vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t); 339843e1988Sjohnlev #endif 340843e1988Sjohnlev /* 341843e1988Sjohnlev * XXPV Fix resume, if Russ didn't already fix it. 342843e1988Sjohnlev * 343843e1988Sjohnlev * Note that resume unconditionally puts t->t_stk + sizeof (regs) 344843e1988Sjohnlev * into kernel_sp via HYPERVISOR_stack_switch. This anticipates 345843e1988Sjohnlev * that only lwps take traps that switch to the kernel stack; 346843e1988Sjohnlev * part of creating an lwp adjusts the stack by subtracting 347843e1988Sjohnlev * sizeof (struct regs) off t_stk. 348843e1988Sjohnlev * 349843e1988Sjohnlev * The more interesting question is, why do we do all the work 350843e1988Sjohnlev * of a fully fledged lwp for a plain thread? In particular 351843e1988Sjohnlev * we don't have to call HYPERVISOR_stack_switch for lwp-less threads 352843e1988Sjohnlev * or futz with the LDT. This should probably all be done with 353843e1988Sjohnlev * an lwp context operator to keep pure thread context switch fast. 354843e1988Sjohnlev */ 355843e1988Sjohnlev vgc.kernel_sp = (ulong_t)tp->t_stk; 356843e1988Sjohnlev 357843e1988Sjohnlev err = mp_set_cpu_context(&vgc, cp); 358843e1988Sjohnlev 359843e1988Sjohnlev done: 360843e1988Sjohnlev if (err) { 361843e1988Sjohnlev mach_cpucontext_free(cp, NULL, err); 362843e1988Sjohnlev return (NULL); 363843e1988Sjohnlev } 364843e1988Sjohnlev return (cp); 365843e1988Sjohnlev } 366843e1988Sjohnlev 367843e1988Sjohnlev /* 368843e1988Sjohnlev * By the time we are called either we have successfully started 369843e1988Sjohnlev * the cpu, or our attempt to start it has failed. 370843e1988Sjohnlev */ 371843e1988Sjohnlev 372843e1988Sjohnlev /*ARGSUSED*/ 373843e1988Sjohnlev void 374843e1988Sjohnlev mach_cpucontext_free(struct cpu *cp, void *arg, int err) 375843e1988Sjohnlev { 376843e1988Sjohnlev switch (err) { 377843e1988Sjohnlev case 0: 378843e1988Sjohnlev break; 379843e1988Sjohnlev case ETIMEDOUT: 380843e1988Sjohnlev /* 381843e1988Sjohnlev * The vcpu context is loaded into the hypervisor, and 382843e1988Sjohnlev * we've tried to start it, but the vcpu has not been set 383843e1988Sjohnlev * running yet, for whatever reason. We arrange to -not- 384843e1988Sjohnlev * free any data structures it may be referencing. In 385843e1988Sjohnlev * particular, we've already told the hypervisor about 386843e1988Sjohnlev * the GDT, and so we can't map it read-write again. 387843e1988Sjohnlev */ 388843e1988Sjohnlev break; 389843e1988Sjohnlev default: 390843e1988Sjohnlev (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE); 391843e1988Sjohnlev kmem_free(cp->cpu_m.mcpu_evt_pend, 392843e1988Sjohnlev sizeof (struct xen_evt_data)); 393843e1988Sjohnlev break; 394843e1988Sjohnlev } 395843e1988Sjohnlev } 396843e1988Sjohnlev 397843e1988Sjohnlev /* 398843e1988Sjohnlev * Reset this CPU's context. Clear out any pending evtchn data, since event 399843e1988Sjohnlev * channel numbers will all change when we resume. 400843e1988Sjohnlev */ 401843e1988Sjohnlev void 402843e1988Sjohnlev mach_cpucontext_reset(cpu_t *cp) 403843e1988Sjohnlev { 404843e1988Sjohnlev bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data)); 405843e1988Sjohnlev /* mcpu_intr_pending ? */ 406843e1988Sjohnlev } 407843e1988Sjohnlev 408843e1988Sjohnlev static void 409843e1988Sjohnlev pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc) 410843e1988Sjohnlev { 411843e1988Sjohnlev #ifdef __amd64 412843e1988Sjohnlev vgc->user_regs.rip = pcb->val[REG_LABEL_PC]; 413843e1988Sjohnlev vgc->user_regs.rsp = pcb->val[REG_LABEL_SP]; 414843e1988Sjohnlev vgc->user_regs.rbp = pcb->val[REG_LABEL_BP]; 415843e1988Sjohnlev vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX]; 416843e1988Sjohnlev vgc->user_regs.r12 = pcb->val[REG_LABEL_R12]; 417843e1988Sjohnlev vgc->user_regs.r13 = pcb->val[REG_LABEL_R13]; 418843e1988Sjohnlev vgc->user_regs.r14 = pcb->val[REG_LABEL_R14]; 419843e1988Sjohnlev vgc->user_regs.r15 = pcb->val[REG_LABEL_R15]; 420843e1988Sjohnlev #else /* __amd64 */ 421843e1988Sjohnlev vgc->user_regs.eip = pcb->val[REG_LABEL_PC]; 422843e1988Sjohnlev vgc->user_regs.esp = pcb->val[REG_LABEL_SP]; 423843e1988Sjohnlev vgc->user_regs.ebp = pcb->val[REG_LABEL_BP]; 424843e1988Sjohnlev vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX]; 425843e1988Sjohnlev vgc->user_regs.esi = pcb->val[REG_LABEL_ESI]; 426843e1988Sjohnlev vgc->user_regs.edi = pcb->val[REG_LABEL_EDI]; 427843e1988Sjohnlev #endif /* __amd64 */ 428843e1988Sjohnlev } 429843e1988Sjohnlev 430843e1988Sjohnlev /* 431*1d03c31eSjohnlev * Restore the context of a CPU during resume. This context is always 432*1d03c31eSjohnlev * inside enter_safe_phase(), below. 433843e1988Sjohnlev */ 434843e1988Sjohnlev void 435843e1988Sjohnlev mach_cpucontext_restore(cpu_t *cp) 436843e1988Sjohnlev { 437843e1988Sjohnlev vcpu_guest_context_t vgc; 438843e1988Sjohnlev int err; 439843e1988Sjohnlev 440843e1988Sjohnlev ASSERT(cp->cpu_thread == cp->cpu_pause_thread || 441843e1988Sjohnlev cp->cpu_thread == cp->cpu_idle_thread); 442843e1988Sjohnlev 443843e1988Sjohnlev bzero(&vgc, sizeof (vgc)); 444843e1988Sjohnlev 445843e1988Sjohnlev pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc); 446843e1988Sjohnlev 447843e1988Sjohnlev /* 448843e1988Sjohnlev * We're emulating a longjmp() here: in particular, we need to bump the 449843e1988Sjohnlev * stack pointer to account for the pop of xIP that returning from 450843e1988Sjohnlev * longjmp() normally would do, and set the return value in xAX to 1. 451843e1988Sjohnlev */ 452843e1988Sjohnlev #ifdef __amd64 453843e1988Sjohnlev vgc.user_regs.rax = 1; 454843e1988Sjohnlev vgc.user_regs.rsp += sizeof (ulong_t); 455843e1988Sjohnlev #else 456843e1988Sjohnlev vgc.user_regs.eax = 1; 457843e1988Sjohnlev vgc.user_regs.esp += sizeof (ulong_t); 458843e1988Sjohnlev #endif 459843e1988Sjohnlev 460843e1988Sjohnlev vgc.kernel_sp = cp->cpu_thread->t_sp; 461843e1988Sjohnlev 462843e1988Sjohnlev err = mp_set_cpu_context(&vgc, cp); 463843e1988Sjohnlev 464843e1988Sjohnlev ASSERT(err == 0); 465843e1988Sjohnlev } 466843e1988Sjohnlev 467*1d03c31eSjohnlev /* 468*1d03c31eSjohnlev * Reach a point at which the CPU can be safely powered-off or 469*1d03c31eSjohnlev * suspended. Nothing can wake this CPU out of the loop. 470*1d03c31eSjohnlev */ 471*1d03c31eSjohnlev static void 472*1d03c31eSjohnlev enter_safe_phase(void) 473*1d03c31eSjohnlev { 474*1d03c31eSjohnlev ulong_t flags = intr_clear(); 475*1d03c31eSjohnlev 476*1d03c31eSjohnlev if (setjmp(&curthread->t_pcb) == 0) { 477*1d03c31eSjohnlev cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE; 478*1d03c31eSjohnlev while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE) 479*1d03c31eSjohnlev SMT_PAUSE(); 480*1d03c31eSjohnlev } 481*1d03c31eSjohnlev 482*1d03c31eSjohnlev ASSERT(!interrupts_enabled()); 483*1d03c31eSjohnlev 484*1d03c31eSjohnlev intr_restore(flags); 485*1d03c31eSjohnlev } 486*1d03c31eSjohnlev 487*1d03c31eSjohnlev /* 488*1d03c31eSjohnlev * Offline CPUs run this code even under a pause_cpus(), so we must 489*1d03c31eSjohnlev * check if we need to enter the safe phase. 490*1d03c31eSjohnlev */ 491843e1988Sjohnlev void 492843e1988Sjohnlev mach_cpu_idle(void) 493843e1988Sjohnlev { 494843e1988Sjohnlev if (IN_XPV_PANIC()) { 495843e1988Sjohnlev xpv_panic_halt(); 496843e1988Sjohnlev } else { 497843e1988Sjohnlev (void) HYPERVISOR_block(); 498*1d03c31eSjohnlev if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 499*1d03c31eSjohnlev enter_safe_phase(); 500843e1988Sjohnlev } 501843e1988Sjohnlev } 502843e1988Sjohnlev 503*1d03c31eSjohnlev /* 504*1d03c31eSjohnlev * Spin until either start_cpus() wakes us up, or we get a request to 505*1d03c31eSjohnlev * enter the safe phase (followed by a later start_cpus()). 506*1d03c31eSjohnlev */ 507843e1988Sjohnlev void 508843e1988Sjohnlev mach_cpu_pause(volatile char *safe) 509843e1988Sjohnlev { 510*1d03c31eSjohnlev *safe = PAUSE_WAIT; 511*1d03c31eSjohnlev membar_enter(); 512843e1988Sjohnlev 513*1d03c31eSjohnlev while (*safe != PAUSE_IDLE) { 514*1d03c31eSjohnlev if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 515*1d03c31eSjohnlev enter_safe_phase(); 516843e1988Sjohnlev SMT_PAUSE(); 517*1d03c31eSjohnlev } 518843e1988Sjohnlev } 519843e1988Sjohnlev 520*1d03c31eSjohnlev void 521*1d03c31eSjohnlev mach_cpu_halt(char *msg) 522*1d03c31eSjohnlev { 523*1d03c31eSjohnlev if (msg) 524*1d03c31eSjohnlev prom_printf("%s\n", msg); 525*1d03c31eSjohnlev (void) xen_vcpu_down(CPU->cpu_id); 526*1d03c31eSjohnlev } 527843e1988Sjohnlev 528843e1988Sjohnlev /*ARGSUSED*/ 529843e1988Sjohnlev int 530843e1988Sjohnlev mp_cpu_poweron(struct cpu *cp) 531843e1988Sjohnlev { 532843e1988Sjohnlev return (ENOTSUP); 533843e1988Sjohnlev } 534843e1988Sjohnlev 535843e1988Sjohnlev /*ARGSUSED*/ 536843e1988Sjohnlev int 537843e1988Sjohnlev mp_cpu_poweroff(struct cpu *cp) 538843e1988Sjohnlev { 539843e1988Sjohnlev return (ENOTSUP); 540843e1988Sjohnlev } 541843e1988Sjohnlev 542*1d03c31eSjohnlev void 543*1d03c31eSjohnlev mp_enter_barrier(void) 544843e1988Sjohnlev { 545*1d03c31eSjohnlev hrtime_t last_poke_time = 0; 546*1d03c31eSjohnlev int poke_allowed = 0; 547*1d03c31eSjohnlev int done = 0; 548*1d03c31eSjohnlev int i; 549843e1988Sjohnlev 550843e1988Sjohnlev ASSERT(MUTEX_HELD(&cpu_lock)); 551843e1988Sjohnlev 552*1d03c31eSjohnlev pause_cpus(NULL); 553*1d03c31eSjohnlev 554*1d03c31eSjohnlev while (!done) { 555*1d03c31eSjohnlev done = 1; 556*1d03c31eSjohnlev poke_allowed = 0; 557*1d03c31eSjohnlev 558*1d03c31eSjohnlev if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) { 559*1d03c31eSjohnlev last_poke_time = xpv_gethrtime(); 560*1d03c31eSjohnlev poke_allowed = 1; 561*1d03c31eSjohnlev } 562*1d03c31eSjohnlev 563*1d03c31eSjohnlev for (i = 0; i < NCPU; i++) { 564*1d03c31eSjohnlev cpu_t *cp = cpu_get(i); 565*1d03c31eSjohnlev 566*1d03c31eSjohnlev if (cp == NULL || cp == CPU) 567*1d03c31eSjohnlev continue; 568*1d03c31eSjohnlev 569*1d03c31eSjohnlev switch (cpu_phase[i]) { 570*1d03c31eSjohnlev case CPU_PHASE_NONE: 571*1d03c31eSjohnlev cpu_phase[i] = CPU_PHASE_WAIT_SAFE; 572*1d03c31eSjohnlev poke_cpu(i); 573*1d03c31eSjohnlev done = 0; 574*1d03c31eSjohnlev break; 575*1d03c31eSjohnlev 576*1d03c31eSjohnlev case CPU_PHASE_WAIT_SAFE: 577*1d03c31eSjohnlev if (poke_allowed) 578*1d03c31eSjohnlev poke_cpu(i); 579*1d03c31eSjohnlev done = 0; 580*1d03c31eSjohnlev break; 581*1d03c31eSjohnlev 582*1d03c31eSjohnlev case CPU_PHASE_SAFE: 583*1d03c31eSjohnlev case CPU_PHASE_POWERED_OFF: 584*1d03c31eSjohnlev break; 585*1d03c31eSjohnlev } 586*1d03c31eSjohnlev } 587*1d03c31eSjohnlev 588*1d03c31eSjohnlev SMT_PAUSE(); 589843e1988Sjohnlev } 590*1d03c31eSjohnlev } 591843e1988Sjohnlev 592*1d03c31eSjohnlev void 593*1d03c31eSjohnlev mp_leave_barrier(void) 594*1d03c31eSjohnlev { 595*1d03c31eSjohnlev int i; 596*1d03c31eSjohnlev 597*1d03c31eSjohnlev ASSERT(MUTEX_HELD(&cpu_lock)); 598*1d03c31eSjohnlev 599*1d03c31eSjohnlev for (i = 0; i < NCPU; i++) { 600*1d03c31eSjohnlev cpu_t *cp = cpu_get(i); 601*1d03c31eSjohnlev 602*1d03c31eSjohnlev if (cp == NULL || cp == CPU) 603*1d03c31eSjohnlev continue; 604*1d03c31eSjohnlev 605*1d03c31eSjohnlev switch (cpu_phase[i]) { 606843e1988Sjohnlev /* 607*1d03c31eSjohnlev * If we see a CPU in one of these phases, something has 608*1d03c31eSjohnlev * gone badly wrong with the guarantees 609*1d03c31eSjohnlev * mp_enter_barrier() is supposed to provide. Rather 610*1d03c31eSjohnlev * than attempt to stumble along (and since we can't 611*1d03c31eSjohnlev * panic properly in this context), we tell the 612*1d03c31eSjohnlev * hypervisor we've crashed. 613843e1988Sjohnlev */ 614*1d03c31eSjohnlev case CPU_PHASE_NONE: 615*1d03c31eSjohnlev case CPU_PHASE_WAIT_SAFE: 616*1d03c31eSjohnlev (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 617*1d03c31eSjohnlev break; 618843e1988Sjohnlev 619*1d03c31eSjohnlev case CPU_PHASE_POWERED_OFF: 620*1d03c31eSjohnlev break; 621*1d03c31eSjohnlev 622*1d03c31eSjohnlev case CPU_PHASE_SAFE: 623*1d03c31eSjohnlev cpu_phase[i] = CPU_PHASE_NONE; 624*1d03c31eSjohnlev } 625843e1988Sjohnlev } 626843e1988Sjohnlev 627*1d03c31eSjohnlev start_cpus(); 628843e1988Sjohnlev } 629843e1988Sjohnlev 630843e1988Sjohnlev static int 631843e1988Sjohnlev poweroff_vcpu(struct cpu *cp) 632843e1988Sjohnlev { 633843e1988Sjohnlev int error; 634843e1988Sjohnlev 635843e1988Sjohnlev ASSERT(MUTEX_HELD(&cpu_lock)); 636843e1988Sjohnlev 637843e1988Sjohnlev ASSERT(CPU->cpu_id != cp->cpu_id); 638843e1988Sjohnlev ASSERT(cp->cpu_flags & CPU_QUIESCED); 639843e1988Sjohnlev 640*1d03c31eSjohnlev mp_enter_barrier(); 641843e1988Sjohnlev 642843e1988Sjohnlev if ((error = xen_vcpu_down(cp->cpu_id)) == 0) { 643*1d03c31eSjohnlev ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE); 644*1d03c31eSjohnlev 645843e1988Sjohnlev CPUSET_DEL(cpu_ready_set, cp->cpu_id); 646*1d03c31eSjohnlev 647843e1988Sjohnlev cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE; 648843e1988Sjohnlev cp->cpu_flags &= 649843e1988Sjohnlev ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE); 650843e1988Sjohnlev 651*1d03c31eSjohnlev cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF; 652*1d03c31eSjohnlev 653843e1988Sjohnlev cpu_set_state(cp); 654843e1988Sjohnlev } 655*1d03c31eSjohnlev 656*1d03c31eSjohnlev mp_leave_barrier(); 657*1d03c31eSjohnlev 658843e1988Sjohnlev return (error); 659843e1988Sjohnlev } 660843e1988Sjohnlev 661843e1988Sjohnlev static int 662843e1988Sjohnlev vcpu_config_poweroff(processorid_t id) 663843e1988Sjohnlev { 664843e1988Sjohnlev int oldstate; 665843e1988Sjohnlev int error; 666843e1988Sjohnlev cpu_t *cp; 667843e1988Sjohnlev 668843e1988Sjohnlev mutex_enter(&cpu_lock); 669843e1988Sjohnlev 670843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) { 671843e1988Sjohnlev mutex_exit(&cpu_lock); 672843e1988Sjohnlev return (ESRCH); 673843e1988Sjohnlev } 674843e1988Sjohnlev 675843e1988Sjohnlev if (cpu_get_state(cp) == P_POWEROFF) { 676843e1988Sjohnlev mutex_exit(&cpu_lock); 677843e1988Sjohnlev return (0); 678843e1988Sjohnlev } 679843e1988Sjohnlev 680843e1988Sjohnlev mutex_exit(&cpu_lock); 681843e1988Sjohnlev 682843e1988Sjohnlev do { 683843e1988Sjohnlev error = p_online_internal(id, P_OFFLINE, 684843e1988Sjohnlev &oldstate); 685843e1988Sjohnlev 686843e1988Sjohnlev if (error != 0) 687843e1988Sjohnlev break; 688843e1988Sjohnlev 689843e1988Sjohnlev /* 690843e1988Sjohnlev * So we just changed it to P_OFFLINE. But then we dropped 691843e1988Sjohnlev * cpu_lock, so now it is possible for another thread to change 692843e1988Sjohnlev * the cpu back to a different, non-quiesced state e.g. 693843e1988Sjohnlev * P_ONLINE. 694843e1988Sjohnlev */ 695843e1988Sjohnlev mutex_enter(&cpu_lock); 696843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) 697843e1988Sjohnlev error = ESRCH; 698843e1988Sjohnlev else { 699843e1988Sjohnlev if (cp->cpu_flags & CPU_QUIESCED) 700843e1988Sjohnlev error = poweroff_vcpu(cp); 701843e1988Sjohnlev else 702843e1988Sjohnlev error = EBUSY; 703843e1988Sjohnlev } 704843e1988Sjohnlev mutex_exit(&cpu_lock); 705843e1988Sjohnlev } while (error == EBUSY); 706843e1988Sjohnlev 707843e1988Sjohnlev return (error); 708843e1988Sjohnlev } 709843e1988Sjohnlev 710843e1988Sjohnlev /* 711843e1988Sjohnlev * Add a new virtual cpu to the domain. 712843e1988Sjohnlev */ 713843e1988Sjohnlev static int 714843e1988Sjohnlev vcpu_config_new(processorid_t id) 715843e1988Sjohnlev { 716843e1988Sjohnlev extern int start_cpu(processorid_t); 717843e1988Sjohnlev int error; 718843e1988Sjohnlev 719843e1988Sjohnlev if (ncpus == 1) { 720843e1988Sjohnlev printf("cannot (yet) add cpus to a single-cpu domain\n"); 721843e1988Sjohnlev return (ENOTSUP); 722843e1988Sjohnlev } 723843e1988Sjohnlev 724843e1988Sjohnlev affinity_set(CPU_CURRENT); 725843e1988Sjohnlev error = start_cpu(id); 726843e1988Sjohnlev affinity_clear(); 727843e1988Sjohnlev return (error); 728843e1988Sjohnlev } 729843e1988Sjohnlev 730*1d03c31eSjohnlev static int 731*1d03c31eSjohnlev poweron_vcpu(struct cpu *cp) 732*1d03c31eSjohnlev { 733*1d03c31eSjohnlev int error; 734*1d03c31eSjohnlev 735*1d03c31eSjohnlev ASSERT(MUTEX_HELD(&cpu_lock)); 736*1d03c31eSjohnlev 737*1d03c31eSjohnlev if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) { 738*1d03c31eSjohnlev printf("poweron_vcpu: vcpu%d is not available!\n", 739*1d03c31eSjohnlev cp->cpu_id); 740*1d03c31eSjohnlev return (ENXIO); 741*1d03c31eSjohnlev } 742*1d03c31eSjohnlev 743*1d03c31eSjohnlev if ((error = xen_vcpu_up(cp->cpu_id)) == 0) { 744*1d03c31eSjohnlev CPUSET_ADD(cpu_ready_set, cp->cpu_id); 745*1d03c31eSjohnlev cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING; 746*1d03c31eSjohnlev cp->cpu_flags &= ~CPU_POWEROFF; 747*1d03c31eSjohnlev /* 748*1d03c31eSjohnlev * There are some nasty races possible here. 749*1d03c31eSjohnlev * Tell the vcpu it's up one more time. 750*1d03c31eSjohnlev * XXPV Is this enough? Is this safe? 751*1d03c31eSjohnlev */ 752*1d03c31eSjohnlev (void) xen_vcpu_up(cp->cpu_id); 753*1d03c31eSjohnlev 754*1d03c31eSjohnlev cpu_phase[cp->cpu_id] = CPU_PHASE_NONE; 755*1d03c31eSjohnlev 756*1d03c31eSjohnlev cpu_set_state(cp); 757*1d03c31eSjohnlev } 758*1d03c31eSjohnlev return (error); 759*1d03c31eSjohnlev } 760*1d03c31eSjohnlev 761843e1988Sjohnlev static int 762843e1988Sjohnlev vcpu_config_poweron(processorid_t id) 763843e1988Sjohnlev { 764843e1988Sjohnlev cpu_t *cp; 765843e1988Sjohnlev int oldstate; 766843e1988Sjohnlev int error; 767843e1988Sjohnlev 768843e1988Sjohnlev if (id >= ncpus) 769843e1988Sjohnlev return (vcpu_config_new(id)); 770843e1988Sjohnlev 771843e1988Sjohnlev mutex_enter(&cpu_lock); 772843e1988Sjohnlev 773843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) { 774843e1988Sjohnlev mutex_exit(&cpu_lock); 775843e1988Sjohnlev return (ESRCH); 776843e1988Sjohnlev } 777843e1988Sjohnlev 778843e1988Sjohnlev if (cpu_get_state(cp) != P_POWEROFF) { 779843e1988Sjohnlev mutex_exit(&cpu_lock); 780843e1988Sjohnlev return (0); 781843e1988Sjohnlev } 782843e1988Sjohnlev 783843e1988Sjohnlev if ((error = poweron_vcpu(cp)) != 0) { 784843e1988Sjohnlev mutex_exit(&cpu_lock); 785843e1988Sjohnlev return (error); 786843e1988Sjohnlev } 787843e1988Sjohnlev 788843e1988Sjohnlev mutex_exit(&cpu_lock); 789843e1988Sjohnlev 790843e1988Sjohnlev return (p_online_internal(id, P_ONLINE, &oldstate)); 791843e1988Sjohnlev } 792843e1988Sjohnlev 793843e1988Sjohnlev #define REPORT_LEN 128 794843e1988Sjohnlev 795843e1988Sjohnlev static void 796843e1988Sjohnlev vcpu_config_report(processorid_t id, uint_t newstate, int error) 797843e1988Sjohnlev { 798843e1988Sjohnlev char *report = kmem_alloc(REPORT_LEN, KM_SLEEP); 799843e1988Sjohnlev size_t len; 800843e1988Sjohnlev char *ps; 801843e1988Sjohnlev 802843e1988Sjohnlev switch (newstate) { 803843e1988Sjohnlev case P_ONLINE: 804843e1988Sjohnlev ps = PS_ONLINE; 805843e1988Sjohnlev break; 806843e1988Sjohnlev case P_POWEROFF: 807843e1988Sjohnlev ps = PS_POWEROFF; 808843e1988Sjohnlev break; 809843e1988Sjohnlev default: 810843e1988Sjohnlev cmn_err(CE_PANIC, "unknown state %u\n", newstate); 811843e1988Sjohnlev break; 812843e1988Sjohnlev } 813843e1988Sjohnlev 814843e1988Sjohnlev len = snprintf(report, REPORT_LEN, 815843e1988Sjohnlev "cpu%d: externally initiated %s", id, ps); 816843e1988Sjohnlev 817843e1988Sjohnlev if (!error) { 818843e1988Sjohnlev cmn_err(CE_CONT, "!%s\n", report); 819843e1988Sjohnlev kmem_free(report, REPORT_LEN); 820843e1988Sjohnlev return; 821843e1988Sjohnlev } 822843e1988Sjohnlev 823843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 824843e1988Sjohnlev " failed, error %d: ", error); 825843e1988Sjohnlev switch (error) { 826843e1988Sjohnlev case EEXIST: 827843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 828843e1988Sjohnlev "cpu already %s", ps ? ps : "?"); 829843e1988Sjohnlev break; 830843e1988Sjohnlev case ESRCH: 831843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 832843e1988Sjohnlev "cpu not found"); 833843e1988Sjohnlev break; 834843e1988Sjohnlev case EINVAL: 835843e1988Sjohnlev case EALREADY: 836843e1988Sjohnlev break; 837843e1988Sjohnlev case EPERM: 838843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 839843e1988Sjohnlev "insufficient privilege (0x%x)", id); 840843e1988Sjohnlev break; 841843e1988Sjohnlev case EBUSY: 842843e1988Sjohnlev switch (newstate) { 843843e1988Sjohnlev case P_ONLINE: 844843e1988Sjohnlev /* 845843e1988Sjohnlev * This return comes from mp_cpu_start - 846843e1988Sjohnlev * we cannot 'start' the boot CPU. 847843e1988Sjohnlev */ 848843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 849843e1988Sjohnlev "already running"); 850843e1988Sjohnlev break; 851843e1988Sjohnlev case P_POWEROFF: 852843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 853843e1988Sjohnlev "bound lwps?"); 854843e1988Sjohnlev break; 855843e1988Sjohnlev default: 856843e1988Sjohnlev break; 857843e1988Sjohnlev } 858843e1988Sjohnlev default: 859843e1988Sjohnlev break; 860843e1988Sjohnlev } 861843e1988Sjohnlev 862843e1988Sjohnlev cmn_err(CE_CONT, "%s\n", report); 863843e1988Sjohnlev kmem_free(report, REPORT_LEN); 864843e1988Sjohnlev } 865843e1988Sjohnlev 866843e1988Sjohnlev static void 867843e1988Sjohnlev vcpu_config(void *arg) 868843e1988Sjohnlev { 869843e1988Sjohnlev int id = (int)(uintptr_t)arg; 870843e1988Sjohnlev int error; 871843e1988Sjohnlev char dir[16]; 872843e1988Sjohnlev char *state; 873843e1988Sjohnlev 874843e1988Sjohnlev if ((uint_t)id >= max_ncpus) { 875843e1988Sjohnlev cmn_err(CE_WARN, 876843e1988Sjohnlev "vcpu_config: cpu%d does not fit in this domain", id); 877843e1988Sjohnlev return; 878843e1988Sjohnlev } 879843e1988Sjohnlev 880843e1988Sjohnlev (void) snprintf(dir, sizeof (dir), "cpu/%d", id); 881843e1988Sjohnlev state = kmem_alloc(MAXPATHLEN, KM_SLEEP); 882843e1988Sjohnlev if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) { 883843e1988Sjohnlev if (strcmp(state, "online") == 0) { 884843e1988Sjohnlev error = vcpu_config_poweron(id); 885843e1988Sjohnlev vcpu_config_report(id, P_ONLINE, error); 886843e1988Sjohnlev } else if (strcmp(state, "offline") == 0) { 887843e1988Sjohnlev error = vcpu_config_poweroff(id); 888843e1988Sjohnlev vcpu_config_report(id, P_POWEROFF, error); 889843e1988Sjohnlev } else { 890843e1988Sjohnlev cmn_err(CE_WARN, 891843e1988Sjohnlev "cpu%d: unknown target state '%s'", id, state); 892843e1988Sjohnlev } 893843e1988Sjohnlev } else 894843e1988Sjohnlev cmn_err(CE_WARN, 895843e1988Sjohnlev "cpu%d: unable to read target state from xenstore", id); 896843e1988Sjohnlev 897843e1988Sjohnlev kmem_free(state, MAXPATHLEN); 898843e1988Sjohnlev } 899843e1988Sjohnlev 900843e1988Sjohnlev /*ARGSUSED*/ 901843e1988Sjohnlev static void 902843e1988Sjohnlev vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len) 903843e1988Sjohnlev { 904843e1988Sjohnlev const char *path = vec[XS_WATCH_PATH]; 905843e1988Sjohnlev processorid_t id; 906843e1988Sjohnlev char *s; 907843e1988Sjohnlev 908843e1988Sjohnlev if ((s = strstr(path, "cpu/")) != NULL && 909843e1988Sjohnlev sscanf(s, "cpu/%d", &id) == 1) { 910843e1988Sjohnlev /* 911843e1988Sjohnlev * Run the virtual CPU configuration on a separate thread to 912843e1988Sjohnlev * avoid blocking on this event for too long (and for now, 913843e1988Sjohnlev * to ensure configuration requests are serialized.) 914843e1988Sjohnlev */ 915843e1988Sjohnlev (void) taskq_dispatch(cpu_config_tq, 916843e1988Sjohnlev vcpu_config, (void *)(uintptr_t)id, 0); 917843e1988Sjohnlev } 918843e1988Sjohnlev } 919843e1988Sjohnlev 920843e1988Sjohnlev static int 921843e1988Sjohnlev xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc) 922843e1988Sjohnlev { 923843e1988Sjohnlev int err; 924843e1988Sjohnlev 925843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) { 926843e1988Sjohnlev char *str; 927843e1988Sjohnlev int level = CE_WARN; 928843e1988Sjohnlev 929843e1988Sjohnlev switch (err) { 930843e1988Sjohnlev case -X_EINVAL: 931843e1988Sjohnlev /* 932843e1988Sjohnlev * This interface squashes multiple error sources 933843e1988Sjohnlev * to one error code. In particular, an X_EINVAL 934843e1988Sjohnlev * code can mean: 935843e1988Sjohnlev * 936843e1988Sjohnlev * - the vcpu id is out of range 937843e1988Sjohnlev * - cs or ss are in ring 0 938843e1988Sjohnlev * - cr3 is wrong 939843e1988Sjohnlev * - an entry in the new gdt is above the 940843e1988Sjohnlev * reserved entry 941843e1988Sjohnlev * - a frame underneath the new gdt is bad 942843e1988Sjohnlev */ 943843e1988Sjohnlev str = "something is wrong :("; 944843e1988Sjohnlev break; 945843e1988Sjohnlev case -X_ENOENT: 946843e1988Sjohnlev str = "no such cpu"; 947843e1988Sjohnlev break; 948843e1988Sjohnlev case -X_ENOMEM: 949843e1988Sjohnlev str = "no mem to copy ctxt"; 950843e1988Sjohnlev break; 951843e1988Sjohnlev case -X_EFAULT: 952843e1988Sjohnlev str = "bad address"; 953843e1988Sjohnlev break; 954843e1988Sjohnlev case -X_EEXIST: 955843e1988Sjohnlev /* 956843e1988Sjohnlev * Hmm. This error is returned if the vcpu has already 957843e1988Sjohnlev * been initialized once before in the lifetime of this 958843e1988Sjohnlev * domain. This is a logic error in the kernel. 959843e1988Sjohnlev */ 960843e1988Sjohnlev level = CE_PANIC; 961843e1988Sjohnlev str = "already initialized"; 962843e1988Sjohnlev break; 963843e1988Sjohnlev default: 964843e1988Sjohnlev level = CE_PANIC; 965843e1988Sjohnlev str = "<unexpected>"; 966843e1988Sjohnlev break; 967843e1988Sjohnlev } 968843e1988Sjohnlev 969843e1988Sjohnlev cmn_err(level, "vcpu%d: failed to init: error %d: %s", 970843e1988Sjohnlev id, -err, str); 971843e1988Sjohnlev } 972843e1988Sjohnlev return (err); 973843e1988Sjohnlev } 974843e1988Sjohnlev 975843e1988Sjohnlev long 976843e1988Sjohnlev xen_vcpu_up(processorid_t id) 977843e1988Sjohnlev { 978843e1988Sjohnlev long err; 979843e1988Sjohnlev 980843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) { 981843e1988Sjohnlev char *str; 982843e1988Sjohnlev 983843e1988Sjohnlev switch (err) { 984843e1988Sjohnlev case -X_ENOENT: 985843e1988Sjohnlev str = "no such cpu"; 986843e1988Sjohnlev break; 987843e1988Sjohnlev case -X_EINVAL: 988843e1988Sjohnlev /* 989843e1988Sjohnlev * Perhaps this is diagnostic overkill. 990843e1988Sjohnlev */ 991843e1988Sjohnlev if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0) 992843e1988Sjohnlev str = "bad cpuid"; 993843e1988Sjohnlev else 994843e1988Sjohnlev str = "not initialized"; 995843e1988Sjohnlev break; 996843e1988Sjohnlev default: 997843e1988Sjohnlev str = "<unexpected>"; 998843e1988Sjohnlev break; 999843e1988Sjohnlev } 1000843e1988Sjohnlev 1001843e1988Sjohnlev printf("vcpu%d: failed to start: error %d: %s\n", 1002843e1988Sjohnlev id, -(int)err, str); 1003843e1988Sjohnlev return (EBFONT); /* deliberately silly */ 1004843e1988Sjohnlev } 1005843e1988Sjohnlev return (err); 1006843e1988Sjohnlev } 1007843e1988Sjohnlev 1008843e1988Sjohnlev long 1009843e1988Sjohnlev xen_vcpu_down(processorid_t id) 1010843e1988Sjohnlev { 1011843e1988Sjohnlev long err; 1012843e1988Sjohnlev 1013843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) { 1014843e1988Sjohnlev /* 1015843e1988Sjohnlev * X_ENOENT: no such cpu 1016843e1988Sjohnlev * X_EINVAL: bad cpuid 1017843e1988Sjohnlev */ 1018843e1988Sjohnlev panic("vcpu%d: failed to stop: error %d", id, -(int)err); 1019843e1988Sjohnlev } 1020843e1988Sjohnlev 1021843e1988Sjohnlev return (err); 1022843e1988Sjohnlev } 1023