/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. * All rights reserved. */ /* * Copyright 2019 Joyent, Inc. */ /* * Welcome to the world of the "real mode platter". * See also startup.c, mpcore.s and apic.c for related routines. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern cpuset_t cpu_ready_set; extern int mp_start_cpu_common(cpu_t *cp, boolean_t boot); extern void real_mode_start_cpu(void); extern void real_mode_start_cpu_end(void); extern void real_mode_stop_cpu_stage1(void); extern void real_mode_stop_cpu_stage1_end(void); extern void real_mode_stop_cpu_stage2(void); extern void real_mode_stop_cpu_stage2_end(void); void rmp_gdt_init(rm_platter_t *); /* * Fill up the real mode platter to make it easy for real mode code to * kick it off. This area should really be one passed by boot to kernel * and guaranteed to be below 1MB and aligned to 16 bytes. Should also * have identical physical and virtual address in paged mode. */ static ushort_t *warm_reset_vector = NULL; int mach_cpucontext_init(void) { ushort_t *vec; ulong_t addr; struct rm_platter *rm = (struct rm_platter *)rm_platter_va; if (!(vec = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR, sizeof (vec), PROT_READ | PROT_WRITE))) return (-1); /* * setup secondary cpu bios boot up vector * Write page offset to 0x467 and page frame number to 0x469. */ addr = (ulong_t)((caddr_t)rm->rm_code - (caddr_t)rm) + rm_platter_pa; vec[0] = (ushort_t)(addr & PAGEOFFSET); vec[1] = (ushort_t)((addr & (0xfffff & PAGEMASK)) >> 4); warm_reset_vector = vec; /* Map real mode platter into kas so kernel can access it. */ hat_devload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, btop(rm_platter_pa), PROT_READ | PROT_WRITE | PROT_EXEC, HAT_LOAD_NOCONSIST); /* Copy CPU startup code to rm_platter if it's still during boot. */ if (!plat_dr_enabled()) { ASSERT((size_t)real_mode_start_cpu_end - (size_t)real_mode_start_cpu <= RM_PLATTER_CODE_SIZE); bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, (size_t)real_mode_start_cpu_end - (size_t)real_mode_start_cpu); } return (0); } void mach_cpucontext_fini(void) { if (warm_reset_vector) psm_unmap_phys((caddr_t)warm_reset_vector, sizeof (warm_reset_vector)); hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, HAT_UNLOAD); } extern void *long_mode_64(void); /*ARGSUSED*/ void rmp_gdt_init(rm_platter_t *rm) { /* Use the kas address space for the CPU startup thread. */ if (mmu_ptob(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) { panic("Cannot initialize CPUs; kernel's 64-bit page tables\n" "located above 4G in physical memory (@ 0x%lx)", mmu_ptob(kas.a_hat->hat_htable->ht_pfn)); } /* * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY * by code in real_mode_start_cpu(): * * GDT[0]: NULL selector * GDT[1]: 64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1 * * Clear the IDT as interrupts will be off and a limit of 0 will cause * the CPU to triple fault and reset on an NMI, seemingly as reasonable * a course of action as any other, though it may cause the entire * platform to reset in some cases... */ rm->rm_temp_gdt[0] = 0; rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL; rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1); rm->rm_temp_gdt_base = rm_platter_pa + (uint32_t)offsetof(rm_platter_t, rm_temp_gdt); rm->rm_temp_idt_lim = 0; rm->rm_temp_idt_base = 0; /* * Since the CPU needs to jump to protected mode using an identity * mapped address, we need to calculate it here. */ rm->rm_longmode64_addr = rm_platter_pa + (uint32_t)((uintptr_t)long_mode_64 - (uintptr_t)real_mode_start_cpu); } static void * mach_cpucontext_alloc_tables(struct cpu *cp) { tss_t *ntss; struct cpu_tables *ct; size_t ctsize; /* * Allocate space for stack, tss, gdt and idt. We round the size * allotted for cpu_tables up, so that the TSS is on a unique page. * This is more efficient when running in virtual machines. */ ctsize = P2ROUNDUP(sizeof (*ct), PAGESIZE); ct = kmem_zalloc(ctsize, KM_SLEEP); if ((uintptr_t)ct & PAGEOFFSET) panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables", cp->cpu_id); ntss = cp->cpu_tss = &ct->ct_tss; uintptr_t va; size_t len; /* * #DF (double fault). */ ntss->tss_ist1 = (uintptr_t)&ct->ct_stack1[sizeof (ct->ct_stack1)]; /* * #NM (non-maskable interrupt) */ ntss->tss_ist2 = (uintptr_t)&ct->ct_stack2[sizeof (ct->ct_stack2)]; /* * #MC (machine check exception / hardware error) */ ntss->tss_ist3 = (uintptr_t)&ct->ct_stack3[sizeof (ct->ct_stack3)]; /* * #DB, #BP debug interrupts and KDI/kmdb */ ntss->tss_ist4 = (uintptr_t)&cp->cpu_m.mcpu_kpti_dbg.kf_tr_rsp; if (kpti_enable == 1) { /* * #GP, #PF, #SS fault interrupts */ ntss->tss_ist5 = (uintptr_t)&cp->cpu_m.mcpu_kpti_flt.kf_tr_rsp; /* * Used by all other interrupts */ ntss->tss_ist6 = (uint64_t)&cp->cpu_m.mcpu_kpti.kf_tr_rsp; /* * On AMD64 we need to make sure that all of the pages of the * struct cpu_tables are punched through onto the user CPU for * kpti. * * The final page will always be the TSS, so treat that * separately. */ for (va = (uintptr_t)ct, len = ctsize - MMU_PAGESIZE; len >= MMU_PAGESIZE; len -= MMU_PAGESIZE, va += MMU_PAGESIZE) { /* The doublefault stack must be RW */ hati_cpu_punchin(cp, va, PROT_READ | PROT_WRITE); } ASSERT3U((uintptr_t)ntss, ==, va); hati_cpu_punchin(cp, (uintptr_t)ntss, PROT_READ); } /* * Set I/O bit map offset equal to size of TSS segment limit * for no I/O permission map. This will cause all user I/O * instructions to generate #gp fault. */ ntss->tss_bitmapbase = sizeof (*ntss); /* * Setup kernel tss. */ set_syssegd((system_desc_t *)&cp->cpu_gdt[GDT_KTSS], cp->cpu_tss, sizeof (*cp->cpu_tss) - 1, SDT_SYSTSS, SEL_KPL); return (ct); } void * mach_cpucontext_xalloc(struct cpu *cp, int optype) { size_t len; struct cpu_tables *ct; rm_platter_t *rm = (rm_platter_t *)rm_platter_va; static int cpu_halt_code_ready; if (optype == MACH_CPUCONTEXT_OP_STOP) { ASSERT(plat_dr_enabled()); /* * The WARM_RESET_VECTOR has a limitation that the physical * address written to it must be page-aligned. To work around * this limitation, the CPU stop code has been splitted into * two stages. * The stage 2 code, which implements the real logic to halt * CPUs, is copied to the rm_cpu_halt_code field in the real * mode platter. The stage 1 code, which simply jumps to the * stage 2 code in the rm_cpu_halt_code field, is copied to * rm_code field in the real mode platter and it may be * overwritten after the CPU has been stopped. */ if (!cpu_halt_code_ready) { /* * The rm_cpu_halt_code field in the real mode platter * is used by the CPU stop code only. So only copy the * CPU stop stage 2 code into the rm_cpu_halt_code * field on the first call. */ len = (size_t)real_mode_stop_cpu_stage2_end - (size_t)real_mode_stop_cpu_stage2; ASSERT(len <= RM_PLATTER_CPU_HALT_CODE_SIZE); bcopy((caddr_t)real_mode_stop_cpu_stage2, (caddr_t)rm->rm_cpu_halt_code, len); cpu_halt_code_ready = 1; } /* * The rm_code field in the real mode platter is shared by * the CPU start, CPU stop, CPR and fast reboot code. So copy * the CPU stop stage 1 code into the rm_code field every time. */ len = (size_t)real_mode_stop_cpu_stage1_end - (size_t)real_mode_stop_cpu_stage1; ASSERT(len <= RM_PLATTER_CODE_SIZE); bcopy((caddr_t)real_mode_stop_cpu_stage1, (caddr_t)rm->rm_code, len); rm->rm_cpu_halted = 0; return (cp->cpu_m.mcpu_mach_ctx_ptr); } else if (optype != MACH_CPUCONTEXT_OP_START) { return (NULL); } /* * Only need to allocate tables when starting CPU. * Tables allocated when starting CPU will be reused when stopping CPU. */ ct = mach_cpucontext_alloc_tables(cp); if (ct == NULL) { return (NULL); } /* Copy CPU startup code to rm_platter for CPU hot-add operations. */ if (plat_dr_enabled()) { bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, (size_t)real_mode_start_cpu_end - (size_t)real_mode_start_cpu); } /* * Now copy all that we've set up onto the real mode platter * for the real mode code to digest as part of starting the cpu. */ rm->rm_idt_base = cp->cpu_idt; rm->rm_idt_lim = sizeof (*cp->cpu_idt) * NIDT - 1; rm->rm_gdt_base = cp->cpu_gdt; rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1; /* * CPU needs to access kernel address space after powering on. */ rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_NONE); rm->rm_cpu = cp->cpu_id; /* * We need to mask off any bits set on our boot CPU that can't apply * while the subject CPU is initializing. If appropriate, they are * enabled later on. */ rm->rm_cr4 = getcr4(); rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE | CR4_PCIDE); rmp_gdt_init(rm); return (ct); } void mach_cpucontext_xfree(struct cpu *cp, void *arg, int err, int optype) { struct cpu_tables *ct = arg; ASSERT(&ct->ct_tss == cp->cpu_tss); if (optype == MACH_CPUCONTEXT_OP_START) { switch (err) { case 0: /* * Save pointer for reuse when stopping CPU. */ cp->cpu_m.mcpu_mach_ctx_ptr = arg; break; case ETIMEDOUT: /* * The processor was poked, but failed to start before * we gave up waiting for it. In case it starts later, * don't free anything. */ cp->cpu_m.mcpu_mach_ctx_ptr = arg; break; default: /* * Some other, passive, error occurred. */ kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); cp->cpu_tss = NULL; break; } } else if (optype == MACH_CPUCONTEXT_OP_STOP) { switch (err) { case 0: /* * Free resources allocated when starting CPU. */ kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); cp->cpu_tss = NULL; cp->cpu_m.mcpu_mach_ctx_ptr = NULL; break; default: /* * Don't touch table pointer in case of failure. */ break; } } else { ASSERT(0); } } void * mach_cpucontext_alloc(struct cpu *cp) { return (mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_START)); } void mach_cpucontext_free(struct cpu *cp, void *arg, int err) { mach_cpucontext_xfree(cp, arg, err, MACH_CPUCONTEXT_OP_START); } /* * "Enter monitor." Called via cross-call from stop_other_cpus(). */ int mach_cpu_halt(xc_arg_t arg1, xc_arg_t arg2 __unused, xc_arg_t arg3 __unused) { char *msg = (char *)arg1; if (msg) prom_printf("%s\n", msg); /*CONSTANTCONDITION*/ while (1) ; return (0); } void mach_cpu_idle(void) { x86_md_clear(); i86_halt(); } void mach_cpu_pause(volatile char *safe) { /* * This cpu is now safe. */ *safe = PAUSE_WAIT; membar_enter(); /* make sure stores are flushed */ /* * Now we wait. When we are allowed to continue, safe * will be set to PAUSE_IDLE. */ while (*safe != PAUSE_IDLE) SMT_PAUSE(); } /* * Power on the target CPU. */ int mp_cpu_poweron(struct cpu *cp) { int error; cpuset_t tempset; processorid_t cpuid; ASSERT(cp != NULL); cpuid = cp->cpu_id; if (use_mp == 0 || plat_dr_support_cpu() == 0) { return (ENOTSUP); } else if (cpuid < 0 || cpuid >= max_ncpus) { return (EINVAL); } /* * The currrent x86 implementaiton of mp_cpu_configure() and * mp_cpu_poweron() have a limitation that mp_cpu_poweron() could only * be called once after calling mp_cpu_configure() for a specific CPU. * It's because mp_cpu_poweron() will destroy data structure created * by mp_cpu_configure(). So reject the request if the CPU has already * been powered on once after calling mp_cpu_configure(). * This limitaiton only affects the p_online syscall and the DR driver * won't be affected because the DR driver always invoke public CPU * management interfaces in the predefined order: * cpu_configure()->cpu_poweron()...->cpu_poweroff()->cpu_unconfigure() */ if (cpuid_checkpass(cp, 4) || cp->cpu_thread == cp->cpu_idle_thread) { return (ENOTSUP); } /* * Check if there's at least a Mbyte of kmem available * before attempting to start the cpu. */ if (kmem_avail() < 1024 * 1024) { /* * Kick off a reap in case that helps us with * later attempts .. */ kmem_reap(); return (ENOMEM); } affinity_set(CPU->cpu_id); /* * Start the target CPU. No need to call mach_cpucontext_fini() * if mach_cpucontext_init() fails. */ if ((error = mach_cpucontext_init()) == 0) { error = mp_start_cpu_common(cp, B_FALSE); mach_cpucontext_fini(); } if (error != 0) { affinity_clear(); return (error); } /* Wait for the target cpu to reach READY state. */ tempset = cpu_ready_set; while (!CPU_IN_SET(tempset, cpuid)) { delay(1); tempset = *((volatile cpuset_t *)&cpu_ready_set); } /* Mark the target CPU as available for mp operation. */ CPUSET_ATOMIC_ADD(mp_cpus, cpuid); /* Free the space allocated to hold the microcode file */ ucode_cleanup(); affinity_clear(); return (0); } #define MP_CPU_DETACH_MAX_TRIES 5 #define MP_CPU_DETACH_DELAY 100 static int mp_cpu_detach_driver(dev_info_t *dip) { int i; int rv = EBUSY; dev_info_t *pdip; pdip = ddi_get_parent(dip); ASSERT(pdip != NULL); /* * Check if caller holds pdip busy - can cause deadlocks in * e_ddi_branch_unconfigure(), which calls devfs_clean(). */ if (DEVI_BUSY_OWNED(pdip)) { return (EDEADLOCK); } for (i = 0; i < MP_CPU_DETACH_MAX_TRIES; i++) { if (e_ddi_branch_unconfigure(dip, NULL, 0) == 0) { rv = 0; break; } DELAY(MP_CPU_DETACH_DELAY); } return (rv); } /* * Power off the target CPU. * Note: cpu_lock will be released and then reacquired. */ int mp_cpu_poweroff(struct cpu *cp) { int rv = 0; void *ctx; dev_info_t *dip = NULL; rm_platter_t *rm = (rm_platter_t *)rm_platter_va; extern void cpupm_start(cpu_t *); extern void cpupm_stop(cpu_t *); ASSERT(cp != NULL); ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); if (use_mp == 0 || plat_dr_support_cpu() == 0) { return (ENOTSUP); } /* * There is no support for powering off cpu0 yet. * There are many pieces of code which have a hard dependency on cpu0. */ if (cp->cpu_id == 0) { return (ENOTSUP); }; if (mach_cpu_get_device_node(cp, &dip) != PSM_SUCCESS) { return (ENXIO); } ASSERT(dip != NULL); if (mp_cpu_detach_driver(dip) != 0) { rv = EBUSY; goto out_online; } /* Allocate CPU context for stopping */ if (mach_cpucontext_init() != 0) { rv = ENXIO; goto out_online; } ctx = mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_STOP); if (ctx == NULL) { rv = ENXIO; goto out_context_fini; } cpupm_stop(cp); cpu_event_fini_cpu(cp); if (cp->cpu_m.mcpu_cmi_hdl != NULL) { cmi_fini(cp->cpu_m.mcpu_cmi_hdl); cp->cpu_m.mcpu_cmi_hdl = NULL; } rv = mach_cpu_stop(cp, ctx); if (rv != 0) { goto out_enable_cmi; } /* Wait until the target CPU has been halted. */ while (*(volatile ushort_t *)&(rm->rm_cpu_halted) != 0xdead) { delay(1); } rm->rm_cpu_halted = 0xffff; /* CPU_READY has been cleared by mach_cpu_stop. */ ASSERT((cp->cpu_flags & CPU_READY) == 0); ASSERT((cp->cpu_flags & CPU_RUNNING) == 0); cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF; CPUSET_ATOMIC_DEL(mp_cpus, cp->cpu_id); mach_cpucontext_xfree(cp, ctx, 0, MACH_CPUCONTEXT_OP_STOP); mach_cpucontext_fini(); return (0); out_enable_cmi: { cmi_hdl_t hdl; if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp))) != NULL) { if (is_x86_feature(x86_featureset, X86FSET_MCA)) cmi_mca_init(hdl); cp->cpu_m.mcpu_cmi_hdl = hdl; } } cpu_event_init_cpu(cp); cpupm_start(cp); mach_cpucontext_xfree(cp, ctx, rv, MACH_CPUCONTEXT_OP_STOP); out_context_fini: mach_cpucontext_fini(); out_online: (void) e_ddi_branch_configure(dip, NULL, 0); if (rv != EAGAIN && rv != ETIME) { rv = ENXIO; } return (rv); } /* * Return vcpu state, since this could be a virtual environment that we * are unaware of, return "unknown". */ /* ARGSUSED */ int vcpu_on_pcpu(processorid_t cpu) { return (VCPU_STATE_UNKNOWN); }