1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30/*
31 * This file and its contents are supplied under the terms of the
32 * Common Development and Distribution License ("CDDL"), version 1.0.
33 * You may only use this file in accordance with the terms of version
34 * 1.0 of the CDDL.
35 *
36 * A full copy of the text of the CDDL should have accompanied this
37 * source.  A copy of the CDDL is also available via the Internet at
38 * http://www.illumos.org/license/CDDL.
39 *
40 * Copyright 2014 Pluribus Networks Inc.
41 * Copyright 2018 Joyent, Inc.
42 * Copyright 2020 Oxide Computer Company
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD$");
47
48#include <sys/param.h>
49#include <sys/pcpu.h>
50#include <sys/systm.h>
51#include <sys/sysctl.h>
52#include <sys/x86_archext.h>
53
54#include <machine/clock.h>
55#include <machine/cpufunc.h>
56#include <machine/md_var.h>
57#include <machine/segments.h>
58#include <machine/specialreg.h>
59
60#include <machine/vmm.h>
61
62#include "vmm_host.h"
63#include "vmm_ktr.h"
64#include "vmm_util.h"
65#include "x86.h"
66
67SYSCTL_DECL(_hw_vmm);
68#ifdef __FreeBSD__
69static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
70    NULL);
71#endif
72
73#define	CPUID_VM_HIGH		0x40000000
74
75static const char bhyve_id[12] = "bhyve bhyve ";
76
77static uint64_t bhyve_xcpuids;
78SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
79    "Number of times an unknown cpuid leaf was accessed");
80
81static int cpuid_leaf_b = 1;
82SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
83    &cpuid_leaf_b, 0, NULL);
84
85/*
86 * Round up to the next power of two, if necessary, and then take log2.
87 * Returns -1 if argument is zero.
88 */
89static __inline int
90log2(u_int x)
91{
92
93	return (fls(x << (1 - powerof2(x))) - 1);
94}
95
96int
97x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint64_t *rax, uint64_t *rbx,
98    uint64_t *rcx, uint64_t *rdx)
99{
100	const struct xsave_limits *limits;
101	uint64_t cr4;
102	int error, enable_invpcid, level, width = 0, x2apic_id = 0;
103	unsigned int func, regs[4], logical_cpus = 0, param;
104	enum x2apic_state x2apic_state;
105	uint16_t cores, maxcpus, sockets, threads;
106
107	/*
108	 * The function of CPUID is controlled through the provided value of
109	 * %eax (and secondarily %ecx, for certain leaf data).
110	 */
111	func = (uint32_t)*rax;
112	param = (uint32_t)*rcx;
113
114	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", func, param);
115
116	/*
117	 * Requests for invalid CPUID levels should map to the highest
118	 * available level instead.
119	 */
120	if (cpu_exthigh != 0 && func >= 0x80000000) {
121		if (func > cpu_exthigh)
122			func = cpu_exthigh;
123	} else if (func >= 0x40000000) {
124		if (func > CPUID_VM_HIGH)
125			func = CPUID_VM_HIGH;
126	} else if (func > cpu_high) {
127		func = cpu_high;
128	}
129
130	/*
131	 * In general the approach used for CPU topology is to
132	 * advertise a flat topology where all CPUs are packages with
133	 * no multi-core or SMT.
134	 */
135	switch (func) {
136		/*
137		 * Pass these through to the guest
138		 */
139		case CPUID_0000_0000:
140		case CPUID_0000_0002:
141		case CPUID_0000_0003:
142		case CPUID_8000_0000:
143		case CPUID_8000_0002:
144		case CPUID_8000_0003:
145		case CPUID_8000_0004:
146		case CPUID_8000_0006:
147			cpuid_count(func, param, regs);
148			break;
149		case CPUID_8000_0008:
150			cpuid_count(func, param, regs);
151			if (vmm_is_svm()) {
152				/*
153				 * As on Intel (0000_0007:0, EDX), mask out
154				 * unsupported or unsafe AMD extended features
155				 * (8000_0008 EBX).
156				 */
157				regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF |
158				    AMDFEID_XSAVEERPTR);
159
160				vm_get_topology(vm, &sockets, &cores, &threads,
161				    &maxcpus);
162				/*
163				 * Here, width is ApicIdCoreIdSize, present on
164				 * at least Family 15h and newer.  It
165				 * represents the "number of bits in the
166				 * initial apicid that indicate thread id
167				 * within a package."
168				 *
169				 * Our topo_probe_amd() uses it for
170				 * pkg_id_shift and other OSes may rely on it.
171				 */
172				width = MIN(0xF, log2(threads * cores));
173				if (width < 0x4)
174					width = 0;
175				logical_cpus = MIN(0xFF, threads * cores - 1);
176				regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus;
177			}
178			break;
179
180		case CPUID_8000_0001:
181			cpuid_count(func, param, regs);
182
183			/*
184			 * Hide SVM from guest.
185			 */
186			regs[2] &= ~AMDID2_SVM;
187
188			/*
189			 * Don't advertise extended performance counter MSRs
190			 * to the guest.
191			 */
192			regs[2] &= ~AMDID2_PCXC;
193			regs[2] &= ~AMDID2_PNXC;
194			regs[2] &= ~AMDID2_PTSCEL2I;
195
196			/*
197			 * Don't advertise Instruction Based Sampling feature.
198			 */
199			regs[2] &= ~AMDID2_IBS;
200
201			/* NodeID MSR not available */
202			regs[2] &= ~AMDID2_NODE_ID;
203
204			/* Don't advertise the OS visible workaround feature */
205			regs[2] &= ~AMDID2_OSVW;
206
207			/* Hide mwaitx/monitorx capability from the guest */
208			regs[2] &= ~AMDID2_MWAITX;
209
210#ifndef __FreeBSD__
211			/*
212			 * Detection routines for TCE and FFXSR are missing
213			 * from our vm_cpuid_capability() detection logic
214			 * today.  Mask them out until that is remedied.
215			 * They do not appear to be in common usage, so their
216			 * absence should not cause undue trouble.
217			 */
218			regs[2] &= ~AMDID2_TCE;
219			regs[3] &= ~AMDID_FFXSR;
220#endif
221
222			/*
223			 * Hide rdtscp/ia32_tsc_aux until we know how
224			 * to deal with them.
225			 */
226			regs[3] &= ~AMDID_RDTSCP;
227			break;
228
229		case CPUID_8000_0007:
230			/*
231			 * AMD uses this leaf to advertise the processor's
232			 * power monitoring and RAS capabilities. These
233			 * features are hardware-specific and exposing
234			 * them to a guest doesn't make a lot of sense.
235			 *
236			 * Intel uses this leaf only to advertise the
237			 * "Invariant TSC" feature with all other bits
238			 * being reserved (set to zero).
239			 */
240			regs[0] = 0;
241			regs[1] = 0;
242			regs[2] = 0;
243			regs[3] = 0;
244
245			/*
246			 * "Invariant TSC" can be advertised to the guest if:
247			 * - host TSC frequency is invariant
248			 * - host TSCs are synchronized across physical cpus
249			 *
250			 * XXX This still falls short because the vcpu
251			 * can observe the TSC moving backwards as it
252			 * migrates across physical cpus. But at least
253			 * it should discourage the guest from using the
254			 * TSC to keep track of time.
255			 */
256#ifdef __FreeBSD__
257			/* XXXJOY: Wire up with our own TSC logic */
258			if (tsc_is_invariant && smp_tsc)
259				regs[3] |= AMDPM_TSC_INVARIANT;
260#endif /* __FreeBSD__ */
261			break;
262
263		case CPUID_8000_001D:
264			/* AMD Cache topology, like 0000_0004 for Intel. */
265			if (!vmm_is_svm())
266				goto default_leaf;
267
268			/*
269			 * Similar to Intel, generate a ficticious cache
270			 * topology for the guest with L3 shared by the
271			 * package, and L1 and L2 local to a core.
272			 */
273			vm_get_topology(vm, &sockets, &cores, &threads,
274			    &maxcpus);
275			switch (param) {
276			case 0:
277				logical_cpus = threads;
278				level = 1;
279				func = 1;	/* data cache */
280				break;
281			case 1:
282				logical_cpus = threads;
283				level = 2;
284				func = 3;	/* unified cache */
285				break;
286			case 2:
287				logical_cpus = threads * cores;
288				level = 3;
289				func = 3;	/* unified cache */
290				break;
291			default:
292				logical_cpus = 0;
293				level = 0;
294				func = 0;
295				break;
296			}
297
298			logical_cpus = MIN(0xfff, logical_cpus - 1);
299			regs[0] = (logical_cpus << 14) | (1 << 8) |
300			    (level << 5) | func;
301			regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0;
302			regs[2] = 0;
303			regs[3] = 0;
304			break;
305
306		case CPUID_8000_001E:
307			/*
308			 * AMD Family 16h+ and Hygon Family 18h additional
309			 * identifiers.
310			 */
311			if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16)
312				goto default_leaf;
313
314			vm_get_topology(vm, &sockets, &cores, &threads,
315			    &maxcpus);
316			regs[0] = vcpu_id;
317			threads = MIN(0xFF, threads - 1);
318			regs[1] = (threads << 8) |
319			    (vcpu_id >> log2(threads + 1));
320			/*
321			 * XXX Bhyve topology cannot yet represent >1 node per
322			 * processor.
323			 */
324			regs[2] = 0;
325			regs[3] = 0;
326			break;
327
328		case CPUID_0000_0001:
329			do_cpuid(1, regs);
330
331			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
332			if (error) {
333				panic("x86_emulate_cpuid: error %d "
334				      "fetching x2apic state", error);
335			}
336
337			/*
338			 * Override the APIC ID only in ebx
339			 */
340			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
341			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
342
343			/*
344			 * Don't expose VMX, SpeedStep, TME or SMX capability.
345			 * Advertise x2APIC capability and Hypervisor guest.
346			 */
347			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
348			regs[2] &= ~(CPUID2_SMX);
349
350			regs[2] |= CPUID2_HV;
351
352			if (x2apic_state != X2APIC_DISABLED)
353				regs[2] |= CPUID2_X2APIC;
354			else
355				regs[2] &= ~CPUID2_X2APIC;
356
357			/*
358			 * Only advertise CPUID2_XSAVE in the guest if
359			 * the host is using XSAVE.
360			 */
361			if (!(regs[2] & CPUID2_OSXSAVE))
362				regs[2] &= ~CPUID2_XSAVE;
363
364			/*
365			 * If CPUID2_XSAVE is being advertised and the
366			 * guest has set CR4_XSAVE, set
367			 * CPUID2_OSXSAVE.
368			 */
369			regs[2] &= ~CPUID2_OSXSAVE;
370			if (regs[2] & CPUID2_XSAVE) {
371				error = vm_get_register(vm, vcpu_id,
372				    VM_REG_GUEST_CR4, &cr4);
373				if (error)
374					panic("x86_emulate_cpuid: error %d "
375					      "fetching %%cr4", error);
376				if (cr4 & CR4_XSAVE)
377					regs[2] |= CPUID2_OSXSAVE;
378			}
379
380			/*
381			 * Hide monitor/mwait until we know how to deal with
382			 * these instructions.
383			 */
384			regs[2] &= ~CPUID2_MON;
385
386                        /*
387			 * Hide the performance and debug features.
388			 */
389			regs[2] &= ~CPUID2_PDCM;
390
391			/*
392			 * No TSC deadline support in the APIC yet
393			 */
394			regs[2] &= ~CPUID2_TSCDLT;
395
396			/*
397			 * Hide thermal monitoring
398			 */
399			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
400
401			/*
402			 * Hide the debug store capability.
403			 */
404			regs[3] &= ~CPUID_DS;
405
406			/*
407			 * Advertise the Machine Check and MTRR capability.
408			 *
409			 * Some guest OSes (e.g. Windows) will not boot if
410			 * these features are absent.
411			 */
412			regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
413
414			vm_get_topology(vm, &sockets, &cores, &threads,
415			    &maxcpus);
416			logical_cpus = threads * cores;
417			regs[1] &= ~CPUID_HTT_CORES;
418			regs[1] |= (logical_cpus & 0xff) << 16;
419			regs[3] |= CPUID_HTT;
420			break;
421
422		case CPUID_0000_0004:
423			cpuid_count(func, param, regs);
424
425			if (regs[0] || regs[1] || regs[2] || regs[3]) {
426				vm_get_topology(vm, &sockets, &cores, &threads,
427				    &maxcpus);
428				regs[0] &= 0x3ff;
429				regs[0] |= (cores - 1) << 26;
430				/*
431				 * Cache topology:
432				 * - L1 and L2 are shared only by the logical
433				 *   processors in a single core.
434				 * - L3 and above are shared by all logical
435				 *   processors in the package.
436				 */
437				logical_cpus = threads;
438				level = (regs[0] >> 5) & 0x7;
439				if (level >= 3)
440					logical_cpus *= cores;
441				regs[0] |= (logical_cpus - 1) << 14;
442			}
443			break;
444
445		case CPUID_0000_0007:
446			regs[0] = 0;
447			regs[1] = 0;
448			regs[2] = 0;
449			regs[3] = 0;
450
451			/* leaf 0 */
452			if (param == 0) {
453				cpuid_count(func, param, regs);
454
455				/* Only leaf 0 is supported */
456				regs[0] = 0;
457
458				/*
459				 * Expose known-safe features.
460				 */
461				regs[1] &= (CPUID_STDEXT_FSGSBASE |
462				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
463				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
464				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
465				    CPUID_STDEXT_AVX512F |
466				    CPUID_STDEXT_RDSEED |
467				    CPUID_STDEXT_AVX512PF |
468				    CPUID_STDEXT_AVX512ER |
469				    CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA);
470				regs[2] = 0;
471				regs[3] &= CPUID_STDEXT3_MD_CLEAR;
472
473				/* Advertise INVPCID if it is enabled. */
474				error = vm_get_capability(vm, vcpu_id,
475				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
476				if (error == 0 && enable_invpcid)
477					regs[1] |= CPUID_STDEXT_INVPCID;
478			}
479			break;
480
481		case CPUID_0000_0006:
482			regs[0] = CPUTPM1_ARAT;
483			regs[1] = 0;
484			regs[2] = 0;
485			regs[3] = 0;
486			break;
487
488		case CPUID_0000_000A:
489			/*
490			 * Handle the access, but report 0 for
491			 * all options
492			 */
493			regs[0] = 0;
494			regs[1] = 0;
495			regs[2] = 0;
496			regs[3] = 0;
497			break;
498
499		case CPUID_0000_000B:
500			/*
501			 * Intel processor topology enumeration
502			 */
503			if (vmm_is_intel()) {
504				vm_get_topology(vm, &sockets, &cores, &threads,
505				    &maxcpus);
506				if (param == 0) {
507					logical_cpus = threads;
508					width = log2(logical_cpus);
509					level = CPUID_TYPE_SMT;
510					x2apic_id = vcpu_id;
511				}
512
513				if (param == 1) {
514					logical_cpus = threads * cores;
515					width = log2(logical_cpus);
516					level = CPUID_TYPE_CORE;
517					x2apic_id = vcpu_id;
518				}
519
520				if (!cpuid_leaf_b || param >= 2) {
521					width = 0;
522					logical_cpus = 0;
523					level = 0;
524					x2apic_id = 0;
525				}
526
527				regs[0] = width & 0x1f;
528				regs[1] = logical_cpus & 0xffff;
529				regs[2] = (level << 8) | (param & 0xff);
530				regs[3] = x2apic_id;
531			} else {
532				regs[0] = 0;
533				regs[1] = 0;
534				regs[2] = 0;
535				regs[3] = 0;
536			}
537			break;
538
539		case CPUID_0000_000D:
540			limits = vmm_get_xsave_limits();
541			if (!limits->xsave_enabled) {
542				regs[0] = 0;
543				regs[1] = 0;
544				regs[2] = 0;
545				regs[3] = 0;
546				break;
547			}
548
549			cpuid_count(func, param, regs);
550			switch (param) {
551			case 0:
552				/*
553				 * Only permit the guest to use bits
554				 * that are active in the host in
555				 * %xcr0.  Also, claim that the
556				 * maximum save area size is
557				 * equivalent to the host's current
558				 * save area size.  Since this runs
559				 * "inside" of vmrun(), it runs with
560				 * the guest's xcr0, so the current
561				 * save area size is correct as-is.
562				 */
563				regs[0] &= limits->xcr0_allowed;
564				regs[2] = limits->xsave_max_size;
565				regs[3] &= (limits->xcr0_allowed >> 32);
566				break;
567			case 1:
568				/* Only permit XSAVEOPT. */
569				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
570				regs[1] = 0;
571				regs[2] = 0;
572				regs[3] = 0;
573				break;
574			default:
575				/*
576				 * If the leaf is for a permitted feature,
577				 * pass through as-is, otherwise return
578				 * all zeroes.
579				 */
580				if (!(limits->xcr0_allowed & (1ul << param))) {
581					regs[0] = 0;
582					regs[1] = 0;
583					regs[2] = 0;
584					regs[3] = 0;
585				}
586				break;
587			}
588			break;
589
590		case CPUID_0000_0015:
591			/*
592			 * Don't report CPU TSC/Crystal ratio and clock
593			 * values since guests may use these to derive the
594			 * local APIC frequency..
595			 */
596			regs[0] = 0;
597			regs[1] = 0;
598			regs[2] = 0;
599			regs[3] = 0;
600			break;
601
602		case 0x40000000:
603			regs[0] = CPUID_VM_HIGH;
604			bcopy(bhyve_id, &regs[1], 4);
605			bcopy(bhyve_id + 4, &regs[2], 4);
606			bcopy(bhyve_id + 8, &regs[3], 4);
607			break;
608
609		default:
610default_leaf:
611			/*
612			 * The leaf value has already been clamped so
613			 * simply pass this through, keeping count of
614			 * how many unhandled leaf values have been seen.
615			 */
616			atomic_add_long(&bhyve_xcpuids, 1);
617			cpuid_count(func, param, regs);
618			break;
619	}
620
621	/*
622	 * CPUID clears the upper 32-bits of the long-mode registers.
623	 */
624	*rax = regs[0];
625	*rbx = regs[1];
626	*rcx = regs[2];
627	*rdx = regs[3];
628
629	return (1);
630}
631
632bool
633vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
634{
635	bool rv;
636
637	KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
638	    __func__, cap));
639
640	/*
641	 * Simply passthrough the capabilities of the host cpu for now.
642	 */
643	rv = false;
644	switch (cap) {
645#ifdef __FreeBSD__
646	case VCC_NO_EXECUTE:
647		if (amd_feature & AMDID_NX)
648			rv = true;
649		break;
650	case VCC_FFXSR:
651		if (amd_feature & AMDID_FFXSR)
652			rv = true;
653		break;
654	case VCC_TCE:
655		if (amd_feature2 & AMDID2_TCE)
656			rv = true;
657		break;
658#else
659	case VCC_NO_EXECUTE:
660		if (is_x86_feature(x86_featureset, X86FSET_NX))
661			rv = true;
662		break;
663	/* XXXJOY: No kernel detection for FFXR or TCE at present, so ignore */
664	case VCC_FFXSR:
665	case VCC_TCE:
666		break;
667#endif
668	default:
669		panic("%s: unknown vm_cpu_capability %d", __func__, cap);
670	}
671	return (rv);
672}
673