summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-04-02 15:13:15 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-04-02 15:13:15 -0700
commit8c1b724ddb218f221612d4c649bc9c7819d8d7a6 (patch)
tree0e226f4156b554eec2690adb8f30ba54b15b68cc /arch/x86/kvm
parentf14a9532ee30c68a56ff502c382860f674cc180c (diff)
parent514ccc194971d0649e4e7ec8a9b3a6e33561d7bf (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "ARM: - GICv4.1 support - 32bit host removal PPC: - secure (encrypted) using under the Protected Execution Framework ultravisor s390: - allow disabling GISA (hardware interrupt injection) and protected VMs/ultravisor support. x86: - New dirty bitmap flag that sets all bits in the bitmap when dirty page logging is enabled; this is faster because it doesn't require bulk modification of the page tables. - Initial work on making nested SVM event injection more similar to VMX, and less buggy. - Various cleanups to MMU code (though the big ones and related optimizations were delayed to 5.8). Instead of using cr3 in function names which occasionally means eptp, KVM too has standardized on "pgd". - A large refactoring of CPUID features, which now use an array that parallels the core x86_features. - Some removal of pointer chasing from kvm_x86_ops, which will also be switched to static calls as soon as they are available. - New Tigerlake CPUID features. - More bugfixes, optimizations and cleanups. Generic: - selftests: cleanups, new MMU notifier stress test, steal-time test - CSV output for kvm_stat" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (277 commits) x86/kvm: fix a missing-prototypes "vmread_error" KVM: x86: Fix BUILD_BUG() in __cpuid_entry_get_reg() w/ CONFIG_UBSAN=y KVM: VMX: Add a trampoline to fix VMREAD error handling KVM: SVM: Annotate svm_x86_ops as __initdata KVM: VMX: Annotate vmx_x86_ops as __initdata KVM: x86: Drop __exit from kvm_x86_ops' hardware_unsetup() KVM: x86: Copy kvm_x86_ops by value to eliminate layer of indirection KVM: x86: Set kvm_x86_ops only after ->hardware_setup() completes KVM: VMX: Configure runtime hooks using vmx_x86_ops KVM: VMX: Move hardware_setup() definition below vmx_x86_ops KVM: x86: Move init-only kvm_x86_ops to separate struct KVM: Pass kvm_init()'s opaque param to additional arch funcs s390/gmap: return proper error code on ksm unsharing KVM: selftests: Fix cosmetic copy-paste error in vm_mem_region_move() KVM: Fix out of range accesses to memslots KVM: X86: Micro-optimize IPI fastpath delay KVM: X86: Delay read msr data iff writes ICR MSR KVM: PPC: Book3S HV: Add a capability for enabling secure guests KVM: arm64: GICv4.1: Expose HW-based SGIs in debugfs KVM: arm64: GICv4.1: Allow non-trapping WFI when using HW SGIs ...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c944
-rw-r--r--arch/x86/kvm/cpuid.h151
-rw-r--r--arch/x86/kvm/emulate.c57
-rw-r--r--arch/x86/kvm/hyperv.c8
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h10
-rw-r--r--arch/x86/kvm/kvm_emulate.h509
-rw-r--r--arch/x86/kvm/lapic.c85
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.h10
-rw-r--r--arch/x86/kvm/mmu/mmu.c209
-rw-r--r--arch/x86/kvm/mmu/page_track.c16
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h4
-rw-r--r--arch/x86/kvm/pmu.c34
-rw-r--r--arch/x86/kvm/pmu.h11
-rw-r--r--arch/x86/kvm/svm.c407
-rw-r--r--arch/x86/kvm/trace.h50
-rw-r--r--arch/x86/kvm/vmx/capabilities.h25
-rw-r--r--arch/x86/kvm/vmx/evmcs.h7
-rw-r--r--arch/x86/kvm/vmx/nested.c183
-rw-r--r--arch/x86/kvm/vmx/nested.h8
-rw-r--r--arch/x86/kvm/vmx/ops.h27
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/vmenter.S72
-rw-r--r--arch/x86/kvm/vmx/vmx.c665
-rw-r--r--arch/x86/kvm/vmx/vmx.h8
-rw-r--r--arch/x86/kvm/x86.c787
-rw-r--r--arch/x86/kvm/x86.h28
28 files changed, 2549 insertions, 1778 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b1c469446b07..901cd1fdecd9 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -24,6 +24,13 @@
#include "trace.h"
#include "pmu.h"
+/*
+ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
+ * aligned to sizeof(unsigned long) because it's not accessed via bitops.
+ */
+u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_cpu_caps);
+
static u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
int feature_bit = 0;
@@ -45,23 +52,6 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
-bool kvm_mpx_supported(void)
-{
- return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
- && kvm_x86_ops->mpx_supported());
-}
-EXPORT_SYMBOL_GPL(kvm_mpx_supported);
-
-u64 kvm_supported_xcr0(void)
-{
- u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
-
- if (!kvm_mpx_supported())
- xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
-
- return xcr0;
-}
-
#define F feature_bit
int kvm_update_cpuid(struct kvm_vcpu *vcpu)
@@ -74,32 +64,24 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
return 0;
/* Update OSXSAVE bit */
- if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) {
- best->ecx &= ~F(OSXSAVE);
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
- best->ecx |= F(OSXSAVE);
- }
+ if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1)
+ cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
+ kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
- best->edx &= ~F(APIC);
- if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)
- best->edx |= F(APIC);
+ cpuid_entry_change(best, X86_FEATURE_APIC,
+ vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
if (apic) {
- if (best->ecx & F(TSC_DEADLINE_TIMER))
+ if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
apic->lapic_timer.timer_mode_mask = 3 << 17;
else
apic->lapic_timer.timer_mode_mask = 1 << 17;
}
best = kvm_find_cpuid_entry(vcpu, 7, 0);
- if (best) {
- /* Update OSPKE bit */
- if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
- best->ecx &= ~F(OSPKE);
- if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
- best->ecx |= F(OSPKE);
- }
- }
+ if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
+ cpuid_entry_change(best, X86_FEATURE_OSPKE,
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
if (!best) {
@@ -107,14 +89,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
} else {
vcpu->arch.guest_supported_xcr0 =
- (best->eax | ((u64)best->edx << 32)) &
- kvm_supported_xcr0();
+ (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
vcpu->arch.guest_xstate_size = best->ebx =
xstate_required_size(vcpu->arch.xcr0, false);
}
best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
- if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
+ if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
+ cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
/*
@@ -136,12 +118,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
- if (best) {
- if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT)
- best->ecx |= F(MWAIT);
- else
- best->ecx &= ~F(MWAIT);
- }
+ if (best)
+ cpuid_entry_change(best, X86_FEATURE_MWAIT,
+ vcpu->arch.ia32_misc_enable_msr &
+ MSR_IA32_MISC_ENABLE_MWAIT);
}
/* Update physical-address width */
@@ -154,10 +134,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
static int is_efer_nx(void)
{
- unsigned long long efer = 0;
-
- rdmsrl_safe(MSR_EFER, &efer);
- return efer & EFER_NX;
+ return host_efer & EFER_NX;
}
static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
@@ -173,8 +150,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
break;
}
}
- if (entry && (entry->edx & F(NX)) && !is_efer_nx()) {
- entry->edx &= ~F(NX);
+ if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) {
+ cpuid_entry_clear(entry, X86_FEATURE_NX);
printk(KERN_INFO "kvm: guest NX capability removed\n");
}
}
@@ -232,7 +209,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
vcpu->arch.cpuid_nent = cpuid->nent;
cpuid_fix_nx_cap(vcpu);
kvm_apic_set_version(vcpu);
- kvm_x86_ops->cpuid_update(vcpu);
+ kvm_x86_ops.cpuid_update(vcpu);
r = kvm_update_cpuid(vcpu);
out:
@@ -255,7 +232,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
goto out;
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
- kvm_x86_ops->cpuid_update(vcpu);
+ kvm_x86_ops.cpuid_update(vcpu);
r = kvm_update_cpuid(vcpu);
out:
return r;
@@ -281,15 +258,189 @@ out:
return r;
}
-static __always_inline void cpuid_mask(u32 *word, int wordnum)
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
{
- reverse_cpuid_check(wordnum);
- *word &= boot_cpu_data.x86_capability[wordnum];
+ const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
+ struct kvm_cpuid_entry2 entry;
+
+ reverse_cpuid_check(leaf);
+ kvm_cpu_caps[leaf] &= mask;
+
+ cpuid_count(cpuid.function, cpuid.index,
+ &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
+
+ kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
+}
+
+void kvm_set_cpu_caps(void)
+{
+ unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+ unsigned int f_gbpages = F(GBPAGES);
+ unsigned int f_lm = F(LM);
+#else
+ unsigned int f_gbpages = 0;
+ unsigned int f_lm = 0;
+#endif
+
+ BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+ sizeof(boot_cpu_data.x86_capability));
+
+ memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
+ sizeof(kvm_cpu_caps));
+
+ kvm_cpu_cap_mask(CPUID_1_ECX,
+ /*
+ * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
+ * advertised to guests via CPUID!
+ */
+ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+ 0 /* DS-CPL, VMX, SMX, EST */ |
+ 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+ F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+ 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+ F(F16C) | F(RDRAND)
+ );
+ /* KVM emulates x2apic in software irrespective of host support. */
+ kvm_cpu_cap_set(X86_FEATURE_X2APIC);
+
+ kvm_cpu_cap_mask(CPUID_1_EDX,
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
+ 0 /* Reserved, DS, ACPI */ | F(MMX) |
+ F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+ 0 /* HTT, TM, Reserved, PBE */
+ );
+
+ kvm_cpu_cap_mask(CPUID_7_0_EBX,
+ F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+ F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
+ F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
+ F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+ F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
+ );
+
+ kvm_cpu_cap_mask(CPUID_7_ECX,
+ F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
+ F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+ F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+ );
+ /* Set LA57 based on hardware capability. */
+ if (cpuid_ecx(7) & F(LA57))
+ kvm_cpu_cap_set(X86_FEATURE_LA57);
+
+ kvm_cpu_cap_mask(CPUID_7_EDX,
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+ F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+ F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM)
+ );
+
+ /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
+ kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST);
+ kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES);
+
+ if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
+ kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
+
+ kvm_cpu_cap_mask(CPUID_7_1_EAX,
+ F(AVX512_BF16)
+ );
+
+ kvm_cpu_cap_mask(CPUID_D_1_EAX,
+ F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
+ );
+
+ kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
+ F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+ F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+ F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
+ F(TOPOEXT) | F(PERFCTR_CORE)
+ );
+
+ kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* Reserved */ |
+ f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
+ 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
+ );
+
+ if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
+ kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+
+ kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
+ F(CLZERO) | F(XSAVEERPTR) |
+ F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+ F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON)
+ );
+
+ /*
+ * AMD has separate bits for each SPEC_CTRL bit.
+ * arch/x86/kernel/cpu/bugs.c is kind enough to
+ * record that in cpufeatures so use them.
+ */
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB);
+ if (boot_cpu_has(X86_FEATURE_IBRS))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD);
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO);
+ /*
+ * The preference is to use SPEC CTRL MSR instead of the
+ * VIRT_SPEC MSR.
+ */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /*
+ * Hide all SVM features by default, SVM will set the cap bits for
+ * features it emulates and/or exposes for L1.
+ */
+ kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
+
+ kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
+ F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+ F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+ F(PMM) | F(PMM_EN)
+ );
}
+EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
-static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
- u32 index)
+struct kvm_cpuid_array {
+ struct kvm_cpuid_entry2 *entries;
+ const int maxnent;
+ int nent;
+};
+
+static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
+ u32 function, u32 index)
{
+ struct kvm_cpuid_entry2 *entry;
+
+ if (array->nent >= array->maxnent)
+ return NULL;
+
+ entry = &array->entries[array->nent++];
+
entry->function = function;
entry->index = index;
entry->flags = 0;
@@ -298,9 +449,6 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
switch (function) {
- case 2:
- entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
- break;
case 4:
case 7:
case 0xb:
@@ -316,11 +464,18 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
break;
}
+
+ return entry;
}
-static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
- u32 func, int *nent, int maxnent)
+static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
{
+ struct kvm_cpuid_entry2 *entry;
+
+ if (array->nent >= array->maxnent)
+ return -E2BIG;
+
+ entry = &array->entries[array->nent];
entry->function = func;
entry->index = 0;
entry->flags = 0;
@@ -328,17 +483,17 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
switch (func) {
case 0:
entry->eax = 7;
- ++*nent;
+ ++array->nent;
break;
case 1:
entry->ecx = F(MOVBE);
- ++*nent;
+ ++array->nent;
break;
case 7:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
entry->eax = 0;
entry->ecx = F(RDPID);
- ++*nent;
+ ++array->nent;
default:
break;
}
@@ -346,223 +501,60 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
return 0;
}
-static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
+static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
{
- unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
- unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
- unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
- unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
- unsigned f_la57;
- unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0;
-
- /* cpuid 7.0.ebx */
- const u32 kvm_cpuid_7_0_ebx_x86_features =
- F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
- F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
- F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
- F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
- F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
-
- /* cpuid 7.0.ecx*/
- const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
- F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
- F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
-
- /* cpuid 7.0.edx*/
- const u32 kvm_cpuid_7_0_edx_x86_features =
- F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
- F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
- F(MD_CLEAR);
-
- /* cpuid 7.1.eax */
- const u32 kvm_cpuid_7_1_eax_x86_features =
- F(AVX512_BF16);
-
- switch (index) {
- case 0:
- entry->eax = min(entry->eax, 1u);
- entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
- cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
- /* TSC_ADJUST is emulated */
- entry->ebx |= F(TSC_ADJUST);
-
- entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
- f_la57 = entry->ecx & F(LA57);
- cpuid_mask(&entry->ecx, CPUID_7_ECX);
- /* Set LA57 based on hardware capability. */
- entry->ecx |= f_la57;
- entry->ecx |= f_umip;
- entry->ecx |= f_pku;
- /* PKU is not yet implemented for shadow paging. */
- if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
- entry->ecx &= ~F(PKU);
-
- entry->edx &= kvm_cpuid_7_0_edx_x86_features;
- cpuid_mask(&entry->edx, CPUID_7_EDX);
- if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
- entry->edx |= F(SPEC_CTRL);
- if (boot_cpu_has(X86_FEATURE_STIBP))
- entry->edx |= F(INTEL_STIBP);
- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
- boot_cpu_has(X86_FEATURE_AMD_SSBD))
- entry->edx |= F(SPEC_CTRL_SSBD);
- /*
- * We emulate ARCH_CAPABILITIES in software even
- * if the host doesn't support it.
- */
- entry->edx |= F(ARCH_CAPABILITIES);
- break;
- case 1:
- entry->eax &= kvm_cpuid_7_1_eax_x86_features;
- entry->ebx = 0;
- entry->ecx = 0;
- entry->edx = 0;
- break;
- default:
- WARN_ON_ONCE(1);
- entry->eax = 0;
- entry->ebx = 0;
- entry->ecx = 0;
- entry->edx = 0;
- break;
- }
-}
-
-static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
- int *nent, int maxnent)
-{
- int r;
- unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
- unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
- ? F(GBPAGES) : 0;
- unsigned f_lm = F(LM);
-#else
- unsigned f_gbpages = 0;
- unsigned f_lm = 0;
-#endif
- unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
- unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
- unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
-
- /* cpuid 1.edx */
- const u32 kvm_cpuid_1_edx_x86_features =
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
- 0 /* Reserved, DS, ACPI */ | F(MMX) |
- F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
- 0 /* HTT, TM, Reserved, PBE */;
- /* cpuid 0x80000001.edx */
- const u32 kvm_cpuid_8000_0001_edx_x86_features =
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* Reserved */ |
- f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
- 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
- /* cpuid 1.ecx */
- const u32 kvm_cpuid_1_ecx_x86_features =
- /* NOTE: MONITOR (and MWAIT) are emulated as NOP,
- * but *not* advertised to guests via CPUID ! */
- F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
- 0 /* DS-CPL, VMX, SMX, EST */ |
- 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
- F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
- F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
- F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
- F(F16C) | F(RDRAND);
- /* cpuid 0x80000001.ecx */
- const u32 kvm_cpuid_8000_0001_ecx_x86_features =
- F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
- F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
- 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
- F(TOPOEXT) | F(PERFCTR_CORE);
-
- /* cpuid 0x80000008.ebx */
- const u32 kvm_cpuid_8000_0008_ebx_x86_features =
- F(CLZERO) | F(XSAVEERPTR) |
- F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
- F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON);
-
- /* cpuid 0xC0000001.edx */
- const u32 kvm_cpuid_C000_0001_edx_x86_features =
- F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
- F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
- F(PMM) | F(PMM_EN);
-
- /* cpuid 0xD.1.eax */
- const u32 kvm_cpuid_D_1_eax_x86_features =
- F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
+ struct kvm_cpuid_entry2 *entry;
+ int r, i, max_idx;
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
r = -E2BIG;
- if (WARN_ON(*nent >= maxnent))
+ entry = do_host_cpuid(array, function, 0);
+ if (!entry)
goto out;
- do_host_cpuid(entry, function, 0);
- ++*nent;
-
switch (function) {
case 0:
/* Limited to the highest leaf implemented in KVM. */
entry->eax = min(entry->eax, 0x1fU);
break;
case 1:
- entry->edx &= kvm_cpuid_1_edx_x86_features;
- cpuid_mask(&entry->edx, CPUID_1_EDX);
- entry->ecx &= kvm_cpuid_1_ecx_x86_features;
- cpuid_mask(&entry->ecx, CPUID_1_ECX);
- /* we support x2apic emulation even if host does not support
- * it since we emulate x2apic in software */
- entry->ecx |= F(X2APIC);
+ cpuid_entry_override(entry, CPUID_1_EDX);
+ cpuid_entry_override(entry, CPUID_1_ECX);
break;
- /* function 2 entries are STATEFUL. That is, repeated cpuid commands
- * may return different values. This forces us to get_cpu() before
- * issuing the first command, and also to emulate this annoying behavior
- * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
- case 2: {
- int t, times = entry->eax & 0xff;
-
- entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- for (t = 1; t < times; ++t) {
- if (*nent >= maxnent)
- goto out;
-
- do_host_cpuid(&entry[t], function, 0);
- ++*nent;
- }
+ case 2:
+ /*
+ * On ancient CPUs, function 2 entries are STATEFUL. That is,
+ * CPUID(function=2, index=0) may return different results each
+ * time, with the least-significant byte in EAX enumerating the
+ * number of times software should do CPUID(2, 0).
+ *
+ * Modern CPUs, i.e. every CPU KVM has *ever* run on are less
+ * idiotic. Intel's SDM states that EAX & 0xff "will always
+ * return 01H. Software should ignore this value and not
+ * interpret it as an informational descriptor", while AMD's
+ * APM states that CPUID(2) is reserved.
+ *
+ * WARN if a frankenstein CPU that supports virtualization and
+ * a stateful CPUID.0x2 is encountered.
+ */
+ WARN_ON_ONCE((entry->eax & 0xff) > 1);
break;
- }
/* functions 4 and 0x8000001d have additional index. */
case 4:
- case 0x8000001d: {
- int i, cache_type;
-
- /* read more entries until cache_type is zero */
- for (i = 1; ; ++i) {
- if (*nent >= maxnent)
+ case 0x8000001d:
+ /*
+ * Read entries until the cache type in the previous entry is
+ * zero, i.e. indicates an invalid entry.
+ */
+ for (i = 1; entry->eax & 0x1f; ++i) {
+ entry = do_host_cpuid(array, function, i);
+ if (!entry)
goto out;
-
- cache_type = entry[i - 1].eax & 0x1f;
- if (!cache_type)
- break;
- do_host_cpuid(&entry[i], function, i);
- ++*nent;
}
break;
- }
case 6: /* Thermal management */
entry->eax = 0x4; /* allow ARAT */
entry->ebx = 0;
@@ -570,22 +562,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
entry->edx = 0;
break;
/* function 7 has additional index. */
- case 7: {
- int i;
-
- for (i = 0; ; ) {
- do_cpuid_7_mask(&entry[i], i);
- if (i == entry->eax)
- break;
- if (*nent >= maxnent)
+ case 7:
+ entry->eax = min(entry->eax, 1u);
+ cpuid_entry_override(entry, CPUID_7_0_EBX);
+ cpuid_entry_override(entry, CPUID_7_ECX);
+ cpuid_entry_override(entry, CPUID_7_EDX);
+
+ /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
+ if (entry->eax == 1) {
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
goto out;
- ++i;
- do_host_cpuid(&entry[i], function, i);
- ++*nent;
+ cpuid_entry_override(entry, CPUID_7_1_EAX);
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
}
break;
- }
case 9:
break;
case 0xa: { /* Architectural Performance Monitoring */
@@ -622,79 +616,81 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
* thus they can be handled by common code.
*/
case 0x1f:
- case 0xb: {
- int i;
-
+ case 0xb:
/*
- * We filled in entry[0] for CPUID(EAX=<function>,
- * ECX=00H) above. If its level type (ECX[15:8]) is
- * zero, then the leaf is unimplemented, and we're
- * done. Otherwise, continue to populate entries
- * until the level type (ECX[15:8]) of the previously
- * added entry is zero.
+ * Populate entries until the level type (ECX[15:8]) of the
+ * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is
+ * the starting entry, filled by the primary do_host_cpuid().
*/
- for (i = 1; entry[i - 1].ecx & 0xff00; ++i) {
- if (*nent >= maxnent)
+ for (i = 1; entry->ecx & 0xff00; ++i) {
+ entry = do_host_cpuid(array, function, i);
+ if (!entry)
goto out;
-
- do_host_cpuid(&entry[i], function, i);
- ++*nent;
}
break;
- }
- case 0xd: {
- int idx, i;
- u64 supported = kvm_supported_xcr0();
-
- entry->eax &= supported;
- entry->ebx = xstate_required_size(supported, false);
+ case 0xd:
+ entry->eax &= supported_xcr0;
+ entry->ebx = xstate_required_size(supported_xcr0, false);
entry->ecx = entry->ebx;
- entry->edx &= supported >> 32;
- if (!supported)
+ entry->edx &= supported_xcr0 >> 32;
+ if (!supported_xcr0)
break;
- for (idx = 1, i = 1; idx < 64; ++idx) {
- u64 mask = ((u64)1 << idx);
- if (*nent >= maxnent)
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_D_1_EAX);
+ if (entry->eax & (F(XSAVES)|F(XSAVEC)))
+ entry->ebx = xstate_required_size(supported_xcr0 | supported_xss,
+ true);
+ else {
+ WARN_ON_ONCE(supported_xss != 0);
+ entry->ebx = 0;
+ }
+ entry->ecx &= supported_xss;
+ entry->edx &= supported_xss >> 32;
+
+ for (i = 2; i < 64; ++i) {
+ bool s_state;
+ if (supported_xcr0 & BIT_ULL(i))
+ s_state = false;
+ else if (supported_xss & BIT_ULL(i))
+ s_state = true;
+ else
+ continue;
+
+ entry = do_host_cpuid(array, function, i);
+ if (!entry)
goto out;
- do_host_cpuid(&entry[i], function, idx);
- if (idx == 1) {
- entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
- cpuid_mask(&entry[i].eax, CPUID_D_1_EAX);
- entry[i].ebx = 0;
- if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
- entry[i].ebx =
- xstate_required_size(supported,
- true);
- } else {
- if (entry[i].eax == 0 || !(supported & mask))
- continue;
- if (WARN_ON_ONCE(entry[i].ecx & 1))
- continue;
+ /*
+ * The supported check above should have filtered out
+ * invalid sub-leafs. Only valid sub-leafs should
+ * reach this point, and they should have a non-zero
+ * save state size. Furthermore, check whether the
+ * processor agrees with supported_xcr0/supported_xss
+ * on whether this is an XCR0- or IA32_XSS-managed area.
+ */
+ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
+ --array->nent;
+ continue;
}
- entry[i].ecx = 0;
- entry[i].edx = 0;
- ++*nent;
- ++i;
+ entry->edx = 0;
}
break;
- }
/* Intel PT */
- case 0x14: {
- int t, times = entry->eax;
-
- if (!f_intel_pt)
+ case 0x14:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
+ }
- for (t = 1; t <= times; ++t) {
- if (*nent >= maxnent)
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
goto out;
- do_host_cpuid(&entry[t], function, t);
- ++*nent;
}