summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-27 12:44:26 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-27 12:44:26 -0700
commit8bbe0dec38e147a50e9dd5f585295f7e68e0f2d0 (patch)
tree0781fd7dd090bbccd88f1f58caf6387a585b9ebe /arch
parente37e3bc7e265d05d00f14079767537699cf6bd46 (diff)
parentfd3edd4a9066f28de99a16685a586d68a9f551f8 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull more KVM updates from Paolo Bonzini: "x86 KVM changes: - The usual accuracy improvements for nested virtualization - The usual round of code cleanups from Sean - Added back optimizations that were prematurely removed in 5.2 (the bare minimum needed to fix the regression was in 5.3-rc8, here comes the rest) - Support for UMWAIT/UMONITOR/TPAUSE - Direct L2->L0 TLB flushing when L0 is Hyper-V and L1 is KVM - Tell Windows guests if SMT is disabled on the host - More accurate detection of vmexit cost - Revert a pvqspinlock pessimization" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (56 commits) KVM: nVMX: cleanup and fix host 64-bit mode checks KVM: vmx: fix build warnings in hv_enable_direct_tlbflush() on i386 KVM: x86: Don't check kvm_rebooting in __kvm_handle_fault_on_reboot() KVM: x86: Drop ____kvm_handle_fault_on_reboot() KVM: VMX: Add error handling to VMREAD helper KVM: VMX: Optimize VMX instruction error and fault handling KVM: x86: Check kvm_rebooting in kvm_spurious_fault() KVM: selftests: fix ucall on x86 Revert "locking/pvqspinlock: Don't wait if vCPU is preempted" kvm: nvmx: limit atomic switch MSRs kvm: svm: Intercept RDPRU kvm: x86: Add "significant index" flag to a few CPUID leaves KVM: x86/mmu: Skip invalid pages during zapping iff root_count is zero KVM: x86/mmu: Explicitly track only a single invalid mmu generation KVM: x86/mmu: Revert "KVM: x86/mmu: Remove is_obsolete() call" KVM: x86/mmu: Revert "Revert "KVM: MMU: reclaim the zapped-obsolete page first"" KVM: x86/mmu: Revert "Revert "KVM: MMU: collapse TLB flushes when zap all pages"" KVM: x86/mmu: Revert "Revert "KVM: MMU: zap pages in batch"" KVM: x86/mmu: Revert "Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages"" KVM: x86/mmu: Revert "Revert "KVM: MMU: show mmu_valid_gen in shadow page related tracepoints"" ...
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h31
-rw-r--r--arch/x86/include/asm/kvm_host.h64
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/uapi/asm/svm.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kernel/cpu/umwait.c6
-rw-r--r--arch/x86/kvm/cpuid.c8
-rw-r--r--arch/x86/kvm/hyperv.c16
-rw-r--r--arch/x86/kvm/lapic.c28
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c145
-rw-r--r--arch/x86/kvm/mmutrace.h42
-rw-r--r--arch/x86/kvm/svm.c79
-rw-r--r--arch/x86/kvm/vmx/capabilities.h6
-rw-r--r--arch/x86/kvm/vmx/evmcs.h2
-rw-r--r--arch/x86/kvm/vmx/nested.c70
-rw-r--r--arch/x86/kvm/vmx/ops.h93
-rw-r--r--arch/x86/kvm/vmx/vmx.c306
-rw-r--r--arch/x86/kvm/vmx/vmx.h9
-rw-r--r--arch/x86/kvm/x86.c197
-rw-r--r--arch/x86/kvm/x86.h2
22 files changed, 731 insertions, 384 deletions
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 7a2705694f5b..7741e211f7f5 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -180,7 +180,15 @@
/* Recommend using enlightened VMCS */
#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14)
+/*
+ * Virtual processor will never share a physical core with another virtual
+ * processor, except for virtual processors that are reported as sibling SMT
+ * threads.
+ */
+#define HV_X64_NO_NONARCH_CORESHARING BIT(18)
+
/* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */
+#define HV_X64_NESTED_DIRECT_FLUSH BIT(17)
#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
#define HV_X64_NESTED_MSR_BITMAP BIT(19)
@@ -524,14 +532,24 @@ struct hv_timer_message_payload {
__u64 delivery_time; /* When the message was delivered */
} __packed;
+struct hv_nested_enlightenments_control {
+ struct {
+ __u32 directhypercall:1;
+ __u32 reserved:31;
+ } features;
+ struct {
+ __u32 reserved;
+ } hypercallControls;
+} __packed;
+
/* Define virtual processor assist page structure. */
struct hv_vp_assist_page {
__u32 apic_assist;
- __u32 reserved;
- __u64 vtl_control[2];
- __u64 nested_enlightenments_control[2];
- __u32 enlighten_vmentry;
- __u32 padding;
+ __u32 reserved1;
+ __u64 vtl_control[3];
+ struct hv_nested_enlightenments_control nested_control;
+ __u8 enlighten_vmentry;
+ __u8 reserved2[7];
__u64 current_nested_vmcs;
} __packed;
@@ -882,4 +900,7 @@ struct hv_tlb_flush_ex {
u64 gva_list[];
} __packed;
+struct hv_partition_assist_pg {
+ u32 tlb_lock_count;
+};
#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a3a3ec73fa2f..23edf56cf577 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -320,6 +320,7 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
bool unsync;
+ u8 mmu_valid_gen;
bool mmio_cached;
/*
@@ -335,7 +336,6 @@ struct kvm_mmu_page {
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
- unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32
@@ -844,6 +844,8 @@ struct kvm_hv {
/* How many vCPUs have VP index != vCPU index */
atomic_t num_mismatched_vp_indexes;
+
+ struct hv_partition_assist_pg *hv_pa_pg;
};
enum kvm_irqchip_mode {
@@ -857,12 +859,13 @@ struct kvm_arch {
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
- unsigned long mmu_valid_gen;
+ u8 mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
*/
struct list_head active_mmu_pages;
+ struct list_head zapped_obsolete_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
@@ -1213,6 +1216,7 @@ struct kvm_x86_ops {
bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
+ int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1312,18 +1316,42 @@ extern u64 kvm_default_tsc_scaling_ratio;
extern u64 kvm_mce_cap_supported;
-enum emulation_result {
- EMULATE_DONE, /* no further processing */
- EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */
- EMULATE_FAIL, /* can't emulate this instruction */
-};
-
+/*
+ * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
+ * userspace I/O) to indicate that the emulation context
+ * should be resued as is, i.e. skip initialization of
+ * emulation context, instruction fetch and decode.
+ *
+ * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
+ * Indicates that only select instructions (tagged with
+ * EmulateOnUD) should be emulated (to minimize the emulator
+ * attack surface). See also EMULTYPE_TRAP_UD_FORCED.
+ *
+ * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
+ * decode the instruction length. For use *only* by
+ * kvm_x86_ops->skip_emulated_instruction() implementations.
+ *
+ * EMULTYPE_ALLOW_RETRY - Set when the emulator should resume the guest to
+ * retry native execution under certain conditions.
+ *
+ * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was
+ * triggered by KVM's magic "force emulation" prefix,
+ * which is opt in via module param (off by default).
+ * Bypasses EmulateOnUD restriction despite emulating
+ * due to an intercepted #UD (see EMULTYPE_TRAP_UD).
+ * Used to test the full emulator from userspace.
+ *
+ * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
+ * backdoor emulation, which is opt in via module param.
+ * VMware backoor emulation handles select instructions
+ * and reinjects the #GP for all other cases.
+ */
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
#define EMULTYPE_ALLOW_RETRY (1 << 3)
-#define EMULTYPE_NO_UD_ON_FAIL (1 << 4)
-#define EMULTYPE_VMWARE (1 << 5)
+#define EMULTYPE_TRAP_UD_FORCED (1 << 4)
+#define EMULTYPE_VMWARE_GP (1 << 5)
int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
void *insn, int insn_len);
@@ -1506,7 +1534,7 @@ enum {
#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
-asmlinkage void __noreturn kvm_spurious_fault(void);
+asmlinkage void kvm_spurious_fault(void);
/*
* Hardware virtualization extension instructions may fault if a
@@ -1514,24 +1542,14 @@ asmlinkage void __noreturn kvm_spurious_fault(void);
* Usually after catching the fault we just panic; during reboot
* instead the instruction is ignored.
*/
-#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
+#define __kvm_handle_fault_on_reboot(insn) \
"666: \n\t" \
insn "\n\t" \
"jmp 668f \n\t" \
"667: \n\t" \
"call kvm_spurious_fault \n\t" \
"668: \n\t" \
- ".pushsection .fixup, \"ax\" \n\t" \
- "700: \n\t" \
- cleanup_insn "\n\t" \
- "cmpb $0, kvm_rebooting\n\t" \
- "je 667b \n\t" \
- "jmp 668b \n\t" \
- ".popsection \n\t" \
- _ASM_EXTABLE(666b, 700b)
-
-#define __kvm_handle_fault_on_reboot(insn) \
- ____kvm_handle_fault_on_reboot(insn, "")
+ _ASM_EXTABLE(666b, 667b)
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index dec9c1e84c78..6ece8561ba66 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -52,6 +52,7 @@ enum {
INTERCEPT_MWAIT,
INTERCEPT_MWAIT_COND,
INTERCEPT_XSETBV,
+ INTERCEPT_RDPRU,
};
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b15e6465870f..1835767aa335 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
#define SECONDARY_EXEC_PT_USE_GPA 0x01000000
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000
#define SECONDARY_EXEC_TSC_SCALING 0x02000000
+#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE 0x04000000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
#define PIN_BASED_NMI_EXITING 0x00000008
@@ -110,6 +111,7 @@
#define VMX_MISC_SAVE_EFER_LMA 0x00000020
#define VMX_MISC_ACTIVITY_HLT 0x00000040
#define VMX_MISC_ZERO_LEN_INS 0x40000000
+#define VMX_MISC_MSR_LIST_MULTIPLIER 512
/* VMFUNC functions */
#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index a9731f8a480f..2e8a30f06c74 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -75,6 +75,7 @@
#define SVM_EXIT_MWAIT 0x08b
#define SVM_EXIT_MWAIT_COND 0x08c
#define SVM_EXIT_XSETBV 0x08d
+#define SVM_EXIT_RDPRU 0x08e
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index f01950aa7fae..3eb8411ab60e 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -86,6 +86,8 @@
#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_UMWAIT 67
+#define EXIT_REASON_TPAUSE 68
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -144,7 +146,9 @@
{ EXIT_REASON_RDSEED, "RDSEED" }, \
{ EXIT_REASON_PML_FULL, "PML_FULL" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
- { EXIT_REASON_XRSTORS, "XRSTORS" }
+ { EXIT_REASON_XRSTORS, "XRSTORS" }, \
+ { EXIT_REASON_UMWAIT, "UMWAIT" }, \
+ { EXIT_REASON_TPAUSE, "TPAUSE" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index 32b4dc9030aa..c222f283b456 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -17,6 +17,12 @@
*/
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
+u32 get_umwait_control_msr(void)
+{
+ return umwait_control_cached;
+}
+EXPORT_SYMBOL_GPL(get_umwait_control_msr);
+
/*
* Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
* hardware or BIOS before kernel boot.
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index dd5985eb61b4..63316036f85a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -304,7 +304,13 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
case 7:
case 0xb:
case 0xd:
+ case 0xf:
+ case 0x10:
+ case 0x12:
case 0x14:
+ case 0x17:
+ case 0x18:
+ case 0x1f:
case 0x8000001d:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
break;
@@ -360,7 +366,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index fff790a3f4ee..23ff65504d7e 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,6 +23,7 @@
#include "ioapic.h"
#include "hyperv.h"
+#include <linux/cpu.h>
#include <linux/kvm_host.h>
#include <linux/highmem.h>
#include <linux/sched/cputime.h>
@@ -645,7 +646,9 @@ static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
.vector = stimer->config.apic_vector
};
- return !kvm_apic_set_irq(vcpu, &irq, NULL);
+ if (lapic_in_kernel(vcpu))
+ return !kvm_apic_set_irq(vcpu, &irq, NULL);
+ return 0;
}
static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
@@ -1852,7 +1855,13 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
- ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
+
+ /*
+ * Direct Synthetic timers only make sense with in-kernel
+ * LAPIC
+ */
+ if (lapic_in_kernel(vcpu))
+ ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
break;
@@ -1864,7 +1873,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
if (evmcs_ver)
ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
-
+ if (!cpu_smt_possible())
+ ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
/*
* Default number of spinlock retry attempts, matches
* HyperV 2016.
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8675458c2205..3a3a6854dcca 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -65,7 +65,9 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
-#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
+static bool lapic_timer_advance_dynamic __read_mostly;
+#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100
+#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000
#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -1485,26 +1487,25 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
u64 ns;
+ /* Do not adjust for tiny fluctuations or large random spikes. */
+ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
+ abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN)
+ return;
+
/* too early */
if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns -= min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
} else {
/* too late */
ns = advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns += min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
}
- if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- apic->lapic_timer.timer_advance_adjust_done = true;
- if (unlikely(timer_advance_ns > 5000)) {
+ if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX))
timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
- apic->lapic_timer.timer_advance_adjust_done = false;
- }
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}
@@ -1524,7 +1525,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
- if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+ if (lapic_timer_advance_dynamic)
adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
}
@@ -2302,13 +2303,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
apic->lapic_timer.timer.function = apic_timer_fn;
if (timer_advance_ns == -1) {
apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
- apic->lapic_timer.timer_advance_adjust_done = false;
+ lapic_timer_advance_dynamic = true;
} else {
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
- apic->lapic_timer.timer_advance_adjust_done = true;
+ lapic_timer_advance_dynamic = false;
}
-
/*
* APIC is created enabled. This will prevent kvm_lapic_set_base from
* thinking that APIC state has changed.
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 50053d2b8b7b..2aad7e226fc0 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -35,7 +35,6 @@ struct kvm_timer {
s64 advance_expire_delta;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
- bool timer_advance_adjust_done;
};
struct kvm_lapic {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a10af9c87f8a..5269aa057dfa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -403,8 +403,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len;
- page_header(__pa(sptep))->mmio_cached = true;
-
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
}
@@ -2103,6 +2101,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
* depends on valid pages being added to the head of the list. See
* comments in kvm_zap_obsolete_pages().
*/
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2252,7 +2251,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
- if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
+ if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
@@ -2311,7 +2310,8 @@ static void mmu_audit_disable(void) { }
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+ return sp->role.invalid ||
+ unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2538,7 +2538,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
- sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
@@ -2753,7 +2752,12 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
- if (!sp->role.invalid)
+ /*
+ * Obsolete pages cannot be used on any vCPUs, see the comment
+ * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
+ * treats invalid shadow pages as being obsolete.
+ */
+ if (!is_obsolete_sp(kvm, sp))
kvm_reload_remote_mmus(kvm);
}
@@ -5383,7 +5387,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
void *insn, int insn_len)
{
int r, emulation_type = 0;
- enum emulation_result er;
bool direct = vcpu->arch.mmu->direct_map;
/* With shadow page tables, fault_address contains a GVA or nGPA. */
@@ -5450,19 +5453,8 @@ emulate:
return 1;
}
- er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
-
- switch (er) {
- case EMULATE_DONE:
- return 1;
- case EMULATE_USER_EXIT:
- ++vcpu->stat.mmio_exits;
- /* fall through */
- case EMULATE_FAIL:
- return 0;
- default:
- BUG();
- }
+ return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
+ insn_len);
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -5684,12 +5676,11 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
return ret;
}
-
+#define BATCH_ZAP_PAGES 10
static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
- LIST_HEAD(invalid_list);
- int ign;
+ int nr_zapped, batch = 0;
restart:
list_for_each_entry_safe_reverse(sp, node,
@@ -5702,46 +5693,39 @@ restart:
break;
/*
- * Do not repeatedly zap a root page to avoid unnecessary
- * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
- * progress:
- * vcpu 0 vcpu 1
- * call vcpu_enter_guest():
- * 1): handle KVM_REQ_MMU_RELOAD
- * and require mmu-lock to
- * load mmu
- * repeat:
- * 1): zap root page and
- * send KVM_REQ_MMU_RELOAD
- *
- * 2): if (cond_resched_lock(mmu-lock))
- *
- * 2): hold mmu-lock and load mmu
- *
- * 3): see KVM_REQ_MMU_RELOAD bit
- * on vcpu->requests is set
- * then return 1 to call
- * vcpu_enter_guest() again.
- * goto repeat;
- *
- * Since we are reversely walking the list and the invalid
- * list will be moved to the head, skip the invalid page
- * can help us to avoid the infinity list walking.
+ * Skip invalid pages with a non-zero root count, zapping pages
+ * with a non-zero root count will never succeed, i.e. the page
+ * will get thrown back on active_mmu_pages and we'll get stuck
+ * in an infinite loop.
*/
- if (sp->role.invalid)
+ if (sp->role.invalid && sp->root_count)
continue;
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- cond_resched_lock(&kvm->mmu_lock);
+ /*
+ * No need to flush the TLB since we're only zapping shadow
+ * pages with an obsolete generation number and all vCPUS have
+ * loaded a new root, i.e. the shadow pages being zapped cannot
+ * be in active use by the guest.
+ */
+ if (batch >= BATCH_ZAP_PAGES &&
+ cond_resched_lock(&kvm->mmu_lock)) {
+ batch = 0;
goto restart;
}
- if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
+ if (__kvm_mmu_prepare_zap_page(kvm, sp,
+ &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
+ batch += nr_zapped;
goto restart;
+ }
}
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ /*
+ * Trigger a remote TLB flush before freeing the page tables to ensure
+ * KVM is not in the middle of a lockless shadow page table walk, which
+ * may reference the pages.
+ */
+ kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
}
/*
@@ -5755,13 +5739,39 @@ restart:
*/
static void kvm_mmu_zap_all_fast(struct kvm *kvm)
{
+ lockdep_assert_held(&kvm->slots_lock);
+
spin_lock(&kvm->mmu_lock);
- kvm->arch.mmu_valid_gen++;
+ trace_kvm_mmu_zap_all_fast(kvm);
+
+ /*
+ * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
+ * held for the entire duration of zapping obsolete pages, it's
+ * impossible for there to be multiple invalid generations associated
+ * with *valid* shadow pages at any given time, i.e. there is exactly
+ * one valid generation and (at most) one invalid generation.
+ */
+ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
+
+ /*
+ * Notify all vcpus to reload its shadow page table and flush TLB.
+ * Then all vcpus will switch to new shadow page table with the new
+ * mmu_valid_gen.
+ *
+ * Note: we need to do this under the protection of mmu_lock,
+ * otherwise, vcpu would purge shadow page but miss tlb flush.
+ */
+ kvm_reload_remote_mmus(kvm);
kvm_zap_obsolete_pages(kvm);
spin_unlock(&kvm->mmu_lock);
}
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+ return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
+
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node)
@@ -5959,7 +5969,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
}
EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
-static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
+void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
@@ -5968,14 +5978,10 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
- if (mmio_only && !sp->mmio_cached)
- continue;
if (sp->role.invalid && sp->root_count)
continue;
- if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
- WARN_ON_ONCE(mmio_only);
+ if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
- }
if (cond_resched_lock(&kvm->mmu_lock))
goto restart;
}
@@ -5984,11 +5990,6 @@ restart:
spin_unlock(&kvm->mmu_lock);
}
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
- return __kvm_mmu_zap_all(kvm, false);
-}
-
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
@@ -6010,7 +6011,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
*/
if (unlikely(gen == 0)) {
kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
- __kvm_mmu_zap_all(kvm, true);
+ kvm_mmu_zap_all_fast(kvm);
}
}
@@ -6041,16 +6042,24 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
* want to shrink a VM that only started to populate its MMU
* anyway.
*/
- if (!kvm->arch.n_used_mmu_pages)
+ if (!kvm->arch.n_used_mmu_pages &&
+ !kvm_has_zapped_obsolete_pages(kvm))
continue;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
+ if (kvm_has_zapped_obsolete_pages(kvm)) {
+ kvm_mmu_commit_zap_page(kvm,
+ &kvm->arch.zapped_obsolete_pages);
+ goto unlock;
+ }
+
if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
freed++;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+unlock:
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index d8001b4bca05..7ca8831c7d1a 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -8,16 +8,18 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
-#define KVM_MMU_PAGE_FIELDS \
- __field(__u64, gfn) \
- __field(__u32, role) \
- __field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u8, mmu_valid_gen) \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
__field(bool, unsync)
-#define KVM_MMU_PAGE_ASSIGN(sp) \
- __entry->gfn = sp->gfn; \
- __entry->role = sp->role.word; \
- __entry->root_count = sp->root_count; \
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
__entry->unsync = sp->unsync;
#define KVM_MMU_PAGE_PRINTK() ({ \
@@ -29,8 +31,9 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \
+ trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \
" %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
__entry->gfn, role.level, \
role.gpte_is_8_bytes ? 8 : 4, \
role.quadrant, \
@@ -280,6 +283,27 @@ TRACE_EVENT(
);
TRACE_EVENT(
+ kvm_mmu_zap_all_fast,
+ TP_PROTO(struct kvm *kvm),
+ TP_ARGS(kvm),
+
+ TP_STRUCT__entry(
+ __field(__u8, mmu_valid_gen)
+ __field(unsigned int, mmu_used_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+ ),
+
+ TP_printk("kvm-mmu-valid-gen %u used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+ )
+);
+
+
+TRACE_EVENT(