diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-19 10:38:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-19 10:38:36 -0700 |
commit | e61cf2e3a5b452cfefcb145021f5a8ea88735cc1 (patch) | |
tree | bbabaf0d4753d6880ecbaddd8daa0164d49c1c61 /arch/x86 | |
parent | 1009aa1205c2c5e9101437dcadfa195708d863bf (diff) | |
parent | 28a1f3ac1d0c8558ee4453d9634dad891a6e922e (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull first set of KVM updates from Paolo Bonzini:
"PPC:
- minor code cleanups
x86:
- PCID emulation and CR3 caching for shadow page tables
- nested VMX live migration
- nested VMCS shadowing
- optimized IPI hypercall
- some optimizations
ARM will come next week"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (85 commits)
kvm: x86: Set highest physical address bits in non-present/reserved SPTEs
KVM/x86: Use CC_SET()/CC_OUT in arch/x86/kvm/vmx.c
KVM: X86: Implement PV IPIs in linux guest
KVM: X86: Add kvm hypervisor init time platform setup callback
KVM: X86: Implement "send IPI" hypercall
KVM/x86: Move X86_CR4_OSXSAVE check into kvm_valid_sregs()
KVM: x86: Skip pae_root shadow allocation if tdp enabled
KVM/MMU: Combine flushing remote tlb in mmu_set_spte()
KVM: vmx: skip VMWRITE of HOST_{FS,GS}_BASE when possible
KVM: vmx: skip VMWRITE of HOST_{FS,GS}_SEL when possible
KVM: vmx: always initialize HOST_{FS,GS}_BASE to zero during setup
KVM: vmx: move struct host_state usage to struct loaded_vmcs
KVM: vmx: compute need to reload FS/GS/LDT on demand
KVM: nVMX: remove a misleading comment regarding vmcs02 fields
KVM: vmx: rename __vmx_load_host_state() and vmx_save_host_state()
KVM: vmx: add dedicated utility to access guest's kernel_gs_base
KVM: vmx: track host_state.loaded using a loaded_vmcs pointer
KVM: vmx: refactor segmentation code in vmx_save_host_state()
kvm: nVMX: Fix fault priority for VMX operations
kvm: nVMX: Fix fault vector for VMX operation at CPL > 0
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/hyperv/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/hyperv/nested.c | 56 | ||||
-rw-r--r-- | arch/x86/include/asm/hyperv-tlfs.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 56 | ||||
-rw-r--r-- | arch/x86/include/asm/mshyperv.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/trace/hyperv.h | 14 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/kvm.h | 37 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/kvm_para.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 112 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 27 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 40 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 531 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 24 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 28 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 1120 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 108 |
20 files changed, 1837 insertions, 348 deletions
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index b173d404e3df..b21ee65c4101 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,2 +1,2 @@ -obj-y := hv_init.o mmu.o +obj-y := hv_init.o mmu.o nested.o obj-$(CONFIG_X86_64) += hv_apic.o diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c new file mode 100644 index 000000000000..b8e60cc50461 --- /dev/null +++ b/arch/x86/hyperv/nested.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V nested virtualization code. + * + * Copyright (C) 2018, Microsoft, Inc. + * + * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> + */ + + +#include <linux/types.h> +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> +#include <asm/tlbflush.h> + +#include <asm/trace/hyperv.h> + +int hyperv_flush_guest_mapping(u64 as) +{ + struct hv_guest_mapping_flush **flush_pcpu; + struct hv_guest_mapping_flush *flush; + u64 status; + unsigned long flags; + int ret = -ENOTSUPP; + + if (!hv_hypercall_pg) + goto fault; + + local_irq_save(flags); + + flush_pcpu = (struct hv_guest_mapping_flush **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto fault; + } + + flush->address_space = as; + flush->flags = 0; + + status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE, + flush, NULL); + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + ret = 0; + +fault: + trace_hyperv_nested_flush_guest_mapping(as, ret); + return ret; +} +EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 6ced78af48da..e977b6b3a538 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -310,6 +310,7 @@ struct ms_hyperv_tsc_page { #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 /* Nested features (CPUID 0x4000000A) EAX */ +#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) struct hv_reenlightenment_control { @@ -351,6 +352,7 @@ struct hv_tsc_emulation_status { #define HVCALL_SEND_IPI_EX 0x0015 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 @@ -742,6 +744,12 @@ struct ipi_arg_ex { struct hv_vpset vp_set; }; +/* HvFlushGuestPhysicalAddressSpace hypercalls */ +struct hv_guest_mapping_flush { + u64 address_space; + u64 flags; +}; + /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ struct hv_tlb_flush { u64 address_space; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index acebb808c4b5..00ddb0c9e612 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -55,6 +55,7 @@ #define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) #define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) #define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) +#define KVM_REQ_LOAD_CR3 KVM_ARCH_REQ(5) #define KVM_REQ_EVENT KVM_ARCH_REQ(6) #define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) @@ -76,13 +77,13 @@ #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) +#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ @@ -326,6 +327,16 @@ struct rsvd_bits_validate { u64 bad_mt_xwr; }; +struct kvm_mmu_root_info { + gpa_t cr3; + hpa_t hpa; +}; + +#define KVM_MMU_ROOT_INFO_INVALID \ + ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) + +#define KVM_MMU_NUM_PREV_ROOTS 3 + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -345,7 +356,7 @@ struct kvm_mmu { struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; @@ -354,6 +365,7 @@ struct kvm_mmu { u8 shadow_root_level; u8 ept_ad; bool direct_map; + struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; /* * Bitmap; bit set = permission fault @@ -978,6 +990,15 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + int (*tlb_remote_flush)(struct kvm *kvm); + + /* + * Flush any TLB entries associated with the given GVA. + * Does not need to flush GPA->HPA mappings. + * Can potentially get non-canonical addresses through INVLPGs, which + * the implementation may choose to ignore if appropriate. + */ + void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); @@ -1090,6 +1111,14 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); + int (*get_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + unsigned user_data_size); + int (*set_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state); + void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); @@ -1122,6 +1151,16 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) return kvm_x86_ops->vm_free(kvm); } +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB +static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +{ + if (kvm_x86_ops->tlb_remote_flush && + !kvm_x86_ops->tlb_remote_flush(kvm)) + return 0; + else + return -ENOTSUPP; +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -1273,6 +1312,10 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) +#define KVM_MMU_ROOTS_ALL (~0UL) + int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); @@ -1284,7 +1327,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, @@ -1303,7 +1346,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); void kvm_enable_tdp(void); void kvm_disable_tdp(void); @@ -1418,6 +1462,10 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); +int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, int min, + unsigned long icr, int op_64_bit); + u64 kvm_get_arch_capabilities(void); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index e1771a1987dd..f37704497d8f 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -347,6 +347,7 @@ void hyperv_reenlightenment_intr(struct pt_regs *regs); void set_hv_tscchange_cb(void (*cb)(void)); void clear_hv_tscchange_cb(void); void hyperv_stop_tsc_emulation(void); +int hyperv_flush_guest_mapping(u64 as); #ifdef CONFIG_X86_64 void hv_apic_init(void); @@ -366,6 +367,7 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) { return NULL; } +static inline int hyperv_flush_guest_mapping(u64 as) { return -1; } #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h index 9c0d4b588e3f..2e6245a023ef 100644 --- a/arch/x86/include/asm/trace/hyperv.h +++ b/arch/x86/include/asm/trace/hyperv.h @@ -28,6 +28,20 @@ TRACE_EVENT(hyperv_mmu_flush_tlb_others, __entry->addr, __entry->end) ); +TRACE_EVENT(hyperv_nested_flush_guest_mapping, + TP_PROTO(u64 as, int ret), + TP_ARGS(as, ret), + + TP_STRUCT__entry( + __field(u64, as) + __field(int, ret) + ), + TP_fast_assign(__entry->as = as; + __entry->ret = ret; + ), + TP_printk("address space %llx ret %d", __entry->as, __entry->ret) + ); + TRACE_EVENT(hyperv_send_ipi_mask, TP_PROTO(const struct cpumask *cpus, int vector), diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c535c2fdea13..86299efa804a 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -378,4 +378,41 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 + +#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_SMM_VMXON 0x00000002 + +struct kvm_vmx_nested_state { + __u64 vmxon_pa; + __u64 vmcs_pa; + + struct { + __u16 flags; + } smm; +}; + +/* for KVM_CAP_NESTED_STATE */ +struct kvm_nested_state { + /* KVM_STATE_* flags */ + __u16 flags; + + /* 0 for VMX, 1 for SVM. */ + __u16 format; + + /* 128 for SVM, 128 + VMCS size for VMX. */ + __u32 size; + + union { + /* VMXON, VMCS */ + struct kvm_vmx_nested_state vmx; + + /* Pad the header to 128 bytes. */ + __u8 pad[120]; + }; + + __u8 data[0]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 0ede697c3961..19980ec1a316 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -28,6 +28,7 @@ #define KVM_FEATURE_PV_UNHALT 7 #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 +#define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 09aaabb2bbf1..0f471bd93417 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -444,6 +444,98 @@ static void __init sev_map_percpu_data(void) } #ifdef CONFIG_SMP +#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) + +static void __send_ipi_mask(const struct cpumask *mask, int vector) +{ + unsigned long flags; + int cpu, apic_id, icr; + int min = 0, max = 0; +#ifdef CONFIG_X86_64 + __uint128_t ipi_bitmap = 0; +#else + u64 ipi_bitmap = 0; +#endif + + if (cpumask_empty(mask)) + return; + + local_irq_save(flags); + + switch (vector) { + default: + icr = APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr = APIC_DM_NMI; + break; + } + + for_each_cpu(cpu, mask) { + apic_id = per_cpu(x86_cpu_to_apicid, cpu); + if (!ipi_bitmap) { + min = max = apic_id; + } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { + ipi_bitmap <<= min - apic_id; + min = apic_id; + } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + max = apic_id < max ? max : apic_id; + } else { + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + min = max = apic_id; + ipi_bitmap = 0; + } + __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); + } + + if (ipi_bitmap) { + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + } + + local_irq_restore(flags); +} + +static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) +{ + __send_ipi_mask(mask, vector); +} + +static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpumask new_mask; + const struct cpumask *local_mask; + + cpumask_copy(&new_mask, mask); + cpumask_clear_cpu(this_cpu, &new_mask); + local_mask = &new_mask; + __send_ipi_mask(local_mask, vector); +} + +static void kvm_send_ipi_allbutself(int vector) +{ + kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); +} + +static void kvm_send_ipi_all(int vector) +{ + __send_ipi_mask(cpu_online_mask, vector); +} + +/* + * Set the IPI entry points + */ +static void kvm_setup_pv_ipi(void) +{ + apic->send_IPI_mask = kvm_send_ipi_mask; + apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; + apic->send_IPI_allbutself = kvm_send_ipi_allbutself; + apic->send_IPI_all = kvm_send_ipi_all; + pr_info("KVM setup pv IPIs\n"); +} + static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) { native_smp_prepare_cpus(max_cpus); @@ -611,13 +703,27 @@ static uint32_t __init kvm_detect(void) return kvm_cpuid_base(); } +static void __init kvm_apic_init(void) +{ +#if defined(CONFIG_SMP) + if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI)) + kvm_setup_pv_ipi(); +#endif +} + +static void __init kvm_init_platform(void) +{ + kvmclock_init(); + x86_platform.apic_post_init = kvm_apic_init; +} + const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, - .init.init_platform = kvmclock_init, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, + .init.init_platform = kvm_init_platform, }; static __init int activate_jump_labels(void) @@ -736,6 +842,10 @@ void __init kvm_spinlock_init(void) if (kvm_para_has_hint(KVM_HINTS_REALTIME)) return; + /* Don't use the pvqspinlock code if there is only 1 vCPU. */ + if (num_possible_cpus() == 1) + return; + __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7e042e3d47fd..7bcfa61375c0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -621,7 +621,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | (1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_TLB_FLUSH) | - (1 << KVM_FEATURE_ASYNC_PF_VMEXIT); + (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | + (1 << KVM_FEATURE_PV_SEND_IPI); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4c4f4263420c..106482da6388 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4191,7 +4191,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) maxphyaddr = 36; rsvd = rsvd_bits(maxphyaddr, 63); if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE) - rsvd &= ~CR3_PCID_INVD; + rsvd &= ~X86_CR3_PCID_NOFLUSH; } if (new_val & rsvd) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index af8caf965baa..01d209ab5481 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -235,7 +235,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, struct kvm_vcpu *vcpu = synic_to_vcpu(synic); int ret; - if (!synic->active) + if (!synic->active && !host) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); @@ -295,11 +295,12 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } -static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) +static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, + bool host) { int ret; - if (!synic->active) + if (!synic->active && !host) return 1; ret = 0; @@ -1014,6 +1015,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_TSC_EMULATION_STATUS: hv->hv_tsc_emulation_status = data; break; + case HV_X64_MSR_TIME_REF_COUNT: + /* read-only, but still ignore it if host-initiated */ + if (!host) + return 1; + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1101,6 +1107,12 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), data, host); } + case HV_X64_MSR_TSC_FREQUENCY: + case HV_X64_MSR_APIC_FREQUENCY: + /* read-only, but still ignore it if host-initiated */ + if (!host) + return 1; + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1156,7 +1168,8 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; @@ -1183,7 +1196,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata); + return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: @@ -1229,7 +1242,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return kvm_hv_set_msr(vcpu, msr, data, host); } -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1239,7 +1252,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else - return kvm_hv_get_msr(vcpu, msr, pdata); + return kvm_hv_get_msr(vcpu, msr, pdata, host); } static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 837465d69c6d..d6aa969e20f1 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -48,7 +48,7 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); bool kvm_hv_hypercall_enabled(struct kvm *kvm); int kvm_hv_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d536d457517b..0cefba28c864 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -547,6 +547,46 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, irq->level, irq->trig_mode, dest_map); } +int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, int min, + unsigned long icr, int op_64_bit) +{ + int i; + struct kvm_apic_map *map; + struct kvm_vcpu *vcpu; + struct kvm_lapic_irq irq = {0}; + int cluster_size = op_64_bit ? 64 : 32; + int count = 0; + + irq.vector = icr & APIC_VECTOR_MASK; + irq.delivery_mode = icr & APIC_MODE_MASK; + irq.level = (icr & APIC_INT_ASSERT) != 0; + irq.trig_mode = icr & APIC_INT_LEVELTRIG; + + if (icr & APIC_DEST_MASK) + return -KVM_EINVAL; + if (icr & APIC_SHORT_MASK) + return -KVM_EINVAL; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + /* Bits above cluster_size are masked in the caller. */ + for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } + + min += cluster_size; + for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } + + rcu_read_unlock(); + return count; +} + static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a44e568363a4..a282321329b5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -178,7 +178,24 @@ struct kvm_shadow_walk_iterator { unsigned index; }; -#define for_each_shadow_entry(_vcpu, _addr, _walker) \ +static const union kvm_mmu_page_role mmu_base_role_mask = { + .cr0_wp = 1, + .cr4_pae = 1, + .nxe = 1, + .smep_andnot_wp = 1, + .smap_andnot_wp = 1, + .smm = 1, + .guest_mode = 1, + .ad_disabled = 1, +}; + +#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ + for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ + (_root), (_addr)); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + +#define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) @@ -221,7 +238,20 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | PT64_EPT_EXECUTABLE_MASK; static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; +/* + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order + * to guard against L1TF attacks. + */ +static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; + +/* + * The number of high-order 1 bits to use in the mask above. + */ +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; + static void mmu_spte_set(u64 *sptep, u64 spte); +static union kvm_mmu_page_role +kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { @@ -308,9 +338,13 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, { unsigned int gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); + u64 gpa = gfn << PAGE_SHIFT; access &= ACC_WRITE_MASK | ACC_USER_MASK; - mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; + mask |= shadow_mmio_value | access; + mask |= gpa | shadow_nonpresent_or_rsvd_mask; + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + << shadow_nonpresent_or_rsvd_mask_len; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -323,8 +357,14 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; - return (spte & ~mask) >> PAGE_SHIFT; + u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask | + shadow_nonpresent_or_rsvd_mask; + u64 gpa = spte & ~mask; + + gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) + & shadow_nonpresent_or_rsvd_mask; + + return gpa >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) @@ -381,7 +421,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static void kvm_mmu_clear_all_pte_masks(void) +static void kvm_mmu_reset_all_pte_masks(void) { shadow_user_mask = 0; shadow_accessed_mask = 0; @@ -391,6 +431,18 @@ static void kvm_mmu_clear_all_pte_masks(void) shadow_mmio_mask = 0; shadow_present_mask = 0; shadow_acc_track_mask = 0; + + /* + * If the CPU has 46 or less physical address bits, then set an + * appropriate mask to guard against L1TF attacks. Otherwise, it is + * assumed that the CPU is not vulnerable to L1TF. + */ + if (boot_cpu_data.x86_phys_bits < + 52 - shadow_nonpresent_or_rsvd_mask_len) + shadow_nonpresent_or_rsvd_mask = + rsvd_bits(boot_cpu_data.x86_phys_bits - + shadow_nonpresent_or_rsvd_mask_len, + boot_cpu_data.x86_phys_bits - 1); } static int is_cpuid_PSE36(void) @@ -1986,7 +2038,7 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 0; } -static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root) { } @@ -2117,12 +2169,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return false; - } - - if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { + if (sp->role.cr4_pae != !!is_pae(vcpu) + || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } @@ -2392,11 +2440,12 @@ out: return sp; } -static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, - struct kvm_vcpu *vcpu, u64 addr) +static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, hpa_t root, + u64 addr) { iterator->addr = addr; - iterator->shadow_addr = vcpu->arch.mmu.root_hpa; + iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu.shadow_root_level; if (iterator->level == PT64_ROOT_4LEVEL && @@ -2405,6 +2454,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, --iterator->level; if (iterator->level == PT32E_ROOT_LEVEL) { + /* + * prev_root is currently only used for 64-bit hosts. So only + * the active root_hpa is valid here. + */ + BUG_ON(root != vcpu->arch.mmu.root_hpa); + iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= PT64_BASE_ADDR_MASK; @@ -2414,6 +2469,13 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, } } +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) +{ + shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa, + addr); +} + static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PT_PAGE_TABLE_LEVEL) @@ -2702,6 +2764,45 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_unsync_page(vcpu, sp); } + /* + * We need to ensure that the marking of unsync pages is visible + * before the SPTE is updated to allow writes because + * kvm_mmu_sync_roots() checks the unsync flags without holding + * the MMU lock and so can race with this. If the SPTE was updated + * before the page had been marked as unsync-ed, something like the + * following could happen: + * + * CPU 1 CPU 2 + * ---------- |