diff options
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 974 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 88 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmutrace.h | 21 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 50 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 318 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 252 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.c | 182 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.h | 60 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 1157 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 48 |
10 files changed, 2443 insertions, 707 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 71aa3da2a0b7..17587f496ec7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -19,10 +19,12 @@ #include "ioapic.h" #include "mmu.h" #include "mmu_internal.h" +#include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include "cpuid.h" +#include "spte.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -45,7 +47,6 @@ #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> -#include <asm/e820/api.h> #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> @@ -64,12 +65,12 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60; static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); -static struct kernel_param_ops nx_huge_pages_ops = { +static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; -static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { +static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { .set = set_nx_huge_pages_recovery_ratio, .get = param_get_uint, }; @@ -104,45 +105,13 @@ enum { AUDIT_POST_SYNC }; -#undef MMU_DEBUG - #ifdef MMU_DEBUG -static bool dbg = 0; +bool dbg = 0; module_param(dbg, bool, 0644); - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) -#define MMU_WARN_ON(x) WARN_ON(x) -#else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) -#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 - -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. - */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - #define PT32_LEVEL_BITS 10 #define PT32_LEVEL_SHIFT(level) \ @@ -156,18 +125,6 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) -#else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#endif -#define PT64_LVL_ADDR_MASK(level) \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) - #define PT32_BASE_ADDR_MASK PAGE_MASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) @@ -175,42 +132,11 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask | shadow_me_mask) - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -/* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull - #include <trace/events/kvm.h> -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) - -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 -/* - * Return values of handle_mmio_page_fault and mmu.page_fault: - * RET_PF_RETRY: let CPU fault again on the address. - * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. - * - * For handle_mmio_page_fault only: - * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. - */ -enum { - RET_PF_RETRY = 0, - RET_PF_EMULATE = 1, - RET_PF_INVALID = 2, -}; - struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -242,65 +168,10 @@ struct kvm_shadow_walk_iterator { __shadow_walk_next(&(_walker), spte)) static struct kmem_cache *pte_list_desc_cache; -static struct kmem_cache *mmu_page_header_cache; +struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; -static u64 __read_mostly shadow_nx_mask; -static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ -static u64 __read_mostly shadow_user_mask; -static u64 __read_mostly shadow_accessed_mask; -static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mmio_value; -static u64 __read_mostly shadow_mmio_access_mask; -static u64 __read_mostly shadow_present_mask; -static u64 __read_mostly shadow_me_mask; - -/* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; - * shadow_acc_track_mask is the set of bits to be cleared in non-accessed - * pages. - */ -static u64 __read_mostly shadow_acc_track_mask; - -/* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. - */ -static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | - PT64_EPT_EXECUTABLE_MASK; -static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; - -/* - * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order - * to guard against L1TF attacks. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; - -/* - * The number of high-order 1 bits to use in the mask above. - */ -static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; - -/* - * In some cases, we need to preserve the GFN of a non-present or reserved - * SPTE when we usurp the upper five bits of the physical address space to - * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll - * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask - * left into the reserved bits, i.e. the GFN in the SPTE will be split into - * high and low parts. This mask covers the lower bits of the GFN. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; - -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -static u8 __read_mostly shadow_phys_bits; - static void mmu_spte_set(u64 *sptep, u64 spte); -static bool is_executable_pte(u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -325,7 +196,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } -static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages) { struct kvm_tlb_range range; @@ -336,143 +207,17 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) -{ - BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); - WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; - shadow_mmio_access_mask = access_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); - -static bool is_mmio_spte(u64 spte) -{ - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; -} - -static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) -{ - return sp->role.ad_disabled; -} - -static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) -{ - /* - * When using the EPT page-modification log, the GPAs in the log - * would come from L2 rather than L1. Therefore, we need to rely - * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. - */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu; -} - -static inline bool spte_ad_enabled(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; -} - -static inline bool spte_ad_need_write_protect(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; -} - -static bool is_nx_huge_page_enabled(void) +bool is_nx_huge_page_enabled(void) { return READ_ONCE(nx_huge_pages); } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - -static inline bool is_access_track_spte(u64 spte) -{ - return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; -} - -/* - * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of - * the memslots generation and is derived as follows: - * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 - * - * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in - * the MMIO generation number, as doing so would require stealing a bit from - * the "real" generation number and thus effectively halve the maximum number - * of MMIO generations that can be handled before encountering a wrap (which - * requires a full MMU zap). The flag is instead explicitly queried when - * checking for MMIO spte cache hits. - */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) - -#define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 -#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ - MMIO_SPTE_GEN_LOW_START) - -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT -#define MMIO_SPTE_GEN_HIGH_END 62 -#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ - MMIO_SPTE_GEN_HIGH_START) - -static u64 generation_mmio_spte_mask(u64 gen) -{ - u64 mask; - - WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); - - mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; - mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; - return mask; -} - -static u64 get_mmio_spte_generation(u64 spte) -{ - u64 gen; - - gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; - gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; - return gen; -} - -static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) -{ - - u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); - u64 gpa = gfn << PAGE_SHIFT; - - access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) - << shadow_nonpresent_or_rsvd_mask_len; - - return mask; -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { u64 mask = make_mmio_spte(vcpu, gfn, access); - unsigned int gen = get_mmio_spte_generation(mask); - - access = mask & ACC_ALL; - trace_mark_mmio_spte(sptep, gfn, access, gen); + trace_mark_mmio_spte(sptep, gfn, mask); mmu_spte_set(sptep, mask); } @@ -521,7 +266,7 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { exception->error_code |= PFERR_RSVD_MASK; return UNMAPPED_GVA; } @@ -529,90 +274,6 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, return gpa; } -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) -{ - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); - -static u8 kvm_get_shadow_phys_bits(void) -{ - /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected - * in CPU detection code, but the processor treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at - * the physical address bits reported by CPUID. - */ - if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) - return cpuid_eax(0x80000008) & 0xff; - - /* - * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with - * custom CPUID. Proceed with whatever the kernel found since these features - * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). - */ - return boot_cpu_data.x86_phys_bits; -} - -static void kvm_mmu_reset_all_pte_masks(void) -{ - u8 low_phys_bits; - - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; - - shadow_phys_bits = kvm_get_shadow_phys_bits(); - - /* - * If the CPU has 46 or less physical address bits, then set an - * appropriate mask to guard against L1TF attacks. Otherwise, it is - * assumed that the CPU is not vulnerable to L1TF. - * - * Some Intel CPUs address the L1 cache using more PA bits than are - * reported by CPUID. Use the PA width of the L1 cache when possible - * to achieve more effective mitigation, e.g. if system RAM overlaps - * the most significant bits of legal physical address space. - */ - shadow_nonpresent_or_rsvd_mask = 0; - low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_has_bug(X86_BUG_L1TF) && - !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= - 52 - shadow_nonpresent_or_rsvd_mask_len)) { - low_phys_bits = boot_cpu_data.x86_cache_bits - - shadow_nonpresent_or_rsvd_mask_len; - shadow_nonpresent_or_rsvd_mask = - rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); - } - - shadow_nonpresent_or_rsvd_lower_gfn_mask = - GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); -} - static int is_cpuid_PSE36(void) { return 1; @@ -623,35 +284,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.efer & EFER_NX; } -static int is_shadow_present_pte(u64 pte) -{ - return (pte != 0) && !is_mmio_spte(pte); -} - -static int is_large_pte(u64 pte) -{ - return pte & PT_PAGE_SIZE_MASK; -} - -static int is_last_spte(u64 pte, int level) -{ - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; -} - -static bool is_executable_pte(u64 spte) -{ - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; -} - -static kvm_pfn_t spte_to_pfn(u64 pte) -{ - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - static gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; @@ -796,12 +428,6 @@ retry: } #endif -static bool spte_can_locklessly_be_made_writable(u64 spte) -{ - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); -} - static bool spte_has_volatile_bits(u64 spte) { if (!is_shadow_present_pte(spte)) @@ -826,21 +452,6 @@ static bool spte_has_volatile_bits(u64 spte) return false; } -static bool is_accessed_spte(u64 spte) -{ - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -976,34 +587,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -static u64 mark_spte_for_access_track(u64 spte) -{ - if (spte_ad_enabled(spte)) - return spte & ~shadow_accessed_mask; - - if (is_access_track_spte(spte)) - return spte; - - /* - * Making an Access Tracking PTE will result in removal of write access - * from the PTE. So, verify that we will be able to restore the write - * access in the fast page fault path later on. - */ - WARN_ONCE((spte & PT_WRITABLE_MASK) && - !spte_can_locklessly_be_made_writable(spte), - "kvm: Writable SPTE is not locklessly dirty-trackable\n"); - - WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift), - "kvm: Access Tracking saved bit locations are not zero\n"); - - spte |= (spte & shadow_acc_track_saved_bits_mask) << - shadow_acc_track_saved_bits_shift; - spte &= ~shadow_acc_track_mask; - - return spte; -} - /* Restore an acc-track PTE back to a regular PTE */ static u64 restore_acc_track_spte(u64 spte) { @@ -1193,7 +776,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } -static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->lpage_disallowed) return; @@ -1221,7 +804,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { --kvm->stat.nx_lpage_splits; sp->lpage_disallowed = false; @@ -1640,6 +1223,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, true); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1666,6 +1252,9 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, false); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1710,6 +1299,10 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, write_protected |= __rmap_write_protect(kvm, rmap_head, true); } + if (kvm->arch.tdp_mmu_enabled) + write_protected |= + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + return write_protected; } @@ -1769,13 +1362,8 @@ restart: pte_list_remove(rmap_head, sptep); goto restart; } else { - new_spte = *sptep & ~PT64_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; - - new_spte = mark_spte_for_access_track(new_spte); + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + *sptep, new_pfn); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1919,12 +1507,26 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + int r; + + r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + + return r; } int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + int r; + + r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + + return r; } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1973,12 +1575,24 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + int young = false; + + young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + + return young; } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + int young = false; + + young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + + return young; } #ifdef MMU_DEBUG @@ -2577,13 +2191,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; - - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else - spte |= shadow_accessed_mask; + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); @@ -2615,8 +2223,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } -static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, - u64 *spte) +/* Returns the number of zapped non-leaf child shadow pages. */ +static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *spte, struct list_head *invalid_list) { u64 pte; struct kvm_mmu_page *child; @@ -2630,23 +2239,34 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); - } - return true; - } - if (is_mmio_spte(pte)) + /* + * Recursively zap nested TDP SPs, parentless SPs are + * unlikely to be used again in the near future. This + * avoids retaining a large number of stale nested SPs. + */ + if (tdp_enabled && invalid_list && + child->role.guest_mode && !child->parent_ptes.val) + return kvm_mmu_prepare_zap_page(kvm, child, + invalid_list); + } + } else if (is_mmio_spte(pte)) { mmu_spte_clear_no_track(spte); - - return false; + } + return 0; } -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *sp) +static int kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list) { + int zapped = 0; unsigned i; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - mmu_page_zap_pte(kvm, sp, sp->spt + i); + zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); + + return zapped; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -2692,7 +2312,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); - kvm_mmu_page_unlink_children(kvm, sp); + *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ @@ -2885,8 +2505,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { struct kvm_mmu_page *sp; @@ -2946,132 +2566,42 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return false; } -static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) -{ - if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && - /* - * Some reserved pages, such as those from NVDIMM - * DAX devices, are not for MMIO, and can be mapped - * with cached memory type for better performance. - * However, the above check misconceives those pages - * as MMIO, and results in KVM mapping them with UC - * memory type, which would hurt the performance. - * Therefore, we check the host memory type in addition - * and only treat UC/UC-/WC pages as MMIO. - */ - (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); - - return !e820__mapped_raw_any(pfn_to_hpa(pfn), - pfn_to_hpa(pfn + 1) - 1, - E820_TYPE_RAM); -} - -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) - static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned int pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte = 0; - int ret = 0; + u64 spte; struct kvm_mmu_page *sp; + int ret; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; sp = sptep_to_sp(sptep); - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; - - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ - spte |= shadow_present_mask; - if (!speculative) - spte |= spte_shadow_accessed_mask(spte); - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { - pte_access &= ~ACC_EXEC_MASK; - } + ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, + can_unsync, host_writable, sp_ad_disabled(sp), &spte); - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else - spte |= shadow_nx_mask; - - if (pte_access & ACC_USER_MASK) - spte |= shadow_user_mask; - - if (level > PG_LEVEL_4K) - spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); - - if (host_writable) - spte |= SPTE_HOST_WRITEABLE; - else - pte_access &= ~ACC_WRITE_MASK; - - if (!kvm_is_mmio_pfn(pfn)) - spte |= shadow_me_mask; - - spte |= (u64)pfn << PAGE_SHIFT; - - if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. - * Same reasoning can be applied to dirty page accounting. - */ - if (!can_unsync && is_writable_pte(*sptep)) - goto set_pte; - - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); - } - } - - if (pte_access & ACC_WRITE_MASK) { + if (spte & PT_WRITABLE_MASK) kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= spte_shadow_dirty_mask(spte); - } - if (speculative) - spte = mark_spte_for_access_track(spte); - -set_pte: - if (mmu_spte_update(sptep, spte)) + if (*sptep == spte) + ret |= SET_SPTE_SPURIOUS; + else if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; return ret; } static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int write_fault, int level, + unsigned int pte_access, bool write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; int set_spte_ret; - int ret = RET_PF_RETRY; + int ret = RET_PF_FIXED; bool flush = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, @@ -3113,6 +2643,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; + /* + * The fault is fully spurious if and only if the new SPTE and old SPTE + * are identical, and emulation is not required. + */ + if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { + WARN_ON_ONCE(!was_rmapped); + return RET_PF_SPURIOUS; + } + pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) @@ -3161,7 +2700,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); put_page(pages[i]); } @@ -3239,8 +2778,9 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp) +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; @@ -3248,6 +2788,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t mask; int level; + *req_level = PG_LEVEL_4K; + if (unlikely(max_level == PG_LEVEL_4K)) return PG_LEVEL_4K; @@ -3272,7 +2814,14 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (level == PG_LEVEL_4K) return level; - level = min(level, max_level); + *req_level = level = min(level, max_level); + + /* + * Enforce the iTLB multihit workaround after capturing the requested + * level, which will be used to do precise, accurate accounting. + */ + if (huge_page_disallowed) + return PG_LEVEL_4K; /* * mmu_notifier_retry() was successful and mmu_lock is held, so @@ -3285,14 +2834,12 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, - gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp) { - int level = *levelp; - u64 spte = *it.sptep; + int level = *goal_levelp; - if (it.level == level && level > PG_LEVEL_4K && - is_nx_huge_page_enabled() && + if (cur_level == level && level > PG_LEVEL_4K && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -3302,26 +2849,32 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - + KVM_PAGES_PER_HPAGE(level - 1); *pfnp |= gfn & page_mask; - (*levelp)--; + (*goal_levelp)--; } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool account_disallowed_nx_lpage) + bool prefault, bool is_tdp) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; |