diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/hyperv.c | 280 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 45 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 393 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 13 | ||||
-rw-r--r-- | arch/x86/kvm/mmu_audit.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 15 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 64 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 42 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 2287 | ||||
-rw-r--r-- | arch/x86/kvm/vmx_shadow_fields.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 244 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 2 |
14 files changed, 2358 insertions, 1050 deletions
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 01d209ab5481..4e80080f277a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -36,6 +36,8 @@ #include "trace.h" +#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) + static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) { return atomic64_read(&synic->sint[sint]); @@ -132,8 +134,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) struct kvm_vcpu *vcpu = NULL; int i; - if (vpidx < KVM_MAX_VCPUS) - vcpu = kvm_get_vcpu(kvm, vpidx); + if (vpidx >= KVM_MAX_VCPUS) + return NULL; + + vcpu = kvm_get_vcpu(kvm, vpidx); if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) @@ -689,6 +693,24 @@ void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) stimer_cleanup(&hv_vcpu->stimer[i]); } +bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) + return false; + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} +EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); + +bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, + struct hv_vp_assist_page *assist_page) +{ + if (!kvm_hv_assist_page_enabled(vcpu)) + return false; + return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, + assist_page, sizeof(*assist_page)); +} +EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); + static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { struct hv_message *msg = &stimer->msg; @@ -1040,21 +1062,41 @@ static u64 current_task_runtime_100ns(void) static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; switch (msr) { - case HV_X64_MSR_VP_INDEX: - if (!host) + case HV_X64_MSR_VP_INDEX: { + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + int vcpu_idx = kvm_vcpu_get_idx(vcpu); + u32 new_vp_index = (u32)data; + + if (!host || new_vp_index >= KVM_MAX_VCPUS) return 1; - hv->vp_index = (u32)data; + + if (new_vp_index == hv_vcpu->vp_index) + return 0; + + /* + * The VP index is initialized to vcpu_index by + * kvm_hv_vcpu_postcreate so they initially match. Now the + * VP index is changing, adjust num_mismatched_vp_indexes if + * it now matches or no longer matches vcpu_idx. + */ + if (hv_vcpu->vp_index == vcpu_idx) + atomic_inc(&hv->num_mismatched_vp_indexes); + else if (new_vp_index == vcpu_idx) + atomic_dec(&hv->num_mismatched_vp_indexes); + + hv_vcpu->vp_index = new_vp_index; break; + } case HV_X64_MSR_VP_ASSIST_PAGE: { u64 gfn; unsigned long addr; if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { - hv->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + hv_vcpu->hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0)) return 1; break; } @@ -1062,12 +1104,19 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; - if (__clear_user((void __user *)addr, PAGE_SIZE)) + + /* + * Clear apic_assist portion of f(struct hv_vp_assist_page + * only, there can be valuable data in the rest which needs + * to be preserved e.g. on migration. + */ + if (__clear_user((void __user *)addr, sizeof(u32))) return 1; - hv->hv_vapic = data; + hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_enable_pv_eoi(vcpu, - gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + gfn_to_gpa(gfn) | KVM_MSR_ENABLED, + sizeof(struct hv_vp_assist_page))) return 1; break; } @@ -1080,7 +1129,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) case HV_X64_MSR_VP_RUNTIME: if (!host) return 1; - hv->runtime_offset = data - current_task_runtime_100ns(); + hv_vcpu->runtime_offset = data - current_task_runtime_100ns(); break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: @@ -1172,11 +1221,11 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; switch (msr) { case HV_X64_MSR_VP_INDEX: - data = hv->vp_index; + data = hv_vcpu->vp_index; break; case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); @@ -1185,10 +1234,10 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_VP_ASSIST_PAGE: - data = hv->hv_vapic; + data = hv_vcpu->hv_vapic; break; case HV_X64_MSR_VP_RUNTIME: - data = current_task_runtime_100ns() + hv->runtime_offset; + data = current_task_runtime_100ns() + hv_vcpu->runtime_offset; break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: @@ -1255,32 +1304,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) return kvm_hv_get_msr(vcpu, msr, pdata, host); } -static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no) +static __always_inline unsigned long *sparse_set_to_vcpu_mask( + struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, + u64 *vp_bitmap, unsigned long *vcpu_bitmap) { - int i = 0, j; + struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_vcpu *vcpu; + int i, bank, sbank = 0; - if (!(valid_bank_mask & BIT_ULL(bank_no))) - return -1; + memset(vp_bitmap, 0, + KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); + for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, + KVM_HV_MAX_SPARSE_VCPU_SET_BITS) + vp_bitmap[bank] = sparse_banks[sbank++]; - for (j = 0; j < bank_no; j++) - if (valid_bank_mask & BIT_ULL(j)) - i++; + if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) { + /* for all vcpus vp_index == vcpu_idx */ + return (unsigned long *)vp_bitmap; + } - return i; + bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index, + (unsigned long *)vp_bitmap)) + __set_bit(i, vcpu_bitmap); + } + return vcpu_bitmap; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, u16 rep_cnt, bool ex) { struct kvm *kvm = current_vcpu->kvm; - struct kvm_vcpu_hv *hv_current = ¤t_vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = ¤t_vcpu->arch.hyperv; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; - struct kvm_vcpu *vcpu; - unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0}; - unsigned long valid_bank_mask = 0; + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); + unsigned long *vcpu_mask; + u64 valid_bank_mask; u64 sparse_banks[64]; - int sparse_banks_len, i; + int sparse_banks_len; bool all_cpus; if (!ex) { @@ -1290,6 +1354,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, trace_kvm_hv_flush_tlb(flush.processor_mask, flush.address_space, flush.flags); + valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; } else { @@ -1306,7 +1371,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * + sparse_banks_len = + bitmap_weight((unsigned long *)&valid_bank_mask, 64) * sizeof(sparse_banks[0]); if (!sparse_banks_len && !all_cpus) @@ -1321,48 +1387,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, return HV_STATUS_INVALID_HYPERCALL_INPUT; } - cpumask_clear(&hv_current->tlb_lush); - - kvm_for_each_vcpu(i, vcpu, kvm) { - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; - int bank = hv->vp_index / 64, sbank = 0; - - if (!all_cpus) { - /* Banks >64 can't be represented */ - if (bank >= 64) - continue; - - /* Non-ex hypercalls can only address first 64 vCPUs */ - if (!ex && bank) - continue; - - if (ex) { - /* - * Check is the bank of this vCPU is in sparse - * set and get the sparse bank number. - */ - sbank = get_sparse_bank_no(valid_bank_mask, - bank); - - if (sbank < 0) - continue; - } - - if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64))) - continue; - } + cpumask_clear(&hv_vcpu->tlb_flush); - /* - * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we - * can't analyze it here, flush TLB regardless of the specified - * address space. - */ - __set_bit(i, vcpu_bitmap); - } + vcpu_mask = all_cpus ? NULL : + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + /* + * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't + * analyze it here, flush TLB regardless of the specified address space. + */ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, - vcpu_bitmap, &hv_current->tlb_lush); + vcpu_mask, &hv_vcpu->tlb_flush); ret_success: /* We always do full TLB flush, set rep_done = rep_cnt. */ @@ -1370,6 +1407,99 @@ ret_success: ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } +static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, + unsigned long *vcpu_bitmap) +{ + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = vector + }; + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) + continue; + + /* We fail only when APIC is disabled */ + kvm_apic_set_irq(vcpu, &irq, NULL); + } +} + +static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa, + bool ex, bool fast) +{ + struct kvm *kvm = current_vcpu->kvm; + struct hv_send_ipi_ex send_ipi_ex; + struct hv_send_ipi send_ipi; + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); + unsigned long *vcpu_mask; + unsigned long valid_bank_mask; + u64 sparse_banks[64]; + int sparse_banks_len; + u32 vector; + bool all_cpus; + + if (!ex) { + if (!fast) { + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi, + sizeof(send_ipi)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + sparse_banks[0] = send_ipi.cpu_mask; + vector = send_ipi.vector; + } else { + /* 'reserved' part of hv_send_ipi should be 0 */ + if (unlikely(ingpa >> 32 != 0)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + sparse_banks[0] = outgpa; + vector = (u32)ingpa; + } + all_cpus = false; + valid_bank_mask = BIT_ULL(0); + + trace_kvm_hv_send_ipi(vector, sparse_banks[0]); + } else { + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex, + sizeof(send_ipi_ex)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, + send_ipi_ex.vp_set.format, + send_ipi_ex.vp_set.valid_bank_mask); + + vector = send_ipi_ex.vector; + valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; + sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * + sizeof(sparse_banks[0]); + + all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; + + if (!sparse_banks_len) + goto ret_success; + + if (!all_cpus && + kvm_read_guest(kvm, + ingpa + offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents), + sparse_banks, + sparse_banks_len)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + vcpu_mask = all_cpus ? NULL : + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + + kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + +ret_success: + return HV_STATUS_SUCCESS; +} + bool kvm_hv_hypercall_enabled(struct kvm *kvm) { return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; @@ -1539,6 +1669,20 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); break; + case HVCALL_SEND_IPI: + if (unlikely(rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast); + break; + case HVCALL_SEND_IPI_EX: + if (unlikely(fast || rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false); + break; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index d6aa969e20f1..0e66c12ed2c3 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -62,6 +62,10 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); +bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); +bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, + struct hv_vp_assist_page *assist_page); + static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, int timer_index) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fbb0e6df121b..3cd227ff807f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -70,6 +70,11 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul +static bool lapic_timer_advance_adjust_done = false; +#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 +/* step-by-step approximation to mitigate fluctuation */ +#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 + static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -955,14 +960,14 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, map = rcu_dereference(kvm->arch.apic_map); ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); - if (ret) + if (ret) { + *r = 0; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; - if (*r < 0) - *r = 0; *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } + } rcu_read_unlock(); return ret; @@ -1472,7 +1477,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u64 guest_tsc, tsc_deadline; + u64 guest_tsc, tsc_deadline, ns; if (!lapic_in_kernel(vcpu)) return; @@ -1492,6 +1497,24 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) if (guest_tsc < tsc_deadline) __delay(min(tsc_deadline - guest_tsc, nsec_to_cycles(vcpu, lapic_timer_advance_ns))); + + if (!lapic_timer_advance_adjust_done) { + /* too early */ + if (guest_tsc < tsc_deadline) { + ns = (tsc_deadline - guest_tsc) * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + lapic_timer_advance_ns -= min((unsigned int)ns, + lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } else { + /* too late */ + ns = (guest_tsc - tsc_deadline) * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + lapic_timer_advance_ns += min((unsigned int)ns, + lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } + if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) + lapic_timer_advance_adjust_done = true; + } } static void start_sw_tscdeadline(struct kvm_lapic *apic) @@ -2621,17 +2644,25 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) { u64 addr = data & ~KVM_MSR_ENABLED; + struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; + unsigned long new_len; + if (!IS_ALIGNED(addr, 4)) return 1; vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, - addr, sizeof(u8)); + + if (addr == ghc->gpa && len <= ghc->len) + new_len = ghc->len; + else + new_len = len; + + return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); } void kvm_apic_accept_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ed0ed39abd36..ff6ef9c3d760 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -120,7 +120,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_init(void); void kvm_lapic_exit(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e843ec46609d..cf5f572f2305 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -932,7 +932,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, while (cache->nobjs < ARRAY_SIZE(cache->objects)) { obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); if (!obj) - return -ENOMEM; + return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = obj; } return 0; @@ -960,7 +960,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, while (cache->nobjs < ARRAY_SIZE(cache->objects)) { page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!page) - return -ENOMEM; + return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = page; } return 0; @@ -1265,24 +1265,24 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(desc); } -static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; int i; if (!rmap_head->val) { - printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); + pr_err("%s: %p 0->BUG\n", __func__, spte); BUG(); } else if (!(rmap_head->val & 1)) { - rmap_printk("pte_list_remove: %p 1->0\n", spte); + rmap_printk("%s: %p 1->0\n", __func__, spte); if ((u64 *)rmap_head->val != spte) { - printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); + pr_err("%s: %p 1->BUG\n", __func__, spte); BUG(); } rmap_head->val = 0; } else { - rmap_printk("pte_list_remove: %p many->many\n", spte); + rmap_printk("%s: %p many->many\n", __func__, spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { @@ -1296,11 +1296,17 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) prev_desc = desc; desc = desc->more; } - pr_err("pte_list_remove: %p many->many\n", spte); + pr_err("%s: %p many->many\n", __func__, spte); BUG(); } } +static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) +{ + mmu_spte_clear_track_bits(sptep); + __pte_list_remove(sptep, rmap_head); +} + static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -1349,7 +1355,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmap_head = gfn_to_rmap(kvm, gfn, sp); - pte_list_remove(spte, rmap_head); + __pte_list_remove(spte, rmap_head); } /* @@ -1685,7 +1691,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) while ((sptep = rmap_get_first(rmap_head, &iter))) { rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); - drop_spte(kvm, sptep); + pte_list_remove(rmap_head, sptep); flush = true; } @@ -1721,7 +1727,7 @@ restart: need_flush = 1; if (pte_write(*ptep)) { - drop_spte(kvm, sptep); + pte_list_remove(rmap_head, sptep); goto restart; } else { new_spte = *sptep & ~PT64_BASE_ADDR_MASK; @@ -1988,7 +1994,7 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - pte_list_remove(parent_pte, &sp->parent_ptes); + __pte_list_remove(parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm_mmu_page *sp, @@ -2181,7 +2187,7 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { if (sp->role.cr4_pae != !!is_pae(vcpu) - || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { + || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } @@ -2375,14 +2381,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu.base_role; + role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = direct; if (role.direct) role.cr4_pae = 0; role.access = access; - if (!vcpu->arch.mmu.direct_map - && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!vcpu->arch.mmu->direct_map + && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -2457,11 +2463,11 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato { iterator->addr = addr; iterator->shadow_addr = root; - iterator->level = vcpu->arch.mmu.shadow_root_level; + iterator->level = vcpu->arch.mmu->shadow_root_level; if (iterator->level == PT64_ROOT_4LEVEL && - vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL && - !vcpu->arch.mmu.direct_map) + vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && + !vcpu->arch.mmu->direct_map) --iterator->level; if (iterator->level == PT32E_ROOT_LEVEL) { @@ -2469,10 +2475,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato * prev_root is currently only used for 64-bit hosts. So only * the active root_hpa is valid here. */ - BUG_ON(root != vcpu->arch.mmu.root_hpa); + BUG_ON(root != vcpu->arch.mmu->root_hpa); iterator->shadow_addr - = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= PT64_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) @@ -2483,7 +2489,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, u64 addr) { - shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa, + shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa, addr); } @@ -3095,7 +3101,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, int emulate = 0; gfn_t pseudo_gfn; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return 0; for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { @@ -3301,7 +3307,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, u64 spte = 0ull; uint retry_count = 0; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return false; if (!page_fault_can_be_fast(error_code)) @@ -3471,11 +3477,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, } /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free) +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + ulong roots_to_free) { int i; LIST_HEAD(invalid_list); - struct kvm_mmu *mmu = &vcpu->arch.mmu; bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); @@ -3535,20 +3541,20 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; unsigned i; - if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); if(make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, 0, 0, - vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); + vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = __pa(sp->spt); - } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { + vcpu->arch.mmu->root_hpa = __pa(sp->spt); + } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); @@ -3561,9 +3567,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); } else BUG(); @@ -3577,7 +3583,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) gfn_t root_gfn; int i; - root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; + root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; @@ -3586,8 +3592,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; + if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + hpa_t root = vcpu->arch.mmu->root_hpa; MMU_WARN_ON(VALID_PAGE(root)); @@ -3597,11 +3603,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); + vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = root; + vcpu->arch.mmu->root_hpa = root; return 0; } @@ -3611,17 +3617,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) + if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; MMU_WARN_ON(VALID_PAGE(root)); - if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); + if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { + pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); if (!(pdptr & PT_PRESENT_MASK)) { - vcpu->arch.mmu.pae_root[i] = 0; + vcpu->arch.mmu->pae_root[i] = 0; continue; } root_gfn = pdptr >> PAGE_SHIFT; @@ -3639,16 +3645,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | pm_mask; + vcpu->arch.mmu->pae_root[i] = root | pm_mask; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); /* * If we shadow a 32 bit page table with a long mode page * table we enter this path. */ - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { - if (vcpu->arch.mmu.lm_root == NULL) { + if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu->lm_root == NULL) { /* * The additional page necessary for this is only * allocated on demand. @@ -3660,12 +3666,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (lm_root == NULL) return 1; - lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; + lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; - vcpu->arch.mmu.lm_root = lm_root; + vcpu->arch.mmu->lm_root = lm_root; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); } return 0; @@ -3673,7 +3679,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { - if (vcpu->arch.mmu.direct_map) + if (vcpu->arch.mmu->direct_map) return mmu_alloc_direct_roots(vcpu); else return mmu_alloc_shadow_roots(vcpu); @@ -3684,17 +3690,16 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) int i; struct kvm_mmu_page *sp; - if (vcpu->arch.mmu.direct_map) + if (vcpu->arch.mmu->direct_map) return; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - + if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + hpa_t root = vcpu->arch.mmu->root_hpa; sp = page_header(root); /* @@ -3725,7 +3730,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; @@ -3799,7 +3804,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) goto exit; walk_shadow_page_lockless_begin(vcpu); @@ -3816,7 +3821,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) if (!is_shadow_present_pte(spte)) break; - reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte, + reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte, iterator.level); } @@ -3895,7 +3900,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) struct kvm_shadow_walk_iterator iterator; u64 spte; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; walk_shadow_page_lockless_begin(vcpu); @@ -3922,7 +3927,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, if (r) return r; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); |