diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 76 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 171 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 78 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 16 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 51 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 489 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 97 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 4 |
11 files changed, 771 insertions, 240 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f4f30d0c25c4..5720e78b2f7b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -404,7 +404,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_cpuid_7_0_ecx_x86_features = F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | - F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG); + F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | + F(CLDEMOTE); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b3705ae52824..4c4f4263420c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -812,6 +812,19 @@ static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) return assign_eip_near(ctxt, ctxt->_eip + rel); } +static int linear_read_system(struct x86_emulate_ctxt *ctxt, ulong linear, + void *data, unsigned size) +{ + return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, true); +} + +static int linear_write_system(struct x86_emulate_ctxt *ctxt, + ulong linear, void *data, + unsigned int size) +{ + return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, true); +} + static int segmented_read_std(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, void *data, @@ -823,7 +836,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, rc = linearize(ctxt, addr, size, false, &linear); if (rc != X86EMUL_CONTINUE) return rc; - return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); + return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, false); } static int segmented_write_std(struct x86_emulate_ctxt *ctxt, @@ -837,7 +850,7 @@ static int segmented_write_std(struct x86_emulate_ctxt *ctxt, rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; - return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception); + return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, false); } /* @@ -1496,8 +1509,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, index << 3 | 0x2); addr = dt.address + index * 8; - return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, - &ctxt->exception); + return linear_read_system(ctxt, addr, desc, sizeof *desc); } static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, @@ -1560,8 +1572,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) return rc; - return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc), - &ctxt->exception); + return linear_read_system(ctxt, *desc_addr_p, desc, sizeof(*desc)); } /* allowed just for 8 bytes segments */ @@ -1575,8 +1586,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) return rc; - return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, - &ctxt->exception); + return linear_write_system(ctxt, addr, desc, sizeof *desc); } static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, @@ -1737,8 +1747,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; } } else if (ctxt->mode == X86EMUL_MODE_PROT64) { - ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, - sizeof(base3), &ctxt->exception); + ret = linear_read_system(ctxt, desc_addr+8, &base3, sizeof(base3)); if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | @@ -2051,11 +2060,11 @@ static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) eip_addr = dt.address + (irq << 2); cs_addr = dt.address + (irq << 2) + 2; - rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception); + rc = linear_read_system(ctxt, cs_addr, &cs, 2); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception); + rc = linear_read_system(ctxt, eip_addr, &eip, 2); if (rc != X86EMUL_CONTINUE) return rc; @@ -2919,12 +2928,12 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, #ifdef CONFIG_X86_64 base |= ((u64)base3) << 32; #endif - r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL); + r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL, true); if (r != X86EMUL_CONTINUE) return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL); + r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL, true); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -3053,35 +3062,30 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 tss_selector, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { - const struct x86_emulate_ops *ops = ctxt->ops; struct tss_segment_16 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); + ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg); if (ret != X86EMUL_CONTINUE) return ret; save_state_to_tss16(ctxt, &tss_seg); - ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); + ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg); if (ret != X86EMUL_CONTINUE) return ret; - ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); + ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg); if (ret != X86EMUL_CONTINUE) return ret; if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(ctxt, new_tss_base, - &tss_seg.prev_task_link, - sizeof tss_seg.prev_task_link, - &ctxt->exception); + ret = linear_write_system(ctxt, new_tss_base, + &tss_seg.prev_task_link, + sizeof tss_seg.prev_task_link); if (ret != X86EMUL_CONTINUE) return ret; } @@ -3197,38 +3201,34 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 tss_selector, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { - const struct x86_emulate_ops *ops = ctxt->ops; struct tss_segment_32 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); u32 eip_offset = offsetof(struct tss_segment_32, eip); u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector); - ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); + ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg); if (ret != X86EMUL_CONTINUE) return ret; save_state_to_tss32(ctxt, &tss_seg); /* Only GP registers and segment selectors are saved */ - ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, - ldt_sel_offset - eip_offset, &ctxt->exception); + ret = linear_write_system(ctxt, old_tss_base + eip_offset, &tss_seg.eip, + ldt_sel_offset - eip_offset); if (ret != X86EMUL_CONTINUE) return ret; - ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); + ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg); if (ret != X86EMUL_CONTINUE) return ret; if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(ctxt, new_tss_base, - &tss_seg.prev_task_link, - sizeof tss_seg.prev_task_link, - &ctxt->exception); + ret = linear_write_system(ctxt, new_tss_base, + &tss_seg.prev_task_link, + sizeof tss_seg.prev_task_link); if (ret != X86EMUL_CONTINUE) return ret; } @@ -4189,7 +4189,9 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) maxphyaddr = eax & 0xff; else maxphyaddr = 36; - rsvd = rsvd_bits(maxphyaddr, 62); + rsvd = rsvd_bits(maxphyaddr, 63); + if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE) + rsvd &= ~CR3_PCID_INVD; } if (new_val & rsvd) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 46ff64da44ca..af8caf965baa 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1242,6 +1242,121 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return kvm_hv_get_msr(vcpu, msr, pdata); } +static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no) +{ + int i = 0, j; + + if (!(valid_bank_mask & BIT_ULL(bank_no))) + return -1; + + for (j = 0; j < bank_no; j++) + if (valid_bank_mask & BIT_ULL(j)) + i++; + + return i; +} + +static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, + u16 rep_cnt, bool ex) +{ + struct kvm *kvm = current_vcpu->kvm; + struct kvm_vcpu_hv *hv_current = ¤t_vcpu->arch.hyperv; + struct hv_tlb_flush_ex flush_ex; + struct hv_tlb_flush flush; + struct kvm_vcpu *vcpu; + unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0}; + unsigned long valid_bank_mask = 0; + u64 sparse_banks[64]; + int sparse_banks_len, i; + bool all_cpus; + + if (!ex) { + if (unlikely(kvm_read_guest(kvm, ingpa, &flush, sizeof(flush)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + trace_kvm_hv_flush_tlb(flush.processor_mask, + flush.address_space, flush.flags); + + sparse_banks[0] = flush.processor_mask; + all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; + } else { + if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex, + sizeof(flush_ex)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, + flush_ex.hv_vp_set.format, + flush_ex.address_space, + flush_ex.flags); + + valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; + all_cpus = flush_ex.hv_vp_set.format != + HV_GENERIC_SET_SPARSE_4K; + + sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * + sizeof(sparse_banks[0]); + + if (!sparse_banks_len && !all_cpus) + goto ret_success; + + if (!all_cpus && + kvm_read_guest(kvm, + ingpa + offsetof(struct hv_tlb_flush_ex, + hv_vp_set.bank_contents), + sparse_banks, + sparse_banks_len)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + + cpumask_clear(&hv_current->tlb_lush); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + int bank = hv->vp_index / 64, sbank = 0; + + if (!all_cpus) { + /* Banks >64 can't be represented */ + if (bank >= 64) + continue; + + /* Non-ex hypercalls can only address first 64 vCPUs */ + if (!ex && bank) + continue; + + if (ex) { + /* + * Check is the bank of this vCPU is in sparse + * set and get the sparse bank number. + */ + sbank = get_sparse_bank_no(valid_bank_mask, + bank); + + if (sbank < 0) + continue; + } + + if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64))) + continue; + } + + /* + * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we + * can't analyze it here, flush TLB regardless of the specified + * address space. + */ + __set_bit(i, vcpu_bitmap); + } + + kvm_make_vcpus_request_mask(kvm, + KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, + vcpu_bitmap, &hv_current->tlb_lush); + +ret_success: + /* We always do full TLB flush, set rep_done = rep_cnt. */ + return (u64)HV_STATUS_SUCCESS | + ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); +} + bool kvm_hv_hypercall_enabled(struct kvm *kvm) { return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; @@ -1315,7 +1430,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS; uint16_t code, rep_idx, rep_cnt; - bool fast, longmode; + bool fast, longmode, rep; /* * hypercall generates UD from non zero cpl and real mode @@ -1345,31 +1460,34 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) #endif code = param & 0xffff; - fast = (param >> 16) & 0x1; - rep_cnt = (param >> 32) & 0xfff; - rep_idx = (param >> 48) & 0xfff; + fast = !!(param & HV_HYPERCALL_FAST_BIT); + rep_cnt = (param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; + rep_idx = (param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; + rep = !!(rep_cnt || rep_idx); trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); - /* Hypercall continuation is not supported yet */ - if (rep_cnt || rep_idx) { - ret = HV_STATUS_INVALID_HYPERCALL_CODE; - goto out; - } - switch (code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: + if (unlikely(rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: + if (unlikely(rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } ret = kvm_hvcall_signal_event(vcpu, fast, ingpa); if (ret != HV_STATUS_INVALID_PORT_ID) break; /* maybe userspace knows this conn_id: fall through */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ - if (!vcpu_to_synic(vcpu)->active) { - ret = HV_STATUS_INVALID_HYPERCALL_CODE; + if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } vcpu->run->exit_reason = KVM_EXIT_HYPERV; @@ -1380,12 +1498,39 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: + if (unlikely(fast || !rep_cnt || rep_idx)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false); + break; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: + if (unlikely(fast || rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false); + break; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: + if (unlikely(fast || !rep_cnt || rep_idx)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); + break; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: + if (unlikely(fast || rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); + break; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } -out: return kvm_hv_hypercall_complete(vcpu, ret); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3773c4625114..b5cd8465d44f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2002,13 +2002,11 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } } - if ((old_value ^ value) & X2APIC_ENABLE) { - if (value & X2APIC_ENABLE) { - kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); - kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); - } else - kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); - } + if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) + kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); + + if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) + kvm_x86_ops->set_virtual_apic_mode(vcpu); apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index edce055e9fd7..ed0ed39abd36 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -16,6 +16,13 @@ #define APIC_BUS_CYCLE_NS 1 #define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) +enum lapic_mode { + LAPIC_MODE_DISABLED = 0, + LAPIC_MODE_INVALID = X2APIC_ENABLE, + LAPIC_MODE_XAPIC = MSR_IA32_APICBASE_ENABLE, + LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE, +}; + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -89,6 +96,7 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -220,4 +228,10 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); + +static inline enum lapic_mode kvm_apic_mode(u64 apic_base) +{ + return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); +} + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d634f0332c0f..d594690d8b95 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -222,7 +222,6 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; static void mmu_spte_set(u64 *sptep, u64 spte); -static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { @@ -3343,51 +3342,48 @@ out_unlock: return RET_PF_RETRY; } - -static void mmu_free_roots(struct kvm_vcpu *vcpu) +static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, + struct list_head *invalid_list) { - int i; struct kvm_mmu_page *sp; - LIST_HEAD(invalid_list); - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(*root_hpa)) return; - if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL && - (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL || - vcpu->arch.mmu.direct_map)) { - hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK); + --sp->root_count; + if (!sp->root_count && sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); - spin_lock(&vcpu->kvm->mmu_lock); - sp = page_header(root); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - } - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + *root_hpa = INVALID_PAGE; +} + +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu) +{ + int i; + LIST_HEAD(invalid_list); + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + if (!VALID_PAGE(mmu->root_hpa)) return; - } spin_lock(&vcpu->kvm->mmu_lock); - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - if (root) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); - } - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && + (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { + mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); + } else { + for (i = 0; i < 4; ++i) + if (mmu->pae_root[i] != 0) + mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i], + &invalid_list); + mmu->root_hpa = INVALID_PAGE; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; } +EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { @@ -3720,7 +3716,6 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) */ return RET_PF_RETRY; } -EXPORT_SYMBOL_GPL(handle_mmio_page_fault); static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, u32 error_code, gfn_t gfn) @@ -3812,6 +3807,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, struct kvm_memory_slot *slot; bool async; + /* + * Don't expose private memslots to L2. + */ + if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) { + *pfn = KVM_PFN_NOSLOT; + return false; + } + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); async = false; *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); @@ -3951,7 +3954,7 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) { - mmu_free_roots(vcpu); + kvm_mmu_free_roots(vcpu); } static unsigned long get_cr3(struct kvm_vcpu *vcpu) @@ -4473,6 +4476,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; + context->base_role.guest_mode = is_guest_mode(vcpu); context->base_role.smm = is_smm(vcpu); context->base_role.ad_disabled = (shadow_accessed_mask == 0); context->page_fault = tdp_page_fault; @@ -4539,6 +4543,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) = smep && !is_write_protection(vcpu); context->base_role.smap_andnot_wp = smap && !is_write_protection(vcpu); + context->base_role.guest_mode = is_guest_mode(vcpu); context->base_role.smm = is_smm(vcpu); reset_shadow_zero_bits_mask(vcpu, context); } @@ -4564,7 +4569,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->root_hpa = INVALID_PAGE; context->direct_map = false; context->base_role.ad_disabled = !accessed_dirty; - + context->base_role.guest_mode = 1; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); update_last_nonleaf_level(vcpu, context); @@ -4664,7 +4669,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - mmu_free_roots(vcpu); + kvm_mmu_free_roots(vcpu); WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -4825,6 +4830,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.smep_andnot_wp = 1; mask.smap_andnot_wp = 1; mask.smm = 1; + mask.guest_mode = 1; mask.ad_disabled = 1; /* diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 950ec50f77c3..695b0bd02220 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1768,7 +1768,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long npages, npinned, size; unsigned long locked, lock_limit; struct page **pages; - int first, last; + unsigned long first, last; + + if (ulen == 0 || uaddr + ulen < uaddr) + return NULL; /* Calculate number of pages. */ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; @@ -1855,13 +1858,13 @@ static void __unregister_enc_region_locked(struct kvm *kvm, static struct kvm *svm_vm_alloc(void) { - struct kvm_svm *kvm_svm = kzalloc(sizeof(struct kvm_svm), GFP_KERNEL); + struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm)); return &kvm_svm->kvm; } static void svm_vm_free(struct kvm *kvm) { - kfree(to_kvm_svm(kvm)); + vfree(to_kvm_svm(kvm)); } static void sev_vm_destroy(struct kvm *kvm) @@ -5062,7 +5065,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } -static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { return; } @@ -6949,6 +6952,9 @@ static int svm_register_enc_region(struct kvm *kvm, if (!sev_guest(kvm)) return -ENOTTY; + if (range->addr > ULONG_MAX || range->size > ULONG_MAX) + return -EINVAL; + region = kzalloc(sizeof(*region), GFP_KERNEL); if (!region) return -ENOMEM; @@ -7100,7 +7106,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, - .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, + .set_virtual_apic_mode = svm_set_virtual_apic_mode, .get_enable_apicv = svm_get_enable_apicv, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 9807c314c478..0f997683404f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1367,6 +1367,57 @@ TRACE_EVENT(kvm_hv_timer_state, __entry->vcpu_id, __entry->hv_timer_in_use) ); + +/* + * Tracepoint for kvm_hv_flush_tlb. + */ +TRACE_EVENT(kvm_hv_flush_tlb, + TP_PROTO(u64 processor_mask, u64 address_space, u64 flags), + TP_ARGS(processor_mask, address_space, flags), + + TP_STRUCT__entry( + __field(u64, processor_mask) + __field(u64, address_space) + __field(u64, flags) + ), + + TP_fast_assign( + __entry->processor_mask = processor_mask; + __entry->address_space = address_space; + __entry->flags = flags; + ), + + TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx", + __entry->processor_mask, __entry->address_space, + __entry->flags) +); + +/* + * Tracepoint for kvm_hv_flush_tlb_ex. + */ +TRACE_EVENT(kvm_hv_flush_tlb_ex, + TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags), + TP_ARGS(valid_bank_mask, format, address_space, flags), + + TP_STRUCT__entry( + __field(u64, valid_bank_mask) + __field(u64, format) + __field(u64, address_space) + __field(u64, flags) + ), + + TP_fast_assign( + __entry->valid_bank_mask = valid_bank_mask; + __entry->format = format; + __entry->address_space = address_space; + __entry->flags = flags; + ), + + TP_printk("valid_bank_mask 0x%llx format 0x%llx " + "address_space 0x%llx flags 0x%llx", + __entry->valid_bank_mask, __entry->format, + __entry->address_space, __entry->flags) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 40aa29204baf..fc61e25966e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -242,7 +242,11 @@ struct shared_msr_entry { * underlying hardware which will be used to run L2. * This structure is packed to ensure that its layout is identical across * machines (necessary for live migration). - * If there are changes in this struct, VMCS12_REVISION must be changed. + * + * IMPORTANT: Changing the layout of existing fields in this structure + * will break save/restore compatibility with older kvm releases. When + * adding new fields, either use space in the reserved padding* arrays + * or add the new fields to the end of the structure. */ typedef u64 natural_width; struct __packed vmcs12 { @@ -265,17 +269,14 @@ struct __packed vmcs12 { u64 virtual_apic_page_addr; u64 apic_access_addr; u64 posted_intr_desc_addr; - u64 vm_function_control; u64 ept_pointer; u64 eoi_exit_bitmap0; u64 eoi_exit_bitmap1; u64 eoi_exit_bitmap2; u64 eoi_exit_bitmap3; - u64 eptp_list_address; u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; - u64 pml_address; u64 guest_ia32_debugctl; u64 guest_ia32_pat; u64 guest_ia32_efer; @@ -288,7 +289,12 @@ struct __packed vmcs12 { u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; - u64 padding64[8]; /* room for future expansion */ + u64 vmread_bitmap; + u64 vmwrite_bitmap; + u64 vm_function_control; + u64 eptp_list_address; + u64 pml_address; + u64 padding64[3]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -397,7 +403,6 @@ struct __packed vmcs12 { u16 guest_ldtr_selector; u16 guest_tr_selector; u16 guest_intr_status; - u16 guest_pml_index; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -405,12 +410,172 @@ struct __packed vmcs12 { u16 host_fs_selector; u16 host_gs_selector; u16 host_tr_selector; + u16 guest_pml_index; }; /* + * For save/restore compatibility, the vmcs12 field offsets must not change. + */ +#define CHECK_OFFSET(field, loc) \ + BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ + "Offset of " #field " in struct vmcs12 has changed.") + +static inline void vmx_check_vmcs12_offsets(void) { + CHECK_OFFSET(revision_id, 0); + CHECK_OFFSET(abort, 4); + CHECK_OFFSET(launch_state, 8); + CHECK_OFFSET(io_bitmap_a, 40); + CHECK_OFFSET(io_bitmap_b, 48); + CHECK_OFFSET(msr_bitmap, 56); + CHECK_OFFSET(vm_exit_msr_store_addr, 64); + CHECK_OFFSET(vm_exit_msr_load_addr, 72); + CHECK_OFFSET(vm_entry_msr_load_addr, 80); + CHECK_OFFSET(tsc_offset, 88); + CHECK_OFFSET(virtual_apic_page_addr, 96); + CHECK_OFFSET(apic_access_addr, 104); + CHECK_OFFSET(posted_intr_desc_addr, 112); + CHECK_OFFSET(ept_pointer, 120); + CHECK_OFFSET(eoi_exit_bitmap0, 128); + CHECK_OFFSET(eoi_exit_bitmap1, 136); + CHECK_OFFSET(eoi_exit_bitmap2, 144); + CHECK_OFFSET(eoi_exit_bitmap3, 152); + CHECK_OFFSET(xss_exit_bitmap, 160); + CHECK_OFFSET(guest_physical_address, 168); + CHECK_OFFSET(vmcs_link_pointer, 176); + CHECK_OFFSET(guest_ia32_debugctl, 184); + CHECK_OFFSET(guest_ia32_pat, 192); + CHECK_OFFSET(guest_ia32_efer, 200); + CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); + CHECK_OFFSET(guest_pdptr0, 216); + CHECK_OFFSET(guest_pdptr1, 224); + CHECK_OFFSET(guest_pdptr2, 232); + CHECK_OFFSET(guest_pdptr3, 240); + CHECK_OFFSET(guest_bndcfgs, 248); + CHECK_OFFSET(host_ia32_pat, 256); + CHECK_OFFSET(host_ia32_efer, 264); + CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); + CHECK_OFFSET(vmread_bitmap, 280); + CHECK_OFFSET(vmwrite_bitmap, 288); + CHECK_OFFSET(vm_function_control, 296); + CHECK_OFFSET(eptp_list_address, 304); + CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(cr0_guest_host_mask, 344); + CHECK_OFFSET(cr4_guest_host_mask, 352); + CHECK_OFFSET(cr0_read_shadow, 360); + CHECK_OFFSET(cr4_read_shadow, 368); + CHECK_OFFSET(cr3_target_value0, 376); + CHECK_OFFSET(cr3_target_value1, 384); + CHECK_OFFSET(cr3_target_value2, 392); + CHECK_OFFSET(cr3_target_value3, 400); + CHECK_OFFSET(exit_qualification, 408); + CHECK_OFFSET(guest_linear_address, 416); + CHECK_OFFSET(guest_cr0, 424); + CHECK_OFFSET(guest_cr3, 432); + CHECK_OFFSET(guest_cr4, 440); + CHECK_OFFSET(guest_es_base, 448); + CHECK_OFFSET(guest_cs_base, 456); + CHECK_OFFSET(guest_ss_base, 464); + CHECK_OFFSET(guest_ds_base, 472); + CHECK_OFFSET(guest_fs_base, 480); + CHECK_OFFSET(guest_gs_base, 488); + CHECK_OFFSET(guest_ldtr_base, 496); + CHECK_OFFSET(guest_tr_base, 504); + CHECK_OFFSET(guest_gdtr_base, 512); + CHECK_OFFSET(guest_idtr_base, 520); + CHECK_OFFSET(guest_dr7, 528); + CHECK_OFFSET(guest_rsp, 536); + CHECK_OFFSET(guest_rip, 544); + CHECK_OFFSET(guest_rflags, 552); + CHECK_OFFSET(guest_pending_dbg_exceptions, 560); + CHECK_OFFSET(guest_sysenter_esp, 568); + CHECK_OFFSET(guest_sysenter_eip, 576); + CHECK_OFFSET(host_cr0, 584); + CHECK_OFFSET(host_cr3, 592); + CHECK_OFFSET(host_cr4, 600); + CHECK_OFFSET(host_fs_base, 608); + CHECK_OFFSET(host_gs_base, 616); + CHECK_OFFSET(host_tr_base, 624); + CHECK_OFFSET(host_gdtr_base, 632); + CHECK_OFFSET(host_idtr_base, 640); + CHECK_OFFSET(host_ia32_sysenter_esp, 648); + CHECK_OFFSET(host_ia32_sysenter_eip, 656); + CHECK_OFFSET(host_rsp, 664); + CHECK_OFFSET(host_rip, 672); + CHECK_OFFSET(pin_based_vm_exec_control, 744); + CHECK_OFFSET(cpu_based_vm_exec_control, 748); + CHECK_OFFSET(exception_bitmap, 752); + CHECK_OFFSET(page_fault_error_code_mask, 756); + CHECK_OFFSET(page_fault_error_code_match, 760); + CHECK_OFFSET(cr3_target_count, 764); + CHECK_OFFSET(vm_exit_controls, 768); + CHECK_OFFSET(vm_exit_msr_store_count, 772); + CHECK_OFFSET(vm_exit_msr_load_count, 776); + CHECK_OFFSET(vm_entry_controls, 780); + CHECK_OFFSET(vm_entry_msr_load_count, 784); + CHECK_OFFSET(vm_entry_intr_info_field, 788); + CHECK_OFFSET(vm_entry_exception_error_code, 792); + CHECK_OFFSET(vm_entry_instruction_len, 796); + CHECK_OFFSET(tpr_threshold, 800); + CHECK_OFFSET(secondary_vm_exec_control, 804); + CHECK_OFFSET(vm_instruction_error, 808); + CHECK_OFFSET(vm_exit_reason, 812); + CHECK_OFFSET(vm_exit_intr_info, 816); + CHECK_OFFSET(vm_exit_intr_error_code, 820); + CHECK_OFFSET(idt_vectoring_info_field, 824); + CHECK_OFFSET(idt_vectoring_error_code, 828); + CHECK_OFFSET(vm_exit_instruction_len, 832); + CHECK_OFFSET(vmx_instruction_info, 836); + CHECK_OFFSET(guest_es_limit, 840); + CHECK_OFFSET(guest_cs_limit, 844); + CHECK_OFFSET(guest_ss_limit, 848); + CHECK_OFFSET(guest_ds_limit, 852); + CHECK_OFFSET(guest_fs_limit, 856); + CHECK_OFFSET(guest_gs_limit, 860); + CHECK_OFFSET(guest_ldtr_limit, 864); + CHECK_OFFSET(guest_tr_limit, 868); + CHECK_OFFSET(guest_gdtr_limit, 872); + CHECK_OFFSET(guest_idtr_limit, 876); + CHECK_OFFSET(gue |