Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini: "PPC: - Better machine check handling for HV KVM - Ability to support guests with threads=2, 4 or 8 on POWER9 - Fix for a race that could cause delayed recognition of signals - Fix for a bug where POWER9 guests could sleep with interrupts pending. ARM: - VCPU request overhaul - allow timer and PMU to have their interrupt number selected from userspace - workaround for Cavium erratum 30115 - handling of memory poisonning - the usual crop of fixes and cleanups s390: - initial machine check forwarding - migration support for the CMMA page hinting information - cleanups and fixes x86: - nested VMX bugfixes and improvements - more reliable NMI window detection on AMD - APIC timer optimizations Generic: - VCPU request overhaul + documentation of common code patterns - kvm_stat improvements" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (124 commits) Update my email address kvm: vmx: allow host to access guest MSR_IA32_BNDCFGS x86: kvm: mmu: use ept a/d in vmcs02 iff used in vmcs12 kvm: x86: mmu: allow A/D bits to be disabled in an mmu x86: kvm: mmu: make spte mmio mask more explicit x86: kvm: mmu: dead code thanks to access tracking KVM: PPC: Book3S: Fix typo in XICS-on-XIVE state saving code KVM: PPC: Book3S HV: Close race with testing for signals on guest entry KVM: PPC: Book3S HV: Simplify dynamic micro-threading code KVM: x86: remove ignored type attribute KVM: LAPIC: Fix lapic timer injection delay KVM: lapic: reorganize restart_apic_timer KVM: lapic: reorganize start_hv_timer kvm: nVMX: Check memory operand to INVVPID KVM: s390: Inject machine check into the nested guest KVM: s390: Inject machine check into the guest tools/kvm_stat: add new interactive command 'b' tools/kvm_stat: add new command line switch '-i' tools/kvm_stat: fix error on interactive command 'g' KVM: SVM: suppress unnecessary NMI singlestep on GIF=0 and nested exit ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-07-06 18:38:31 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-07-06 18:38:31 -0700
commit: c136b84393d4e340e1b53fc7f737dd5827b19ee5 (patch)
tree: 985a1bdfafe7ec5ce2d3c738f601cad3998d8ce9 /arch/x86/kvm
parent: e0f25a3f2d052e36ff67a9b4db835c3e27e950d8 (diff)
parent: 1372324b328cd5dabaef5e345e37ad48c63df2a9 (diff)
10 files changed, 357 insertions, 208 deletions
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a6fd40aade7c..da6728383052 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_RTM));
 }
 
+static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_MPX));
+}
+
 static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 80890dee66ce..fb0055953fbc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
 	if (rc != X86EMUL_CONTINUE)					\
 		goto done;						\
 	ctxt->_eip += sizeof(_type);					\
-	_x = *(_type __aligned(1) *) ctxt->fetch.ptr;			\
+	memcpy(&_x, ctxt->fetch.ptr, sizeof(_type));			\
 	ctxt->fetch.ptr += sizeof(_type);				\
 	_x;								\
 })
@@ -3942,6 +3942,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
 }
 
 /*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+	return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+	bool cr4_osfxsr;
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		return __fxstate_size(16);
+
+	cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+	return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
  * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
  *  1) 16 bit mode
  *  2) 32 bit mode
@@ -3962,7 +3981,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
 static int em_fxsave(struct x86_emulate_ctxt *ctxt)
 {
 	struct fxregs_state fx_state;
-	size_t size;
 	int rc;
 
 	rc = check_fxsr(ctxt);
@@ -3978,68 +3996,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
-		size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
-	else
-		size = offsetof(struct fxregs_state, xmm_space[0]);
-
-	return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
-}
-
-static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
-		struct fxregs_state *new)
-{
-	int rc = X86EMUL_CONTINUE;
-	struct fxregs_state old;
-
-	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	/*
-	 * 64 bit host will restore XMM 8-15, which is not correct on non-64
-	 * bit guests.  Load the current values in order to preserve 64 bit
-	 * XMMs after fxrstor.
-	 */
-#ifdef CONFIG_X86_64
-	/* XXX: accessing XMM 8-15 very awkwardly */
-	memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
-#endif
-
-	/*
-	 * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
-	 * does save and restore MXCSR.
-	 */
-	if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
-		memcpy(new->xmm_space, old.xmm_space, 8 * 16);
-
-	return rc;
+	return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+		                   fxstate_size(ctxt));
 }
 
 static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
 {
 	struct fxregs_state fx_state;
 	int rc;
+	size_t size;
 
 	rc = check_fxsr(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
+	ctxt->ops->get_fpu(ctxt);
 
-	if (fx_state.mxcsr >> 16)
-		return emulate_gp(ctxt, 0);
+	size = fxstate_size(ctxt);
+	if (size < __fxstate_size(16)) {
+		rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+		if (rc != X86EMUL_CONTINUE)
+			goto out;
+	}
 
-	ctxt->ops->get_fpu(ctxt);
+	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+	if (rc != X86EMUL_CONTINUE)
+		goto out;
 
-	if (ctxt->mode < X86EMUL_MODE_PROT64)
-		rc = fxrstor_fixup(ctxt, &fx_state);
+	if (fx_state.mxcsr >> 16) {
+		rc = emulate_gp(ctxt, 0);
+		goto out;
+	}
 
 	if (rc == X86EMUL_CONTINUE)
 		rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
 
+out:
 	ctxt->ops->put_fpu(ctxt);
 
 	return rc;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d24c8742d9b0..2819d4c123eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,6 +1495,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
 
 static void cancel_hv_timer(struct kvm_lapic *apic)
 {
+	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
 	preempt_disable();
 	kvm_x86_ops->cancel_hv_timer(apic->vcpu);
 	apic->lapic_timer.hv_timer_in_use = false;
@@ -1503,25 +1504,56 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
 
 static bool start_hv_timer(struct kvm_lapic *apic)
 {
-	u64 tscdeadline = apic->lapic_timer.tscdeadline;
+	struct kvm_timer *ktimer = &apic->lapic_timer;
+	int r;
 
-	if ((atomic_read(&apic->lapic_timer.pending) &&
-		!apic_lvtt_period(apic)) ||
-		kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
-		if (apic->lapic_timer.hv_timer_in_use)
-			cancel_hv_timer(apic);
-	} else {
-		apic->lapic_timer.hv_timer_in_use = true;
-		hrtimer_cancel(&apic->lapic_timer.timer);
+	if (!kvm_x86_ops->set_hv_timer)
+		return false;
+
+	if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+		return false;
 
-		/* In case the sw timer triggered in the window */
-		if (atomic_read(&apic->lapic_timer.pending) &&
-			!apic_lvtt_period(apic))
-			cancel_hv_timer(apic);
+	r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
+	if (r < 0)
+		return false;
+
+	ktimer->hv_timer_in_use = true;
+	hrtimer_cancel(&ktimer->timer);
+
+	/*
+	 * Also recheck ktimer->pending, in case the sw timer triggered in
+	 * the window.  For periodic timer, leave the hv timer running for
+	 * simplicity, and the deadline will be recomputed on the next vmexit.
+	 */
+	if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) {
+		if (r)
+			apic_timer_expired(apic);
+		return false;
 	}
-	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
-			apic->lapic_timer.hv_timer_in_use);
-	return apic->lapic_timer.hv_timer_in_use;
+
+	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true);
+	return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+	struct kvm_timer *ktimer = &apic->lapic_timer;
+	if (apic->lapic_timer.hv_timer_in_use)
+		cancel_hv_timer(apic);
+	if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+		return;
+
+	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+		start_sw_period(apic);
+	else if (apic_lvtt_tscdeadline(apic))
+		start_sw_tscdeadline(apic);
+	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+	if (!start_hv_timer(apic))
+		start_sw_timer(apic);
 }
 
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
@@ -1535,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 
 	if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
 		advance_periodic_target_expiration(apic);
-		if (!start_hv_timer(apic))
-			start_sw_period(apic);
+		restart_apic_timer(apic);
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
 
 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	WARN_ON(apic->lapic_timer.hv_timer_in_use);
-
-	start_hv_timer(apic);
+	restart_apic_timer(vcpu->arch.apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
 
@@ -1556,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	/* Possibly the TSC deadline timer is not enabled yet */
-	if (!apic->lapic_timer.hv_timer_in_use)
-		return;
-
-	cancel_hv_timer(apic);
+	if (apic->lapic_timer.hv_timer_in_use)
+		start_sw_timer(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
 
-	if (atomic_read(&apic->lapic_timer.pending))
-		return;
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
-		start_sw_period(apic);
-	else if (apic_lvtt_tscdeadline(apic))
-		start_sw_tscdeadline(apic);
+	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+	restart_apic_timer(apic);
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	atomic_set(&apic->lapic_timer.pending, 0);
 
-	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
-		if (set_target_expiration(apic) &&
-			!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
-			start_sw_period(apic);
-	} else if (apic_lvtt_tscdeadline(apic)) {
-		if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
-			start_sw_tscdeadline(apic);
-	}
+	if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+	    && !set_target_expiration(apic))
+		return;
+
+	restart_apic_timer(apic);
 }
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -1813,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
  * LAPIC interface
  *----------------------------------------------------------------------
  */
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	if (!lapic_in_kernel(vcpu))
-		return 0;
-
-	return apic->lapic_timer.tscdeadline;
-}
-
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index bcbe811f3b97..29caa2c3dff9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
 int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu);
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
 
@@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb8225969255..aafd399cf8c6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -183,13 +183,13 @@ static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_mmio_value;
 static u64 __read_mostly shadow_present_mask;
 
 /*
- * The mask/value to distinguish a PTE that has been marked not-present for
- * access tracking purposes.
- * The mask would be either 0 if access tracking is disabled, or
- * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
+ * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
+ * Non-present SPTEs with shadow_acc_track_value set are in place for access
+ * tracking.
  */
 static u64 __read_mostly shadow_acc_track_mask;
 static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
@@ -207,16 +207,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
 {
+	BUG_ON((mmio_mask & mmio_value) != mmio_value);
+	shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
 	shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
+static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+{
+	return sp->role.ad_disabled;
+}
+
+static inline bool spte_ad_enabled(u64 spte)
+{
+	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+	return !(spte & shadow_acc_track_value);
+}
+
+static inline u64 spte_shadow_accessed_mask(u64 spte)
+{
+	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+	return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
+}
+
+static inline u64 spte_shadow_dirty_mask(u64 spte)
+{
+	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+	return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
+}
+
 static inline bool is_access_track_spte(u64 spte)
 {
-	/* Always false if shadow_acc_track_mask is zero.  */
-	return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+	return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 }
 
 /*
@@ -270,7 +294,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 	u64 mask = generation_mmio_spte_mask(gen);
 
 	access &= ACC_WRITE_MASK | ACC_USER_MASK;
-	mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
+	mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
 
 	trace_mark_mmio_spte(sptep, gfn, access, gen);
 	mmu_spte_set(sptep, mask);
@@ -278,7 +302,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 
 static bool is_mmio_spte(u64 spte)
 {
-	return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+	return (spte & shadow_mmio_mask) == shadow_mmio_value;
 }
 
 static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -315,12 +339,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 	return likely(kvm_gen == spte_gen);
 }
 
+/*
+ * Sets the shadow PTE masks used by the MMU.
+ *
+ * Assumptions:
+ *  - Setting either @accessed_mask or @dirty_mask requires setting both
+ *  - At least one of @accessed_mask or @acc_track_mask must be set
+ */
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
 		u64 acc_track_mask)
 {
-	if (acc_track_mask != 0)
-		acc_track_mask |= SPTE_SPECIAL_MASK;
+	BUG_ON(!dirty_mask != !accessed_mask);
+	BUG_ON(!accessed_mask && !acc_track_mask);
+	BUG_ON(acc_track_mask & shadow_acc_track_value);
 
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
@@ -329,7 +361,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 	shadow_x_mask = x_mask;
 	shadow_present_mask = p_mask;
 	shadow_acc_track_mask = acc_track_mask;
-	WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -549,7 +580,7 @@ static bool spte_has_volatile_bits(u64 spte)
 	    is_access_track_spte(spte))
 		return true;
 
-	if (shadow_accessed_mask) {
+	if (spte_ad_enabled(spte)) {
 		if ((spte & shadow_accessed_mask) == 0 ||
 	    	    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
 			return true;
@@ -560,14 +591,17 @@ static bool spte_has_volatile_bits(u64 spte)
 
 static bool is_accessed_spte(u64 spte)
 {
-	return shadow_accessed_mask ? spte & shadow_accessed_mask
-				    : !is_access_track_spte(spte);
+	u64 accessed_mask = spte_shadow_accessed_mask(spte);
+
+	return accessed_mask ? spte & accessed_mask
+			     : !is_access_track_spte(spte);
 }
 
 static bool is_dirty_spte(u64 spte)
 {
-	return shadow_dirty_mask ? spte & shadow_dirty_mask
-				 : spte & PT_WRITABLE_MASK;
+	u64 dirty_mask = spte_shadow_dirty_mask(spte);
+
+	return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
 }
 
 /* Rules for using mmu_spte_set:
@@ -707,10 +741,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 
 static u64 mark_spte_for_access_track(u64 spte)
 {
-	if (shadow_accessed_mask != 0)
+	if (spte_ad_enabled(spte))
 		return spte & ~shadow_accessed_mask;
 
-	if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
+	if (is_access_track_spte(spte))
 		return spte;
 
 	/*
@@ -729,7 +763,6 @@ static u64 mark_spte_for_access_track(u64 spte)
 	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
 		shadow_acc_track_saved_bits_shift;
 	spte &= ~shadow_acc_track_mask;
-	spte |= shadow_acc_track_value;
 
 	return spte;
 }
@@ -741,6 +774,7 @@ static u64 restore_acc_track_spte(u64 spte)
 	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
 			 & shadow_acc_track_saved_bits_mask;
 
+	WARN_ON_ONCE(spte_ad_enabled(spte));
 	WARN_ON_ONCE(!is_access_track_spte(spte));
 
 	new_spte &= ~shadow_acc_track_mask;
@@ -759,7 +793,7 @@ static bool mmu_spte_age(u64 *sptep)
 	if (!is_accessed_spte(spte))
 		return false;
 
-	if (shadow_accessed_mask) {
+	if (spte_ad_enabled(spte)) {
 		clear_bit((ffs(shadow_accessed_mask) - 1),
 			  (unsigned long *)sptep);
 	} else {
@@ -1390,6 +1424,22 @@ static bool spte_clear_dirty(u64 *sptep)
 	return mmu_spte_update(sptep, spte);
 }
 
+static bool wrprot_ad_disabled_spte(u64 *sptep)
+{
+	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
+					       (unsigned long *)sptep);
+	if (was_writable)
+		kvm_set_pfn_dirty(spte_to_pfn(*sptep));
+
+	return was_writable;
+}
+
+/*
+ * Gets the GFN ready for another round of dirty logging by clearing the
+ *	- D bit on ad-enabled SPTEs, and
+ *	- W bit on ad-disabled SPTEs.
+ * Returns true iff any D or W bits were cleared.
+ */
 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 {
 	u64 *sptep;
@@ -1397,7 +1447,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 	bool flush = false;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep)
-		flush |= spte_clear_dirty(sptep);
+		if (spte_ad_enabled(*sptep))
+			flush |= spte_clear_dirty(sptep);
+		else
+			flush |= wrprot_ad_disabled_spte(sptep);
 
 	return flush;
 }
@@ -1420,7 +1473,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 	bool flush = false;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep)
-		flush |= spte_set_dirty(sptep);
+		if (spte_ad_enabled(*sptep))
+			flush |= spte_set_dirty(sptep);
 
 	return flush;
 }
@@ -1452,7 +1506,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 }
 
 /**
- * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
+ * protect the page if the D-bit isn't supported.
  * @kvm: kvm instance
  * @slot: slot to clear D-bit
  * @gfn_offset: start of the BITS_PER_LONG pages we care about
@@ -1766,18 +1821,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	u64 *sptep;
 	struct rmap_iterator iter;
 
-	/*
-	 * If there's no access bit in the secondary pte set by the hardware and
-	 * fast access tracking is also not enabled, it's up to gup-fast/gup to
-	 * set the access bit in the primary pte or in the page structure.
-	 */
-	if (!shadow_accessed_mask && !shadow_acc_track_mask)
-		goto out;
-
 	for_each_rmap_spte(rmap_head, &iter, sptep)
 		if (is_accessed_spte(*sptep))
 			return 1;
-out:
 	return 0;
 }
 
@@ -1798,18 +1844,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	/*
-	 * In case of absence of EPT Access and Dirty Bits supports,
-	 * emulate the accessed bit for EPT, by checking if this page has
-	 * an EPT mapping, and clearing it if it does. On the next access,
-	 * a new EPT mapping will be established.
-	 * This has some overhead, but not as much as the cost of swapping
-	 * out actively used pages or breaking up actively used hugepages.
-	 */
-	if (!shadow_accessed_mask && !shadow_acc_track_mask)
-		return kvm_handle_hva_range(kvm, start, end, 0,
-					    kvm_unmap_rmapp);
-
 	return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
 }
 
@@ -2398,7 +2432,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
 	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
 	spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
-	       shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+	       shadow_user_mask | shadow_x_mask;
+
+	if (sp_ad_disabled(sp))
+		spte |= shadow_acc_track_value;
+	else
+		spte |= shadow_accessed_mask;
 
 	mmu_spte_set(sptep, spte);
 
@@ -2666,10 +2705,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 {
 	u64 spte = 0;
 	int ret = 0;
+	struct kvm_mmu_page *sp;
 
 	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
 		return 0;
 
+	sp = page_header(__pa(sptep));
+	if (sp_ad_disabled(sp))
+		spte |= shadow_acc_track_value;
+
 	/*
 	 * For the EPT case, shadow_present_mask is 0 if hardware
 	 * supports exec-only page table entries.  In that case,
@@ -2678,7 +2722,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	 */
 	spte |= shadow_present_mask;
 	if (!speculative)
-		spte |= shadow_accessed_mask;
+		spte |= spte_shadow_accessed_mask(spte);
 
 	if (pte_access & ACC_EXEC_MASK)
 		spte |= shadow_x_mask;
@@ -2735,7 +2779,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 	if (pte_access & ACC_WRITE_MASK) {
 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
-		spte |= shadow_dirty_mask;
+		spte |= spte_shadow_dirty_mask(spte);
 	}
 
 	if (speculative)
@@ -2877,16 +2921,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 {
 	struct kvm_mmu_page *sp;
 
+	sp = page_header(__pa(sptep));
+
 	/*
-	 * Since it's no accessed bit on EPT, it's no way to
-	 * distinguish between actually accessed translations
-	 * and prefetched, so disable pte prefetch if EPT is
-	 * enabled.
+	 * Without accessed bits, there's no way to distinguish between
+	 * actually accessed translations and prefetched, so disable pte
+	 * prefetch if accessed bits aren't available.
 	 */
-	if (!shadow_accessed_mask)
+	if (sp_ad_disabled(sp))
 		return;
 
-	sp = page_header(__pa(sptep));
 	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
 		return;
 
@@ -4290,6 +4334,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
 	context->base_role.word = 0;
 	context->base_role.smm = is_smm(vcpu);
+	context->base_role.ad_disabled = (shadow_accessed_mask == 0);
 	context->page_fault = tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = nonpaging_invlpg;
@@ -4377,6 +4422,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 	context->root_level = context->shadow_root_level;
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = false;
+	context->base_role.ad_disabled = !accessed_dirty;
 
 	update_permission_bitmask(vcpu, context, true);
 	update_pkru_bitmask(vcpu, context, true);
@@ -4636,6 +4682,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	mask.smep_andnot_wp = 1;
 	mask.smap_andnot_wp = 1;
 	mask.smm = 1;
+	mask.ad_disabled = 1;
 
 	/*
 	 * If we don't have indirect shadow pages, it means no page is
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 330bf3a811fb..a276834950c1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e)
 	return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 5a24b846a1cb..8b97a6cba8d1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -30,8 +30,9 @@
 								        \
 	role.word = __entry->role;					\
 									\
-	trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s"	\
-			 " %snxe root %u %s%c",	__entry->mmu_valid_gen,	\
+	trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s"	\
+			 " %snxe %sad root %u %s%c",			\
+			 __entry->mmu_valid_gen,			\
 			 __entry->gfn, role.level,			\
 			 role.cr4_pae ? " pae" : "",			\
 			 role.quadrant,					\
@@ -39,6 +40,7 @@
 			 access_str[role.access],			\
 			 role.invalid ? " invalid" : "",		\
 			 role.nxe ? "" : "!",				\
+			 role.ad_disabled ? "!" : "",			\
 			 __entry->root_count,				\
 			 __entry->unsync ? "unsync" : "sync", 0);	\
 	saved_ptr;							\
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 33460fcdeef9..905ea6052517 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -190,6 +190,7 @@ struct vcpu_svm {
 	struct nested_state nested;
 
 	bool nmi_singlestep;
+	u64 nmi_singlestep_guest_rflags;
 
 	unsigned int3_injected;
 	unsigned long int3_rip;
@@ -964,6 +965,18 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
 	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 }
 
+static void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+	svm->nmi_singlestep = false;
+	if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+		/* Clear our flags if they were not set by the guest */
+		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+			svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+			svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+	}
+}
+
 /* Note:
  * This hash table is used to map VM_ID to a struct kvm_arch,
  * when handling AMD IOMMU GALOG notification to schedule in
@@ -1713,11 +1726,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
-	return to_svm(vcpu)->vmcb->save.rflags;
+	struct vcpu_svm *svm = to_svm(vcpu);
+	unsigned long rflags = svm->vmcb->save.rflags;
+
+	if (svm->nmi_singlestep) {
+		/* Hide our flags if they were not set by the guest */
+		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+			rflags &= ~X86_EFLAGS_TF;
+		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+			rflags &= ~X86_EFLAGS_RF;
+	}
+	return rflags;
 }
 
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
+	if (to_svm(vcpu)->nmi_singlestep)
+		rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
        /*
         * Any change of EFLAGS.VM is accompanied by a reload of SS
         * (caused by either a task switch or an inter-privilege IRET),
@@ -2112,10 +2138,7 @@ static int db_interception(struct vcpu_svm *svm)
 	}
 
 	if (svm->nmi_singlestep) {
-		svm->nmi_singlestep = false;
-		if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
-			svm->vmcb->save.rflags &=
-				~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+		disable_nmi_singlestep(svm);
 	}
 
 	if (svm->vcpu.guest_debug &
@@ -2370,8 +2393,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
 
 static int nested_svm_check_permissions(struct vcpu_svm *svm)
 {
-	if (!(svm->vcpu.arch.efer & EFER_SVME)
-	    || !is_paging(&svm->vcpu)) {
+	if (!(svm->vcpu.arch.efer & EFER_SVME) ||
+	    !is_paging(&svm->vcpu)) {
 		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
 		return 1;
 	}
@@ -2381,7 +2404,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
 		return 1;
 	}
 
-       return 0;
+	return 0;
 }
 
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
@@ -2534,6 +2557,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 }
 
+/* DB exceptions for our internal use must not cause vmexit */
+static int nested_svm_intercept_db(struct vcpu_svm *svm)
+{
+	unsigned long dr6;
+
+	/* if we're not singlestepping, it's not ours */
+	if (!svm->nmi_singlestep)
+		return NESTED_EXIT_DONE;
+
+	/* if it's not a singlestep exception, it's not ours */
+	if (kvm_get_dr(&svm->vcpu, 6, &dr6))
+		return NESTED_EXIT_DONE;
+	if (!(dr6 & DR6_BS))
+		return NESTED_EXIT_DONE;
+
+	/* if the guest is singlestepping, it should get the vmexit */
+	if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
+		disable_nmi_singlestep(svm);
+		return NESTED_EXIT_DONE;
+	}
+
+	/* it's ours, the nested hypervisor must not see this one */
+	return NESTED_EXIT_HOST;
+}
+
 static int nested_svm_exit_special(struct vcpu_svm *svm)
 {
 	u32 exit_code = svm->vmcb->control.exit_code;
@@ -2589,8 +2637,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 	}
 	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
 		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
-		if (svm->nested.intercept_exceptions & excp_bits)
-			vmexit = NESTED_EXIT_DONE;
+		if (svm->nested.intercept_exceptions & excp_bits) {
+			if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
+				vmexit = nested_svm_intercept_db(svm);
+			else
+				vmexit = NESTED_EXIT_DONE;
+		}
 		/* async page fault always cause vmexit */
 		else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
 			 svm->apf_reason != 0)
@@ -4627,10 +4679,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	    == HF_NMI_MASK)
 		return; /* IRET will cause a vm exit */
 
+	if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0)
+		return; /* STGI will cause a vm exit */
+
+	if (svm->nested.exit_required)
+		return; /* we're not going to run the guest yet */
+
 	/*
 	 * Something prevents NMI from been injected. Single step over possible
 	 * problem (IRET or exception injection or interrupt shadow)
 	 */
+	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
 	svm->nmi_singlestep = true;
 	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
 }
@@ -4771,6 +4830,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->nested.exit_required))
 		return;
 
+	/*
+	 * Disable singlestep if we're injecting an interrupt/exception.
+	 * We don't want our modified rflags to be pushed on the stack where
+	 * we might not be able to easily reset them if we disabled NMI
+	 * singlestep later.
+	 */
+	if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+		/*
+		 * Event injection happens before external interrupts cause a
+		 * vmexit and interrupts are disabled here, so smp_send_reschedule
+		 * is enough to force an immediate vmexit.
+		 */
+		disable_nmi_singlestep(svm);
+		smp_send_reschedule(vcpu->cpu);
+	}
+
 	pre_svm_run(svm);
 
 	sync_lapic_to_cr8(vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6dcc4873e435..f76efad248ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -913,8 +913,9 @@ static void nested_release_page_clean(struct page *page)
 	kvm_release_page_clean(page);
 }
 
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
-static u64 construct_eptp(unsigned long root_hpa);
+static u64 construct_eptp(struct kvm_vcpu *vcpu
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-07-06 18:38:31 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-07-06 18:38:31 -0700
commit	c136b84393d4e340e1b53fc7f737dd5827b19ee5 (patch)
tree	985a1bdfafe7ec5ce2d3c738f601cad3998d8ce9 /arch/x86/kvm
parent	e0f25a3f2d052e36ff67a9b4db835c3e27e950d8 (diff)
parent	1372324b328cd5dabaef5e345e37ad48c63df2a9 (diff)