summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/kernel/kvm.c1
-rw-r--r--arch/x86/kvm/cpuid.c31
-rw-r--r--arch/x86/kvm/debugfs.c10
-rw-r--r--arch/x86/kvm/emulate.c8
-rw-r--r--arch/x86/kvm/i8254.c1
-rw-r--r--arch/x86/kvm/svm/nested.c2
-rw-r--r--arch/x86/kvm/svm/svm.c4
-rw-r--r--arch/x86/kvm/vmx/nested.c82
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c2
-rw-r--r--arch/x86/kvm/vmx/vmx.c38
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c139
14 files changed, 170 insertions, 161 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6ad579c7d4cd..d41812aba393 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -823,14 +823,6 @@ config PVH
This option enables the PVH entry point for guest virtual machines
as specified in the x86/HVM direct boot ABI.
-config KVM_DEBUG_FS
- bool "Enable debug information for KVM Guests in debugfs"
- depends on KVM_GUEST && DEBUG_FS
- ---help---
- This option enables collection of various statistics for KVM guest.
- Statistics are displayed in debugfs filesystem. Enabling this option
- may incur significant overhead.
-
config PARAVIRT_TIME_ACCOUNTING
bool "Paravirtual steal time accounting"
depends on PARAVIRT
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1da5858501ca..f8998e97457f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1306,7 +1306,6 @@ struct kvm_arch_async_pf {
extern u64 __read_mostly host_efer;
extern struct kvm_x86_ops kvm_x86_ops;
-extern struct kmem_cache *x86_fpu_cache;
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
@@ -1671,7 +1670,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm);
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
unsigned long *vcpu_bitmap);
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d6f22a3a1f7d..7e6403a8d861 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -21,7 +21,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kprobes.h>
-#include <linux/debugfs.h>
#include <linux/nmi.h>
#include <linux/swait.h>
#include <asm/timer.h>
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 253b8e875ccd..8a294f9747aa 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -181,17 +181,14 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
r = -E2BIG;
if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
goto out;
- r = -ENOMEM;
if (cpuid->nent) {
- cpuid_entries =
- vmalloc(array_size(sizeof(struct kvm_cpuid_entry),
- cpuid->nent));
- if (!cpuid_entries)
- goto out;
- r = -EFAULT;
- if (copy_from_user(cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+ cpuid_entries = vmemdup_user(entries,
+ array_size(sizeof(struct kvm_cpuid_entry),
+ cpuid->nent));
+ if (IS_ERR(cpuid_entries)) {
+ r = PTR_ERR(cpuid_entries);
goto out;
+ }
}
for (i = 0; i < cpuid->nent; i++) {
vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
@@ -211,8 +208,8 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
kvm_x86_ops.cpuid_update(vcpu);
r = kvm_update_cpuid(vcpu);
+ kvfree(cpuid_entries);
out:
- vfree(cpuid_entries);
return r;
}
@@ -325,7 +322,7 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_mask(CPUID_7_ECX,
- F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
+ F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
@@ -334,6 +331,13 @@ void kvm_set_cpu_caps(void)
if (cpuid_ecx(7) & F(LA57))
kvm_cpu_cap_set(X86_FEATURE_LA57);
+ /*
+ * PKU not yet implemented for shadow paging and requires OSPKE
+ * to be set on the host. Clear it if that is not the case
+ */
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+ kvm_cpu_cap_clear(X86_FEATURE_PKU);
+
kvm_cpu_cap_mask(CPUID_7_EDX,
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
@@ -426,7 +430,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
struct kvm_cpuid_array {
struct kvm_cpuid_entry2 *entries;
- const int maxnent;
+ int maxnent;
int nent;
};
@@ -870,7 +874,6 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_array array = {
.nent = 0,
- .maxnent = cpuid->nent,
};
int r, i;
@@ -887,6 +890,8 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
if (!array.entries)
return -ENOMEM;
+ array.maxnent = cpuid->nent;
+
for (i = 0; i < ARRAY_SIZE(funcs); i++) {
r = get_cpuid_func(&array, funcs[i], type);
if (r)
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 018aebce33ff..7e818d64bb4d 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -43,22 +43,22 @@ static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
-void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
{
- debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu,
+ debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu,
&vcpu_tsc_offset_fops);
if (lapic_in_kernel(vcpu))
debugfs_create_file("lapic_timer_advance_ns", 0444,
- vcpu->debugfs_dentry, vcpu,
+ debugfs_dentry, vcpu,
&vcpu_timer_advance_ns_fops);
if (kvm_has_tsc_control) {
debugfs_create_file("tsc-scaling-ratio", 0444,
- vcpu->debugfs_dentry, vcpu,
+ debugfs_dentry, vcpu,
&vcpu_tsc_scaling_fops);
debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
- vcpu->debugfs_dentry, vcpu,
+ debugfs_dentry, vcpu,
&vcpu_tsc_scaling_frac_fops);
}
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index de5476f8683e..d0e2825ae617 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4800,8 +4800,12 @@ static const struct opcode twobyte_table[256] = {
GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11),
GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11),
N, N, N, N, N, N,
- D(ImplicitOps | ModRM | SrcMem | NoAccess),
- N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess),
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */
/* 0x20 - 0x2F */
DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index febca334c320..a6e218c6140d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -462,7 +462,6 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
if (channel == 3) {
/* Read-Back Command. */
for (channel = 0; channel < 3; channel++) {
- s = &pit_state->channels[channel];
if (val & (2 << channel)) {
if (!(val & 0x20))
pit_latch_count(pit, channel);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 8a6db11dcb43..6bceafb19108 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -258,7 +258,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
/* Only a few fields of int_ctl are written by the processor. */
mask = V_IRQ_MASK | V_TPR_MASK;
if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
- is_intercept(svm, SVM_EXIT_VINTR)) {
+ is_intercept(svm, INTERCEPT_VINTR)) {
/*
* In order to request an interrupt window, L0 is usurping
* svm->vmcb->control.int_ctl and possibly setting V_IRQ
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9e333b91ff78..c8f5e87615d5 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1378,6 +1378,8 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
/* Drop int_ctl fields related to VINTR injection. */
svm->vmcb->control.int_ctl &= mask;
if (is_guest_mode(&svm->vcpu)) {
+ svm->nested.hsave->control.int_ctl &= mask;
+
WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
(svm->nested.ctl.int_ctl & V_TPR_MASK));
svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
@@ -1999,7 +2001,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
*/
if (vgif_enabled(svm))
clr_intercept(svm, INTERCEPT_STGI);
- if (is_intercept(svm, SVM_EXIT_VINTR))
+ if (is_intercept(svm, INTERCEPT_VINTR))
svm_clear_vintr(svm);
enable_gif(svm);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 9c74a732b08d..adb11b504d5c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4624,19 +4624,24 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
}
}
-static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
+ int *ret)
{
gva_t gva;
struct x86_exception e;
+ int r;
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmcs_read32(VMX_INSTRUCTION_INFO), false,
- sizeof(*vmpointer), &gva))
- return 1;
+ sizeof(*vmpointer), &gva)) {
+ *ret = 1;
+ return -EINVAL;
+ }
- if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
- kvm_inject_emulated_page_fault(vcpu, &e);
- return 1;
+ r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
+ if (r != X86EMUL_CONTINUE) {
+ *ret = vmx_handle_memory_failure(vcpu, r, &e);
+ return -EINVAL;
}
return 0;
@@ -4764,8 +4769,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if (nested_vmx_get_vmptr(vcpu, &vmptr))
- return 1;
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
+ return ret;
/*
* SDM 3: 24.11.5
@@ -4838,12 +4843,13 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
u32 zero = 0;
gpa_t vmptr;
u64 evmcs_gpa;
+ int r;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_get_vmptr(vcpu, &vmptr))
- return 1;
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
+ return r;
if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
@@ -4902,7 +4908,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
u64 value;
gva_t gva = 0;
short offset;
- int len;
+ int len, r;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -4943,10 +4949,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
instr_info, true, len, &gva))
return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
- if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) {
- kvm_inject_emulated_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
+ if (r != X86EMUL_CONTINUE)
+ return vmx_handle_memory_failure(vcpu, r, &e);
}
return nested_vmx_succeed(vcpu);
@@ -4987,7 +4992,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
unsigned long field;
short offset;
gva_t gva;
- int len;
+ int len, r;
/*
* The value to write might be 32 or 64 bits, depending on L1's long
@@ -5017,10 +5022,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, exit_qualification,
instr_info, false, len, &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) {
- kvm_inject_emulated_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
+ if (r != X86EMUL_CONTINUE)
+ return vmx_handle_memory_failure(vcpu, r, &e);
}
field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
@@ -5103,12 +5107,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
gpa_t vmptr;
+ int r;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_get_vmptr(vcpu, &vmptr))
- return 1;
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
+ return r;
if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
@@ -5170,6 +5175,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
struct x86_exception e;
gva_t gva;
+ int r;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -5181,11 +5187,11 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
true, sizeof(gpa_t), &gva))
return 1;
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
- if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
- sizeof(gpa_t), &e)) {
- kvm_inject_emulated_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
+ sizeof(gpa_t), &e);
+ if (r != X86EMUL_CONTINUE)
+ return vmx_handle_memory_failure(vcpu, r, &e);
+
return nested_vmx_succeed(vcpu);
}
@@ -5209,7 +5215,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
struct {
u64 eptp, gpa;
} operand;
- int i;
+ int i, r;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_EPT) ||
@@ -5236,10 +5242,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_emulated_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return vmx_handle_memory_failure(vcpu, r, &e);
/*
* Nested EPT roots are always held through guest_mmu,
@@ -5291,6 +5296,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
u64 gla;
} operand;
u16 vpid02;
+ int r;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
@@ -5318,10 +5324,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_emulated_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return vmx_handle_memory_failure(vcpu, r, &e);
+
if (operand.vpid >> 16)
return nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
@@ -5666,7 +5672,7 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
{
u32 intr_info;
- switch (exit_reason) {
+ switch ((u16)exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
@@ -5727,7 +5733,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u32 intr_info;
- switch (exit_reason) {
+ switch ((u16)exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index d33d890b605f..bdcce65c7a1d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -181,7 +181,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
ret = pmu->version > 1;
break;
case MSR_IA32_PERF_CAPABILITIES:
- ret = guest_cpuid_has(vcpu, X86_FEATURE_PDCM);
+ ret = 1;
break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 170cc76a581f..08e26a9518c2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1600,6 +1600,32 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+/*
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
+ * indicates whether exit to userspace is needed.
+ */
+int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e)
+{
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, e);
+ return 1;
+ }
+
+ /*
+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
+ * while handling a VMX instruction KVM could've handled the request
+ * correctly by exiting to userspace and performing I/O but there
+ * doesn't seem to be a real use-case behind such requests, just return
+ * KVM_EXIT_INTERNAL_ERROR for now.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
+ return 0;
+}
/*
* Recognizes a pending MTF VM-exit and records the nested state for later
@@ -5486,6 +5512,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
u64 pcid;
u64 gla;
} operand;
+ int r;
if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5508,10 +5535,9 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
sizeof(operand), &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_emulated_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return vmx_handle_memory_failure(vcpu, r, &e);
if (operand.pcid >> 12 != 0) {
kvm_inject_gp(vcpu, 0);
@@ -7282,10 +7308,6 @@ static __init void vmx_set_cpu_caps(void)
if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
- /* PKU is not yet implemented for shadow paging. */
- if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
- kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
-
if (vmx_umip_emulated())
kvm_cpu_cap_set(X86_FEATURE_UMIP);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 672c28f17e49..8a83b5edc820 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -355,6 +355,8 @@ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
int vmx_find_msr_index(struct vmx_msrs *m, u32 msr);
+int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e);
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9e41b5135340..00c88c2f34e4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -239,8 +239,7 @@ u64 __read_mostly host_xcr0;
u64 __read_mostly supported_xcr0;
EXPORT_SYMBOL_GPL(supported_xcr0);
-struct kmem_cache *x86_fpu_cache;
-EXPORT_SYMBOL_GPL(x86_fpu_cache);
+static struct kmem_cache *x86_fpu_cache;
static struct kmem_cache *x86_emulator_cache;
@@ -5647,13 +5646,6 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
- /*
- * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
- * is returned, but our callers are not ready for that and they blindly
- * call kvm_inject_page_fault. Ensure that they at least do not leak
- * uninitialized kernel stack memory into cr2 and error code.
- */
- memset(exception, 0, sizeof(*exception));
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@@ -7018,7 +7010,7 @@ restart:
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
kvm_rip_write(vcpu, ctxt->eip);
- if (r && ctxt->tf)
+ if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
if (kvm_x86_ops.update_emulated_instruction)
kvm_x86_ops.update_emulated_instruction(vcpu);
@@ -8277,9 +8269,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
-int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end,
- bool blockable)
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
{
unsigned long apic_address;
@@ -8290,8 +8281,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (start <= apic_address && apic_address < end)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-
- return 0;
}
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
@@ -9962,13 +9951,8 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
if (!slot || !slot->npages)
return 0;
- /*
- * Stuff a non-canonical value to catch use-after-delete. This
- * ends up being 0 on 32-bit KVM, but there's no better
- * alternative.
- */
- hva = (unsigned long)(0xdeadull << 48);
old_npages = slot->npages;
+ hva = 0;
}
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -10140,43 +10124,65 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
}
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
- struct kvm_memory_slot *new)
+ struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
- /* Still write protect RO slot */
- if (new->flags & KVM_MEM_READONLY) {
- kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
+ /*
+ * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
+ * See comments below.
+ */
+ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
return;
- }
/*
- * Call kvm_x86_ops dirty logging hooks when they are valid.
- *
- * kvm_x86_ops.slot_disable_log_dirty is called when:
- *
- * - KVM_MR_CREATE with dirty logging is disabled
- * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
- *
- * The reason is, in case of PML, we need to set D-bit for any slots
- * with dirty logging disabled in order to eliminate unnecessary GPA
- * logging in PML buffer (and potential PML buffer full VMEXIT). This
- * guarantees leaving PML enabled during guest's lifetime won't have
- * any additional overhead from PML when guest is running with dirty
- * logging disabled for memory slots.
+ * Dirty logging tracks sptes in 4k granularity, meaning that large
+ * sptes have to be split. If live migration is successful, the guest
+ * in the source machine will be destroyed and large sptes will be
+ * created in the destination. However, if the guest continues to run
+ * in the source machine (for example if live migration fails), small
+ * sptes will remain around and cause bad performance.
*
- * kvm_x86_ops.slot_enable_log_dirty is called when switching new slot
- * to dirty logging mode.
+ * Scan sptes if dirty logging has been stopped, dropping those
+ * which can be collapsed into a single large-page spte. Later
+ * page faults will create the large-page sptes.
*
- * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
+ * There is no need to do this in any of the following cases:
+ * CREATE: No dirty mappings will already exist.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * kvm_arch_flush_shadow_memslot()
+ */
+ if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ kvm_mmu_zap_collapsible_sptes(kvm, new);
+
+ /*
+ * Enable or disable dirty logging for the slot.
*
- * In case of write protect:
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
+ * slot have been zapped so no dirty logging updates are needed for
+ * the old slot.
+ * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
+ * any mappings that might be created in it will consume the
+ * properties of the new slot and do not need to be updated here.
*
- * Write protect all pages for dirty logging.
+ * When PML is enabled, the kvm_x86_ops dirty logging hooks are
+ * called to enable/disable dirty logging.
*
- * All the sptes including the large sptes which point to this
- * slot are set to readonly. We can not create any new large
- * spte on this slot until the end of the logging.
+ * When disabling dirty logging with PML enabled, the D-bit is set
+ * for sptes in the slot in order to prevent unnecessary GPA
+ * logging in the PML buffer (and potential PML buffer full VMEXIT).
+ * This guarantees leaving PML enabled for the guest's lifetime
+ * won't have any additional overhead from PML when the guest is
+ * running with dirty logging disabled.
*
+ * When enabling dirty logging, large sptes are write-protected
+ * so they can be split on first write. New large sptes cannot
+ * be created for this slot until the end of the logging.
* See the comments in fast_page_fault().
+ * For small sptes, nothing is done if the dirty log is in the
+ * initial-all-set state. Otherwise, depending on whether pml
+ * is enabled the D-bit or the W-bit will be cleared.
*/
if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
if (kvm_x86_ops.slot_enable_log_dirty) {
@@ -10213,39 +10219,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_mmu_calculate_default_mmu_pages(kvm));
/*
- * Dirty logging tracks sptes in 4k granularity, meaning that large
- * sptes have to be split. If live migration is successful, the guest
- * in the source machine will be destroyed and large sptes will be
- * created in the destination. However, if the guest continues to run
- * in the source machine (for example if live migration fails), small
- * sptes will remain around and cause bad performance.
- *
- * Scan sptes if dirty logging has been stopped, dropping those
- * which can be collapsed into a single large-page spte. Later
- * page faults will create the large-page sptes.
- *
- * There is no need to do this in any of the following cases:
- * CREATE: No dirty mappings will already exist.
- * MOVE/DELETE: The old mappings will already have been cleaned up by
- * kvm_arch_flush_shadow_memslot()
- */
- if (change == KVM_MR_FLAGS_ONLY &&
- (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
- !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
- kvm_mmu_zap_collapsible_sptes(kvm, new);
-
- /*
- * Set up write protection and/or dirty logging for the new slot.
- *
- * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
- * been zapped so no dirty logging staff is needed for old slot. For
- * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
- * new and it's also covered when dealing with the new slot.
- *
* FIXME: const-ify all uses of struct kvm_memory_slot.
*/
- if (change != KVM_MR_DELETE)
- kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
+ kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
/* Free the arrays associated with the old memslot. */
if (change == KVM_MR_MOVE)
@@ -10530,7 +10506,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
return kvm_arch_interrupt_allowed(vcpu);
}
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
struct x86_exception fault;
@@ -10547,6 +10523,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
fault.address = work->arch.token;
fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
+ return true;
} else {
/*
* It is not possible to deliver a paravirtualized asynchronous
@@ -10557,6 +10534,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
* fault is retried, hopefully the page will be ready in the host.
*/
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return false;
}
}
@@ -10574,7 +10552,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
- if (kvm_pv_async_pf_enabled(vcpu) &&
+ if ((work->wakeup_all || work->notpresent_injected) &&
+ kvm_pv_async_pf_enabled(vcpu) &&
!apf_put_user_ready(vcpu, work->arch.token)) {
vcpu->arch.apf.pageready_pending = true;
kvm_apic_set_irq(vcpu, &irq, NULL);