diff options
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 15295 |
1 files changed, 15295 insertions, 0 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c new file mode 100644 index 000000000000..589230c923e2 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx.c @@ -0,0 +1,15295 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/frame.h> +#include <linux/highmem.h> +#include <linux/hrtimer.h> +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mod_devicetable.h> +#include <linux/mm.h> +#include <linux/nospec.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/tboot.h> +#include <linux/trace_events.h> + +#include <asm/apic.h> +#include <asm/asm.h> +#include <asm/cpu.h> +#include <asm/debugreg.h> +#include <asm/desc.h> +#include <asm/fpu/internal.h> +#include <asm/io.h> +#include <asm/irq_remapping.h> +#include <asm/kexec.h> +#include <asm/perf_event.h> +#include <asm/mce.h> +#include <asm/mmu_context.h> +#include <asm/mshyperv.h> +#include <asm/spec-ctrl.h> +#include <asm/virtext.h> +#include <asm/vmx.h> + +#include "cpuid.h" +#include "hyperv.h" +#include "irq.h" +#include "kvm_cache_regs.h" +#include "lapic.h" +#include "mmu.h" +#include "pmu.h" +#include "trace.h" +#include "vmx_evmcs.h" +#include "x86.h" + +#define __ex(x) __kvm_handle_fault_on_reboot(x) +#define __ex_clear(x, reg) \ + ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +static const struct x86_cpu_id vmx_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_VMX), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); + +static bool __read_mostly enable_vpid = 1; +module_param_named(vpid, enable_vpid, bool, 0444); + +static bool __read_mostly enable_vnmi = 1; +module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); + +static bool __read_mostly flexpriority_enabled = 1; +module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); + +static bool __read_mostly enable_ept = 1; +module_param_named(ept, enable_ept, bool, S_IRUGO); + +static bool __read_mostly enable_unrestricted_guest = 1; +module_param_named(unrestricted_guest, + enable_unrestricted_guest, bool, S_IRUGO); + +static bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + +static bool __read_mostly emulate_invalid_guest_state = true; +module_param(emulate_invalid_guest_state, bool, S_IRUGO); + +static bool __read_mostly fasteoi = 1; +module_param(fasteoi, bool, S_IRUGO); + +static bool __read_mostly enable_apicv = 1; +module_param(enable_apicv, bool, S_IRUGO); + +static bool __read_mostly enable_shadow_vmcs = 1; +module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); +/* + * If nested=1, nested virtualization is supported, i.e., guests may use + * VMX and be a hypervisor for its own guests. If nested=0, guests may not + * use VMX instructions. + */ +static bool __read_mostly nested = 1; +module_param(nested, bool, S_IRUGO); + +static bool __read_mostly nested_early_check = 0; +module_param(nested_early_check, bool, S_IRUGO); + +static u64 __read_mostly host_xss; + +static bool __read_mostly enable_pml = 1; +module_param_named(pml, enable_pml, bool, S_IRUGO); + +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +#define MSR_TYPE_RW 3 + +#define MSR_BITMAP_MODE_X2APIC 1 +#define MSR_BITMAP_MODE_X2APIC_APICV 2 + +#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + +/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +static int __read_mostly cpu_preemption_timer_multi; +static bool __read_mostly enable_preemption_timer = 1; +#ifdef CONFIG_X86_64 +module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); +#endif + +#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ + X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) + +#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + +/* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive + * executions of PAUSE in a loop. Also indicate if ple enabled. + * According to test, this time is usually smaller than 128 cycles. + * ple_window: upper bound on the amount of time a guest is allowed to execute + * in a PAUSE loop. Tests indicate that most spinlocks are held for + * less than 2^12 cycles + * Time is measured based on a counter that runs at the same rate as the TSC, + * refer SDM volume 3b section 21.6.13 & 22.1.3. + */ +static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; +module_param(ple_gap, uint, 0444); + +static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, uint, 0444); + +/* Default doubles per-vcpu window every exit. */ +static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, uint, 0444); + +/* Default resets per-vcpu window every exit to ple_window. */ +static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, uint, 0444); + +/* Default is to compute the maximum so we can never overflow. */ +static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, uint, 0444); + +extern const ulong vmx_return; +extern const ulong vmx_early_consistency_check_return; + +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); +static DEFINE_MUTEX(vmx_l1d_flush_mutex); + +/* Storage for pre module init parameter parsing */ +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; + +static const struct { + const char *option; + bool for_parse; +} vmentry_l1d_param[] = { + [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, + [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, + [VMENTER_L1D_FLUSH_COND] = {"cond", true}, + [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, + [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, +}; + +#define L1D_CACHE_ORDER 4 +static void *vmx_l1d_flush_pages; + +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +{ + struct page *page; + unsigned int i; + + if (!enable_ept) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; + return 0; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 msr; + + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + } + + /* If set to auto use the default l1tf mitigation method */ + if (l1tf == VMENTER_L1D_FLUSH_AUTO) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + l1tf = VMENTER_L1D_FLUSH_NEVER; + break; + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + l1tf = VMENTER_L1D_FLUSH_COND; + break; + case L1TF_MITIGATION_FULL: + case L1TF_MITIGATION_FULL_FORCE: + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + break; + } + } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + } + + if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && + !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); + if (!page) + return -ENOMEM; + vmx_l1d_flush_pages = page_address(page); + + /* + * Initialize each page with a different pattern in + * order to protect against KSM in the nested + * virtualization case. + */ + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { + memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, + PAGE_SIZE); + } + } + + l1tf_vmx_mitigation = l1tf; + + if (l1tf != VMENTER_L1D_FLUSH_NEVER) + static_branch_enable(&vmx_l1d_should_flush); + else + static_branch_disable(&vmx_l1d_should_flush); + + if (l1tf == VMENTER_L1D_FLUSH_COND) + static_branch_enable(&vmx_l1d_flush_cond); + else + static_branch_disable(&vmx_l1d_flush_cond); + return 0; +} + +static int vmentry_l1d_flush_parse(const char *s) +{ + unsigned int i; + + if (s) { + for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { + if (vmentry_l1d_param[i].for_parse && + sysfs_streq(s, vmentry_l1d_param[i].option)) + return i; + } + } + return -EINVAL; +} + +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + int l1tf, ret; + + l1tf = vmentry_l1d_flush_parse(s); + if (l1tf < 0) + return l1tf; + + if (!boot_cpu_has(X86_BUG_L1TF)) + return 0; + + /* + * Has vmx_init() run already? If not then this is the pre init + * parameter parsing. In that case just store the value and let + * vmx_init() do the proper setup after enable_ept has been + * established. + */ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { + vmentry_l1d_flush_param = l1tf; + return 0; + } + + mutex_lock(&vmx_l1d_flush_mutex); + ret = vmx_setup_l1d_flush(l1tf); + mutex_unlock(&vmx_l1d_flush_mutex); + return ret; +} + +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) + return sprintf(s, "???\n"); + + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); +} + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + +enum ept_pointers_status { + EPT_POINTERS_CHECK = 0, + EPT_POINTERS_MATCH = 1, + EPT_POINTERS_MISMATCH = 2 +}; + +struct kvm_vmx { + struct kvm kvm; + + unsigned int tss_addr; + bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; + + enum ept_pointers_status ept_pointers_match; + spinlock_t ept_pointer_lock; +}; + +#define NR_AUTOLOAD_MSRS 8 + +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + +struct vmcs { + struct vmcs_hdr hdr; + u32 abort; + char data[0]; +}; + +/* + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT + * and whose values change infrequently, but are not constant. I.e. this is + * used as a write-through cache of the corresponding VMCS fields. + */ +struct vmcs_host_state { + unsigned long cr3; /* May not match real cr3 */ + unsigned long cr4; /* May not match real cr4 */ + unsigned long gs_base; + unsigned long fs_base; + + u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif +}; + +/* + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also + * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs + * loaded on this CPU (so we can clear them if the CPU goes down). + */ +struct loaded_vmcs { + struct vmcs *vmcs; + struct vmcs *shadow_vmcs; + int cpu; + bool launched; + bool nmi_known_unmasked; + bool hv_timer_armed; + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; + unsigned long *msr_bitmap; + struct list_head loaded_vmcss_on_cpu_link; + struct vmcs_host_state host_state; +}; + +struct shared_msr_entry { + unsigned index; + u64 data; + u64 mask; +}; + +/* + * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a + * single nested guest (L2), hence the name vmcs12. Any VMX implementation has + * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). + * + * IMPORTANT: Changing the layout of existing fields in this structure + * will break save/restore compatibility with older kvm releases. When + * adding new fields, either use space in the reserved padding* arrays + * or add the new fields to the end of the structure. + */ +typedef u64 natural_width; +struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with the + * following two fields. Then follow implementation-specific data. + */ + struct vmcs_hdr hdr; + u32 abort; + + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 posted_intr_desc_addr; + u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; + u64 xss_exit_bitmap; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_ia32_perf_global_ctrl; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 guest_bndcfgs; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 host_ia32_perf_global_ctrl; + u64 vmread_bitmap; + u64 vmwrite_bitmap; + u64 vm_function_control; + u64 eptp_list_address; + u64 pml_address; + u64 padding64[3]; /* room for future expansion */ + /* + * To allow migration of L1 (complete with its L2 guests) between + * machines of different natural widths (32 or 64 bit), we cannot have + * unsigned long fields with no explict size. We use u64 (aliased + * natural_width) instead. Luckily, x86 is little-endian. + */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 vmx_preemption_timer_value; + u32 padding32[7]; /* room for future expansion */ + u16 virtual_processor_id; + u16 posted_intr_nv; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 guest_intr_status; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + u16 guest_pml_index; +}; + +/* + * For save/restore compatibility, the vmcs12 field offsets must not change. + */ +#define CHECK_OFFSET(field, loc) \ + BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ + "Offset of " #field " in struct vmcs12 has changed.") + +static inline void vmx_check_vmcs12_offsets(void) { + CHECK_OFFSET(hdr, 0); + CHECK_OFFSET(abort, 4); + CHECK_OFFSET(launch_state, 8); + CHECK_OFFSET(io_bitmap_a, 40); + CHECK_OFFSET(io_bitmap_b, 48); + CHECK_OFFSET(msr_bitmap, 56); + CHECK_OFFSET(vm_exit_msr_store_addr, 64); + CHECK_OFFSET(vm_exit_msr_load_addr, 72); + CHECK_OFFSET(vm_entry_msr_load_addr, 80); + CHECK_OFFSET(tsc_offset, 88); + CHECK_OFFSET(virtual_apic_page_addr, 96); + CHECK_OFFSET(apic_access_addr, 104); + CHECK_OFFSET(posted_intr_desc_addr, 112); + CHECK_OFFSET(ept_pointer, 120); + CHECK_OFFSET(eoi_exit_bitmap0, 128); + CHECK_OFFSET(eoi_exit_bitmap1, 136); + CHECK_OFFSET(eoi_exit_bitmap2, 144); + CHECK_OFFSET(eoi_exit_bitmap3, 152); + CHECK_OFFSET(xss_exit_bitmap, 160); + CHECK_OFFSET(guest_physical_address, 168); + CHECK_OFFSET(vmcs_link_pointer, 176); + CHECK_OFFSET(guest_ia32_debugctl, 184); + CHECK_OFFSET(guest_ia32_pat, 192); + CHECK_OFFSET(guest_ia32_efer, 200); + CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); + CHECK_OFFSET(guest_pdptr0, 216); + CHECK_OFFSET(guest_pdptr1, 224); + CHECK_OFFSET(guest_pdptr2, 232); + CHECK_OFFSET(guest_pdptr3, 240); + CHECK_OFFSET(guest_bndcfgs, 248); + CHECK_OFFSET(host_ia32_pat, 256); + CHECK_OFFSET(host_ia32_efer, 264); + CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); + CHECK_OFFSET(vmread_bitmap, 280); + CHECK_OFFSET(vmwrite_bitmap, 288); + CHECK_OFFSET(vm_function_control, 296); + CHECK_OFFSET(eptp_list_address, 304); + CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(cr0_guest_host_mask, 344); + CHECK_OFFSET(cr4_guest_host_mask, 352); + CHECK_OFFSET(cr0_read_shadow, 360); + CHECK_OFFSET(cr4_read_shadow, 368); + CHECK_OFFSET(cr3_target_value0, 376); + CHECK_OFFSET(cr3_target_value1, 384); + CHECK_OFFSET(cr3_target_value2, 392); + CHECK_OFFSET(cr3_target_value3, 400); + CHECK_OFFSET(exit_qualification, 408); + CHECK_OFFSET(guest_linear_address, 416); + CHECK_OFFSET(guest_cr0, 424); + CHECK_OFFSET(guest_cr3, 432); + CHECK_OFFSET(guest_cr4, 440); + CHECK_OFFSET(guest_es_base, 448); + CHECK_OFFSET(guest_cs_base, 456); + CHECK_OFFSET(guest_ss_base, 464); + CHECK_OFFSET(guest_ds_base, 472); + CHECK_OFFSET(guest_fs_base, 480); + CHECK_OFFSET(guest_gs_base, 488); + CHECK_OFFSET(guest_ldtr_base, 496); + CHECK_OFFSET(guest_tr_base, 504); + CHECK_OFFSET(guest_gdtr_base, 512); + CHECK_OFFSET(guest_idtr_base, 520); + CHECK_OFFSET(guest_dr7, 528); + CHECK_OFFSET(guest_rsp, 536); + CHECK_OFFSET(guest_rip, 544); + CHECK_OFFSET(guest_rflags, 552); + CHECK_OFFSET(guest_pending_dbg_exceptions, 560); + CHECK_OFFSET(guest_sysenter_esp, 568); + CHECK_OFFSET(guest_sysenter_eip, 576); + CHECK_OFFSET(host_cr0, 584); + CHECK_OFFSET(host_cr3, 592); + CHECK_OFFSET(host_cr4, 600); + CHECK_OFFSET(host_fs_base, 608); + CHECK_OFFSET(host_gs_base, 616); + CHECK_OFFSET(host_tr_base, 624); + CHECK_OFFSET(host_gdtr_base, 632); + CHECK_OFFSET(host_idtr_base, 640); + CHECK_OFFSET(host_ia32_sysenter_esp, 648); + CHECK_OFFSET(host_ia32_sysenter_eip, 656); + CHECK_OFFSET(host_rsp, 664); + CHECK_OFFSET(host_rip, 672); + CHECK_OFFSET(pin_based_vm_exec_control, 744); + CHECK_OFFSET(cpu_based_vm_exec_control, 748); + CHECK_OFFSET(exception_bitmap, 752); + CHECK_OFFSET(page_fault_error_code_mask, 756); + CHECK_OFFSET(page_fault_error_code_match, 760); + CHECK_OFFSET(cr3_target_count, 764); + CHECK_OFFSET(vm_exit_controls, 768); + CHECK_OFFSET(vm_exit_msr_store_count, 772); + CHECK_OFFSET(vm_exit_msr_load_count, 776); + CHECK_OFFSET(vm_entry_controls, 780); + CHECK_OFFSET(vm_entry_msr_load_count, 784); + CHECK_OFFSET(vm_entry_intr_info_field, 788); + CHECK_OFFSET(vm_entry_exception_error_code, 792); + CHECK_OFFSET(vm_entry_instruction_len, 796); + CHECK_OFFSET(tpr_threshold, 800); + CHECK_OFFSET(secondary_vm_exec_control, 804); + CHECK_OFFSET(vm_instruction_error, 808); + CHECK_OFFSET(vm_exit_reason, 812); + CHECK_OFFSET(vm_exit_intr_info, 816); + CHECK_OFFSET(vm_exit_intr_error_code, 820); + CHECK_OFFSET(idt_vectoring_info_field, 824); + CHECK_OFFSET(idt_vectoring_error_code, 828); + CHECK_OFFSET(vm_exit_instruction_len, 832); + CHECK_OFFSET(vmx_instruction_info, 836); + CHECK_OFFSET(guest_es_limit, 840); + CHECK_OFFSET(guest_cs_limit, 844); + CHECK_OFFSET(guest_ss_limit, 848); + CHECK_OFFSET(guest_ds_limit, 852); + CHECK_OFFSET(guest_fs_limit, 856); + CHECK_OFFSET(guest_gs_limit, 860); + CHECK_OFFSET(guest_ldtr_limit, 864); + CHECK_OFFSET(guest_tr_limit, 868); + CHECK_OFFSET(guest_gdtr_limit, 872); + CHECK_OFFSET(guest_idtr_limit, 876); + CHECK_OFFSET(guest_es_ar_bytes, 880); + CHECK_OFFSET(guest_cs_ar_bytes, 884); + CHECK_OFFSET(guest_ss_ar_bytes, 888); + CHECK_OFFSET(guest_ds_ar_bytes, 892); + CHECK_OFFSET(guest_fs_ar_bytes, 896); + CHECK_OFFSET(guest_gs_ar_bytes, 900); + CHECK_OFFSET(guest_ldtr_ar_bytes, 904); + CHECK_OFFSET(guest_tr_ar_bytes, 908); + CHECK_OFFSET(guest_interruptibility_info, 912); + CHECK_OFFSET(guest_activity_state, 916); + CHECK_OFFSET(guest_sysenter_cs, 920); + CHECK_OFFSET(host_ia32_sysenter_cs, 924); + CHECK_OFFSET(vmx_preemption_timer_value, 928); + CHECK_OFFSET(virtual_processor_id, 960); + CHECK_OFFSET(posted_intr_nv, 962); + CHECK_OFFSET(guest_es_selector, 964); + CHECK_OFFSET(guest_cs_selector, 966); + CHECK_OFFSET(guest_ss_selector, 968); + CHECK_OFFSET(guest_ds_selector, 970); + CHECK_OFFSET(guest_fs_selector, 972); + CHECK_OFFSET(guest_gs_selector, 974); + CHECK_OFFSET(guest_ldtr_selector, 976); + CHECK_OFFSET(guest_tr_selector, 978); + CHECK_OFFSET(guest_intr_status, 980); + CHECK_OFFSET(host_es_selector, 982); + CHECK_OFFSET(host_cs_selector, 984); + CHECK_OFFSET(host_ss_selector, 986); + CHECK_OFFSET(host_ds_selector, 988); + CHECK_OFFSET(host_fs_selector, 990); + CHECK_OFFSET(host_gs_selector, 992); + CHECK_OFFSET(host_tr_selector, 994); + CHECK_OFFSET(guest_pml_index, 996); +} + +/* + * VMCS12_REVISION is an arbitrary id that should be changed if the content or + * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and + * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + * + * IMPORTANT: Changing this value will break save/restore compatibility with + * older kvm releases. + */ +#define VMCS12_REVISION 0x11e57ed0 + +/* + * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region + * and any VMCS region. Although only sizeof(struct vmcs12) are used by the + * current implementation, 4K are reserved to avoid future complications. + */ +#define VMCS12_SIZE 0x1000 + +/* + * VMCS12_MAX_FIELD_INDEX is the highest index value used in any + * supported VMCS12 field encoding. + */ +#define VMCS12_MAX_FIELD_INDEX 0x17 + +struct nested_vmx_msrs { + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ + u32 procbased_ctls_low; + u32 procbased_ctls_high; + u32 secondary_ctls_low; + u32 secondary_ctls_high; + u32 pinbased_ctls_low; + u32 pinbased_ctls_high; + u32 exit_ctls_low; + u32 exit_ctls_high; + u32 entry_ctls_low; + u32 entry_ctls_high; + u32 misc_low; + u32 misc_high; + u32 ept_caps; + u32 vpid_caps; + u64 basic; + u64 cr0_fixed0; + u64 cr0_fixed1; + u64 cr4_fixed0; + u64 cr4_fixed1; + u64 vmcs_enum; + u64 vmfunc_controls; +}; + +/* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. + */ +struct nested_vmx { + /* Has the level1 guest done vmxon? */ + bool vmxon; + gpa_t vmxon_ptr; + bool pml_full; + + /* The guest-physical address of the current VMCS L1 keeps for L2 */ + gpa_t current_vmptr; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMCLEAR and VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; + /* + * Cache of the guest's shadow VMCS, existing outside of guest + * memory. Loaded from guest memory during VM entry. Flushed + * to guest memory during VM exit. + */ + struct vmcs12 *cached_shadow_vmcs12; + /* + * Indicates if the shadow vmcs or enlightened vmcs must be updated + * with the data held by struct vmcs12. + */ + bool need_vmcs12_sync; + bool dirty_vmcs12; + + /* + * vmcs02 has been initialized, i.e. state that is constant for + * vmcs02 has been written to the backing VMCS. Initialization + * is delayed until L1 actually attempts to run a nested VM. + */ + bool vmcs02_initialized; + + bool change_vmcs01_virtual_apic_mode; + + /* + * Enlightened VMCS has been enabled. It does not mean that L1 has to + * use it. However, VMX features available to L1 will be limited based + * on what the enlightened VMCS supports. + */ + bool enlightened_vmcs_enabled; + + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; + + struct loaded_vmcs vmcs02; + + /* + * Guest pages referred to in the vmcs02 with host-physical + * pointers, so we must keep them pinned while L2 runs. + */ + struct page *apic_access_page; + struct page *virtual_apic_page; + struct page *pi_desc_page; + struct pi_desc *pi_desc; + bool pi_pending; + u16 posted_intr_nv; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; + + /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ + u64 vmcs01_debugctl; + u64 vmcs01_guest_bndcfgs; + + u16 vpid02; + u16 last_vpid; + + struct nested_vmx_msrs msrs; + + /* SMM related state */ + struct { + /* in VMX operation on SMM entry? */ + bool vmxon; + /* in guest mode on SMM entry? */ + bool guest_mode; + } smm; + + gpa_t hv_evmcs_vmptr; + struct page *hv_evmcs_page; + struct hv_enlightened_vmcs *hv_evmcs; +}; + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +static bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + return clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + return set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +struct vmx_msrs { + unsigned int nr; + struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + unsigned long host_rsp; + u8 fail; + u8 msr_bitmap_mode; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; + struct shared_msr_entry *guest_msrs; + int nmsrs; + int save_nmsrs; + bool guest_msrs_dirty; + unsigned long host_idt_base; +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; +#endif + + u64 arch_capabilities; + u64 spec_ctrl; + + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + u32 secondary_exec_control; + + /* + * loaded_vmcs points to the VMCS currently used in this vcpu. For a + * non-nested (L1) guest, it always points to vmcs01. For a nested + * guest (L2), it points to a different VMCS. loaded_cpu_state points + * to the VMCS whose state is loaded into the CPU registers that only + * need to be switched when transitioning to/from the kernel; a NULL + * value indicates that host state is loaded. + */ + struct loaded_vmcs vmcs01; + struct loaded_vmcs *loaded_vmcs; + struct loaded_vmcs *loaded_cpu_state; + bool __launched; /* temporary, used in vmx_vcpu_run */ + struct msr_autoload { + struct vmx_msrs guest; + struct vmx_msrs host; + } msr_autoload; + + struct { + int vm86_active; + ulong save_rflags; + struct kvm_segment segs[8]; + } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } seg[8]; + } segment_cache; + int vpid; + bool emulation_required; + + u32 exit_reason; + + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + + /* Support for a guest hypervisor (nested VMX) */ + struct nested_vmx nested; + + /* Dynamic PLE window. */ + int ple_window; + bool ple_window_dirty; + + bool req_immediate_exit; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; + + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + + u64 current_tsc_ratio; + + u32 host_pkru; + + unsigned long host_debugctlmsr; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; + u64 ept_pointer; +}; + +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + +static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_vmx, kvm); +} + +static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_vmx, vcpu); +} + +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) +#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) +#define FIELD64(number, name) \ + FIELD(number, name), \ + [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) + + +static u16 shadow_read_only_fields[] = { +#define SHADOW_FIELD_RO(x) x, +#include "vmx_shadow_fields.h" +}; +static int max_shadow_read_only_fields = + ARRAY_SIZE(shadow_read_only_fields); + +static u16 shadow_read_write_fields[] = { +#define SHADOW_FIELD_RW(x) x, +#include "vmx_shadow_fields.h" +}; +static int max_shadow_read_write_fields = + ARRAY_SIZE(shadow_read_write_fields); + +static const unsigned short vmcs_field_to_offset_table[] = { + FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), + FIELD(POSTED_INTR_NV, posted_intr_nv), + FIELD(GUEST_ES_SELECTOR, guest_es_selector), + FIELD(GUEST_CS_SELECTOR, guest_cs_selector), + FIELD(GUEST_SS_SELECTOR, guest_ss_selector), + FIELD(GUEST_DS_SELECTOR, guest_ds_selector), + FIELD(GUEST_FS_SELECTOR, guest_fs_selector), + FIELD(GUEST_GS_SELECTOR, guest_gs_selector), + FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), + FIELD(GUEST_TR_SELECTOR, guest_tr_selector), + FIELD(GUEST_INTR_STATUS, guest_intr_status), + FIELD(GUEST_PML_INDEX, guest_pml_index), + FIELD(HOST_ES_SELECTOR, host_es_selector), + FIELD(HOST_CS_SELECTOR, host_cs_selector), + FIELD(HOST_SS_SELECTOR, host_ss_selector), + FIELD(HOST_DS_SELECTOR, host_ds_selector), + FIELD(HOST_FS_SELECTOR, host_fs_selector), + FIELD(HOST_GS_SELECTOR, host_gs_selector), + FIELD(HOST_TR_SELECTOR, host_tr_selector), |