diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Kconfig | 33 | ||||
-rw-r--r-- | arch/x86/xen/Makefile | 16 | ||||
-rw-r--r-- | arch/x86/xen/efi.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 1833 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_hvm.c | 214 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_pv.c | 1513 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_pvh.c | 106 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 2788 | ||||
-rw-r--r-- | arch/x86/xen/mmu_hvm.c | 79 | ||||
-rw-r--r-- | arch/x86/xen/mmu_pv.c | 2730 | ||||
-rw-r--r-- | arch/x86/xen/pmu.h | 5 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 517 | ||||
-rw-r--r-- | arch/x86/xen/smp.h | 16 | ||||
-rw-r--r-- | arch/x86/xen/smp_hvm.c | 63 | ||||
-rw-r--r-- | arch/x86/xen/smp_pv.c | 490 | ||||
-rw-r--r-- | arch/x86/xen/suspend.c | 54 | ||||
-rw-r--r-- | arch/x86/xen/suspend_hvm.c | 22 | ||||
-rw-r--r-- | arch/x86/xen/suspend_pv.c | 46 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 8 | ||||
-rw-r--r-- | arch/x86/xen/xen-head.S | 4 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 22 |
21 files changed, 5397 insertions, 5164 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 76b6dbd627df..027987638e98 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,8 +6,6 @@ config XEN bool "Xen guest support" depends on PARAVIRT select PARAVIRT_CLOCK - select XEN_HAVE_PVMMU - select XEN_HAVE_VPMU depends on X86_64 || (X86_32 && X86_PAE) depends on X86_LOCAL_APIC && X86_TSC help @@ -15,18 +13,41 @@ config XEN kernel to boot in a paravirtualized environment under the Xen hypervisor. -config XEN_DOM0 +config XEN_PV + bool "Xen PV guest support" + default y + depends on XEN + select XEN_HAVE_PVMMU + select XEN_HAVE_VPMU + help + Support running as a Xen PV guest. + +config XEN_PV_SMP def_bool y - depends on XEN && PCI_XEN && SWIOTLB_XEN + depends on XEN_PV && SMP + +config XEN_DOM0 + bool "Xen PV Dom0 support" + default y + depends on XEN_PV && PCI_XEN && SWIOTLB_XEN depends on X86_IO_APIC && ACPI && PCI + help + Support running as a Xen PV Dom0 guest. config XEN_PVHVM - def_bool y + bool "Xen PVHVM guest support" + default y depends on XEN && PCI && X86_LOCAL_APIC + help + Support running as a Xen PVHVM guest. + +config XEN_PVHVM_SMP + def_bool y + depends on XEN_PVHVM && SMP config XEN_512GB bool "Limit Xen pv-domain memory to 512GB" - depends on XEN && X86_64 + depends on XEN_PV && X86_64 default y help Limit paravirtualized user domains to 512GB of RAM. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index cb0164aee156..fffb0a16f9e3 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -7,17 +7,23 @@ endif # Make sure early boot has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_enlighten.o := $(nostackp) -CFLAGS_mmu.o := $(nostackp) +CFLAGS_enlighten_pv.o := $(nostackp) +CFLAGS_mmu_pv.o := $(nostackp) -obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ +obj-y := enlighten.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o apic.o pmu.o + grant-table.o suspend.o platform-pci-unplug.o + +obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o mmu_hvm.o suspend_hvm.o +obj-$(CONFIG_XEN_PV) += setup.o apic.o pmu.o suspend_pv.o \ + p2m.o enlighten_pv.o mmu_pv.o +obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_XEN_PV_SMP) += smp_pv.o +obj-$(CONFIG_XEN_PVHVM_SMP) += smp_hvm.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_DOM0) += vga.o diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 3be012115853..30bb2e80cfe7 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -81,7 +81,7 @@ static const struct efi efi_xen __initconst = { .update_capsule = xen_efi_update_capsule, .query_capsule_caps = xen_efi_query_capsule_caps, .get_next_high_mono_count = xen_efi_get_next_high_mono_count, - .reset_system = NULL, /* Functionality provided by Xen. */ + .reset_system = xen_efi_reset_system, .set_virtual_address_map = NULL, /* Not used under Xen. */ .flags = 0 /* Initialized later. */ }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 30822e8e64ac..a5ffcbb20cc0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,95 +1,16 @@ -/* - * Core of Xen paravirt_ops implementation. - * - * This file contains the xen_paravirt_ops structure itself, and the - * implementations for: - * - privileged instructions - * - interrupt flags - * - segment operations - * - booting and setup - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - #include <linux/cpu.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/preempt.h> -#include <linux/hardirq.h> -#include <linux/percpu.h> -#include <linux/delay.h> -#include <linux/start_kernel.h> -#include <linux/sched.h> -#include <linux/kprobes.h> -#include <linux/bootmem.h> -#include <linux/export.h> -#include <linux/mm.h> -#include <linux/page-flags.h> -#include <linux/highmem.h> -#include <linux/console.h> -#include <linux/pci.h> -#include <linux/gfp.h> -#include <linux/memblock.h> -#include <linux/edd.h> -#include <linux/frame.h> - #include <linux/kexec.h> -#include <xen/xen.h> -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/version.h> -#include <xen/interface/physdev.h> -#include <xen/interface/vcpu.h> -#include <xen/interface/memory.h> -#include <xen/interface/nmi.h> -#include <xen/interface/xen-mca.h> -#include <xen/interface/hvm/start_info.h> #include <xen/features.h> #include <xen/page.h> -#include <xen/hvm.h> -#include <xen/hvc-console.h> -#include <xen/acpi.h> -#include <asm/paravirt.h> -#include <asm/apic.h> -#include <asm/page.h> -#include <asm/xen/pci.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> -#include <asm/xen/cpuid.h> -#include <asm/fixmap.h> -#include <asm/processor.h> -#include <asm/proto.h> -#include <asm/msr-index.h> -#include <asm/traps.h> -#include <asm/setup.h> -#include <asm/desc.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/reboot.h> -#include <asm/stackprotector.h> -#include <asm/hypervisor.h> -#include <asm/mach_traps.h> -#include <asm/mwait.h> -#include <asm/pci_x86.h> #include <asm/cpu.h> #include <asm/e820/api.h> -#ifdef CONFIG_ACPI -#include <linux/acpi.h> -#include <asm/acpi.h> -#include <acpi/pdc_intel.h> -#include <acpi/processor.h> -#include <xen/interface/platform.h> -#endif - #include "xen-ops.h" -#include "mmu.h" #include "smp.h" -#include "multicalls.h" #include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -136,13 +57,8 @@ EXPORT_SYMBOL_GPL(xen_start_info); struct shared_info xen_dummy_shared_info; -void *xen_initial_gdt; - -RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); - -static int xen_cpu_up_prepare(unsigned int cpu); -static int xen_cpu_up_online(unsigned int cpu); -static int xen_cpu_dead(unsigned int cpu); +__read_mostly int xen_have_vector_callback; +EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* * Point at some empty memory to start with. We map the real shared_info @@ -163,34 +79,32 @@ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 1; +int xen_have_vcpu_info_placement = 1; -struct tls_descs { - struct desc_struct desc[3]; -}; +static int xen_cpu_up_online(unsigned int cpu) +{ + xen_init_lock_cpu(cpu); + return 0; +} -/* - * Updating the 3 TLS descriptors in the GDT on every task switch is - * surprisingly expensive so we avoid updating them if they haven't - * changed. Since Xen writes different descriptors than the one - * passed in the update_descriptor hypercall we keep shadow copies to - * compare against. - */ -static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), + int (*cpu_dead_cb)(unsigned int)) +{ + int rc; -#ifdef CONFIG_XEN_PVH -/* - * PVH variables. - * - * xen_pvh and pvh_bootparams need to live in data segment since they - * are used after startup_{32|64}, which clear .bss, are invoked. - */ -bool xen_pvh __attribute__((section(".data"))) = 0; -struct boot_params pvh_bootparams __attribute__((section(".data"))); + rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, + "x86/xen/hvm_guest:prepare", + cpu_up_prepare_cb, cpu_dead_cb); + if (rc >= 0) { + rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "x86/xen/hvm_guest:online", + xen_cpu_up_online, NULL); + if (rc < 0) + cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); + } -struct hvm_start_info pvh_start_info; -unsigned int pvh_start_info_sz = sizeof(pvh_start_info); -#endif + return rc >= 0 ? 0 : rc; +} static void clamp_max_cpus(void) { @@ -227,7 +141,7 @@ void xen_vcpu_setup(int cpu) per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - if (!have_vcpu_info_placement) { + if (!xen_have_vcpu_info_placement) { if (cpu >= MAX_VIRT_CPUS) clamp_max_cpus(); return; @@ -250,7 +164,7 @@ void xen_vcpu_setup(int cpu) if (err) { printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); - have_vcpu_info_placement = 0; + xen_have_vcpu_info_placement = 0; clamp_max_cpus(); } else { /* This cpu is using the registered vcpu info, even if @@ -259,1043 +173,7 @@ void xen_vcpu_setup(int cpu) } } -/* - * On restore, set the vcpu placement up again. - * If it fails, then we're in a bad state, since - * we can't back out from using it... - */ -void xen_vcpu_restore(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); - bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), - NULL); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) - BUG(); - - xen_setup_runstate_info(cpu); - - if (have_vcpu_info_placement) - xen_vcpu_setup(cpu); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) - BUG(); - } -} - -static void __init xen_banner(void) -{ - unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); - struct xen_extraversion extra; - HYPERVISOR_xen_version(XENVER_extraversion, &extra); - - pr_info("Booting paravirtualized kernel %son %s\n", - xen_feature(XENFEAT_auto_translated_physmap) ? - "with PVH extensions " : "", pv_info.name); - printk(KERN_INFO "Xen version: %d.%d%s%s\n", - version >> 16, version & 0xffff, extra.extraversion, - xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); -} -/* Check if running on Xen version (major, minor) or later */ -bool -xen_running_on_version_or_later(unsigned int major, unsigned int minor) -{ - unsigned int version; - - if (!xen_domain()) - return false; - - version = HYPERVISOR_xen_version(XENVER_version, NULL); - if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || - ((version >> 16) > major)) - return true; - return false; -} - -#define CPUID_THERM_POWER_LEAF 6 -#define APERFMPERF_PRESENT 0 - -static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; -static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; - -static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask; -static __read_mostly unsigned int cpuid_leaf5_ecx_val; -static __read_mostly unsigned int cpuid_leaf5_edx_val; - -static void xen_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - unsigned maskebx = ~0; - unsigned maskecx = ~0; - unsigned maskedx = ~0; - unsigned setecx = 0; - /* - * Mask out inconvenient features, to try and disable as many - * unsupported kernel subsystems as possible. - */ - switch (*ax) { - case 1: - maskecx = cpuid_leaf1_ecx_mask; - setecx = cpuid_leaf1_ecx_set_mask; - maskedx = cpuid_leaf1_edx_mask; - break; - - case CPUID_MWAIT_LEAF: - /* Synthesize the values.. */ - *ax = 0; - *bx = 0; - *cx = cpuid_leaf5_ecx_val; - *dx = cpuid_leaf5_edx_val; - return; - - case CPUID_THERM_POWER_LEAF: - /* Disabling APERFMPERF for kernel usage */ - maskecx = ~(1 << APERFMPERF_PRESENT); - break; - - case 0xb: - /* Suppress extended topology stuff */ - maskebx = 0; - break; - } - - asm(XEN_EMULATE_PREFIX "cpuid" - : "=a" (*ax), - "=b" (*bx), - "=c" (*cx), - "=d" (*dx) - : "0" (*ax), "2" (*cx)); - - *bx &= maskebx; - *cx &= maskecx; - *cx |= setecx; - *dx &= maskedx; -} -STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ - -static bool __init xen_check_mwait(void) -{ -#ifdef CONFIG_ACPI - struct xen_platform_op op = { - .cmd = XENPF_set_processor_pminfo, - .u.set_pminfo.id = -1, - .u.set_pminfo.type = XEN_PM_PDC, - }; - uint32_t buf[3]; - unsigned int ax, bx, cx, dx; - unsigned int mwait_mask; - - /* We need to determine whether it is OK to expose the MWAIT - * capability to the kernel to harvest deeper than C3 states from ACPI - * _CST using the processor_harvest_xen.c module. For this to work, we - * need to gather the MWAIT_LEAF values (which the cstate.c code - * checks against). The hypervisor won't expose the MWAIT flag because - * it would break backwards compatibility; so we will find out directly - * from the hardware and hypercall. - */ - if (!xen_initial_domain()) - return false; - - /* - * When running under platform earlier than Xen4.2, do not expose - * mwait, to avoid the risk of loading native acpi pad driver - */ - if (!xen_running_on_version_or_later(4, 2)) - return false; - - ax = 1; - cx = 0; - - native_cpuid(&ax, &bx, &cx, &dx); - - mwait_mask = (1 << (X86_FEATURE_EST % 32)) | - (1 << (X86_FEATURE_MWAIT % 32)); - - if ((cx & mwait_mask) != mwait_mask) - return false; - - /* We need to emulate the MWAIT_LEAF and for that we need both - * ecx and edx. The hypercall provides only partial information. - */ - - ax = CPUID_MWAIT_LEAF; - bx = 0; - cx = 0; - dx = 0; - - native_cpuid(&ax, &bx, &cx, &dx); - - /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, - * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. - */ - buf[0] = ACPI_PDC_REVISION_ID; - buf[1] = 1; - buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); - - set_xen_guest_handle(op.u.set_pminfo.pdc, buf); - - if ((HYPERVISOR_platform_op(&op) == 0) && - (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { - cpuid_leaf5_ecx_val = cx; - cpuid_leaf5_edx_val = dx; - } - return true; -#else - return false; -#endif -} -static void __init xen_init_cpuid_mask(void) -{ - unsigned int ax, bx, cx, dx; - unsigned int xsave_mask; - - cpuid_leaf1_edx_mask = - ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ - (1 << X86_FEATURE_ACC)); /* thermal monitoring */ - - if (!xen_initial_domain()) - cpuid_leaf1_edx_mask &= - ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ - - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); - - ax = 1; - cx = 0; - cpuid(1, &ax, &bx, &cx, &dx); - - xsave_mask = - (1 << (X86_FEATURE_XSAVE % 32)) | - (1 << (X86_FEATURE_OSXSAVE % 32)); - - /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ - if ((cx & xsave_mask) != xsave_mask) - cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ - if (xen_check_mwait()) - cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); -} - -static void xen_set_debugreg(int reg, unsigned long val) -{ - HYPERVISOR_set_debugreg(reg, val); -} - -static unsigned long xen_get_debugreg(int reg) -{ - return HYPERVISOR_get_debugreg(reg); -} - -static void xen_end_context_switch(struct task_struct *next) -{ - xen_mc_flush(); - paravirt_end_context_switch(next); -} - -static unsigned long xen_store_tr(void) -{ - return 0; -} - -/* - * Set the page permissions for a particular virtual address. If the - * address is a vmalloc mapping (or other non-linear mapping), then - * find the linear mapping of the page and also set its protections to - * match. - */ -static void set_aliased_prot(void *v, pgprot_t prot) -{ - int level; - pte_t *ptep; - pte_t pte; - unsigned long pfn; - struct page *page; - unsigned char dummy; - - ptep = lookup_address((unsigned long)v, &level); - BUG_ON(ptep == NULL); - - pfn = pte_pfn(*ptep); - page = pfn_to_page(pfn); - - pte = pfn_pte(pfn, prot); - - /* - * Careful: update_va_mapping() will fail if the virtual address - * we're poking isn't populated in the page tables. We don't - * need to worry about the direct map (that's always in the page - * tables), but we need to be careful about vmap space. In - * particular, the top level page table can lazily propagate - * entries between processes, so if we've switched mms since we - * vmapped the target in the first place, we might not have the - * top-level page table entry populated. - * - * We disable preemption because we want the same mm active when - * we probe the target and when we issue the hypercall. We'll - * have the same nominal mm, but if we're a kernel thread, lazy - * mm dropping could change our pgd. - * - * Out of an abundance of caution, this uses __get_user() to fault - * in the target address just in case there's some obscure case - * in which the target address isn't readable. - */ - - preempt_disable(); - - probe_kernel_read(&dummy, v, 1); - - if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) - BUG(); - - if (!PageHighMem(page)) { - void *av = __va(PFN_PHYS(pfn)); - - if (av != v) - if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) - BUG(); - } else - kmap_flush_unused(); - - preempt_enable(); -} - -static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) -{ - const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; - int i; - - /* - * We need to mark the all aliases of the LDT pages RO. We - * don't need to call vm_flush_aliases(), though, since that's - * only responsible for flushing aliases out the TLBs, not the - * page tables, and Xen will flush the TLB for us if needed. - * - * To avoid confusing future readers: none of this is necessary - * to load the LDT. The hypervisor only checks this when the - * LDT is faulted in due to subsequent descriptor access. - */ - - for(i = 0; i < entries; i += entries_per_page) - set_aliased_prot(ldt + i, PAGE_KERNEL_RO); -} - -static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) -{ - const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; - int i; - - for(i = 0; i < entries; i += entries_per_page) - set_aliased_prot(ldt + i, PAGE_KERNEL); -} - -static void xen_set_ldt(const void *addr, unsigned entries) -{ - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - - trace_xen_cpu_set_ldt(addr, entries); - - op = mcs.args; - op->cmd = MMUEXT_SET_LDT; - op->arg1.linear_addr = (unsigned long)addr; - op->arg2.nr_ents = entries; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -static void xen_load_gdt(const struct desc_ptr *dtr) -{ - unsigned long va = dtr->address; - unsigned int size = dtr->size + 1; - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); - unsigned long frames[pages]; - int f; - - /* - * A GDT can be up to 64k in size, which corresponds to 8192 - * 8-byte entries, or 16 4k pages.. - */ - - BUG_ON(size > 65536); - BUG_ON(va & ~PAGE_MASK); - - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - int level; - pte_t *ptep; - unsigned long pfn, mfn; - void *virt; - - /* - * The GDT is per-cpu and is in the percpu data area. - * That can be virtually mapped, so we need to do a - * page-walk to get the underlying MFN for the - * hypercall. The page can also be in the kernel's - * linear range, so we need to RO that mapping too. - */ - ptep = lookup_address(va, &level); - BUG_ON(ptep == NULL); - - pfn = pte_pfn(*ptep); - mfn = pfn_to_mfn(pfn); - virt = __va(PFN_PHYS(pfn)); - - frames[f] = mfn; - - make_lowmem_page_readonly((void *)va); - make_lowmem_page_readonly(virt); - } - - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) - BUG(); -} - -/* - * load_gdt for early boot, when the gdt is only mapped once - */ -static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) -{ - unsigned long va = dtr->address; - unsigned int size = dtr->size + 1; - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); - unsigned long frames[pages]; - int f; - - /* - * A GDT can be up to 64k in size, which corresponds to 8192 - * 8-byte entries, or 16 4k pages.. - */ - - BUG_ON(size > 65536); - BUG_ON(va & ~PAGE_MASK); - - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - pte_t pte; - unsigned long pfn, mfn; - - pfn = virt_to_pfn(va); - mfn = pfn_to_mfn(pfn); - - pte = pfn_pte(pfn, PAGE_KERNEL_RO); - - if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) - BUG(); - - frames[f] = mfn; - } - - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) - BUG(); -} - -static inline bool desc_equal(const struct desc_struct *d1, - const struct desc_struct *d2) -{ - return d1->a == d2->a && d1->b == d2->b; -} - -static void load_TLS_descriptor(struct thread_struct *t, - unsigned int cpu, unsigned int i) -{ - struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; - struct desc_struct *gdt; - xmaddr_t maddr; - struct multicall_space mc; - - if (desc_equal(shadow, &t->tls_array[i])) - return; - - *shadow = t->tls_array[i]; - - gdt = get_cpu_gdt_rw(cpu); - maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); - mc = __xen_mc_entry(0); - - MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); -} - -static void xen_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * XXX sleazy hack: If we're being called in a lazy-cpu zone - * and lazy gs handling is enabled, it means we're in a - * context switch, and %gs has just been saved. This means we - * can zero it out to prevent faults on exit from the - * hypervisor if the next process has no %gs. Either way, it - * has been saved, and the new value will get loaded properly. - * This will go away as soon as Xen has been modified to not - * save/restore %gs for normal hypercalls. - * - * On x86_64, this hack is not used for %gs, because gs points - * to KERNEL_GS_BASE (and uses it for PDA references), so we - * must not zero %gs on x86_64 - * - * For x86_64, we need to zero %fs, otherwise we may get an - * exception between the new %fs descriptor being loaded and - * %fs being effectively cleared at __switch_to(). - */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { -#ifdef CONFIG_X86_32 - lazy_load_gs(0); -#else - loadsegment(fs, 0); -#endif - } - - xen_mc_batch(); - - load_TLS_descriptor(t, cpu, 0); - load_TLS_descriptor(t, cpu, 1); - load_TLS_descriptor(t, cpu, 2); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -#ifdef CONFIG_X86_64 -static void xen_load_gs_index(unsigned int idx) -{ - if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) - BUG(); -} -#endif - -static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, - const void *ptr) -{ - xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); - u64 entry = *(u64 *)ptr; - - trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); - - preempt_disable(); - - xen_mc_flush(); - if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) - BUG(); - - preempt_enable(); -} - -static int cvt_gate_to_trap(int vector, const gate_desc *val, - struct trap_info *info) -{ - unsigned long addr; - - if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) - return 0; - - info->vector = vector; - - addr = gate_offset(*val); -#ifdef CONFIG_X86_64 - /* - * Look for known traps using IST, and substitute them - * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault, - * so we should never see them. Warn if - * there's an unexpected IST-using fault handler. - */ - if (addr == (unsigned long)debug) - addr = (unsigned long)xen_debug; - else if (addr == (unsigned long)int3) - addr = (unsigned long)xen_int3; - else if (addr == (unsigned long)stack_segment) - addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault) { - /* Don't need to handle these */ - return 0; -#ifdef CONFIG_X86_MCE - } else if (addr == (unsigned long)machine_check) { - /* - * when xen hypervisor inject vMCE to guest, - * use native mce handler to handle it - */ - ; -#endif - } else if (addr == (unsigned long)nmi) - /* - * Use the native version as well. - */ - ; - else { - /* Some other trap using IST? */ - if (WARN_ON(val->ist != 0)) - return 0; - } -#endif /* CONFIG_X86_64 */ - info->address = addr; - - info->cs = gate_segment(*val); - info->flags = val->dpl; - /* interrupt gates clear IF */ - if (val->type == GATE_INTERRUPT) - info->flags |= 1 << 2; - - return 1; -} - -/* Locations of each CPU's IDT */ -static DEFINE_PER_CPU(struct desc_ptr, idt_desc); - -/* Set an IDT entry. If the entry is part of the current IDT, then - also update Xen. */ -static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) -{ - unsigned long p = (unsigned long)&dt[entrynum]; - unsigned long start, end; - - trace_xen_cpu_write_idt_entry(dt, entrynum, g); - - preempt_disable(); - - start = __this_cpu_read(idt_desc.address); - end = start + __this_cpu_read(idt_desc.size) + 1; - - xen_mc_flush(); - - native_write_idt_entry(dt, entrynum, g); - - if (p >= start && (p + 8) <= end) { - struct trap_info info[2]; - - info[1].address = 0; - - if (cvt_gate_to_trap(entrynum, g, &info[0])) - if (HYPERVISOR_set_trap_table(info)) - BUG(); - } - - preempt_enable(); -} - -static void xen_convert_trap_info(const struct desc_ptr *desc, - struct trap_info *traps) -{ - unsigned in, out, count; - - count = (desc->size+1) / sizeof(gate_desc); - BUG_ON(count > 256); - - for (in = out = 0; in < count; in++) { - gate_desc *entry = (gate_desc*)(desc->address) + in; - - if (cvt_gate_to_trap(in, entry, &traps[out])) - out++; - } - traps[out].address = 0; -} - -void xen_copy_trap_info(struct trap_info *traps) -{ - const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); - - xen_convert_trap_info(desc, traps); -} - -/* Load a new IDT into Xen. In principle this can be per-CPU, so we - hold a spinlock to protect the static traps[] array (static because - it avoids allocation, and saves stack space). */ -static void xen_load_idt(const struct desc_ptr *desc) -{ - static DEFINE_SPINLOCK(lock); - static struct trap_info traps[257]; - - trace_xen_cpu_load_idt(desc); - - spin_lock(&lock); - - memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); - - xen_convert_trap_info(desc, traps); - - xen_mc_flush(); - if (HYPERVISOR_set_trap_table(traps)) - BUG(); - - spin_unlock(&lock); -} - -/* Write a GDT descriptor entry. Ignore LDT descriptors, since - they're handled differently. */ -static void xen_write_gdt_entry(struct desc_struct *dt, int entry, - const void *desc, int type) -{ - trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); - - preempt_disable(); - - switch (type) { - case DESC_LDT: - case DESC_TSS: - /* ignore */ - break; - - default: { - xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); - - xen_mc_flush(); - if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) - BUG(); - } - - } - - preempt_enable(); -} - -/* - * Version of write_gdt_entry for use at early boot-time needed to - * update an entry as simply as possible. - */ -static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, - const void *desc, int type) -{ - trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); - - switch (type) { - case DESC_LDT: - case DESC_TSS: - /* ignore */ - break; - - default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); - - if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) - dt[entry] = *(struct desc_struct *)desc; - } - - } -} - -static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - struct multicall_space mcs; - - mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); - tss->x86_tss.sp0 = thread->sp0; -} - -void xen_set_iopl_mask(unsigned mask) -{ - struct physdev_set_iopl set_iopl; - - /* Force the change at ring 0. */ - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; - HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -} - -static void xen_io_delay(void) -{ -} - -static DEFINE_PER_CPU(unsigned long, xen_cr0_value); - -static unsigned long xen_read_cr0(void) -{ - unsigned long cr0 = this_cpu_read(xen_cr0_value); - - if (unlikel |