summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-hypervisor-pmu23
-rw-r--r--Documentation/kernel-parameters.txt7
-rw-r--r--arch/arm/include/asm/xen/events.h6
-rw-r--r--arch/arm/include/asm/xen/page.h16
-rw-r--r--arch/arm/xen/enlighten.c7
-rw-r--r--arch/arm64/include/asm/xen/events.h6
-rw-r--r--arch/x86/include/asm/xen/events.h11
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/interface.h219
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/xen/Kconfig21
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/apic.c6
-rw-r--r--arch/x86/xen/enlighten.c20
-rw-r--r--arch/x86/xen/mmu.c399
-rw-r--r--arch/x86/xen/p2m.c43
-rw-r--r--arch/x86/xen/p2m.h15
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/pmu.c570
-rw-r--r--arch/x86/xen/pmu.h15
-rw-r--r--arch/x86/xen/setup.c496
-rw-r--r--arch/x86/xen/smp.c29
-rw-r--r--arch/x86/xen/suspend.c23
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h7
-rw-r--r--drivers/block/xen-blkfront.c146
-rw-r--r--drivers/xen/Kconfig11
-rw-r--r--drivers/xen/balloon.c6
-rw-r--r--drivers/xen/events/events_base.c6
-rw-r--r--drivers/xen/sys-hypervisor.c136
-rw-r--r--drivers/xen/xenfs/Makefile1
-rw-r--r--drivers/xen/xenfs/super.c3
-rw-r--r--drivers/xen/xenfs/xenfs.h1
-rw-r--r--drivers/xen/xenfs/xensyms.c152
-rw-r--r--include/asm-generic/early_ioremap.h2
-rw-r--r--include/asm-generic/fixmap.h3
-rw-r--r--include/xen/events.h1
-rw-r--r--include/xen/interface/platform.h18
-rw-r--r--include/xen/interface/xen.h37
-rw-r--r--include/xen/interface/xenpmu.h94
-rw-r--r--include/xen/page.h4
-rw-r--r--mm/early_ioremap.c12
42 files changed, 2229 insertions, 363 deletions
diff --git a/Documentation/ABI/testing/sysfs-hypervisor-pmu b/Documentation/ABI/testing/sysfs-hypervisor-pmu
new file mode 100644
index 000000000000..224faa105e18
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-hypervisor-pmu
@@ -0,0 +1,23 @@
+What: /sys/hypervisor/pmu/pmu_mode
+Date: August 2015
+KernelVersion: 4.3
+Contact: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Description:
+ Describes mode that Xen's performance-monitoring unit (PMU)
+ uses. Accepted values are
+ "off" -- PMU is disabled
+ "self" -- The guest can profile itself
+ "hv" -- The guest can profile itself and, if it is
+ privileged (e.g. dom0), the hypervisor
+ "all" -- The guest can profile itself, the hypervisor
+ and all other guests. Only available to
+ privileged guests.
+
+What: /sys/hypervisor/pmu/pmu_features
+Date: August 2015
+KernelVersion: 4.3
+Contact: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Description:
+ Describes Xen PMU features (as an integer). A set bit indicates
+ that the corresponding feature is enabled. See
+ include/xen/interface/xenpmu.h for available features
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index f0c950598582..22a4b687ea5b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -4106,6 +4106,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
plus one apbt timer for broadcast timer.
x86_intel_mid_timer=apbt_only | lapic_and_apbt
+ xen_512gb_limit [KNL,X86-64,XEN]
+ Restricts the kernel running paravirtualized under Xen
+ to use only up to 512 GB of RAM. The reason to do so is
+ crash analysis tools and Xen tools for doing domain
+ save/restore/migration must be enabled to handle larger
+ domains.
+
xen_emul_unplug= [HW,X86,XEN]
Unplug Xen emulated devices
Format: [unplug0,][unplug1]
diff --git a/arch/arm/include/asm/xen/events.h b/arch/arm/include/asm/xen/events.h
index 8b1f37bfeeec..71e473d05fcc 100644
--- a/arch/arm/include/asm/xen/events.h
+++ b/arch/arm/include/asm/xen/events.h
@@ -20,4 +20,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
atomic64_t, \
counter), (val))
+/* Rebind event channel is supported by default */
+static inline bool xen_support_evtchn_rebind(void)
+{
+ return true;
+}
+
#endif /* _ASM_ARM_XEN_EVENTS_H */
diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h
index 1bee8ca12494..98b1084f8282 100644
--- a/arch/arm/include/asm/xen/page.h
+++ b/arch/arm/include/asm/xen/page.h
@@ -54,26 +54,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
#define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn)
-static inline xmaddr_t phys_to_machine(xpaddr_t phys)
-{
- unsigned offset = phys.paddr & ~PAGE_MASK;
- return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
-}
-
-static inline xpaddr_t machine_to_phys(xmaddr_t machine)
-{
- unsigned offset = machine.maddr & ~PAGE_MASK;
- return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
-}
/* VIRT <-> MACHINE conversion */
-#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+/* Only used in PV code. But ARM guests are always HVM. */
static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr)
{
- /* TODO: assuming it is mapped in the kernel 1:1 */
- return virt_to_machine(vaddr);
+ BUG();
}
/* TODO: this shouldn't be here but it is because the frontend drivers
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index 6c09cc440a2b..c50c8d33f874 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -45,13 +45,6 @@ static struct vcpu_info __percpu *xen_vcpu_info;
unsigned long xen_released_pages;
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
-/* TODO: to be removed */
-__read_mostly int xen_have_vector_callback;
-EXPORT_SYMBOL_GPL(xen_have_vector_callback);
-
-int xen_platform_pci_unplug = XEN_UNPLUG_ALL;
-EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
-
static __read_mostly unsigned int xen_events_irq;
static __initdata struct device_node *xen_node;
diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h
index 86553213c132..4318866d053c 100644
--- a/arch/arm64/include/asm/xen/events.h
+++ b/arch/arm64/include/asm/xen/events.h
@@ -18,4 +18,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
+/* Rebind event channel is supported by default */
+static inline bool xen_support_evtchn_rebind(void)
+{
+ return true;
+}
+
#endif /* _ASM_ARM64_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 608a79d5a466..e6911caf5bbf 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
/* No need for a barrier -- XCHG is a barrier on x86. */
#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
+extern int xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+ return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ca08a27b90b3..83aea8055119 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(
return _hypercall1(int, tmem_op, op);
}
+static inline int
+HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
+{
+ return _hypercall2(int, xenpmu_op, op, arg);
+}
+
static inline void
MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
{
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 3400dbaec3c3..62ca03ef5c65 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -3,12 +3,38 @@
*
* Guest OS interface to x86 Xen.
*
- * Copyright (c) 2004, K A Fraser
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
*/
#ifndef _ASM_X86_XEN_INTERFACE_H
#define _ASM_X86_XEN_INTERFACE_H
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
#ifdef __XEN__
#define __DEFINE_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name
@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
* start of the GDT because some stupid OSes export hard-coded selector values
* in their ABI. These hard-coded values are always near the start of the GDT,
* so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
*/
#define FIRST_RESERVED_GDT_PAGE 14
#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
/*
- * Send an array of these to HYPERVISOR_set_trap_table()
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
* The privilege level specifies which modes may enter a trap via a software
* interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
* privilege levels as follows:
@@ -118,10 +147,41 @@ struct trap_info {
DEFINE_GUEST_HANDLE_STRUCT(trap_info);
struct arch_shared_info {
- unsigned long max_pfn; /* max pfn that appears in table */
- /* Frame containing list of mfns containing list of mfns containing p2m. */
- unsigned long pfn_to_mfn_frame_list_list;
- unsigned long nmi_reason;
+ /*
+ * Number of valid entries in the p2m table(s) anchored at
+ * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+ */
+ unsigned long max_pfn;
+ /*
+ * Frame containing list of mfns containing list of mfns containing p2m.
+ * A value of 0 indicates it has not yet been set up, ~0 indicates it
+ * has been set to invalid e.g. due to the p2m being too large for the
+ * 3-level p2m tree. In this case the linear mapper p2m list anchored
+ * at p2m_vaddr is to be used.
+ */
+ xen_pfn_t pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
+ /*
+ * Following three fields are valid if p2m_cr3 contains a value
+ * different from 0.
+ * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+ * p2m_cr3 is in the same format as a cr3 value in the vcpu register
+ * state and holds the folded machine frame number (via xen_pfn_to_cr3)
+ * of a L3 or L4 page table.
+ * p2m_vaddr holds the virtual address of the linear p2m list. All
+ * entries in the range [0...max_pfn[ are accessible via this pointer.
+ * p2m_generation will be incremented by the guest before and after each
+ * change of the mappings of the p2m list. p2m_generation starts at 0
+ * and a value with the least significant bit set indicates that a
+ * mapping update is in progress. This allows guest external software
+ * (e.g. in Dom0) to verify that read mappings are consistent and
+ * whether they have changed since the last check.
+ * Modifying a p2m element in the linear p2m list is allowed via an
+ * atomic write only.
+ */
+ unsigned long p2m_cr3; /* cr3 value of the p2m address space */
+ unsigned long p2m_vaddr; /* virtual address of the p2m list */
+ unsigned long p2m_generation; /* generation count of p2m mapping */
};
#endif /* !__ASSEMBLY__ */
@@ -137,13 +197,31 @@ struct arch_shared_info {
/*
* The following is all CPU context. Note that the fpu_ctxt block is filled
* in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
*/
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST (1<<1)
-#define VGCF_IN_KERNEL (1<<2)
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_IN_KERNEL (1<<2)
+#define _VGCF_i387_valid 0
+#define VGCF_i387_valid (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel 2
+#define VGCF_in_kernel (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events 4
+#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online 5
+#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
@@ -172,6 +250,129 @@ struct vcpu_guest_context {
#endif
};
DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+ /*
+ * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t counters;
+ uint32_t ctrls;
+
+ /* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+ uint64_t counter;
+ uint64_t control;
+};
+
+struct xen_pmu_intel_ctxt {
+ /*
+ * Offsets to fixed and architectural counter MSRs (relative to
+ * xen_pmu_arch.c.intel).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t fixed_counters;
+ uint32_t arch_counters;
+
+ /* PMU registers */
+ uint64_t global_ctrl;
+ uint64_t global_ovf_ctrl;
+ uint64_t global_status;
+ uint64_t fixed_ctrl;
+ uint64_t ds_area;
+ uint64_t pebs_enable;
+ uint64_t debugctl;
+
+ /* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+ uint64_t ip;
+ uint64_t sp;
+ uint64_t flags;
+ uint16_t cs;
+ uint16_t ss;
+ uint8_t cpl;
+ uint8_t pad[3];
+};
+
+/* PMU flags */
+#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+ union {
+ /*
+ * Processor's registers at the time of interrupt.
+ * WO for hypervisor, RO for guests.
+ */
+ struct xen_pmu_regs regs;
+ /*
+ * Padding for adding new registers to xen_pmu_regs in
+ * the future
+ */
+#define XENPMU_REGS_PAD_SZ 64
+ uint8_t pad[XENPMU_REGS_PAD_SZ];
+ } r;
+
+ /* WO for hypervisor, RO for guest */
+ uint64_t pmu_flags;
+
+ /*
+ * APIC LVTPC register.
+ * RW for both hypervisor and guest.
+ * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+ * during XENPMU_flush or XENPMU_lvtpc_set.
+ */
+ union {
+ uint32_t lapic_lvtpc;
+ uint64_t pad;
+ } l;
+
+ /*
+ * Vendor-specific PMU registers.
+ * RW for both hypervisor and guest (see exceptions above).
+ * Guest's updates to this field are verified and then loaded by the
+ * hypervisor into hardware during XENPMU_flush
+ */
+ union {
+ struct xen_pmu_amd_ctxt amd;
+ struct xen_pmu_intel_ctxt intel;
+
+ /*
+ * Padding for contexts (fixed parts only, does not include
+ * MSR banks that are specified by offsets)
+ */
+#define XENPMU_CTXT_PAD_SZ 128
+ uint8_t pad[XENPMU_CTXT_PAD_SZ];
+ } c;
+};
+
#endif /* !__ASSEMBLY__ */
/*
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c44a5d53e464..a3804fbe1f36 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -35,9 +35,7 @@ typedef struct xpaddr {
#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
-/* Maximum amount of memory we can handle in a domain in pages */
-#define MAX_DOMAIN_PAGES \
- ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
extern unsigned long *machine_to_phys_mapping;
extern unsigned long machine_to_phys_nr;
@@ -48,8 +46,8 @@ extern unsigned long xen_max_p2m_pfn;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern unsigned long set_phys_range_identity(unsigned long pfn_s,
- unsigned long pfn_e);
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e);
extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
struct gnttab_map_grant_ref *kmap_ops,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 484145368a24..c7b15f3e2cf3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -7,6 +7,7 @@ config XEN
depends on PARAVIRT
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
+ select XEN_HAVE_VPMU
depends on X86_64 || (X86_32 && X86_PAE)
depends on X86_LOCAL_APIC && X86_TSC
help
@@ -23,14 +24,18 @@ config XEN_PVHVM
def_bool y
depends on XEN && PCI && X86_LOCAL_APIC
-config XEN_MAX_DOMAIN_MEMORY
- int
- default 500 if X86_64
- default 64 if X86_32
- depends on XEN
- help
- This only affects the sizing of some bss arrays, the unused
- portions of which are freed.
+config XEN_512GB
+ bool "Limit Xen pv-domain memory to 512GB"
+ depends on XEN && X86_64
+ default y
+ help
+ Limit paravirtualized user domains to 512GB of RAM.
+
+ The Xen tools and crash dump analysis tools might not support
+ pv-domains with more than 512 GB of RAM. This option controls the
+ default setting of the kernel to use only up to 512 GB or more.
+ It is always possible to change the default via specifying the
+ boot parameter "xen_512gb_limit".
config XEN_SAVE_RESTORE
bool
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 4b6e29ac0968..e47e52787d32 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
- p2m.o apic.o
+ p2m.o apic.o pmu.o
obj-$(CONFIG_EVENT_TRACING) += trace.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 70e060ad879a..acda713ab5be 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
+#include "pmu.h"
#include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
static void xen_apic_write(u32 reg, u32 val)
{
+ if (reg == APIC_LVTPC) {
+ (void)pmu_apic_update(reg);
+ return;
+ }
+
/* Warn to see if there's any stray references */
WARN(1,"register: %x, value: %x\n", reg, val);
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d9cfa452da9d..30d12afe52ed 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -84,6 +84,7 @@
#include "mmu.h"
#include "smp.h"
#include "multicalls.h"
+#include "pmu.h"
EXPORT_SYMBOL_GPL(hypercall_page);
@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0)
static void xen_write_cr4(unsigned long cr4)
{
- cr4 &= ~X86_CR4_PGE;
- cr4 &= ~X86_CR4_PSE;
+ cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
native_write_cr4(cr4);
}
@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
{
u64 val;
+ if (pmu_msr_read(msr, &val, err))
+ return val;
+
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
@@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
Xen console noise. */
default:
- ret = native_write_msr_safe(msr, low, high);
+ if (!pmu_msr_write(msr, low, high, &ret))
+ ret = native_write_msr_safe(msr, low, high);
}
return ret;
@@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_msr = xen_read_msr_safe,
.write_msr = xen_write_msr_safe,
- .read_pmc = native_read_pmc,
+ .read_pmc = xen_read_pmc,
.iret = xen_iret,
#ifdef CONFIG_X86_64
@@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG();
@@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
early_boot_irqs_disabled = true;
xen_raw_console_write("mapping kernel into physical memory\n");
- xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+ xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+ xen_start_info->nr_pages);
+ xen_reserve_special_pages();
/*
* Modify the cache mode translation tables to match Xen's PAT
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dd151b2045b0..2c50b445884e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+static phys_addr_t xen_pt_base, xen_pt_size __initdata;
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
static void xen_post_allocator_init(void);
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
#ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
xen_mc_flush();
}
+/*
+ * Make a page range writeable and free it.
+ */
+static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
+{
+ void *vaddr = __va(paddr);
+ void *vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
+ make_lowmem_page_readwrite(vaddr);
+
+ memblock_free(paddr, size);
+}
+
+static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
+{
+ unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
+
+ if (unpin)
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
+ ClearPagePinned(virt_to_page(__va(pa)));
+ xen_free_ro_pages(pa, PAGE_SIZE);
+}
+
+/*
+ * Since it is well isolated we can (and since it is perhaps large we should)
+ * also free the page tables mapping the initial P->M table.
+ */
+static void __init xen_cleanmfnmap(unsigned long vaddr)
+{
+ unsigned long va = vaddr & PMD_MASK;
+ unsigned long pa;
+ pgd_t *pgd = pgd_offset_k(va);
+ pud_t *pud_page = pud_offset(pgd, 0);
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned int i;
+ bool unpin;
+
+ unpin = (vaddr == 2 * PGDIR_SIZE);
+ set_pgd(pgd, __pgd(0));
+ do {
+ pud = pud_page + pud_index(va);
+ if (pud_none(*pud)) {
+ va += PUD_SIZE;
+ } else if (pud_large(*pud)) {
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PUD_SIZE);
+ va += PUD_SIZE;
+ } else {
+ pmd = pmd_offset(pud, va);
+ if (pmd_large(*pmd)) {
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PMD_SIZE);
+ } else if (!pmd_none(*pmd)) {
+ pte = pte_offset_kernel(pmd, va);
+ set_pmd(pmd, __pmd(0));
+ for (i = 0; i < PTRS_PER_PTE; ++i) {
+ if (pte_none(pte[i]))
+ break;
+ pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+ xen_free_ro_pages(pa, PAGE_SIZE);
+ }
+ xen_cleanmfnmap_free_pgtbl(pte, unpin);
+ }
+ va += PMD_SIZE;
+ if (pmd_index(va))
+ continue;
+ set_pud(pud, __pud(0));
+ xen_cleanmfnmap_free_pgtbl(pmd, unpin);
+ }
+
+ } while (pud_index(va) || pmd_index(va));
+ xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+}
+
static void __init xen_pagetable_p2m_free(void)
{
unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
/* using __ka address and sticking INVALID_P2M_ENTRY! */
memset((void *)xen_start_info->mfn_list, 0xff, size);
- /* We should be in __ka space. */
- BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
addr = xen_start_info->mfn_list;
- /* We roundup to the PMD, which means that if anybody at this stage is
- * using the __ka address of xen_start_info or xen_start_info->shared_info
- * they are in going to crash. Fortunatly we have already revectored