powerpc/kexec: Move kexec files into a dedicated subdir.

arch/powerpc/kernel/ contains 8 files dedicated to kexec. Move them into a dedicated subdirectory. Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> [mpe: Move to a/p/kexec, drop the 'machine' naming and use 'core' instead] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/afbef97ec6a978574a5cf91a4441000e0a9da42a.1572351221.git.christophe.leroy@c-s.fr
author: Christophe Leroy <christophe.leroy@c-s.fr> 2019-10-29 12:13:58 +0000
committer: Michael Ellerman <mpe@ellerman.id.au> 2019-11-21 15:41:34 +1100
commit: 793b08e2efff3ec020c5c5861d00ed394fcdd488 (patch)
tree: 0a0ccae4f55def043a319f6b1654d558d0254a53 /arch/powerpc/kexec
parent: 9f7bd9201521b3ad11e96887550dd3e835ba01cb (diff)
9 files changed, 2263 insertions, 0 deletions
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
new file mode 100644
index 000000000000..16c1c5a19519
--- /dev/null
+++ b/arch/powerpc/kexec/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the linux kernel.
+#
+
+# Disable clang warning for using setjmp without setjmp.h header
+CFLAGS_crash.o		+= $(call cc-disable-warning, builtin-requires-header)
+
+obj-y				+= core.o crash.o core_$(BITS).o
+
+obj-$(CONFIG_PPC32)		+= relocate_32.o
+
+obj-$(CONFIG_KEXEC_FILE)	+= file_load.o elf_$(BITS).o
+
+ifdef CONFIG_HAVE_IMA_KEXEC
+ifdef CONFIG_IMA
+obj-y				+= ima.o
+endif
+endif
+
+
+# Disable GCOV, KCOV & sanitizers in odd or sensitive code
+GCOV_PROFILE_core_$(BITS).o := n
+KCOV_INSTRUMENT_core_$(BITS).o := n
+UBSAN_SANITIZE_core_$(BITS).o := n
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
new file mode 100644
index 000000000000..078fe3d76feb
--- /dev/null
+++ b/arch/powerpc/kexec/core.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Code to handle transition of Linux booting another kernel.
+ *
+ * Copyright (C) 2002-2003 Eric Biederman  <ebiederm@xmission.com>
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ * Copyright (C) 2005 IBM Corporation.
+ */
+
+#include <linux/kexec.h>
+#include <linux/reboot.h>
+#include <linux/threads.h>
+#include <linux/memblock.h>
+#include <linux/of.h>
+#include <linux/irq.h>
+#include <linux/ftrace.h>
+
+#include <asm/kdump.h>
+#include <asm/machdep.h>
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/sections.h>
+
+void machine_kexec_mask_interrupts(void) {
+	unsigned int i;
+	struct irq_desc *desc;
+
+	for_each_irq_desc(i, desc) {
+		struct irq_chip *chip;
+
+		chip = irq_desc_get_chip(desc);
+		if (!chip)
+			continue;
+
+		if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
+			chip->irq_eoi(&desc->irq_data);
+
+		if (chip->irq_mask)
+			chip->irq_mask(&desc->irq_data);
+
+		if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
+			chip->irq_disable(&desc->irq_data);
+	}
+}
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	default_machine_crash_shutdown(regs);
+}
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	if (ppc_md.machine_kexec_prepare)
+		return ppc_md.machine_kexec_prepare(image);
+	else
+		return default_machine_kexec_prepare(image);
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+void arch_crash_save_vmcoreinfo(void)
+{
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	VMCOREINFO_SYMBOL(node_data);
+	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+	VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+	VMCOREINFO_SYMBOL(vmemmap_list);
+	VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
+	VMCOREINFO_SYMBOL(mmu_psize_defs);
+	VMCOREINFO_STRUCT_SIZE(vmemmap_backing);
+	VMCOREINFO_OFFSET(vmemmap_backing, list);
+	VMCOREINFO_OFFSET(vmemmap_backing, phys);
+	VMCOREINFO_OFFSET(vmemmap_backing, virt_addr);
+	VMCOREINFO_STRUCT_SIZE(mmu_psize_def);
+	VMCOREINFO_OFFSET(mmu_psize_def, shift);
+#endif
+	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+	int save_ftrace_enabled;
+
+	save_ftrace_enabled = __ftrace_enabled_save();
+	this_cpu_disable_ftrace();
+
+	if (ppc_md.machine_kexec)
+		ppc_md.machine_kexec(image);
+	else
+		default_machine_kexec(image);
+
+	this_cpu_enable_ftrace();
+	__ftrace_enabled_restore(save_ftrace_enabled);
+
+	/* Fall back to normal restart if we're still alive. */
+	machine_restart(NULL);
+	for(;;);
+}
+
+void __init reserve_crashkernel(void)
+{
+	unsigned long long crash_size, crash_base;
+	int ret;
+
+	/* use common parsing */
+	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+			&crash_size, &crash_base);
+	if (ret == 0 && crash_size > 0) {
+		crashk_res.start = crash_base;
+		crashk_res.end = crash_base + crash_size - 1;
+	}
+
+	if (crashk_res.end == crashk_res.start) {
+		crashk_res.start = crashk_res.end = 0;
+		return;
+	}
+
+	/* We might have got these values via the command line or the
+	 * device tree, either way sanitise them now. */
+
+	crash_size = resource_size(&crashk_res);
+
+#ifndef CONFIG_NONSTATIC_KERNEL
+	if (crashk_res.start != KDUMP_KERNELBASE)
+		printk("Crash kernel location must be 0x%x\n",
+				KDUMP_KERNELBASE);
+
+	crashk_res.start = KDUMP_KERNELBASE;
+#else
+	if (!crashk_res.start) {
+#ifdef CONFIG_PPC64
+		/*
+		 * On 64bit we split the RMO in half but cap it at half of
+		 * a small SLB (128MB) since the crash kernel needs to place
+		 * itself and some stacks to be in the first segment.
+		 */
+		crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
+#else
+		crashk_res.start = KDUMP_KERNELBASE;
+#endif
+	}
+
+	crash_base = PAGE_ALIGN(crashk_res.start);
+	if (crash_base != crashk_res.start) {
+		printk("Crash kernel base must be aligned to 0x%lx\n",
+				PAGE_SIZE);
+		crashk_res.start = crash_base;
+	}
+
+#endif
+	crash_size = PAGE_ALIGN(crash_size);
+	crashk_res.end = crashk_res.start + crash_size - 1;
+
+	/* The crash region must not overlap the current kernel */
+	if (overlaps_crashkernel(__pa(_stext), _end - _stext)) {
+		printk(KERN_WARNING
+			"Crash kernel can not overlap current kernel\n");
+		crashk_res.start = crashk_res.end = 0;
+		return;
+	}
+
+	/* Crash kernel trumps memory limit */
+	if (memory_limit && memory_limit <= crashk_res.end) {
+		memory_limit = crashk_res.end + 1;
+		printk("Adjusted memory limit for crashkernel, now 0x%llx\n",
+		       memory_limit);
+	}
+
+	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+			"for crashkernel (System RAM: %ldMB)\n",
+			(unsigned long)(crash_size >> 20),
+			(unsigned long)(crashk_res.start >> 20),
+			(unsigned long)(memblock_phys_mem_size() >> 20));
+
+	if (!memblock_is_region_memory(crashk_res.start, crash_size) ||
+	    memblock_reserve(crashk_res.start, crash_size)) {
+		pr_err("Failed to reserve memory for crashkernel!\n");
+		crashk_res.start = crashk_res.end = 0;
+		return;
+	}
+}
+
+int overlaps_crashkernel(unsigned long start, unsigned long size)
+{
+	return (start + size) > crashk_res.start && start <= crashk_res.end;
+}
+
+/* Values we need to export to the second kernel via the device tree. */
+static phys_addr_t kernel_end;
+static phys_addr_t crashk_base;
+static phys_addr_t crashk_size;
+static unsigned long long mem_limit;
+
+static struct property kernel_end_prop = {
+	.name = "linux,kernel-end",
+	.length = sizeof(phys_addr_t),
+	.value = &kernel_end,
+};
+
+static struct property crashk_base_prop = {
+	.name = "linux,crashkernel-base",
+	.length = sizeof(phys_addr_t),
+	.value = &crashk_base
+};
+
+static struct property crashk_size_prop = {
+	.name = "linux,crashkernel-size",
+	.length = sizeof(phys_addr_t),
+	.value = &crashk_size,
+};
+
+static struct property memory_limit_prop = {
+	.name = "linux,memory-limit",
+	.length = sizeof(unsigned long long),
+	.value = &mem_limit,
+};
+
+#define cpu_to_be_ulong	__PASTE(cpu_to_be, BITS_PER_LONG)
+
+static void __init export_crashk_values(struct device_node *node)
+{
+	/* There might be existing crash kernel properties, but we can't
+	 * be sure what's in them, so remove them. */
+	of_remove_property(node, of_find_property(node,
+				"linux,crashkernel-base", NULL));
+	of_remove_property(node, of_find_property(node,
+				"linux,crashkernel-size", NULL));
+
+	if (crashk_res.start != 0) {
+		crashk_base = cpu_to_be_ulong(crashk_res.start),
+		of_add_property(node, &crashk_base_prop);
+		crashk_size = cpu_to_be_ulong(resource_size(&crashk_res));
+		of_add_property(node, &crashk_size_prop);
+	}
+
+	/*
+	 * memory_limit is required by the kexec-tools to limit the
+	 * crash regions to the actual memory used.
+	 */
+	mem_limit = cpu_to_be_ulong(memory_limit);
+	of_update_property(node, &memory_limit_prop);
+}
+
+static int __init kexec_setup(void)
+{
+	struct device_node *node;
+
+	node = of_find_node_by_path("/chosen");
+	if (!node)
+		return -ENOENT;
+
+	/* remove any stale properties so ours can be found */
+	of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL));
+
+	/* information needed by userspace when using default_machine_kexec */
+	kernel_end = cpu_to_be_ulong(__pa(_end));
+	of_add_property(node, &kernel_end_prop);
+
+	export_crashk_values(node);
+
+	of_node_put(node);
+	return 0;
+}
+late_initcall(kexec_setup);
diff --git a/arch/powerpc/kexec/core_32.c b/arch/powerpc/kexec/core_32.c
new file mode 100644
index 000000000000..bf9f1f906d64
--- /dev/null
+++ b/arch/powerpc/kexec/core_32.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PPC32 code to handle Linux booting another kernel.
+ *
+ * Copyright (C) 2002-2003 Eric Biederman  <ebiederm@xmission.com>
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ * Copyright (C) 2005 IBM Corporation.
+ */
+
+#include <linux/kexec.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <asm/cacheflush.h>
+#include <asm/hw_irq.h>
+#include <asm/io.h>
+
+typedef void (*relocate_new_kernel_t)(
+				unsigned long indirection_page,
+				unsigned long reboot_code_buffer,
+				unsigned long start_address) __noreturn;
+
+/*
+ * This is a generic machine_kexec function suitable at least for
+ * non-OpenFirmware embedded platforms.
+ * It merely copies the image relocation code to the control page and
+ * jumps to it.
+ * A platform specific function may just call this one.
+ */
+void default_machine_kexec(struct kimage *image)
+{
+	extern const unsigned int relocate_new_kernel_size;
+	unsigned long page_list;
+	unsigned long reboot_code_buffer, reboot_code_buffer_phys;
+	relocate_new_kernel_t rnk;
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+
+	/* mask each interrupt so we are in a more sane state for the
+	 * kexec kernel */
+	machine_kexec_mask_interrupts();
+
+	page_list = image->head;
+
+	/* we need both effective and real address here */
+	reboot_code_buffer =
+			(unsigned long)page_address(image->control_code_page);
+	reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer);
+
+	/* copy our kernel relocation code to the control code page */
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+						relocate_new_kernel_size);
+
+	flush_icache_range(reboot_code_buffer,
+				reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE);
+	printk(KERN_INFO "Bye!\n");
+
+	if (!IS_ENABLED(CONFIG_FSL_BOOKE) && !IS_ENABLED(CONFIG_44x))
+		relocate_new_kernel(page_list, reboot_code_buffer_phys, image->start);
+
+	/* now call it */
+	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+	(*rnk)(page_list, reboot_code_buffer_phys, image->start);
+}
+
+int default_machine_kexec_prepare(struct kimage *image)
+{
+	return 0;
+}
diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
new file mode 100644
index 000000000000..04a7cba58eff
--- /dev/null
+++ b/arch/powerpc/kexec/core_64.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PPC64 code to handle Linux booting another kernel.
+ *
+ * Copyright (C) 2004-2005, IBM Corp.
+ *
+ * Created by: Milton D Miller II
+ */
+
+
+#include <linux/kexec.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/init_task.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/hardirq.h>
+
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/machdep.h>
+#include <asm/cacheflush.h>
+#include <asm/firmware.h>
+#include <asm/paca.h>
+#include <asm/mmu.h>
+#include <asm/sections.h>	/* _end */
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/asm-prototypes.h>
+#include <asm/svm.h>
+#include <asm/ultravisor.h>
+
+int default_machine_kexec_prepare(struct kimage *image)
+{
+	int i;
+	unsigned long begin, end;	/* limits of segment */
+	unsigned long low, high;	/* limits of blocked memory range */
+	struct device_node *node;
+	const unsigned long *basep;
+	const unsigned int *sizep;
+
+	/*
+	 * Since we use the kernel fault handlers and paging code to
+	 * handle the virtual mode, we must make sure no destination
+	 * overlaps kernel static data or bss.
+	 */
+	for (i = 0; i < image->nr_segments; i++)
+		if (image->segment[i].mem < __pa(_end))
+			return -ETXTBSY;
+
+	/* We also should not overwrite the tce tables */
+	for_each_node_by_type(node, "pci") {
+		basep = of_get_property(node, "linux,tce-base", NULL);
+		sizep = of_get_property(node, "linux,tce-size", NULL);
+		if (basep == NULL || sizep == NULL)
+			continue;
+
+		low = *basep;
+		high = low + (*sizep);
+
+		for (i = 0; i < image->nr_segments; i++) {
+			begin = image->segment[i].mem;
+			end = begin + image->segment[i].memsz;
+
+			if ((begin < high) && (end > low))
+				return -ETXTBSY;
+		}
+	}
+
+	return 0;
+}
+
+static void copy_segments(unsigned long ind)
+{
+	unsigned long entry;
+	unsigned long *ptr;
+	void *dest;
+	void *addr;
+
+	/*
+	 * We rely on kexec_load to create a lists that properly
+	 * initializes these pointers before they are used.
+	 * We will still crash if the list is wrong, but at least
+	 * the compiler will be quiet.
+	 */
+	ptr = NULL;
+	dest = NULL;
+
+	for (entry = ind; !(entry & IND_DONE); entry = *ptr++) {
+		addr = __va(entry & PAGE_MASK);
+
+		switch (entry & IND_FLAGS) {
+		case IND_DESTINATION:
+			dest = addr;
+			break;
+		case IND_INDIRECTION:
+			ptr = addr;
+			break;
+		case IND_SOURCE:
+			copy_page(dest, addr);
+			dest += PAGE_SIZE;
+		}
+	}
+}
+
+void kexec_copy_flush(struct kimage *image)
+{
+	long i, nr_segments = image->nr_segments;
+	struct  kexec_segment ranges[KEXEC_SEGMENT_MAX];
+
+	/* save the ranges on the stack to efficiently flush the icache */
+	memcpy(ranges, image->segment, sizeof(ranges));
+
+	/*
+	 * After this call we may not use anything allocated in dynamic
+	 * memory, including *image.
+	 *
+	 * Only globals and the stack are allowed.
+	 */
+	copy_segments(image->head);
+
+	/*
+	 * we need to clear the icache for all dest pages sometime,
+	 * including ones that were in place on the original copy
+	 */
+	for (i = 0; i < nr_segments; i++)
+		flush_icache_range((unsigned long)__va(ranges[i].mem),
+			(unsigned long)__va(ranges[i].mem + ranges[i].memsz));
+}
+
+#ifdef CONFIG_SMP
+
+static int kexec_all_irq_disabled = 0;
+
+static void kexec_smp_down(void *arg)
+{
+	local_irq_disable();
+	hard_irq_disable();
+
+	mb(); /* make sure our irqs are disabled before we say they are */
+	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
+	while(kexec_all_irq_disabled == 0)
+		cpu_relax();
+	mb(); /* make sure all irqs are disabled before this */
+	hw_breakpoint_disable();
+	/*
+	 * Now every CPU has IRQs off, we can clear out any pending
+	 * IPIs and be sure that no more will come in after this.
+	 */
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(0, 1);
+
+	kexec_smp_wait();
+	/* NOTREACHED */
+}
+
+static void kexec_prepare_cpus_wait(int wait_state)
+{
+	int my_cpu, i, notified=-1;
+
+	hw_breakpoint_disable();
+	my_cpu = get_cpu();
+	/* Make sure each CPU has at least made it to the state we need.
+	 *
+	 * FIXME: There is a (slim) chance of a problem if not all of the CPUs
+	 * are correctly onlined.  If somehow we start a CPU on boot with RTAS
+	 * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in
+	 * time, the boot CPU will timeout.  If it does eventually execute
+	 * stuff, the secondary will start up (paca_ptrs[]->cpu_start was
+	 * written) and get into a peculiar state.
+	 * If the platform supports smp_ops->take_timebase(), the secondary CPU
+	 * will probably be spinning in there.  If not (i.e. pseries), the
+	 * secondary will continue on and try to online itself/idle/etc. If it
+	 * survives that, we need to find these
+	 * possible-but-not-online-but-should-be CPUs and chaperone them into
+	 * kexec_smp_wait().
+	 */
+	for_each_online_cpu(i) {
+		if (i == my_cpu)
+			continue;
+
+		while (paca_ptrs[i]->kexec_state < wait_state) {
+			barrier();
+			if (i != notified) {
+				printk(KERN_INFO "kexec: waiting for cpu %d "
+				       "(physical %d) to enter %i state\n",
+				       i, paca_ptrs[i]->hw_cpu_id, wait_state);
+				notified = i;
+			}
+		}
+	}
+	mb();
+}
+
+/*
+ * We need to make sure each present CPU is online.  The next kernel will scan
+ * the device tree and assume primary threads are online and query secondary
+ * threads via RTAS to online them if required.  If we don't online primary
+ * threads, they will be stuck.  However, we also online secondary threads as we
+ * may be using 'cede offline'.  In this case RTAS doesn't see the secondary
+ * threads as offline -- and again, these CPUs will be stuck.
+ *
+ * So, we online all CPUs that should be running, including secondary threads.
+ */
+static void wake_offline_cpus(void)
+{
+	int cpu = 0;
+
+	for_each_present_cpu(cpu) {
+		if (!cpu_online(cpu)) {
+			printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
+			       cpu);
+			WARN_ON(cpu_up(cpu));
+		}
+	}
+}
+
+static void kexec_prepare_cpus(void)
+{
+	wake_offline_cpus();
+	smp_call_function(kexec_smp_down, NULL, /* wait */0);
+	local_irq_disable();
+	hard_irq_disable();
+
+	mb(); /* make sure IRQs are disabled before we say they are */
+	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
+
+	kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF);
+	/* we are sure every CPU has IRQs off at this point */
+	kexec_all_irq_disabled = 1;
+
+	/*
+	 * Before removing MMU mappings make sure all CPUs have entered real
+	 * mode:
+	 */
+	kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE);
+
+	/* after we tell the others to go down */
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(0, 0);
+
+	put_cpu();
+}
+
+#else /* ! SMP */
+
+static void kexec_prepare_cpus(void)
+{
+	/*
+	 * move the secondarys to us so that we can copy
+	 * the new kernel 0-0x100 safely
+	 *
+	 * do this if kexec in setup.c ?
+	 *
+	 * We need to release the cpus if we are ever going from an
+	 * UP to an SMP kernel.
+	 */
+	smp_release_cpus();
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(0, 0);
+	local_irq_disable();
+	hard_irq_disable();
+}
+
+#endif /* SMP */
+
+/*
+ * kexec thread structure and stack.
+ *
+ * We need to make sure that this is 16384-byte aligned due to the
+ * way process stacks are handled.  It also must be statically allocated
+ * or allocated as part of the kimage, because everything else may be
+ * overwritten when we copy the kexec image.  We piggyback on the
+ * "init_task" linker section here to statically allocate a stack.
+ *
+ * We could use a smaller stack if we don't care about anything using
+ * current, but that audit has not been performed.
+ */
+static union thread_union kexec_stack __init_task_data =
+	{ };
+
+/*
+ * For similar reasons to the stack above, the kexecing CPU needs to be on a
+ * static PACA; we switch to kexec_paca.
+ */
+struct paca_struct kexec_paca;
+
+/* Our assembly helper, in misc_64.S */
+extern void kexec_sequence(void *newstack, unsigned long start,
+			   void *image, void *control,
+			   void (*clear_all)(void),
+			   bool copy_with_mmu_off) __noreturn;
+
+/* too late to fail here */
+void default_machine_kexec(struct kimage *image)
+{
+	bool copy_with_mmu_off;
+
+	/* prepare control code if any */
+
+	/*
+        * If the kexec boot is the normal one, need to shutdown other cpus
+        * into our wait loop and quiesce interrupts.
+        * Otherwise, in the case of crashed mode (crashing_cpu >= 0),
+        * stopping other CPUs and collecting their pt_regs is done before
+        * using debugger IPI.
+        */
+
+	if (!kdump_in_progress())
+		kexec_prepare_cpus();
+
+	printk("kexec: Starting switchover sequence.\n");
+
+	/* switch to a staticly allocated stack.  Based on irq stack code.
+	 * We setup preempt_count to avoid using VMX in memcpy.
+	 * XXX: the task struct will likely be invalid once we do the copy!
+	 */
+	current_thread_info()->flags = 0;
+	current_thread_info()->preempt_count = HARDIRQ_OFFSET;
+
+	/* We need a static PACA, too; copy this CPU's PACA over and switch to
+	 * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using
+	 * non-static data.
+	 */
+	memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct));
+	kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL;
+#ifdef CONFIG_PPC_PSERIES
+	kexec_paca.lppaca_ptr = NULL;
+#endif
+
+	if (is_secure_guest() && !(image->preserve_context ||
+				   image->type == KEXEC_TYPE_CRASH)) {
+		uv_unshare_all_pages();
+		printk("kexec: Unshared all shared pages.\n");
+	}
+
+	paca_ptrs[kexec_paca.paca_index] = &kexec_paca;
+
+	setup_paca(&kexec_paca);
+
+	/*
+	 * The lppaca should be unregistered at this point so the HV won't
+	 * touch it. In the case of a crash, none of the lppacas are
+	 * unregistered so there is not much we can do about it here.
+	 */
+
+	/*
+	 * On Book3S, the copy must happen with the MMU off if we are either
+	 * using Radix page tables or we are not in an LPAR since we can
+	 * overwrite the page tables while copying.
+	 *
+	 * In an LPAR, we keep the MMU on otherwise we can't access beyond
+	 * the RMA. On BookE there is no real MMU off mode, so we have to
+	 * keep it enabled as well (but then we have bolted TLB entries).
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	copy_with_mmu_off = false;
+#else
+	copy_with_mmu_off = radix_enabled() ||
+		!(firmware_has_feature(FW_FEATURE_LPAR) ||
+		  firmware_has_feature(FW_FEATURE_PS3_LV1));
+#endif
+
+	/* Some things are best done in assembly.  Finding globals with
+	 * a toc is easier in C, so pass in what we can.
+	 */
+	kexec_sequence(&kexec_stack, image->start, image,
+		       page_address(image->control_code_page),
+		       mmu_cleanup_all, copy_with_mmu_off);
+	/* NOTREACHED */
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/* Values we need to export to the second kernel via the device tree. */
+static unsigned long htab_base;
+static unsigned long htab_size;
+
+static struct property htab_base_prop = {
+	.name = "linux,htab-base",
+	.length = sizeof(unsigned long),
+	.value = &htab_base,
+};
+
+static struct property htab_size_prop = {
+	.name = "linux,htab-size",
+	.length = sizeof(unsigned long),
+	.value = &htab_size,
+};
+
+static int __init export_htab_values(void)
+{
+	struct device_node *node;
+
+	/* On machines with no htab htab_address is NULL */
+	if (!htab_address)
+		return -ENODEV;
+
+	node = of_find_node_by_path("/chosen");
+	if (!node)
+		return -ENODEV;
+
+	/* remove any stale propertys so ours can be found */
+	of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL));
+	of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL));
+
+	htab_base = cpu_to_be64(__pa(htab_address));
+	of_add_property(node, &htab_base_prop);
+	htab_size = cpu_to_be64(htab_size_bytes);
+	of_add_property(node, &htab_size_prop);
+
+	of_node_put(node);
+	return 0;
+}
+late_initcall(export_htab_values);
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
new file mode 100644
index 000000000000..d488311efab1
--- /dev/null
+++ b/arch/powerpc/kexec/crash.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Architecture specific (PPC64) functions for kexec based crash dumps.
+ *
+ * Copyright (C) 2005, IBM Corp.
+ *
+ * Created by: Haren Myneni
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/export.h>
+#include <linux/crash_dump.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/types.h>
+
+#include <asm/processor.h>
+#include <asm/machdep.h>
+#include <asm/kexec.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/setjmp.h>
+#include <asm/debug.h>
+
+/*
+ * The primary CPU waits a while for all secondary CPUs to enter. This is to
+ * avoid sending an IPI if the secondary CPUs are entering
+ * crash_kexec_secondary on their own (eg via a system reset).
+ *
+ * The secondary timeout has to be longer than the primary. Both timeouts are
+ * in milliseconds.
+ */
+#define PRIMARY_TIMEOUT		500
+#define SECONDARY_TIMEOUT	1000
+
+#define IPI_TIMEOUT		10000
+#define REAL_MODE_TIMEOUT	10000
+
+static int time_to_dump;
+/*
+ * crash_wake_offline should be set to 1 by platforms that intend to wake
+ * up offline cpus prior to jumping to a kdump kernel. Currently powernv
+ * sets it to 1, since we want to avoid things from happening when an
+ * offline CPU wakes up due to something like an HMI (malfunction error),
+ * which propagates to all threads.
+ */
+int crash_wake_offline;
+
+#define CRASH_HANDLER_MAX 3
+/* List of shutdown handles */
+static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX];
+static DEFINE_SPINLOCK(crash_handlers_lock);
+
+static unsigned long crash_shutdown_buf[JMP_BUF_LEN];
+static int crash_shutdown_cpu = -1;
+
+static int handle_fault(struct pt_regs *regs)
+{
+	if (crash_shutdown_cpu == smp_processor_id())
+		longjmp(crash_shutdown_buf, 1);
+	return 0;
+}
+
+#ifdef CONFIG_SMP
+
+static atomic_t cpus_in_crash;
+void crash_ipi_callback(struct pt_regs *regs)
+{
+	static cpumask_t cpus_state_saved = CPU_MASK_NONE;
+
+	int cpu = smp_processor_id();
+
+	hard_irq_disable();
+	if (!cpumask_test_cpu(cpu, &cpus_state_saved)) {
+		crash_save_cpu(regs, cpu);
+		cpumask_set_cpu(cpu, &cpus_state_saved);
+	}
+
+	atomic_inc(&cpus_in_crash);
+	smp_mb__after_atomic();
+
+	/*
+	 * Starting the kdump boot.
+	 * This barrier is needed to make sure that all CPUs are stopped.
+	 */
+	while (!time_to_dump)
+		cpu_relax();
+
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(1, 1);
+
+#ifdef CONFIG_PPC64
+	kexec_smp_wait();
+#else
+	for (;;);	/* FIXME */
+#endif
+
+	/* NOTREACHED */
+}
+
+static void crash_kexec_prepare_cpus(int cpu)
+{
+	unsigned int msecs;
+	unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */
+	int tries = 0;
+	int (*old_handler)(struct pt_regs *regs);
+
+	printk(KERN_EMERG "Sending IPI to other CPUs\n");
+
+	if (crash_wake_offline)
+		ncpus = num_present_cpus() - 1;
+
+	crash_send_ipi(crash_ipi_callback);
+	smp_wmb();
+
+again:
+	/*
+	 * FIXME: Until we will have the way to stop other CPUs reliably,
+	 * the crash CPU will send an IPI and wait for other CPUs to
+	 * respond.
+	 */
+	msecs = IPI_TIMEOUT;
+	while ((atomic_read(&cpus_in_crash) < ncpus) && (--msecs > 0))
+		mdelay(1);
+
+	/* Would it be better to replace the trap vector here? */
+
+	if (atomic_read(&cpus_in_crash) >= ncpus) {
+		printk(KERN_EMERG "IPI complete\n");
+		return;
+	}
+
+	printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n",
+		ncpus - atomic_read(&cpus_in_crash));
+
+	/*
+	 * If we have a panic timeout set then we can't wait indefinitely
+	 * for someone to activate system reset. We also give up on the
+	 * second time through if system reset fail to work.
+	 */
+	if ((panic_timeout > 0) || (tries > 0))
+		return;
+
+	/*
+	 * A system reset will cause all CPUs to take an 0x100 exception.
+	 * The primary CPU returns here via setjmp, and the secondary
+	 * CPUs reexecute the crash_kexec_secondary path.
+	 */
+	old_handler = __debugger;
+	__debugger = handle_fault;
+	crash_shutdown_cpu = smp_processor_id();
+
+	if (setjmp(crash_shutdown_buf) == 0) {
+		printk(KERN_EMERG "Activate system reset (dumprestart) "
+				  "to stop other cpu(s)\n");
+
+		/*
+		 * A system reset will force all CPUs to execute the
+		 * crash code again. We need to reset cpus_in_crash so we
+		 * wait for everyone to do this.
+		 */
+		atomic_set(&cpus_in_crash, 0);
+		smp_mb();
+
+		while (atomic_read(&cpus_in_crash) < ncpus)
+			cpu_relax();
+	}
+
+	crash_shutdown_cpu = -1;
+	__debugger = old_handler;
+
+	tries++;
+	goto again;
+}
+
+/*
+ * This function will be called by secondary cpus.
+ */
+void crash_kexec_secondary(struct pt_regs *regs)
+{
+	unsigned long flags;
+	int msecs = SECONDARY_TIMEOUT;
+
+	local_irq_save(flags);
+
+	/* Wait for the primary crash CPU to signal its progress */
+	while (crashing_cpu < 0) {
+		if (--msecs < 0) {
+			/* No response, kdump image may not have been loaded */
+			local_irq_restore(flags);
+			return;
+		}
+
+		mdelay(1);
+	}
+
+	crash_ipi_callback(regs);
+}
+
+#else	/* ! CONFIG_SMP */
+
+static void crash_kexec_prepare_cpus(int cpu)
+{
+	/*
+	 * move the secondaries to us so that we can copy
+	 * the new kernel 0-0x100 safely
+	 *
+	 * do this if kexec in setup.c ?
+	 */
+#ifdef CONFIG_PPC64
+	smp_release_cpus();
+#else
+	/* FIXME */
+#endif
+}
+
+void crash_kexec_secondary(struct pt_regs *regs)
+{
+}
+#endif	/* CONFIG_SMP */
+
+/* wait for all the CPUs to hit real mode but timeout if they don't come in */
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC64)
+static void __maybe_unused crash_kexec_wait_realmode(int cpu)
+{
+	unsigned int msecs;
+	int i;
+
+	msecs = REAL_MODE_TIMEOUT;
+	for (i=0; i < nr_cpu_ids && msecs > 0; i++) {
+		if (i == cpu)
+			continue;
+
+		while (paca_ptrs[i]->kexec_state < KEXEC_STATE_REAL_MODE) {
+			barrier();
+			if (!cpu_possible(i) || !cpu_online(i) || (msecs <= 0))
+				break;
+			msecs--;
+			mdelay(1);
+		}
+	}
+	mb();
+}
+#else
+static inline void crash_kexec_wait_realmode(int cpu) {}
+#endif	/* CONFIG_SMP && CONFIG_PPC64 */
+
+/*
+ * Register a function to be called on shutdown.  Only use this if you
+ * can't reset your device in the second kernel.
+ */
+int crash_shutdown_register(crash_shutdown_t handler)
+{
+	unsigned int i, rc;
+
+	spin_lock(&crash_handlers_lock);
+	for (i = 0 ; i < CRASH_HANDLER_MAX; i++)
+		if (!crash_shutdown_handles[i]) {
+			/* Insert handle at first empty entry */
+			crash_shutdown_handles[i] = handler;
+			rc = 0;
+			break;
+		}
+
+	if (i == CRASH_HANDLER_MAX) {
+		printk(KERN_ERR "Crash shutdown handles full, "
+		       "not registered.\n");
+		rc = 1;
+	}
+
+	spin_unlock(&crash_handlers_lock);
+	return rc;
+}
author	Christophe Leroy <christophe.leroy@c-s.fr>	2019-10-29 12:13:58 +0000
committer	Michael Ellerman <mpe@ellerman.id.au>	2019-11-21 15:41:34 +1100
commit	793b08e2efff3ec020c5c5861d00ed394fcdd488 (patch)
tree	0a0ccae4f55def043a319f6b1654d558d0254a53 /arch/powerpc/kexec
parent	9f7bd9201521b3ad11e96887550dd3e835ba01cb (diff)