summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kexec
diff options
context:
space:
mode:
authorChristophe Leroy <christophe.leroy@c-s.fr>2019-10-29 12:13:58 +0000
committerMichael Ellerman <mpe@ellerman.id.au>2019-11-21 15:41:34 +1100
commit793b08e2efff3ec020c5c5861d00ed394fcdd488 (patch)
tree0a0ccae4f55def043a319f6b1654d558d0254a53 /arch/powerpc/kexec
parent9f7bd9201521b3ad11e96887550dd3e835ba01cb (diff)
powerpc/kexec: Move kexec files into a dedicated subdir.
arch/powerpc/kernel/ contains 8 files dedicated to kexec. Move them into a dedicated subdirectory. Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> [mpe: Move to a/p/kexec, drop the 'machine' naming and use 'core' instead] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/afbef97ec6a978574a5cf91a4441000e0a9da42a.1572351221.git.christophe.leroy@c-s.fr
Diffstat (limited to 'arch/powerpc/kexec')
-rw-r--r--arch/powerpc/kexec/Makefile25
-rw-r--r--arch/powerpc/kexec/core.c280
-rw-r--r--arch/powerpc/kexec/core_32.c69
-rw-r--r--arch/powerpc/kexec/core_64.c417
-rw-r--r--arch/powerpc/kexec/crash.c374
-rw-r--r--arch/powerpc/kexec/elf_64.c125
-rw-r--r--arch/powerpc/kexec/file_load.c254
-rw-r--r--arch/powerpc/kexec/ima.c219
-rw-r--r--arch/powerpc/kexec/relocate_32.S500
9 files changed, 2263 insertions, 0 deletions
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
new file mode 100644
index 000000000000..16c1c5a19519
--- /dev/null
+++ b/arch/powerpc/kexec/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the linux kernel.
+#
+
+# Disable clang warning for using setjmp without setjmp.h header
+CFLAGS_crash.o += $(call cc-disable-warning, builtin-requires-header)
+
+obj-y += core.o crash.o core_$(BITS).o
+
+obj-$(CONFIG_PPC32) += relocate_32.o
+
+obj-$(CONFIG_KEXEC_FILE) += file_load.o elf_$(BITS).o
+
+ifdef CONFIG_HAVE_IMA_KEXEC
+ifdef CONFIG_IMA
+obj-y += ima.o
+endif
+endif
+
+
+# Disable GCOV, KCOV & sanitizers in odd or sensitive code
+GCOV_PROFILE_core_$(BITS).o := n
+KCOV_INSTRUMENT_core_$(BITS).o := n
+UBSAN_SANITIZE_core_$(BITS).o := n
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
new file mode 100644
index 000000000000..078fe3d76feb
--- /dev/null
+++ b/arch/powerpc/kexec/core.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Code to handle transition of Linux booting another kernel.
+ *
+ * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com>
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ * Copyright (C) 2005 IBM Corporation.
+ */
+
+#include <linux/kexec.h>
+#include <linux/reboot.h>
+#include <linux/threads.h>
+#include <linux/memblock.h>
+#include <linux/of.h>
+#include <linux/irq.h>
+#include <linux/ftrace.h>
+
+#include <asm/kdump.h>
+#include <asm/machdep.h>
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/sections.h>
+
+void machine_kexec_mask_interrupts(void) {
+ unsigned int i;
+ struct irq_desc *desc;
+
+ for_each_irq_desc(i, desc) {
+ struct irq_chip *chip;
+
+ chip = irq_desc_get_chip(desc);
+ if (!chip)
+ continue;
+
+ if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
+ chip->irq_eoi(&desc->irq_data);
+
+ if (chip->irq_mask)
+ chip->irq_mask(&desc->irq_data);
+
+ if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
+ chip->irq_disable(&desc->irq_data);
+ }
+}
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+ default_machine_crash_shutdown(regs);
+}
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+ if (ppc_md.machine_kexec_prepare)
+ return ppc_md.machine_kexec_prepare(image);
+ else
+ return default_machine_kexec_prepare(image);
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+void arch_crash_save_vmcoreinfo(void)
+{
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+ VMCOREINFO_SYMBOL(vmemmap_list);
+ VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
+ VMCOREINFO_SYMBOL(mmu_psize_defs);
+ VMCOREINFO_STRUCT_SIZE(vmemmap_backing);
+ VMCOREINFO_OFFSET(vmemmap_backing, list);
+ VMCOREINFO_OFFSET(vmemmap_backing, phys);
+ VMCOREINFO_OFFSET(vmemmap_backing, virt_addr);
+ VMCOREINFO_STRUCT_SIZE(mmu_psize_def);
+ VMCOREINFO_OFFSET(mmu_psize_def, shift);
+#endif
+ vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+ int save_ftrace_enabled;
+
+ save_ftrace_enabled = __ftrace_enabled_save();
+ this_cpu_disable_ftrace();
+
+ if (ppc_md.machine_kexec)
+ ppc_md.machine_kexec(image);
+ else
+ default_machine_kexec(image);
+
+ this_cpu_enable_ftrace();
+ __ftrace_enabled_restore(save_ftrace_enabled);
+
+ /* Fall back to normal restart if we're still alive. */
+ machine_restart(NULL);
+ for(;;);
+}
+
+void __init reserve_crashkernel(void)
+{
+ unsigned long long crash_size, crash_base;
+ int ret;
+
+ /* use common parsing */
+ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+ &crash_size, &crash_base);
+ if (ret == 0 && crash_size > 0) {
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ }
+
+ if (crashk_res.end == crashk_res.start) {
+ crashk_res.start = crashk_res.end = 0;
+ return;
+ }
+
+ /* We might have got these values via the command line or the
+ * device tree, either way sanitise them now. */
+
+ crash_size = resource_size(&crashk_res);
+
+#ifndef CONFIG_NONSTATIC_KERNEL
+ if (crashk_res.start != KDUMP_KERNELBASE)
+ printk("Crash kernel location must be 0x%x\n",
+ KDUMP_KERNELBASE);
+
+ crashk_res.start = KDUMP_KERNELBASE;
+#else
+ if (!crashk_res.start) {
+#ifdef CONFIG_PPC64
+ /*
+ * On 64bit we split the RMO in half but cap it at half of
+ * a small SLB (128MB) since the crash kernel needs to place
+ * itself and some stacks to be in the first segment.
+ */
+ crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
+#else
+ crashk_res.start = KDUMP_KERNELBASE;
+#endif
+ }
+
+ crash_base = PAGE_ALIGN(crashk_res.start);
+ if (crash_base != crashk_res.start) {
+ printk("Crash kernel base must be aligned to 0x%lx\n",
+ PAGE_SIZE);
+ crashk_res.start = crash_base;
+ }
+
+#endif
+ crash_size = PAGE_ALIGN(crash_size);
+ crashk_res.end = crashk_res.start + crash_size - 1;
+
+ /* The crash region must not overlap the current kernel */
+ if (overlaps_crashkernel(__pa(_stext), _end - _stext)) {
+ printk(KERN_WARNING
+ "Crash kernel can not overlap current kernel\n");
+ crashk_res.start = crashk_res.end = 0;
+ return;
+ }
+
+ /* Crash kernel trumps memory limit */
+ if (memory_limit && memory_limit <= crashk_res.end) {
+ memory_limit = crashk_res.end + 1;
+ printk("Adjusted memory limit for crashkernel, now 0x%llx\n",
+ memory_limit);
+ }
+
+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+ "for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crashk_res.start >> 20),
+ (unsigned long)(memblock_phys_mem_size() >> 20));
+
+ if (!memblock_is_region_memory(crashk_res.start, crash_size) ||
+ memblock_reserve(crashk_res.start, crash_size)) {
+ pr_err("Failed to reserve memory for crashkernel!\n");
+ crashk_res.start = crashk_res.end = 0;
+ return;
+ }
+}
+
+int overlaps_crashkernel(unsigned long start, unsigned long size)
+{
+ return (start + size) > crashk_res.start && start <= crashk_res.end;
+}
+
+/* Values we need to export to the second kernel via the device tree. */
+static phys_addr_t kernel_end;
+static phys_addr_t crashk_base;
+static phys_addr_t crashk_size;
+static unsigned long long mem_limit;
+
+static struct property kernel_end_prop = {
+ .name = "linux,kernel-end",
+ .length = sizeof(phys_addr_t),
+ .value = &kernel_end,
+};
+
+static struct property crashk_base_prop = {
+ .name = "linux,crashkernel-base",
+ .length = sizeof(phys_addr_t),
+ .value = &crashk_base
+};
+
+static struct property crashk_size_prop = {
+ .name = "linux,crashkernel-size",
+ .length = sizeof(phys_addr_t),
+ .value = &crashk_size,
+};
+
+static struct property memory_limit_prop = {
+ .name = "linux,memory-limit",
+ .length = sizeof(unsigned long long),
+ .value = &mem_limit,
+};
+
+#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG)
+
+static void __init export_crashk_values(struct device_node *node)
+{
+ /* There might be existing crash kernel properties, but we can't
+ * be sure what's in them, so remove them. */
+ of_remove_property(node, of_find_property(node,
+ "linux,crashkernel-base", NULL));
+ of_remove_property(node, of_find_property(node,
+ "linux,crashkernel-size", NULL));
+
+ if (crashk_res.start != 0) {
+ crashk_base = cpu_to_be_ulong(crashk_res.start),
+ of_add_property(node, &crashk_base_prop);
+ crashk_size = cpu_to_be_ulong(resource_size(&crashk_res));
+ of_add_property(node, &crashk_size_prop);
+ }
+
+ /*
+ * memory_limit is required by the kexec-tools to limit the
+ * crash regions to the actual memory used.
+ */
+ mem_limit = cpu_to_be_ulong(memory_limit);
+ of_update_property(node, &memory_limit_prop);
+}
+
+static int __init kexec_setup(void)
+{
+ struct device_node *node;
+
+ node = of_find_node_by_path("/chosen");
+ if (!node)
+ return -ENOENT;
+
+ /* remove any stale properties so ours can be found */
+ of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL));
+
+ /* information needed by userspace when using default_machine_kexec */
+ kernel_end = cpu_to_be_ulong(__pa(_end));
+ of_add_property(node, &kernel_end_prop);
+
+ export_crashk_values(node);
+
+ of_node_put(node);
+ return 0;
+}
+late_initcall(kexec_setup);
diff --git a/arch/powerpc/kexec/core_32.c b/arch/powerpc/kexec/core_32.c
new file mode 100644
index 000000000000..bf9f1f906d64
--- /dev/null
+++ b/arch/powerpc/kexec/core_32.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PPC32 code to handle Linux booting another kernel.
+ *
+ * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com>
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ * Copyright (C) 2005 IBM Corporation.
+ */
+
+#include <linux/kexec.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <asm/cacheflush.h>
+#include <asm/hw_irq.h>
+#include <asm/io.h>
+
+typedef void (*relocate_new_kernel_t)(
+ unsigned long indirection_page,
+ unsigned long reboot_code_buffer,
+ unsigned long start_address) __noreturn;
+
+/*
+ * This is a generic machine_kexec function suitable at least for
+ * non-OpenFirmware embedded platforms.
+ * It merely copies the image relocation code to the control page and
+ * jumps to it.
+ * A platform specific function may just call this one.
+ */
+void default_machine_kexec(struct kimage *image)
+{
+ extern const unsigned int relocate_new_kernel_size;
+ unsigned long page_list;
+ unsigned long reboot_code_buffer, reboot_code_buffer_phys;
+ relocate_new_kernel_t rnk;
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+
+ /* mask each interrupt so we are in a more sane state for the
+ * kexec kernel */
+ machine_kexec_mask_interrupts();
+
+ page_list = image->head;
+
+ /* we need both effective and real address here */
+ reboot_code_buffer =
+ (unsigned long)page_address(image->control_code_page);
+ reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer);
+
+ /* copy our kernel relocation code to the control code page */
+ memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+ relocate_new_kernel_size);
+
+ flush_icache_range(reboot_code_buffer,
+ reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE);
+ printk(KERN_INFO "Bye!\n");
+
+ if (!IS_ENABLED(CONFIG_FSL_BOOKE) && !IS_ENABLED(CONFIG_44x))
+ relocate_new_kernel(page_list, reboot_code_buffer_phys, image->start);
+
+ /* now call it */
+ rnk = (relocate_new_kernel_t) reboot_code_buffer;
+ (*rnk)(page_list, reboot_code_buffer_phys, image->start);
+}
+
+int default_machine_kexec_prepare(struct kimage *image)
+{
+ return 0;
+}
diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
new file mode 100644
index 000000000000..04a7cba58eff
--- /dev/null
+++ b/arch/powerpc/kexec/core_64.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PPC64 code to handle Linux booting another kernel.
+ *
+ * Copyright (C) 2004-2005, IBM Corp.
+ *
+ * Created by: Milton D Miller II
+ */
+
+
+#include <linux/kexec.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/init_task.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/hardirq.h>
+
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/machdep.h>
+#include <asm/cacheflush.h>
+#include <asm/firmware.h>
+#include <asm/paca.h>
+#include <asm/mmu.h>
+#include <asm/sections.h> /* _end */
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/asm-prototypes.h>
+#include <asm/svm.h>
+#include <asm/ultravisor.h>
+
+int default_machine_kexec_prepare(struct kimage *image)
+{
+ int i;
+ unsigned long begin, end; /* limits of segment */
+ unsigned long low, high; /* limits of blocked memory range */
+ struct device_node *node;
+ const unsigned long *basep;
+ const unsigned int *sizep;
+
+ /*
+ * Since we use the kernel fault handlers and paging code to
+ * handle the virtual mode, we must make sure no destination
+ * overlaps kernel static data or bss.
+ */
+ for (i = 0; i < image->nr_segments; i++)
+ if (image->segment[i].mem < __pa(_end))
+ return -ETXTBSY;
+
+ /* We also should not overwrite the tce tables */
+ for_each_node_by_type(node, "pci") {
+ basep = of_get_property(node, "linux,tce-base", NULL);
+ sizep = of_get_property(node, "linux,tce-size", NULL);
+ if (basep == NULL || sizep == NULL)
+ continue;
+
+ low = *basep;
+ high = low + (*sizep);
+
+ for (i = 0; i < image->nr_segments; i++) {
+ begin = image->segment[i].mem;
+ end = begin + image->segment[i].memsz;
+
+ if ((begin < high) && (end > low))
+ return -ETXTBSY;
+ }
+ }
+
+ return 0;
+}
+
+static void copy_segments(unsigned long ind)
+{
+ unsigned long entry;
+ unsigned long *ptr;
+ void *dest;
+ void *addr;
+
+ /*
+ * We rely on kexec_load to create a lists that properly
+ * initializes these pointers before they are used.
+ * We will still crash if the list is wrong, but at least
+ * the compiler will be quiet.
+ */
+ ptr = NULL;
+ dest = NULL;
+
+ for (entry = ind; !(entry & IND_DONE); entry = *ptr++) {
+ addr = __va(entry & PAGE_MASK);
+
+ switch (entry & IND_FLAGS) {
+ case IND_DESTINATION:
+ dest = addr;
+ break;
+ case IND_INDIRECTION:
+ ptr = addr;
+ break;
+ case IND_SOURCE:
+ copy_page(dest, addr);
+ dest += PAGE_SIZE;
+ }
+ }
+}
+
+void kexec_copy_flush(struct kimage *image)
+{
+ long i, nr_segments = image->nr_segments;
+ struct kexec_segment ranges[KEXEC_SEGMENT_MAX];
+
+ /* save the ranges on the stack to efficiently flush the icache */
+ memcpy(ranges, image->segment, sizeof(ranges));
+
+ /*
+ * After this call we may not use anything allocated in dynamic
+ * memory, including *image.
+ *
+ * Only globals and the stack are allowed.
+ */
+ copy_segments(image->head);
+
+ /*
+ * we need to clear the icache for all dest pages sometime,
+ * including ones that were in place on the original copy
+ */
+ for (i = 0; i < nr_segments; i++)
+ flush_icache_range((unsigned long)__va(ranges[i].mem),
+ (unsigned long)__va(ranges[i].mem + ranges[i].memsz));
+}
+
+#ifdef CONFIG_SMP
+
+static int kexec_all_irq_disabled = 0;
+
+static void kexec_smp_down(void *arg)
+{
+ local_irq_disable();
+ hard_irq_disable();
+
+ mb(); /* make sure our irqs are disabled before we say they are */
+ get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
+ while(kexec_all_irq_disabled == 0)
+ cpu_relax();
+ mb(); /* make sure all irqs are disabled before this */
+ hw_breakpoint_disable();
+ /*
+ * Now every CPU has IRQs off, we can clear out any pending
+ * IPIs and be sure that no more will come in after this.
+ */
+ if (ppc_md.kexec_cpu_down)
+ ppc_md.kexec_cpu_down(0, 1);
+
+ kexec_smp_wait();
+ /* NOTREACHED */
+}
+
+static void kexec_prepare_cpus_wait(int wait_state)
+{
+ int my_cpu, i, notified=-1;
+
+ hw_breakpoint_disable();
+ my_cpu = get_cpu();
+ /* Make sure each CPU has at least made it to the state we need.
+ *
+ * FIXME: There is a (slim) chance of a problem if not all of the CPUs
+ * are correctly onlined. If somehow we start a CPU on boot with RTAS
+ * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in
+ * time, the boot CPU will timeout. If it does eventually execute
+ * stuff, the secondary will start up (paca_ptrs[]->cpu_start was
+ * written) and get into a peculiar state.
+ * If the platform supports smp_ops->take_timebase(), the secondary CPU
+ * will probably be spinning in there. If not (i.e. pseries), the
+ * secondary will continue on and try to online itself/idle/etc. If it
+ * survives that, we need to find these
+ * possible-but-not-online-but-should-be CPUs and chaperone them into
+ * kexec_smp_wait().
+ */
+ for_each_online_cpu(i) {
+ if (i == my_cpu)
+ continue;
+
+ while (paca_ptrs[i]->kexec_state < wait_state) {
+ barrier();
+ if (i != notified) {
+ printk(KERN_INFO "kexec: waiting for cpu %d "
+ "(physical %d) to enter %i state\n",
+ i, paca_ptrs[i]->hw_cpu_id, wait_state);
+ notified = i;
+ }
+ }
+ }
+ mb();
+}
+
+/*
+ * We need to make sure each present CPU is online. The next kernel will scan
+ * the device tree and assume primary threads are online and query secondary
+ * threads via RTAS to online them if required. If we don't online primary
+ * threads, they will be stuck. However, we also online secondary threads as we
+ * may be using 'cede offline'. In this case RTAS doesn't see the secondary
+ * threads as offline -- and again, these CPUs will be stuck.
+ *
+ * So, we online all CPUs that should be running, including secondary threads.
+ */
+static void wake_offline_cpus(void)
+{
+ int cpu = 0;
+
+ for_each_present_cpu(cpu) {
+ if (!cpu_online(cpu)) {
+ printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
+ cpu);
+ WARN_ON(cpu_up(cpu));
+ }
+ }
+}
+
+static void kexec_prepare_cpus(void)
+{
+ wake_offline_cpus();
+ smp_call_function(kexec_smp_down, NULL, /* wait */0);
+ local_irq_disable();
+ hard_irq_disable();
+
+ mb(); /* make sure IRQs are disabled before we say they are */
+ get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
+
+ kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF);
+ /* we are sure every CPU has IRQs off at this point */
+ kexec_all_irq_disabled = 1;
+
+ /*
+ * Before removing MMU mappings make sure all CPUs have entered real
+ * mode:
+ */
+ kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE);
+
+ /* after we tell the others to go down */
+ if (ppc_md.kexec_cpu_down)
+ ppc_md.kexec_cpu_down(0, 0);
+
+ put_cpu();
+}
+
+#else /* ! SMP */
+
+static void kexec_prepare_cpus(void)
+{
+ /*
+ * move the secondarys to us so that we can copy
+ * the new kernel 0-0x100 safely
+ *
+ * do this if kexec in setup.c ?
+ *
+ * We need to release the cpus if we are ever going from an
+ * UP to an SMP kernel.
+ */
+ smp_release_cpus();
+ if (ppc_md.kexec_cpu_down)
+ ppc_md.kexec_cpu_down(0, 0);
+ local_irq_disable();
+ hard_irq_disable();
+}
+
+#endif /* SMP */
+
+/*
+ * kexec thread structure and stack.
+ *
+ * We need to make sure that this is 16384-byte aligned due to the
+ * way process stacks are handled. It also must be statically allocated
+ * or allocated as part of the kimage, because everything else may be
+ * overwritten when we copy the kexec image. We piggyback on the
+ * "init_task" linker section here to statically allocate a stack.
+ *
+ * We could use a smaller stack if we don't care about anything using
+ * current, but that audit has not been performed.
+ */
+static union thread_union kexec_stack __init_task_data =
+ { };
+
+/*
+ * For similar reasons to the stack above, the kexecing CPU needs to be on a
+ * static PACA; we switch to kexec_paca.
+ */
+struct paca_struct kexec_paca;
+
+/* Our assembly helper, in misc_64.S */
+extern void kexec_sequence(void *newstack, unsigned long start,
+ void *image, void *control,
+ void (*clear_all)(void),
+ bool copy_with_mmu_off) __noreturn;
+
+/* too late to fail here */
+void default_machine_kexec(struct kimage *image)
+{
+ bool copy_with_mmu_off;
+
+ /* prepare control code if any */
+
+ /*
+ * If the kexec boot is the normal one, need to shutdown other cpus
+ * into our wait loop and quiesce interrupts.
+ * Otherwise, in the case of crashed mode (crashing_cpu >= 0),
+ * stopping other CPUs and collecting their pt_regs is done before
+ * using debugger IPI.
+ */
+
+ if (!kdump_in_progress())
+ kexec_prepare_cpus();
+
+ printk("kexec: Starting switchover sequence.\n");
+
+ /* switch to a staticly allocated stack. Based on irq stack code.
+ * We setup preempt_count to avoid using VMX in memcpy.
+ * XXX: the task struct will likely be invalid once we do the copy!
+ */
+ current_thread_info()->flags = 0;
+ current_thread_info()->preempt_count = HARDIRQ_OFFSET;
+
+ /* We need a static PACA, too; copy this CPU's PACA over and switch to
+ * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using
+ * non-static data.
+ */
+ memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct));
+ kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL;
+#ifdef CONFIG_PPC_PSERIES
+ kexec_paca.lppaca_ptr = NULL;
+#endif
+
+ if (is_secure_guest() && !(image->preserve_context ||
+ image->type == KEXEC_TYPE_CRASH)) {
+ uv_unshare_all_pages();
+ printk("kexec: Unshared all shared pages.\n");
+ }
+
+ paca_ptrs[kexec_paca.paca_index] = &kexec_paca;
+
+ setup_paca(&kexec_paca);
+
+ /*
+ * The lppaca should be unregistered at this point so the HV won't
+ * touch it. In the case of a crash, none of the lppacas are
+ * unregistered so there is not much we can do about it here.
+ */
+
+ /*
+ * On Book3S, the copy must happen with the MMU off if we are either
+ * using Radix page tables or we are not in an LPAR since we can
+ * overwrite the page tables while copying.
+ *
+ * In an LPAR, we keep the MMU on otherwise we can't access beyond
+ * the RMA. On BookE there is no real MMU off mode, so we have to
+ * keep it enabled as well (but then we have bolted TLB entries).
+ */
+#ifdef CONFIG_PPC_BOOK3E
+ copy_with_mmu_off = false;
+#else
+ copy_with_mmu_off = radix_enabled() ||
+ !(firmware_has_feature(FW_FEATURE_LPAR) ||
+ firmware_has_feature(FW_FEATURE_PS3_LV1));
+#endif
+
+ /* Some things are best done in assembly. Finding globals with
+ * a toc is easier in C, so pass in what we can.
+ */
+ kexec_sequence(&kexec_stack, image->start, image,
+ page_address(image->control_code_page),
+ mmu_cleanup_all, copy_with_mmu_off);
+ /* NOTREACHED */
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/* Values we need to export to the second kernel via the device tree. */
+static unsigned long htab_base;
+static unsigned long htab_size;
+
+static struct property htab_base_prop = {
+ .name = "linux,htab-base",
+ .length = sizeof(unsigned long),
+ .value = &htab_base,
+};
+
+static struct property htab_size_prop = {
+ .name = "linux,htab-size",
+ .length = sizeof(unsigned long),
+ .value = &htab_size,
+};
+
+static int __init export_htab_values(void)
+{
+ struct device_node *node;
+
+ /* On machines with no htab htab_address is NULL */
+ if (!htab_address)
+ return -ENODEV;
+
+ node = of_find_node_by_path("/chosen");
+ if (!node)
+ return -ENODEV;
+
+ /* remove any stale propertys so ours can be found */
+ of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL));
+ of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL));
+
+ htab_base = cpu_to_be64(__pa(htab_address));
+ of_add_property(node, &htab_base_prop);
+ htab_size = cpu_to_be64(htab_size_bytes);
+ of_add_property(node, &htab_size_prop);
+
+ of_node_put(node);
+ return 0;
+}
+late_initcall(export_htab_values);
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
new file mode 100644
index 000000000000..d488311efab1
--- /dev/null
+++ b/arch/powerpc/kexec/crash.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Architecture specific (PPC64) functions for kexec based crash dumps.
+ *
+ * Copyright (C) 2005, IBM Corp.
+ *
+ * Created by: Haren Myneni
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/export.h>
+#include <linux/crash_dump.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/types.h>
+
+#include <asm/processor.h>
+#include <asm/machdep.h>
+#include <asm/kexec.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/setjmp.h>
+#include <asm/debug.h>
+
+/*
+ * The primary CPU waits a while for all secondary CPUs to enter. This is to
+ * avoid sending an IPI if the secondary CPUs are entering
+ * crash_kexec_secondary on their own (eg via a system reset).
+ *
+ * The secondary timeout has to be longer than the primary. Both timeouts are
+ * in milliseconds.
+ */
+#define PRIMARY_TIMEOUT 500
+#define SECONDARY_TIMEOUT 1000
+
+#define IPI_TIMEOUT 10000
+#define REAL_MODE_TIMEOUT 10000
+
+static int time_to_dump;
+/*
+ * crash_wake_offline should be set to 1 by platforms that intend to wake
+ * up offline cpus prior to jumping to a kdump kernel. Currently powernv
+ * sets it to 1, since we want to avoid things from happening when an
+ * offline CPU wakes up due to something like an HMI (malfunction error),
+ * which propagates to all threads.
+ */
+int crash_wake_offline;
+
+#define CRASH_HANDLER_MAX 3
+/* List of shutdown handles */
+static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX];
+static DEFINE_SPINLOCK(crash_handlers_lock);
+
+static unsigned long crash_shutdown_buf[JMP_BUF_LEN];
+static int crash_shutdown_cpu = -1;
+
+static int handle_fault(struct pt_regs *regs)
+{
+ if (crash_shutdown_cpu == smp_processor_id())
+ longjmp(crash_shutdown_buf, 1);
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+
+static atomic_t cpus_in_crash;
+void crash_ipi_callback(struct pt_regs *regs)
+{
+ static cpumask_t cpus_state_saved = CPU_MASK_NONE;
+
+ int cpu = smp_processor_id();
+
+ hard_irq_disable();
+ if (!cpumask_test_cpu(cpu, &cpus_state_saved)) {
+ crash_save_cpu(regs, cpu);
+ cpumask_set_cpu(cpu, &cpus_state_saved);
+ }
+
+ atomic_inc(&cpus_in_crash);
+ smp_mb__after_atomic();
+
+ /*
+ * Starting the kdump boot.
+ * This barrier is needed to make sure that all CPUs are stopped.
+ */
+ while (!time_to_dump)
+ cpu_relax();
+
+ if (ppc_md.kexec_cpu_down)
+ ppc_md.kexec_cpu_down(1, 1);
+
+#ifdef CONFIG_PPC64
+ kexec_smp_wait();
+#else
+ for (;;); /* FIXME */
+#endif
+
+ /* NOTREACHED */
+}
+
+static void crash_kexec_prepare_cpus(int cpu)
+{
+ unsigned int msecs;
+ unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */
+ int tries = 0;
+ int (*old_handler)(struct pt_regs *regs);
+
+ printk(KERN_EMERG "Sending IPI to other CPUs\n");
+
+ if (crash_wake_offline)
+ ncpus = num_present_cpus() - 1;
+
+ crash_send_ipi(crash_ipi_callback);
+ smp_wmb();
+
+again:
+ /*
+ * FIXME: Until we will have the way to stop other CPUs reliably,
+ * the crash CPU will send an IPI and wait for other CPUs to
+ * respond.
+ */
+ msecs = IPI_TIMEOUT;
+ while ((atomic_read(&cpus_in_crash) < ncpus) && (--msecs > 0))
+ mdelay(1);
+
+ /* Would it be better to replace the trap vector here? */
+
+ if (atomic_read(&cpus_in_crash) >= ncpus) {
+ printk(KERN_EMERG "IPI complete\n");
+ return;
+ }
+
+ printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n",
+ ncpus - atomic_read(&cpus_in_crash));
+
+ /*
+ * If we have a panic timeout set then we can't wait indefinitely
+ * for someone to activate system reset. We also give up on the
+ * second time through if system reset fail to work.
+ */
+ if ((panic_timeout > 0) || (tries > 0))
+ return;
+
+ /*
+ * A system reset will cause all CPUs to take an 0x100 exception.
+ * The primary CPU returns here via setjmp, and the secondary
+ * CPUs reexecute the crash_kexec_secondary path.
+ */
+ old_handler = __debugger;
+ __debugger = handle_fault;
+ crash_shutdown_cpu = smp_processor_id();
+
+ if (setjmp(crash_shutdown_buf) == 0) {
+ printk(KERN_EMERG "Activate system reset (dumprestart) "
+ "to stop other cpu(s)\n");
+
+ /*
+ * A system reset will force all CPUs to execute the
+ * crash code again. We need to reset cpus_in_crash so we
+ * wait for everyone to do this.
+ */
+ atomic_set(&cpus_in_crash, 0);
+ smp_mb();
+
+ while (atomic_read(&cpus_in_crash) < ncpus)
+ cpu_relax();
+ }
+
+ crash_shutdown_cpu = -1;
+ __debugger = old_handler;
+
+ tries++;
+ goto again;
+}
+
+/*
+ * This function will be called by secondary cpus.
+ */
+void crash_kexec_secondary(struct pt_regs *regs)
+{
+ unsigned long flags;
+ int msecs = SECONDARY_TIMEOUT;
+
+ local_irq_save(flags);
+
+ /* Wait for the primary crash CPU to signal its progress */
+ while (crashing_cpu < 0) {
+ if (--msecs < 0) {
+ /* No response, kdump image may not have been loaded */
+ local_irq_restore(flags);
+ return;
+ }
+
+ mdelay(1);
+ }
+
+ crash_ipi_callback(regs);
+}
+
+#else /* ! CONFIG_SMP */
+
+static void crash_kexec_prepare_cpus(int cpu)
+{
+ /*
+ * move the secondaries to us so that we can copy
+ * the new kernel 0-0x100 safely
+ *
+ * do this if kexec in setup.c ?
+ */
+#ifdef CONFIG_PPC64
+ smp_release_cpus();
+#else
+ /* FIXME */
+#endif
+}
+
+void crash_kexec_secondary(struct pt_regs *regs)
+{
+}
+#endif /* CONFIG_SMP */
+
+/* wait for all the CPUs to hit real mode but timeout if they don't come in */
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC64)
+static void __maybe_unused crash_kexec_wait_realmode(int cpu)
+{
+ unsigned int msecs;
+ int i;
+
+ msecs = REAL_MODE_TIMEOUT;
+ for (i=0; i < nr_cpu_ids && msecs > 0; i++) {
+ if (i == cpu)
+ continue;
+
+ while (paca_ptrs[i]->kexec_state < KEXEC_STATE_REAL_MODE) {
+ barrier();
+ if (!cpu_possible(i) || !cpu_online(i) || (msecs <= 0))
+ break;
+ msecs--;
+ mdelay(1);
+ }
+ }
+ mb();
+}
+#else
+static inline void crash_kexec_wait_realmode(int cpu) {}
+#endif /* CONFIG_SMP && CONFIG_PPC64 */
+
+/*
+ * Register a function to be called on shutdown. Only use this if you
+ * can't reset your device in the second kernel.
+ */
+int crash_shutdown_register(crash_shutdown_t handler)
+{
+ unsigned int i, rc;
+
+ spin_lock(&crash_handlers_lock);
+ for (i = 0 ; i < CRASH_HANDLER_MAX; i++)
+ if (!crash_shutdown_handles[i]) {
+ /* Insert handle at first empty entry */
+ crash_shutdown_handles[i] = handler;
+ rc = 0;
+ break;
+ }
+
+ if (i == CRASH_HANDLER_MAX) {
+ printk(KERN_ERR "Crash shutdown handles full, "
+ "not registered.\n");
+ rc = 1;
+ }
+
+ spin_unlock(&crash_handlers_lock);
+ return rc;
+}