diff options
author | Christophe Leroy <christophe.leroy@c-s.fr> | 2019-10-29 12:13:58 +0000 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2019-11-21 15:41:34 +1100 |
commit | 793b08e2efff3ec020c5c5861d00ed394fcdd488 (patch) | |
tree | 0a0ccae4f55def043a319f6b1654d558d0254a53 /arch/powerpc/kexec | |
parent | 9f7bd9201521b3ad11e96887550dd3e835ba01cb (diff) |
powerpc/kexec: Move kexec files into a dedicated subdir.
arch/powerpc/kernel/ contains 8 files dedicated to kexec.
Move them into a dedicated subdirectory.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: Move to a/p/kexec, drop the 'machine' naming and use 'core' instead]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/afbef97ec6a978574a5cf91a4441000e0a9da42a.1572351221.git.christophe.leroy@c-s.fr
Diffstat (limited to 'arch/powerpc/kexec')
-rw-r--r-- | arch/powerpc/kexec/Makefile | 25 | ||||
-rw-r--r-- | arch/powerpc/kexec/core.c | 280 | ||||
-rw-r--r-- | arch/powerpc/kexec/core_32.c | 69 | ||||
-rw-r--r-- | arch/powerpc/kexec/core_64.c | 417 | ||||
-rw-r--r-- | arch/powerpc/kexec/crash.c | 374 | ||||
-rw-r--r-- | arch/powerpc/kexec/elf_64.c | 125 | ||||
-rw-r--r-- | arch/powerpc/kexec/file_load.c | 254 | ||||
-rw-r--r-- | arch/powerpc/kexec/ima.c | 219 | ||||
-rw-r--r-- | arch/powerpc/kexec/relocate_32.S | 500 |
9 files changed, 2263 insertions, 0 deletions
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile new file mode 100644 index 000000000000..16c1c5a19519 --- /dev/null +++ b/arch/powerpc/kexec/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux kernel. +# + +# Disable clang warning for using setjmp without setjmp.h header +CFLAGS_crash.o += $(call cc-disable-warning, builtin-requires-header) + +obj-y += core.o crash.o core_$(BITS).o + +obj-$(CONFIG_PPC32) += relocate_32.o + +obj-$(CONFIG_KEXEC_FILE) += file_load.o elf_$(BITS).o + +ifdef CONFIG_HAVE_IMA_KEXEC +ifdef CONFIG_IMA +obj-y += ima.o +endif +endif + + +# Disable GCOV, KCOV & sanitizers in odd or sensitive code +GCOV_PROFILE_core_$(BITS).o := n +KCOV_INSTRUMENT_core_$(BITS).o := n +UBSAN_SANITIZE_core_$(BITS).o := n diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c new file mode 100644 index 000000000000..078fe3d76feb --- /dev/null +++ b/arch/powerpc/kexec/core.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Code to handle transition of Linux booting another kernel. + * + * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> + * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz + * Copyright (C) 2005 IBM Corporation. + */ + +#include <linux/kexec.h> +#include <linux/reboot.h> +#include <linux/threads.h> +#include <linux/memblock.h> +#include <linux/of.h> +#include <linux/irq.h> +#include <linux/ftrace.h> + +#include <asm/kdump.h> +#include <asm/machdep.h> +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/sections.h> + +void machine_kexec_mask_interrupts(void) { + unsigned int i; + struct irq_desc *desc; + + for_each_irq_desc(i, desc) { + struct irq_chip *chip; + + chip = irq_desc_get_chip(desc); + if (!chip) + continue; + + if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) + chip->irq_eoi(&desc->irq_data); + + if (chip->irq_mask) + chip->irq_mask(&desc->irq_data); + + if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) + chip->irq_disable(&desc->irq_data); + } +} + +void machine_crash_shutdown(struct pt_regs *regs) +{ + default_machine_crash_shutdown(regs); +} + +/* + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + */ +int machine_kexec_prepare(struct kimage *image) +{ + if (ppc_md.machine_kexec_prepare) + return ppc_md.machine_kexec_prepare(image); + else + return default_machine_kexec_prepare(image); +} + +void machine_kexec_cleanup(struct kimage *image) +{ +} + +void arch_crash_save_vmcoreinfo(void) +{ + +#ifdef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) + VMCOREINFO_SYMBOL(vmemmap_list); + VMCOREINFO_SYMBOL(mmu_vmemmap_psize); + VMCOREINFO_SYMBOL(mmu_psize_defs); + VMCOREINFO_STRUCT_SIZE(vmemmap_backing); + VMCOREINFO_OFFSET(vmemmap_backing, list); + VMCOREINFO_OFFSET(vmemmap_backing, phys); + VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); + VMCOREINFO_STRUCT_SIZE(mmu_psize_def); + VMCOREINFO_OFFSET(mmu_psize_def, shift); +#endif + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + int save_ftrace_enabled; + + save_ftrace_enabled = __ftrace_enabled_save(); + this_cpu_disable_ftrace(); + + if (ppc_md.machine_kexec) + ppc_md.machine_kexec(image); + else + default_machine_kexec(image); + + this_cpu_enable_ftrace(); + __ftrace_enabled_restore(save_ftrace_enabled); + + /* Fall back to normal restart if we're still alive. */ + machine_restart(NULL); + for(;;); +} + +void __init reserve_crashkernel(void) +{ + unsigned long long crash_size, crash_base; + int ret; + + /* use common parsing */ + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + } + + if (crashk_res.end == crashk_res.start) { + crashk_res.start = crashk_res.end = 0; + return; + } + + /* We might have got these values via the command line or the + * device tree, either way sanitise them now. */ + + crash_size = resource_size(&crashk_res); + +#ifndef CONFIG_NONSTATIC_KERNEL + if (crashk_res.start != KDUMP_KERNELBASE) + printk("Crash kernel location must be 0x%x\n", + KDUMP_KERNELBASE); + + crashk_res.start = KDUMP_KERNELBASE; +#else + if (!crashk_res.start) { +#ifdef CONFIG_PPC64 + /* + * On 64bit we split the RMO in half but cap it at half of + * a small SLB (128MB) since the crash kernel needs to place + * itself and some stacks to be in the first segment. + */ + crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2)); +#else + crashk_res.start = KDUMP_KERNELBASE; +#endif + } + + crash_base = PAGE_ALIGN(crashk_res.start); + if (crash_base != crashk_res.start) { + printk("Crash kernel base must be aligned to 0x%lx\n", + PAGE_SIZE); + crashk_res.start = crash_base; + } + +#endif + crash_size = PAGE_ALIGN(crash_size); + crashk_res.end = crashk_res.start + crash_size - 1; + + /* The crash region must not overlap the current kernel */ + if (overlaps_crashkernel(__pa(_stext), _end - _stext)) { + printk(KERN_WARNING + "Crash kernel can not overlap current kernel\n"); + crashk_res.start = crashk_res.end = 0; + return; + } + + /* Crash kernel trumps memory limit */ + if (memory_limit && memory_limit <= crashk_res.end) { + memory_limit = crashk_res.end + 1; + printk("Adjusted memory limit for crashkernel, now 0x%llx\n", + memory_limit); + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crashk_res.start >> 20), + (unsigned long)(memblock_phys_mem_size() >> 20)); + + if (!memblock_is_region_memory(crashk_res.start, crash_size) || + memblock_reserve(crashk_res.start, crash_size)) { + pr_err("Failed to reserve memory for crashkernel!\n"); + crashk_res.start = crashk_res.end = 0; + return; + } +} + +int overlaps_crashkernel(unsigned long start, unsigned long size) +{ + return (start + size) > crashk_res.start && start <= crashk_res.end; +} + +/* Values we need to export to the second kernel via the device tree. */ +static phys_addr_t kernel_end; +static phys_addr_t crashk_base; +static phys_addr_t crashk_size; +static unsigned long long mem_limit; + +static struct property kernel_end_prop = { + .name = "linux,kernel-end", + .length = sizeof(phys_addr_t), + .value = &kernel_end, +}; + +static struct property crashk_base_prop = { + .name = "linux,crashkernel-base", + .length = sizeof(phys_addr_t), + .value = &crashk_base +}; + +static struct property crashk_size_prop = { + .name = "linux,crashkernel-size", + .length = sizeof(phys_addr_t), + .value = &crashk_size, +}; + +static struct property memory_limit_prop = { + .name = "linux,memory-limit", + .length = sizeof(unsigned long long), + .value = &mem_limit, +}; + +#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG) + +static void __init export_crashk_values(struct device_node *node) +{ + /* There might be existing crash kernel properties, but we can't + * be sure what's in them, so remove them. */ + of_remove_property(node, of_find_property(node, + "linux,crashkernel-base", NULL)); + of_remove_property(node, of_find_property(node, + "linux,crashkernel-size", NULL)); + + if (crashk_res.start != 0) { + crashk_base = cpu_to_be_ulong(crashk_res.start), + of_add_property(node, &crashk_base_prop); + crashk_size = cpu_to_be_ulong(resource_size(&crashk_res)); + of_add_property(node, &crashk_size_prop); + } + + /* + * memory_limit is required by the kexec-tools to limit the + * crash regions to the actual memory used. + */ + mem_limit = cpu_to_be_ulong(memory_limit); + of_update_property(node, &memory_limit_prop); +} + +static int __init kexec_setup(void) +{ + struct device_node *node; + + node = of_find_node_by_path("/chosen"); + if (!node) + return -ENOENT; + + /* remove any stale properties so ours can be found */ + of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL)); + + /* information needed by userspace when using default_machine_kexec */ + kernel_end = cpu_to_be_ulong(__pa(_end)); + of_add_property(node, &kernel_end_prop); + + export_crashk_values(node); + + of_node_put(node); + return 0; +} +late_initcall(kexec_setup); diff --git a/arch/powerpc/kexec/core_32.c b/arch/powerpc/kexec/core_32.c new file mode 100644 index 000000000000..bf9f1f906d64 --- /dev/null +++ b/arch/powerpc/kexec/core_32.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * PPC32 code to handle Linux booting another kernel. + * + * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> + * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz + * Copyright (C) 2005 IBM Corporation. + */ + +#include <linux/kexec.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <asm/cacheflush.h> +#include <asm/hw_irq.h> +#include <asm/io.h> + +typedef void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long reboot_code_buffer, + unsigned long start_address) __noreturn; + +/* + * This is a generic machine_kexec function suitable at least for + * non-OpenFirmware embedded platforms. + * It merely copies the image relocation code to the control page and + * jumps to it. + * A platform specific function may just call this one. + */ +void default_machine_kexec(struct kimage *image) +{ + extern const unsigned int relocate_new_kernel_size; + unsigned long page_list; + unsigned long reboot_code_buffer, reboot_code_buffer_phys; + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* mask each interrupt so we are in a more sane state for the + * kexec kernel */ + machine_kexec_mask_interrupts(); + + page_list = image->head; + + /* we need both effective and real address here */ + reboot_code_buffer = + (unsigned long)page_address(image->control_code_page); + reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer); + + /* copy our kernel relocation code to the control code page */ + memcpy((void *)reboot_code_buffer, relocate_new_kernel, + relocate_new_kernel_size); + + flush_icache_range(reboot_code_buffer, + reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE); + printk(KERN_INFO "Bye!\n"); + + if (!IS_ENABLED(CONFIG_FSL_BOOKE) && !IS_ENABLED(CONFIG_44x)) + relocate_new_kernel(page_list, reboot_code_buffer_phys, image->start); + + /* now call it */ + rnk = (relocate_new_kernel_t) reboot_code_buffer; + (*rnk)(page_list, reboot_code_buffer_phys, image->start); +} + +int default_machine_kexec_prepare(struct kimage *image) +{ + return 0; +} diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c new file mode 100644 index 000000000000..04a7cba58eff --- /dev/null +++ b/arch/powerpc/kexec/core_64.c @@ -0,0 +1,417 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * PPC64 code to handle Linux booting another kernel. + * + * Copyright (C) 2004-2005, IBM Corp. + * + * Created by: Milton D Miller II + */ + + +#include <linux/kexec.h> +#include <linux/smp.h> +#include <linux/thread_info.h> +#include <linux/init_task.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/hardirq.h> + +#include <asm/page.h> +#include <asm/current.h> +#include <asm/machdep.h> +#include <asm/cacheflush.h> +#include <asm/firmware.h> +#include <asm/paca.h> +#include <asm/mmu.h> +#include <asm/sections.h> /* _end */ +#include <asm/prom.h> +#include <asm/smp.h> +#include <asm/hw_breakpoint.h> +#include <asm/asm-prototypes.h> +#include <asm/svm.h> +#include <asm/ultravisor.h> + +int default_machine_kexec_prepare(struct kimage *image) +{ + int i; + unsigned long begin, end; /* limits of segment */ + unsigned long low, high; /* limits of blocked memory range */ + struct device_node *node; + const unsigned long *basep; + const unsigned int *sizep; + + /* + * Since we use the kernel fault handlers and paging code to + * handle the virtual mode, we must make sure no destination + * overlaps kernel static data or bss. + */ + for (i = 0; i < image->nr_segments; i++) + if (image->segment[i].mem < __pa(_end)) + return -ETXTBSY; + + /* We also should not overwrite the tce tables */ + for_each_node_by_type(node, "pci") { + basep = of_get_property(node, "linux,tce-base", NULL); + sizep = of_get_property(node, "linux,tce-size", NULL); + if (basep == NULL || sizep == NULL) + continue; + + low = *basep; + high = low + (*sizep); + + for (i = 0; i < image->nr_segments; i++) { + begin = image->segment[i].mem; + end = begin + image->segment[i].memsz; + + if ((begin < high) && (end > low)) + return -ETXTBSY; + } + } + + return 0; +} + +static void copy_segments(unsigned long ind) +{ + unsigned long entry; + unsigned long *ptr; + void *dest; + void *addr; + + /* + * We rely on kexec_load to create a lists that properly + * initializes these pointers before they are used. + * We will still crash if the list is wrong, but at least + * the compiler will be quiet. + */ + ptr = NULL; + dest = NULL; + + for (entry = ind; !(entry & IND_DONE); entry = *ptr++) { + addr = __va(entry & PAGE_MASK); + + switch (entry & IND_FLAGS) { + case IND_DESTINATION: + dest = addr; + break; + case IND_INDIRECTION: + ptr = addr; + break; + case IND_SOURCE: + copy_page(dest, addr); + dest += PAGE_SIZE; + } + } +} + +void kexec_copy_flush(struct kimage *image) +{ + long i, nr_segments = image->nr_segments; + struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; + + /* save the ranges on the stack to efficiently flush the icache */ + memcpy(ranges, image->segment, sizeof(ranges)); + + /* + * After this call we may not use anything allocated in dynamic + * memory, including *image. + * + * Only globals and the stack are allowed. + */ + copy_segments(image->head); + + /* + * we need to clear the icache for all dest pages sometime, + * including ones that were in place on the original copy + */ + for (i = 0; i < nr_segments; i++) + flush_icache_range((unsigned long)__va(ranges[i].mem), + (unsigned long)__va(ranges[i].mem + ranges[i].memsz)); +} + +#ifdef CONFIG_SMP + +static int kexec_all_irq_disabled = 0; + +static void kexec_smp_down(void *arg) +{ + local_irq_disable(); + hard_irq_disable(); + + mb(); /* make sure our irqs are disabled before we say they are */ + get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; + while(kexec_all_irq_disabled == 0) + cpu_relax(); + mb(); /* make sure all irqs are disabled before this */ + hw_breakpoint_disable(); + /* + * Now every CPU has IRQs off, we can clear out any pending + * IPIs and be sure that no more will come in after this. + */ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 1); + + kexec_smp_wait(); + /* NOTREACHED */ +} + +static void kexec_prepare_cpus_wait(int wait_state) +{ + int my_cpu, i, notified=-1; + + hw_breakpoint_disable(); + my_cpu = get_cpu(); + /* Make sure each CPU has at least made it to the state we need. + * + * FIXME: There is a (slim) chance of a problem if not all of the CPUs + * are correctly onlined. If somehow we start a CPU on boot with RTAS + * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in + * time, the boot CPU will timeout. If it does eventually execute + * stuff, the secondary will start up (paca_ptrs[]->cpu_start was + * written) and get into a peculiar state. + * If the platform supports smp_ops->take_timebase(), the secondary CPU + * will probably be spinning in there. If not (i.e. pseries), the + * secondary will continue on and try to online itself/idle/etc. If it + * survives that, we need to find these + * possible-but-not-online-but-should-be CPUs and chaperone them into + * kexec_smp_wait(). + */ + for_each_online_cpu(i) { + if (i == my_cpu) + continue; + + while (paca_ptrs[i]->kexec_state < wait_state) { + barrier(); + if (i != notified) { + printk(KERN_INFO "kexec: waiting for cpu %d " + "(physical %d) to enter %i state\n", + i, paca_ptrs[i]->hw_cpu_id, wait_state); + notified = i; + } + } + } + mb(); +} + +/* + * We need to make sure each present CPU is online. The next kernel will scan + * the device tree and assume primary threads are online and query secondary + * threads via RTAS to online them if required. If we don't online primary + * threads, they will be stuck. However, we also online secondary threads as we + * may be using 'cede offline'. In this case RTAS doesn't see the secondary + * threads as offline -- and again, these CPUs will be stuck. + * + * So, we online all CPUs that should be running, including secondary threads. + */ +static void wake_offline_cpus(void) +{ + int cpu = 0; + + for_each_present_cpu(cpu) { + if (!cpu_online(cpu)) { + printk(KERN_INFO "kexec: Waking offline cpu %d.\n", + cpu); + WARN_ON(cpu_up(cpu)); + } + } +} + +static void kexec_prepare_cpus(void) +{ + wake_offline_cpus(); + smp_call_function(kexec_smp_down, NULL, /* wait */0); + local_irq_disable(); + hard_irq_disable(); + + mb(); /* make sure IRQs are disabled before we say they are */ + get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; + + kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF); + /* we are sure every CPU has IRQs off at this point */ + kexec_all_irq_disabled = 1; + + /* + * Before removing MMU mappings make sure all CPUs have entered real + * mode: + */ + kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE); + + /* after we tell the others to go down */ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + + put_cpu(); +} + +#else /* ! SMP */ + +static void kexec_prepare_cpus(void) +{ + /* + * move the secondarys to us so that we can copy + * the new kernel 0-0x100 safely + * + * do this if kexec in setup.c ? + * + * We need to release the cpus if we are ever going from an + * UP to an SMP kernel. + */ + smp_release_cpus(); + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + local_irq_disable(); + hard_irq_disable(); +} + +#endif /* SMP */ + +/* + * kexec thread structure and stack. + * + * We need to make sure that this is 16384-byte aligned due to the + * way process stacks are handled. It also must be statically allocated + * or allocated as part of the kimage, because everything else may be + * overwritten when we copy the kexec image. We piggyback on the + * "init_task" linker section here to statically allocate a stack. + * + * We could use a smaller stack if we don't care about anything using + * current, but that audit has not been performed. + */ +static union thread_union kexec_stack __init_task_data = + { }; + +/* + * For similar reasons to the stack above, the kexecing CPU needs to be on a + * static PACA; we switch to kexec_paca. + */ +struct paca_struct kexec_paca; + +/* Our assembly helper, in misc_64.S */ +extern void kexec_sequence(void *newstack, unsigned long start, + void *image, void *control, + void (*clear_all)(void), + bool copy_with_mmu_off) __noreturn; + +/* too late to fail here */ +void default_machine_kexec(struct kimage *image) +{ + bool copy_with_mmu_off; + + /* prepare control code if any */ + + /* + * If the kexec boot is the normal one, need to shutdown other cpus + * into our wait loop and quiesce interrupts. + * Otherwise, in the case of crashed mode (crashing_cpu >= 0), + * stopping other CPUs and collecting their pt_regs is done before + * using debugger IPI. + */ + + if (!kdump_in_progress()) + kexec_prepare_cpus(); + + printk("kexec: Starting switchover sequence.\n"); + + /* switch to a staticly allocated stack. Based on irq stack code. + * We setup preempt_count to avoid using VMX in memcpy. + * XXX: the task struct will likely be invalid once we do the copy! + */ + current_thread_info()->flags = 0; + current_thread_info()->preempt_count = HARDIRQ_OFFSET; + + /* We need a static PACA, too; copy this CPU's PACA over and switch to + * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using + * non-static data. + */ + memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct)); + kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL; +#ifdef CONFIG_PPC_PSERIES + kexec_paca.lppaca_ptr = NULL; +#endif + + if (is_secure_guest() && !(image->preserve_context || + image->type == KEXEC_TYPE_CRASH)) { + uv_unshare_all_pages(); + printk("kexec: Unshared all shared pages.\n"); + } + + paca_ptrs[kexec_paca.paca_index] = &kexec_paca; + + setup_paca(&kexec_paca); + + /* + * The lppaca should be unregistered at this point so the HV won't + * touch it. In the case of a crash, none of the lppacas are + * unregistered so there is not much we can do about it here. + */ + + /* + * On Book3S, the copy must happen with the MMU off if we are either + * using Radix page tables or we are not in an LPAR since we can + * overwrite the page tables while copying. + * + * In an LPAR, we keep the MMU on otherwise we can't access beyond + * the RMA. On BookE there is no real MMU off mode, so we have to + * keep it enabled as well (but then we have bolted TLB entries). + */ +#ifdef CONFIG_PPC_BOOK3E + copy_with_mmu_off = false; +#else + copy_with_mmu_off = radix_enabled() || + !(firmware_has_feature(FW_FEATURE_LPAR) || + firmware_has_feature(FW_FEATURE_PS3_LV1)); +#endif + + /* Some things are best done in assembly. Finding globals with + * a toc is easier in C, so pass in what we can. + */ + kexec_sequence(&kexec_stack, image->start, image, + page_address(image->control_code_page), + mmu_cleanup_all, copy_with_mmu_off); + /* NOTREACHED */ +} + +#ifdef CONFIG_PPC_BOOK3S_64 +/* Values we need to export to the second kernel via the device tree. */ +static unsigned long htab_base; +static unsigned long htab_size; + +static struct property htab_base_prop = { + .name = "linux,htab-base", + .length = sizeof(unsigned long), + .value = &htab_base, +}; + +static struct property htab_size_prop = { + .name = "linux,htab-size", + .length = sizeof(unsigned long), + .value = &htab_size, +}; + +static int __init export_htab_values(void) +{ + struct device_node *node; + + /* On machines with no htab htab_address is NULL */ + if (!htab_address) + return -ENODEV; + + node = of_find_node_by_path("/chosen"); + if (!node) + return -ENODEV; + + /* remove any stale propertys so ours can be found */ + of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL)); + of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL)); + + htab_base = cpu_to_be64(__pa(htab_address)); + of_add_property(node, &htab_base_prop); + htab_size = cpu_to_be64(htab_size_bytes); + of_add_property(node, &htab_size_prop); + + of_node_put(node); + return 0; +} +late_initcall(export_htab_values); +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c new file mode 100644 index 000000000000..d488311efab1 --- /dev/null +++ b/arch/powerpc/kexec/crash.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Architecture specific (PPC64) functions for kexec based crash dumps. + * + * Copyright (C) 2005, IBM Corp. + * + * Created by: Haren Myneni + */ + +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/reboot.h> +#include <linux/kexec.h> +#include <linux/export.h> +#include <linux/crash_dump.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/types.h> + +#include <asm/processor.h> +#include <asm/machdep.h> +#include <asm/kexec.h> +#include <asm/prom.h> +#include <asm/smp.h> +#include <asm/setjmp.h> +#include <asm/debug.h> + +/* + * The primary CPU waits a while for all secondary CPUs to enter. This is to + * avoid sending an IPI if the secondary CPUs are entering + * crash_kexec_secondary on their own (eg via a system reset). + * + * The secondary timeout has to be longer than the primary. Both timeouts are + * in milliseconds. + */ +#define PRIMARY_TIMEOUT 500 +#define SECONDARY_TIMEOUT 1000 + +#define IPI_TIMEOUT 10000 +#define REAL_MODE_TIMEOUT 10000 + +static int time_to_dump; +/* + * crash_wake_offline should be set to 1 by platforms that intend to wake + * up offline cpus prior to jumping to a kdump kernel. Currently powernv + * sets it to 1, since we want to avoid things from happening when an + * offline CPU wakes up due to something like an HMI (malfunction error), + * which propagates to all threads. + */ +int crash_wake_offline; + +#define CRASH_HANDLER_MAX 3 +/* List of shutdown handles */ +static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX]; +static DEFINE_SPINLOCK(crash_handlers_lock); + +static unsigned long crash_shutdown_buf[JMP_BUF_LEN]; +static int crash_shutdown_cpu = -1; + +static int handle_fault(struct pt_regs *regs) +{ + if (crash_shutdown_cpu == smp_processor_id()) + longjmp(crash_shutdown_buf, 1); + return 0; +} + +#ifdef CONFIG_SMP + +static atomic_t cpus_in_crash; +void crash_ipi_callback(struct pt_regs *regs) +{ + static cpumask_t cpus_state_saved = CPU_MASK_NONE; + + int cpu = smp_processor_id(); + + hard_irq_disable(); + if (!cpumask_test_cpu(cpu, &cpus_state_saved)) { + crash_save_cpu(regs, cpu); + cpumask_set_cpu(cpu, &cpus_state_saved); + } + + atomic_inc(&cpus_in_crash); + smp_mb__after_atomic(); + + /* + * Starting the kdump boot. + * This barrier is needed to make sure that all CPUs are stopped. + */ + while (!time_to_dump) + cpu_relax(); + + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(1, 1); + +#ifdef CONFIG_PPC64 + kexec_smp_wait(); +#else + for (;;); /* FIXME */ +#endif + + /* NOTREACHED */ +} + +static void crash_kexec_prepare_cpus(int cpu) +{ + unsigned int msecs; + unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ + int tries = 0; + int (*old_handler)(struct pt_regs *regs); + + printk(KERN_EMERG "Sending IPI to other CPUs\n"); + + if (crash_wake_offline) + ncpus = num_present_cpus() - 1; + + crash_send_ipi(crash_ipi_callback); + smp_wmb(); + +again: + /* + * FIXME: Until we will have the way to stop other CPUs reliably, + * the crash CPU will send an IPI and wait for other CPUs to + * respond. + */ + msecs = IPI_TIMEOUT; + while ((atomic_read(&cpus_in_crash) < ncpus) && (--msecs > 0)) + mdelay(1); + + /* Would it be better to replace the trap vector here? */ + + if (atomic_read(&cpus_in_crash) >= ncpus) { + printk(KERN_EMERG "IPI complete\n"); + return; + } + + printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n", + ncpus - atomic_read(&cpus_in_crash)); + + /* + * If we have a panic timeout set then we can't wait indefinitely + * for someone to activate system reset. We also give up on the + * second time through if system reset fail to work. + */ + if ((panic_timeout > 0) || (tries > 0)) + return; + + /* + * A system reset will cause all CPUs to take an 0x100 exception. + * The primary CPU returns here via setjmp, and the secondary + * CPUs reexecute the crash_kexec_secondary path. + */ + old_handler = __debugger; + __debugger = handle_fault; + crash_shutdown_cpu = smp_processor_id(); + + if (setjmp(crash_shutdown_buf) == 0) { + printk(KERN_EMERG "Activate system reset (dumprestart) " + "to stop other cpu(s)\n"); + + /* + * A system reset will force all CPUs to execute the + * crash code again. We need to reset cpus_in_crash so we + * wait for everyone to do this. + */ + atomic_set(&cpus_in_crash, 0); + smp_mb(); + + while (atomic_read(&cpus_in_crash) < ncpus) + cpu_relax(); + } + + crash_shutdown_cpu = -1; + __debugger = old_handler; + + tries++; + goto again; +} + +/* + * This function will be called by secondary cpus. + */ +void crash_kexec_secondary(struct pt_regs *regs) +{ + unsigned long flags; + int msecs = SECONDARY_TIMEOUT; + + local_irq_save(flags); + + /* Wait for the primary crash CPU to signal its progress */ + while (crashing_cpu < 0) { + if (--msecs < 0) { + /* No response, kdump image may not have been loaded */ + local_irq_restore(flags); + return; + } + + mdelay(1); + } + + crash_ipi_callback(regs); +} + +#else /* ! CONFIG_SMP */ + +static void crash_kexec_prepare_cpus(int cpu) +{ + /* + * move the secondaries to us so that we can copy + * the new kernel 0-0x100 safely + * + * do this if kexec in setup.c ? + */ +#ifdef CONFIG_PPC64 + smp_release_cpus(); +#else + /* FIXME */ +#endif +} + +void crash_kexec_secondary(struct pt_regs *regs) +{ +} +#endif /* CONFIG_SMP */ + +/* wait for all the CPUs to hit real mode but timeout if they don't come in */ +#if defined(CONFIG_SMP) && defined(CONFIG_PPC64) +static void __maybe_unused crash_kexec_wait_realmode(int cpu) +{ + unsigned int msecs; + int i; + + msecs = REAL_MODE_TIMEOUT; + for (i=0; i < nr_cpu_ids && msecs > 0; i++) { + if (i == cpu) + continue; + + while (paca_ptrs[i]->kexec_state < KEXEC_STATE_REAL_MODE) { + barrier(); + if (!cpu_possible(i) || !cpu_online(i) || (msecs <= 0)) + break; + msecs--; + mdelay(1); + } + } + mb(); +} +#else +static inline void crash_kexec_wait_realmode(int cpu) {} +#endif /* CONFIG_SMP && CONFIG_PPC64 */ + +/* + * Register a function to be called on shutdown. Only use this if you + * can't reset your device in the second kernel. + */ +int crash_shutdown_register(crash_shutdown_t handler) +{ + unsigned int i, rc; + + spin_lock(&crash_handlers_lock); + for (i = 0 ; i < CRASH_HANDLER_MAX; i++) + if (!crash_shutdown_handles[i]) { + /* Insert handle at first empty entry */ + crash_shutdown_handles[i] = handler; + rc = 0; + break; + } + + if (i == CRASH_HANDLER_MAX) { + printk(KERN_ERR "Crash shutdown handles full, " + "not registered.\n"); + rc = 1; + } + + spin_unlock(&crash_handlers_lock); + return rc; +} |