Merge tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc updates from Michael Ellerman: "Highlights include: - Larger virtual address space on 64-bit server CPUs. By default we use a 128TB virtual address space, but a process can request access to the full 512TB by passing a hint to mmap(). - Support for the new Power9 "XIVE" interrupt controller. - TLB flushing optimisations for the radix MMU on Power9. - Support for CAPI cards on Power9, using the "Coherent Accelerator Interface Architecture 2.0". - The ability to configure the mmap randomisation limits at build and runtime. - Several small fixes and cleanups to the kprobes code, as well as support for KPROBES_ON_FTRACE. - Major improvements to handling of system reset interrupts, correctly treating them as NMIs, giving them a dedicated stack and using a new hypervisor call to trigger them, all of which should aid debugging and robustness. - Many fixes and other minor enhancements. Thanks to: Alastair D'Silva, Alexey Kardashevskiy, Alistair Popple, Andrew Donnellan, Aneesh Kumar K.V, Anshuman Khandual, Anton Blanchard, Balbir Singh, Ben Hutchings, Benjamin Herrenschmidt, Bhupesh Sharma, Chris Packham, Christian Zigotzky, Christophe Leroy, Christophe Lombard, Daniel Axtens, David Gibson, Gautham R. Shenoy, Gavin Shan, Geert Uytterhoeven, Guilherme G. Piccoli, Hamish Martin, Hari Bathini, Kees Cook, Laurent Dufour, Madhavan Srinivasan, Mahesh J Salgaonkar, Mahesh Salgaonkar, Masami Hiramatsu, Matt Brown, Matthew R. Ochs, Michael Neuling, Naveen N. Rao, Nicholas Piggin, Oliver O'Halloran, Pan Xinhui, Paul Mackerras, Rashmica Gupta, Russell Currey, Sukadev Bhattiprolu, Thadeu Lima de Souza Cascardo, Tobin C. Harding, Tyrel Datwyler, Uma Krishnan, Vaibhav Jain, Vipin K Parashar, Yang Shi" * tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (214 commits) powerpc/64s: Power9 has no LPCR[VRMASD] field so don't set it powerpc/powernv: Fix TCE kill on NVLink2 powerpc/mm/radix: Drop support for CPUs without lockless tlbie powerpc/book3s/mce: Move add_taint() later in virtual mode powerpc/sysfs: Move #ifdef CONFIG_HOTPLUG_CPU out of the function body powerpc/smp: Document irq enable/disable after migrating IRQs powerpc/mpc52xx: Don't select user-visible RTAS_PROC powerpc/powernv: Document cxl dependency on special case in pnv_eeh_reset() powerpc/eeh: Clean up and document event handling functions powerpc/eeh: Avoid use after free in eeh_handle_special_event() cxl: Mask slice error interrupts after first occurrence cxl: Route eeh events to all drivers in cxl_pci_error_detected() cxl: Force context lock during EEH flow powerpc/64: Allow CONFIG_RELOCATABLE if COMPILE_TEST powerpc/xmon: Teach xmon oops about radix vectors powerpc/mm/hash: Fix off-by-one in comment about kernel contexts ids powerpc/pseries: Enable VFIO powerpc/powernv: Fix iommu table size calculation hook for small tables powerpc/powernv: Check kzalloc() return value in pnv_pci_table_alloc powerpc: Add arch/powerpc/tools directory ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-05 11:36:44 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-05 11:36:44 -0700
commit: 7246f60068840847bdcf595be5f0b5ca632736e0 (patch)
tree: fd9a963a03c2655f3ba9d1ced3c87a2775f5b166 /arch/powerpc/mm
parent: e579dde654fc2c6b0d3e4b77a9a4b2d2405c510e (diff)
parent: 700b7eadd5625d22b8235fb21259b3d7d564c000 (diff)
20 files changed, 604 insertions, 322 deletions
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index d979709a0239..c6b900f54c07 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -468,7 +468,7 @@ static void walk_linearmapping(struct pg_state *st)
 	unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift;
 
 	for (addr = PAGE_OFFSET; addr < PAGE_OFFSET +
-			memblock_phys_mem_size(); addr += psize)
+			memblock_end_of_DRAM(); addr += psize)
 		hpte_find(st, addr, mmu_linear_psize);
 }
 
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index 49abaf4dc8e3..d659345a98d6 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -26,6 +26,10 @@
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 
+#ifdef CONFIG_PPC32
+#define KERN_VIRT_START	0
+#endif
+
 /*
  * To visualise what is happening,
  *
@@ -56,6 +60,8 @@ struct pg_state {
 	struct seq_file *seq;
 	const struct addr_marker *marker;
 	unsigned long start_address;
+	unsigned long start_pa;
+	unsigned long last_pa;
 	unsigned int level;
 	u64 current_flags;
 };
@@ -69,6 +75,7 @@ static struct addr_marker address_markers[] = {
 	{ 0,	"Start of kernel VM" },
 	{ 0,	"vmalloc() Area" },
 	{ 0,	"vmalloc() End" },
+#ifdef CONFIG_PPC64
 	{ 0,	"isa I/O start" },
 	{ 0,	"isa I/O end" },
 	{ 0,	"phb I/O start" },
@@ -76,6 +83,20 @@ static struct addr_marker address_markers[] = {
 	{ 0,	"I/O remap start" },
 	{ 0,	"I/O remap end" },
 	{ 0,	"vmemmap start" },
+#else
+	{ 0,	"Early I/O remap start" },
+	{ 0,	"Early I/O remap end" },
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	{ 0,	"Consistent mem start" },
+	{ 0,	"Consistent mem end" },
+#endif
+#ifdef CONFIG_HIGHMEM
+	{ 0,	"Highmem PTEs start" },
+	{ 0,	"Highmem PTEs end" },
+#endif
+	{ 0,	"Fixmap start" },
+	{ 0,	"Fixmap end" },
+#endif
 	{ -1,	NULL },
 };
 
@@ -100,8 +121,13 @@ static const struct flag_info flag_array[] = {
 		.set	= "user",
 		.clear	= "    ",
 	}, {
+#if _PAGE_RO == 0
 		.mask	= _PAGE_RW,
 		.val	= _PAGE_RW,
+#else
+		.mask	= _PAGE_RO,
+		.val	= 0,
+#endif
 		.set	= "rw",
 		.clear	= "ro",
 	}, {
@@ -154,11 +180,24 @@ static const struct flag_info flag_array[] = {
 		.clear	= "             ",
 	}, {
 #endif
+#ifndef CONFIG_PPC_BOOK3S_64
 		.mask	= _PAGE_NO_CACHE,
 		.val	= _PAGE_NO_CACHE,
 		.set	= "no cache",
 		.clear	= "        ",
 	}, {
+#else
+		.mask	= _PAGE_NON_IDEMPOTENT,
+		.val	= _PAGE_NON_IDEMPOTENT,
+		.set	= "non-idempotent",
+		.clear	= "              ",
+	}, {
+		.mask	= _PAGE_TOLERANT,
+		.val	= _PAGE_TOLERANT,
+		.set	= "tolerant",
+		.clear	= "        ",
+	}, {
+#endif
 #ifdef CONFIG_PPC_BOOK3S_64
 		.mask	= H_PAGE_BUSY,
 		.val	= H_PAGE_BUSY,
@@ -188,6 +227,10 @@ static const struct flag_info flag_array[] = {
 		.mask	= _PAGE_SPECIAL,
 		.val	= _PAGE_SPECIAL,
 		.set	= "special",
+	}, {
+		.mask	= _PAGE_SHARED,
+		.val	= _PAGE_SHARED,
+		.set	= "shared",
 	}
 };
 
@@ -252,7 +295,14 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
 	const char *unit = units;
 	unsigned long delta;
 
-	seq_printf(st->seq, "0x%016lx-0x%016lx   ", st->start_address, addr-1);
+#ifdef CONFIG_PPC64
+	seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
+	seq_printf(st->seq, "0x%016lx ", st->start_pa);
+#else
+	seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1);
+	seq_printf(st->seq, "0x%08lx ", st->start_pa);
+#endif
+
 	delta = (addr - st->start_address) >> 10;
 	/* Work out what appropriate unit to use */
 	while (!(delta & 1023) && unit[1]) {
@@ -267,11 +317,15 @@ static void note_page(struct pg_state *st, unsigned long addr,
 	       unsigned int level, u64 val)
 {
 	u64 flag = val & pg_level[level].mask;
+	u64 pa = val & PTE_RPN_MASK;
+
 	/* At first no level is set */
 	if (!st->level) {
 		st->level = level;
 		st->current_flags = flag;
 		st->start_address = addr;
+		st->start_pa = pa;
+		st->last_pa = pa;
 		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 	/*
 	 * Dump the section of virtual memory when:
@@ -279,9 +333,11 @@ static void note_page(struct pg_state *st, unsigned long addr,
 	 *   - we change levels in the tree.
 	 *   - the address is in a different section of memory and is thus
 	 *   used for a different purpose, regardless of the flags.
+	 *   - the pa of this page is not adjacent to the last inspected page
 	 */
 	} else if (flag != st->current_flags || level != st->level ||
-		   addr >= st->marker[1].start_address) {
+		   addr >= st->marker[1].start_address ||
+		   pa != st->last_pa + PAGE_SIZE) {
 
 		/* Check the PTE flags */
 		if (st->current_flags) {
@@ -305,8 +361,12 @@ static void note_page(struct pg_state *st, unsigned long addr,
 			seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 		}
 		st->start_address = addr;
+		st->start_pa = pa;
+		st->last_pa = pa;
 		st->current_flags = flag;
 		st->level = level;
+	} else {
+		st->last_pa = pa;
 	}
 }
 
@@ -377,20 +437,38 @@ static void walk_pagetables(struct pg_state *st)
 
 static void populate_markers(void)
 {
-	address_markers[0].start_address = PAGE_OFFSET;
-	address_markers[1].start_address = VMALLOC_START;
-	address_markers[2].start_address = VMALLOC_END;
-	address_markers[3].start_address = ISA_IO_BASE;
-	address_markers[4].start_address = ISA_IO_END;
-	address_markers[5].start_address = PHB_IO_BASE;
-	address_markers[6].start_address = PHB_IO_END;
-	address_markers[7].start_address = IOREMAP_BASE;
-	address_markers[8].start_address = IOREMAP_END;
+	int i = 0;
+
+	address_markers[i++].start_address = PAGE_OFFSET;
+	address_markers[i++].start_address = VMALLOC_START;
+	address_markers[i++].start_address = VMALLOC_END;
+#ifdef CONFIG_PPC64
+	address_markers[i++].start_address = ISA_IO_BASE;
+	address_markers[i++].start_address = ISA_IO_END;
+	address_markers[i++].start_address = PHB_IO_BASE;
+	address_markers[i++].start_address = PHB_IO_END;
+	address_markers[i++].start_address = IOREMAP_BASE;
+	address_markers[i++].start_address = IOREMAP_END;
 #ifdef CONFIG_PPC_STD_MMU_64
-	address_markers[9].start_address =  H_VMEMMAP_BASE;
+	address_markers[i++].start_address =  H_VMEMMAP_BASE;
 #else
-	address_markers[9].start_address =  VMEMMAP_BASE;
+	address_markers[i++].start_address =  VMEMMAP_BASE;
+#endif
+#else /* !CONFIG_PPC64 */
+	address_markers[i++].start_address = ioremap_bot;
+	address_markers[i++].start_address = IOREMAP_TOP;
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	address_markers[i++].start_address = IOREMAP_TOP;
+	address_markers[i++].start_address = IOREMAP_TOP +
+					     CONFIG_CONSISTENT_SIZE;
+#endif
+#ifdef CONFIG_HIGHMEM
+	address_markers[i++].start_address = PKMAP_BASE;
+	address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
 #endif
+	address_markers[i++].start_address = FIXADDR_START;
+	address_markers[i++].start_address = FIXADDR_TOP;
+#endif /* CONFIG_PPC64 */
 }
 
 static int ptdump_show(struct seq_file *m, void *v)
@@ -435,7 +513,7 @@ static int ptdump_init(void)
 
 	populate_markers();
 	build_pgtable_complete_mask();
-	debugfs_file = debugfs_create_file("kernel_pagetables", 0400, NULL,
+	debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL,
 			NULL, &ptdump_fops);
 	return debugfs_file ? 0 : -ENOMEM;
 }
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 51def8a515be..3a7d580fdc59 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -120,8 +120,6 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 	siginfo_t info;
 	unsigned int lsb = 0;
 
-	up_read(&current->mm->mmap_sem);
-
 	if (!user_mode(regs))
 		return MM_FAULT_ERR(SIGBUS);
 
@@ -154,13 +152,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 	 * continue the pagefault.
 	 */
 	if (fatal_signal_pending(current)) {
-		/*
-		 * If we have retry set, the mmap semaphore will have
-		 * alrady been released in __lock_page_or_retry(). Else
-		 * we release it now.
-		 */
-		if (!(fault & VM_FAULT_RETRY))
-			up_read(&current->mm->mmap_sem);
 		/* Coming from kernel, we need to deal with uaccess fixups */
 		if (user_mode(regs))
 			return MM_FAULT_RETURN;
@@ -173,8 +164,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 
 	/* Out of memory */
 	if (fault & VM_FAULT_OOM) {
-		up_read(&current->mm->mmap_sem);
-
 		/*
 		 * We ran out of memory, or some other thing happened to us that
 		 * made us unable to handle the page fault gracefully.
@@ -298,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * can result in fault, which will cause a deadlock when called with
 	 * mmap_sem held
 	 */
-	if (user_mode(regs))
+	if (!is_exec && user_mode(regs))
 		store_update_sp = store_updates_sp(regs);
 
 	if (user_mode(regs))
@@ -458,9 +447,30 @@ good_area:
 	 * the fault.
 	 */
 	fault = handle_mm_fault(vma, address, flags);
+
+	/*
+	 * Handle the retry right now, the mmap_sem has been released in that
+	 * case.
+	 */
+	if (unlikely(fault & VM_FAULT_RETRY)) {
+		/* We retry only once */
+		if (flags & FAULT_FLAG_ALLOW_RETRY) {
+			/*
+			 * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+			 * of starvation.
+			 */
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
+			if (!fatal_signal_pending(current))
+				goto retry;
+		}
+		/* We will enter mm_fault_error() below */
+	} else
+		up_read(&current->mm->mmap_sem);
+
 	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
 		if (fault & VM_FAULT_SIGSEGV)
-			goto bad_area;
+			goto bad_area_nosemaphore;
 		rc = mm_fault_error(regs, address, fault);
 		if (rc >= MM_FAULT_RETURN)
 			goto bail;
@@ -469,41 +479,29 @@ good_area:
 	}
 
 	/*
-	 * Major/minor page fault accounting is only done on the
-	 * initial attempt. If we go through a retry, it is extremely
-	 * likely that the page will be found in page cache at that point.
+	 * Major/minor page fault accounting.
 	 */
-	if (flags & FAULT_FLAG_ALLOW_RETRY) {
-		if (fault & VM_FAULT_MAJOR) {
-			current->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
-				      regs, address);
+	if (fault & VM_FAULT_MAJOR) {
+		current->maj_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+			      regs, address);
 #ifdef CONFIG_PPC_SMLPAR
-			if (firmware_has_feature(FW_FEATURE_CMO)) {
-				u32 page_ins;
-
-				preempt_disable();
-				page_ins = be32_to_cpu(get_lppaca()->page_ins);
-				page_ins += 1 << PAGE_FACTOR;
-				get_lppaca()->page_ins = cpu_to_be32(page_ins);
-				preempt_enable();
-			}
-#endif /* CONFIG_PPC_SMLPAR */
-		} else {
-			current->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
-				      regs, address);
-		}
-		if (fault & VM_FAULT_RETRY) {
-			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
-			 * of starvation. */
-			flags &= ~FAULT_FLAG_ALLOW_RETRY;
-			flags |= FAULT_FLAG_TRIED;
-			goto retry;
+		if (firmware_has_feature(FW_FEATURE_CMO)) {
+			u32 page_ins;
+
+			preempt_disable();
+			page_ins = be32_to_cpu(get_lppaca()->page_ins);
+			page_ins += 1 << PAGE_FACTOR;
+			get_lppaca()->page_ins = cpu_to_be32(page_ins);
+			preempt_enable();
 		}
+#endif /* CONFIG_PPC_SMLPAR */
+	} else {
+		current->min_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+			      regs, address);
 	}
 
-	up_read(&mm->mmap_sem);
 	goto bail;
 
 bad_area:
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 09cc50c8dace..6f962e5cb5e1 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -31,10 +31,8 @@
 #ifdef CONFIG_SMP
 	.section .bss
 	.align	2
-	.globl mmu_hash_lock
 mmu_hash_lock:
 	.space	4
-EXPORT_SYMBOL(mmu_hash_lock)
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c554768b1fa2..f2095ce9d4b0 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,9 +35,8 @@
 #include <linux/memblock.h>
 #include <linux/context_tracking.h>
 #include <linux/libfdt.h>
-#include <linux/debugfs.h>
 
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -927,11 +926,6 @@ static void __init htab_initialize(void)
 	}
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
-	/* On U3 based machines, we need to reserve the DART area and
-	 * _NOT_ map it to avoid cache paradoxes as it's remapped non
-	 * cacheable later on
-	 */
-
 	/* create bolted the linear mapping in the hash table */
 	for_each_memblock(memory, reg) {
 		base = (unsigned long)__va(reg->base);
@@ -981,6 +975,19 @@ void __init hash__early_init_devtree(void)
 
 void __init hash__early_init_mmu(void)
 {
+	/*
+	 * We have code in __hash_page_64K() and elsewhere, which assumes it can
+	 * do the following:
+	 *   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+	 *
+	 * Where the slot number is between 0-15, and values of 8-15 indicate
+	 * the secondary bucket. For that code to work H_PAGE_F_SECOND and
+	 * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
+	 * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
+	 * with a BUILD_BUG_ON().
+	 */
+	BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul  << (H_PAGE_F_GIX_SHIFT + 3)));
+
 	htab_init_page_sizes();
 
 	/*
@@ -1120,7 +1127,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
 	copro_flush_all_slbs(mm);
 	if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
 
-		copy_mm_to_paca(&mm->context);
+		copy_mm_to_paca(mm);
 		slb_flush_and_rebolt();
 	}
 }
@@ -1192,7 +1199,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
 {
 	if (user_region) {
 		if (psize != get_paca_psize(ea)) {
-			copy_mm_to_paca(&mm->context);
+			copy_mm_to_paca(mm);
 			slb_flush_and_rebolt();
 		}
 	} else if (get_paca()->vmalloc_sllp !=
@@ -1855,5 +1862,4 @@ static int __init hash64_debugfs(void)
 	return 0;
 }
 machine_device_initcall(pseries, hash64_debugfs);
-
 #endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 83a8be791e06..bfe4e8526b2d 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -148,16 +148,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 
 	mm = vma->vm_mm;
 
-#ifdef CONFIG_PPC_MM_SLICES
-	psize = get_slice_psize(mm, ea);
-	tsize = mmu_get_tsize(psize);
-	shift = mmu_psize_defs[psize].shift;
-#else
 	psize = vma_mmu_pagesize(vma);
 	shift = __ilog2(psize);
 	tsize = shift - 10;
-#endif
-
 	/*
 	 * We can't be interrupted while we're setting up the MAS
 	 * regusters or after we've confirmed that no tlb exists.
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 35254a678456..6575b9aabef4 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -50,9 +50,12 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct hstate *h = hstate_file(file);
 	struct vm_unmapped_area_info info;
 
+	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (len > TASK_SIZE)
+	if (len > mm->task_size)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
@@ -64,7 +67,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (addr) {
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr &&
+		if (mm->task_size - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -78,5 +81,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	info.high_limit = current->mm->mmap_base;
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
+
+	if (addr > DEFAULT_MAP_WINDOW)
+		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
 	return vm_unmapped_area(&info);
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 8c3389cbcd12..a4f33de4008e 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -753,6 +753,24 @@ static int __init add_huge_page_size(unsigned long long size)
 	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 		return -EINVAL;
 
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * We need to make sure that for different page sizes reported by
+	 * firmware we only add hugetlb support for page sizes that can be
+	 * supported by linux page table layout.
+	 * For now we have
+	 * Radix: 2M
+	 * Hash: 16M and 16G
+	 */
+	if (radix_enabled()) {
+		if (mmu_psize != MMU_PAGE_2M)
+			return -EINVAL;
+	} else {
+		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
+			return -EINVAL;
+	}
+#endif
+
 	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 
 	/* Return if huge page size has already been setup */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index c22f207aa656..ec84b31c6c86 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -71,10 +71,6 @@
 #if H_PGTABLE_RANGE > USER_VSID_RANGE
 #warning Limited user VSID range means pagetable space is wasted
 #endif
-
-#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
-#warning TASK_SIZE is smaller than it needs to be.
-#endif
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
 phys_addr_t memstart_addr = ~0;
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index a5d9ef59debe..9dbd2a733d6b 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -59,13 +59,14 @@ static inline int mmap_is_legacy(void)
 
 unsigned long arch_mmap_rnd(void)
 {
-	unsigned long rnd;
+	unsigned long shift, rnd;
 
-	/* 8MB for 32bit, 1GB for 64bit */
+	shift = mmap_rnd_bits;
+#ifdef CONFIG_COMPAT
 	if (is_32bit_task())
-		rnd = get_random_long() % (1<<(23-PAGE_SHIFT));
-	else
-		rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT));
+		shift = mmap_rnd_compat_bits;
+#endif
+	rnd = get_random_long() % (1ul << shift);
 
 	return rnd << PAGE_SHIFT;
 }
@@ -79,7 +80,7 @@ static inline unsigned long mmap_base(unsigned long rnd)
 	else if (gap > MAX_GAP)
 		gap = MAX_GAP;
 
-	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
+	return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd);
 }
 
 #ifdef CONFIG_PPC_RADIX_MMU
@@ -97,7 +98,11 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct vm_area_struct *vma;
 	struct vm_unmapped_area_info info;
 
-	if (len > TASK_SIZE - mmap_min_addr)
+	if (unlikely(addr > mm->context.addr_limit &&
+		     mm->context.addr_limit != TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
+	if (len > mm->task_size - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -106,7 +111,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -114,8 +119,13 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
-	info.high_limit = TASK_SIZE;
 	info.align_mask = 0;
+
+	if (unlikely(addr > DEFAULT_MAP_WINDOW))
+		info.high_limit = mm->context.addr_limit;
+	else
+		info.high_limit = DEFAULT_MAP_WINDOW;
+
 	return vm_unmapped_area(&info);
 }
 
@@ -131,8 +141,12 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	unsigned long addr = addr0;
 	struct vm_unmapped_area_info info;
 
+	if (unlikely(addr > mm->context.addr_limit &&
+		     mm->context.addr_limit != TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
 	/* requested length too big for entire address space */
-	if (len > TASK_SIZE - mmap_min_addr)
+	if (len > mm->task_size - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -142,7 +156,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
 				(!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -152,7 +166,14 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
 	info.high_limit = mm->mmap_base;
 	info.align_mask = 0;
+
+	if (addr > DEFAULT_MAP_WINDOW)
+		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
 	addr = vm_unmapped_area(&info);
+	if (!(addr & ~PAGE_MASK))
+		return addr;
+	VM_BUG_ON(addr != -ENOMEM);
 
 	/*
 	 * A failed mmap() very likely causes application failure,
@@ -160,15 +181,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	if (addr & ~PAGE_MASK) {
-		VM_BUG_ON(addr != -ENOMEM);
-		info.flags = 0;
-		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = TASK_SIZE;
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
+	return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 }
 
 static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 73bf6e14c3aa..c6dca2ae78ef 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -30,17 +30,16 @@
 static DEFINE_SPINLOCK(mmu_context_lock);
 static DEFINE_IDA(mmu_context_ida);
 
-int __init_new_context(void)
+static int alloc_context_id(int min_id, int max_id)
 {
-	int index;
-	int err;
+	int index, err;
 
 again:
 	if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
 		return -ENOMEM;
 
 	spin_lock(&mmu_context_lock);
-	err = ida_get_new_above(&mmu_context_ida, 1, &index);
+	err = ida_get_new_above(&mmu_context_ida, min_id, &index);
 	spin_unlock(&mmu_context_lock);
 
 	if (err == -EAGAIN)
@@ -48,7 +47,7 @@ again:
 	else if (err)
 		return err;
 
-	if (index > MAX_USER_CONTEXT) {
+	if (index > max_id) {
 		spin_lock(&mmu_context_lock);
 		ida_remove(&mmu_context_ida, index);
 		spin_unlock(&mmu_context_lock);
@@ -57,48 +56,105 @@ again:
 
 	return index;
 }
-EXPORT_SYMBOL_GPL(__init_new_context);
-static int radix__init_new_context(struct mm_struct *mm, int index)
+
+void hash__reserve_context_id(int id)
+{
+	int rc, result = 0;
+
+	do {
+		if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
+			break;
+
+		spin_lock(&mmu_context_lock);
+		rc = ida_get_new_above(&mmu_context_ida, id, &result);
+		spin_unlock(&mmu_context_lock);
+	} while (rc == -EAGAIN);
+
+	WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
+}
+
+int hash__alloc_context_id(void)
+{
+	unsigned long max;
+
+	if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+		max = MAX_USER_CONTEXT;
+	else
+		max = MAX_USER_CONTEXT_65BIT_VA;
+
+	return alloc_context_id(MIN_USER_CONTEXT, max);
+}
+EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+
+static int hash__init_new_context(struct mm_struct *mm)
+{
+	int index;
+
+	index = hash__alloc_context_id();
+	if (index < 0)
+		return index;
+
+	/*
+	 * We do switch_slb() early in fork, even before we setup the
+	 * mm->context.addr_limit. Default to max task size so that we copy the
+	 * default values to paca which will help us to handle slb miss early.
+	 */
+	mm->context.addr_limit = TASK_SIZE_128TB;
+
+	/*
+	 * The old code would re-promote on fork, we don't do that when using
+	 * slices as it could cause problem promoting slices that have been
+	 * forced down to 4K.
+	 *
+	 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+	 * explicitly against context.id == 0. This ensures that we properly
+	 * initialize context slice details for newly allocated mm's (which will
+	 * have id == 0) and don't alter context slice inherited via fork (which
+	 * will have id != 0).
+	 *
+	 * We should not be calling init_new_context() on init_mm. Hence a
+	 * check against 0 is OK.
+	 */
+	if (mm->context.id == 0)
+		slice_set_user_psize(mm, mmu_virtual_psize);
+
+	subpage_prot_init_new_context(mm);
+
+	return index;
+}
+
+static int radix__init_new_context(struct mm_struct *mm)
 {
 	unsigned long rts_field;
+	int index;
+
+	index = alloc_context_id(1, PRTB_ENTRIES - 1);
+	if (index < 0)
+		return index;
 
 	/*
 	 * set the process table entry,
 	 */
 	rts_field = radix__get_tree_size();
 	process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
-	return 0;
+
+	mm->context.npu_context = NULL;
+
+	return index;
 }
 
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	int index;
 
-	index = __init_new_context();
+	if (radix_enabled())
+		index = radix__init_new_context(mm);
+	else
+		index = hash__init_new_context(mm);
+
 	if (index < 0)
 		return index;
 
-	if (radix_enabled()) {
-		radix__init_new_context(mm, index);
-	} else {
-
-		/* The old code would re-promote on fork, we don't do that
-		 * when using slices as it could cause problem promoting slices
-		 * that have been forced down to 4K
-		 *
-		 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
-		 * explicitly against context.id == 0. This ensures that we
-		 * properly initialize context slice details for newly allocated
-		 * mm's (which will have id == 0) and don't alter context slice
-		 * inherited via fork (which will have id != 0).
-		 *
-		 * We should not be calling init_new_context() on init_mm. Hence a
-		 * check against 0 is ok.
-		 */
-		if (mm->context.id == 0)
-			slice_set_user_psize(mm, mmu_virtual_psize);
-		subpage_prot_init_new_context(mm);
-	}
 	mm->context.id = index;
 #ifdef CONFIG_PPC_ICSWX
 	mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 497130c5c742..e0a2d8e806ed 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -81,7 +81,7 @@ struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-05-05 11:36:44 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-05-05 11:36:44 -0700
commit	7246f60068840847bdcf595be5f0b5ca632736e0 (patch)
tree	fd9a963a03c2655f3ba9d1ced3c87a2775f5b166 /arch/powerpc/mm
parent	e579dde654fc2c6b0d3e4b77a9a4b2d2405c510e (diff)
parent	700b7eadd5625d22b8235fb21259b3d7d564c000 (diff)