summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 11:40:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 11:40:28 -0700
commitef8f3d48afd6a17a0dae8c277c2f539c2f19fd16 (patch)
tree5c28f9f287a552ad1a655b9e29e5330966652e89 /mm
parentd7d170a8e357bd9926cc6bfea5c2385c2eac65b2 (diff)
parent2c207985f354dfb549e5a543102a3e084eea81f6 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: "Am experimenting with splitting MM up into identifiable subsystems perhaps with a view to gitifying it in complex ways. Also with more verbose "incoming" emails. Most of MM is here and a few other trees. Subsystems affected by this patch series: - hotfixes - iommu - scripts - arch/sh - ocfs2 - mm:slab-generic - mm:slub - mm:kmemleak - mm:kasan - mm:cleanups - mm:debug - mm:pagecache - mm:swap - mm:memcg - mm:gup - mm:pagemap - mm:infrastructure - mm:vmalloc - mm:initialization - mm:pagealloc - mm:vmscan - mm:tools - mm:proc - mm:ras - mm:oom-kill hotfixes: mm: vmscan: scan anonymous pages on file refaults mm/nvdimm: add is_ioremap_addr and use that to check ioremap address mm/memcontrol: fix wrong statistics in memory.stat mm/z3fold.c: lock z3fold page before __SetPageMovable() nilfs2: do not use unexported cpu_to_le32()/le32_to_cpu() in uapi header MAINTAINERS: nilfs2: update email address iommu: include/linux/dmar.h: replace single-char identifiers in macros scripts: scripts/decode_stacktrace: match basepath using shell prefix operator, not regex scripts/decode_stacktrace: look for modules with .ko.debug extension scripts/spelling.txt: drop "sepc" from the misspelling list scripts/spelling.txt: add spelling fix for prohibited scripts/decode_stacktrace: Accept dash/underscore in modules scripts/spelling.txt: add more spellings to spelling.txt arch/sh: arch/sh/configs/sdk7786_defconfig: remove CONFIG_LOGFS sh: config: remove left-over BACKLIGHT_LCD_SUPPORT sh: prevent warnings when using iounmap ocfs2: fs: ocfs: fix spelling mistake "hearbeating" -> "heartbeat" ocfs2/dlm: use struct_size() helper ocfs2: add last unlock times in locking_state ocfs2: add locking filter debugfs file ocfs2: add first lock wait time in locking_state ocfs: no need to check return value of debugfs_create functions fs/ocfs2/dlmglue.c: unneeded variable: "status" ocfs2: use kmemdup rather than duplicating its implementation mm:slab-generic: Patch series "mm/slab: Improved sanity checking": mm/slab: validate cache membership under freelist hardening mm/slab: sanity-check page type when looking up cache lkdtm/heap: add tests for freelist hardening mm:slub: mm/slub.c: avoid double string traverse in kmem_cache_flags() slub: don't panic for memcg kmem cache creation failure mm:kmemleak: mm/kmemleak.c: fix check for softirq context mm/kmemleak.c: change error at _write when kmemleak is disabled docs: kmemleak: add more documentation details mm:kasan: mm/kasan: print frame description for stack bugs Patch series "Bitops instrumentation for KASAN", v5: lib/test_kasan: add bitops tests x86: use static_cpu_has in uaccess region to avoid instrumentation asm-generic, x86: add bitops instrumentation for KASAN Patch series "mm/kasan: Add object validation in ksize()", v3: mm/kasan: introduce __kasan_check_{read,write} mm/kasan: change kasan_check_{read,write} to return boolean lib/test_kasan: Add test for double-kzfree detection mm/slab: refactor common ksize KASAN logic into slab_common.c mm/kasan: add object validation in ksize() mm:cleanups: include/linux/pfn_t.h: remove pfn_t_to_virt() Patch series "remove ARCH_SELECT_MEMORY_MODEL where it has no effect": arm: remove ARCH_SELECT_MEMORY_MODEL s390: remove ARCH_SELECT_MEMORY_MODEL sparc: remove ARCH_SELECT_MEMORY_MODEL mm/gup.c: make follow_page_mask() static mm/memory.c: trivial clean up in insert_page() mm: make !CONFIG_HUGE_PAGE wrappers into static inlines include/linux/mm_types.h: ifdef struct vm_area_struct::swap_readahead_info mm: remove the account_page_dirtied export mm/page_isolation.c: change the prototype of undo_isolate_page_range() include/linux/vmpressure.h: use spinlock_t instead of struct spinlock mm: remove the exporting of totalram_pages include/linux/pagemap.h: document trylock_page() return value mm:debug: mm/failslab.c: by default, do not fail allocations with direct reclaim only Patch series "debug_pagealloc improvements": mm, debug_pagelloc: use static keys to enable debugging mm, page_alloc: more extensive free page checking with debug_pagealloc mm, debug_pagealloc: use a page type instead of page_ext flag mm:pagecache: Patch series "fix filler_t callback type mismatches", v2: mm/filemap.c: fix an overly long line in read_cache_page mm/filemap: don't cast ->readpage to filler_t for do_read_cache_page jffs2: pass the correct prototype to read_cache_page 9p: pass the correct prototype to read_cache_page mm/filemap.c: correct the comment about VM_FAULT_RETRY mm:swap: mm, swap: fix race between swapoff and some swap operations mm/swap_state.c: simplify total_swapcache_pages() with get_swap_device() mm, swap: use rbtree for swap_extent mm/mincore.c: fix race between swapoff and mincore mm:memcg: memcg, oom: no oom-kill for __GFP_RETRY_MAYFAIL memcg, fsnotify: no oom-kill for remote memcg charging mm, memcg: introduce memory.events.local mm: memcontrol: dump memory.stat during cgroup OOM Patch series "mm: reparent slab memory on cgroup removal", v7: mm: memcg/slab: postpone kmem_cache memcg pointer initialization to memcg_link_cache() mm: memcg/slab: rename slab delayed deactivation functions and fields mm: memcg/slab: generalize postponed non-root kmem_cache deactivation mm: memcg/slab: introduce __memcg_kmem_uncharge_memcg() mm: memcg/slab: unify SLAB and SLUB page accounting mm: memcg/slab: don't check the dying flag on kmem_cache creation mm: memcg/slab: synchronize access to kmem_cache dying flag using a spinlock mm: memcg/slab: rework non-root kmem_cache lifecycle management mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages mm: memcg/slab: reparent memcg kmem_caches on cgroup removal mm, memcg: add a memcg_slabinfo debugfs file mm:gup: Patch series "switch the remaining architectures to use generic GUP", v4: mm: use untagged_addr() for get_user_pages_fast addresses mm: simplify gup_fast_permitted mm: lift the x86_32 PAE version of gup_get_pte to common code MIPS: use the generic get_user_pages_fast code sh: add the missing pud_page definition sh: use the generic get_user_pages_fast code sparc64: add the missing pgd_page definition sparc64: define untagged_addr() sparc64: use the generic get_user_pages_fast code mm: rename CONFIG_HAVE_GENERIC_GUP to CONFIG_HAVE_FAST_GUP mm: reorder code blocks in gup.c mm: consolidate the get_user_pages* implementations mm: validate get_user_pages_fast flags mm: move the powerpc hugepd code to mm/gup.c mm: switch gup_hugepte to use try_get_compound_head mm: mark the page referenced in gup_hugepte mm/gup: speed up check_and_migrate_cma_pages() on huge page mm/gup.c: remove some BUG_ONs from get_gate_page() mm/gup.c: mark undo_dev_pagemap as __maybe_unused mm:pagemap: asm-generic, x86: introduce generic pte_{alloc,free}_one[_kernel] alpha: switch to generic version of pte allocation arm: switch to generic version of pte allocation arm64: switch to generic version of pte allocation csky: switch to generic version of pte allocation m68k: sun3: switch to generic version of pte allocation mips: switch to generic version of pte allocation nds32: switch to generic version of pte allocation nios2: switch to generic version of pte allocation parisc: switch to generic version of pte allocation riscv: switch to generic version of pte allocation um: switch to generic version of pte allocation unicore32: switch to generic version of pte allocation mm/pgtable: drop pgtable_t variable from pte_fn_t functions mm/memory.c: fail when offset == num in first check of __vm_map_pages() mm:infrastructure: mm/mmu_notifier: use hlist_add_head_rcu() mm:vmalloc: Patch series "Some cleanups for the KVA/vmalloc", v5: mm/vmalloc.c: remove "node" argument mm/vmalloc.c: preload a CPU with one object for split purpose mm/vmalloc.c: get rid of one single unlink_va() when merge mm/vmalloc.c: switch to WARN_ON() and move it under unlink_va() mm/vmalloc.c: spelling> s/informaion/information/ mm:initialization: mm/large system hash: use vmalloc for size > MAX_ORDER when !hashdist mm/large system hash: clear hashdist when only one node with memory is booted mm:pagealloc: arm64: move jump_label_init() before parse_early_param() Patch series "add init_on_alloc/init_on_free boot options", v10: mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options mm: init: report memory auto-initialization features at boot time mm:vmscan: mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned mm: vmscan: correct some vmscan counters for THP swapout mm:tools: tools/vm/slabinfo: order command line options tools/vm/slabinfo: add partial slab listing to -X tools/vm/slabinfo: add option to sort by partial slabs tools/vm/slabinfo: add sorting info to help menu mm:proc: proc: use down_read_killable mmap_sem for /proc/pid/maps proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup proc: use down_read_killable mmap_sem for /proc/pid/pagemap proc: use down_read_killable mmap_sem for /proc/pid/clear_refs proc: use down_read_killable mmap_sem for /proc/pid/map_files mm: use down_read_killable for locking mmap_sem in access_remote_vm mm: smaps: split PSS into components mm: vmalloc: show number of vmalloc pages in /proc/meminfo mm:ras: mm/memory-failure.c: clarify error message mm:oom-kill: mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks() mm, oom: refactor dump_tasks for memcg OOMs mm, oom: remove redundant task_in_mem_cgroup() check oom: decouple mems_allowed from oom_unkillable_task mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process()" * akpm: (147 commits) mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process() oom: decouple mems_allowed from oom_unkillable_task mm, oom: remove redundant task_in_mem_cgroup() check mm, oom: refactor dump_tasks for memcg OOMs mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks() mm/memory-failure.c: clarify error message mm: vmalloc: show number of vmalloc pages in /proc/meminfo mm: smaps: split PSS into components mm: use down_read_killable for locking mmap_sem in access_remote_vm proc: use down_read_killable mmap_sem for /proc/pid/map_files proc: use down_read_killable mmap_sem for /proc/pid/clear_refs proc: use down_read_killable mmap_sem for /proc/pid/pagemap proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup proc: use down_read_killable mmap_sem for /proc/pid/maps tools/vm/slabinfo: add sorting info to help menu tools/vm/slabinfo: add option to sort by partial slabs tools/vm/slabinfo: add partial slab listing to -X tools/vm/slabinfo: order command line options mm: vmscan: correct some vmscan counters for THP swapout mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Kconfig.debug14
-rw-r--r--mm/Makefile4
-rw-r--r--mm/dmapool.c4
-rw-r--r--mm/failslab.c3
-rw-r--r--mm/filemap.c19
-rw-r--r--mm/gup.c674
-rw-r--r--mm/kasan/common.c14
-rw-r--r--mm/kasan/generic.c13
-rw-r--r--mm/kasan/kasan.h15
-rw-r--r--mm/kasan/report.c165
-rw-r--r--mm/kasan/tags.c12
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/list_lru.c3
-rw-r--r--mm/memcontrol.c461
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c15
-rw-r--r--mm/mincore.c12
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/nommu.c91
-rw-r--r--mm/oom_kill.c131
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c234
-rw-r--r--mm/page_ext.c3
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/slab.c79
-rw-r--r--mm/slab.h199
-rw-r--r--mm/slab_common.c269
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c86
-rw-r--r--mm/swap_state.c49
-rw-r--r--mm/swapfile.c291
-rw-r--r--mm/util.c47
-rw-r--r--mm/vmalloc.c108
-rw-r--r--mm/vmscan.c74
-rw-r--r--mm/z3fold.c12
37 files changed, 1978 insertions, 1157 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ef6efedc5921..0b4352557dd5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -132,7 +132,8 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
-config HAVE_GENERIC_GUP
+config HAVE_FAST_GUP
+ depends on MMU
bool
config ARCH_KEEP_MEMBLOCK
@@ -762,7 +763,20 @@ config GUP_BENCHMARK
See tools/testing/selftests/vm/gup_benchmark.c
+config GUP_GET_PTE_LOW_HIGH
+ bool
+
config ARCH_HAS_PTE_SPECIAL
bool
+#
+# Some architectures require a special hugepage directory format that is
+# required to support multiple hugepage sizes. For example a4fe3ce76
+# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
+# introduced it on powerpc. This allows for a more flexible hugepage
+# pagetable layouts.
+#
+config ARCH_HAS_HUGEPD
+ bool
+
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index fa6d79281368..82b6a20898bd 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -12,19 +12,23 @@ config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
depends on DEBUG_KERNEL
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
- select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
Depending on runtime enablement, this results in a small or large
slowdown, but helps to find certain types of memory corruption.
+ Also, the state of page tracking structures is checked more often as
+ pages are being allocated and freed, as unexpected state changes
+ often happen for same reasons as memory corruption (e.g. double free,
+ use-after-free).
+
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
- the patterns before alloc_pages(). Additionally,
- this option cannot be enabled in combination with hibernation as
- that would result in incorrect warnings of memory corruption after
- a resume because free pages are not saved to the suspend image.
+ the patterns before alloc_pages(). Additionally, this option cannot
+ be enabled in combination with hibernation as that would result in
+ incorrect warnings of memory corruption after a resume because free
+ pages are not saved to the suspend image.
By default this option will have a small overhead, e.g. by not
allowing the kernel mapping to be backed by large pages on some
diff --git a/mm/Makefile b/mm/Makefile
index ac5e5ba78874..dc0746ca1109 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,7 +22,7 @@ KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
mmu-y := nommu.o
-mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
+mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
pgtable-generic.o rmap.o vmalloc.o
@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- debug.o $(mmu-y)
+ debug.o gup.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 8c94c89a6f7e..fe5d33060415 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -378,7 +378,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
#endif
spin_unlock_irqrestore(&pool->lock, flags);
- if (mem_flags & __GFP_ZERO)
+ if (want_init_on_alloc(mem_flags))
memset(retval, 0, pool->size);
return retval;
@@ -428,6 +428,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
offset = vaddr - page->vaddr;
+ if (want_init_on_free())
+ memset(vaddr, 0, pool->size);
#ifdef DMAPOOL_DEBUG
if ((dma - page->dma) != offset) {
spin_unlock_irqrestore(&pool->lock, flags);
diff --git a/mm/failslab.c b/mm/failslab.c
index ec5aad211c5b..f92fed91ac23 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -23,7 +23,8 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
if (gfpflags & __GFP_NOFAIL)
return false;
- if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
+ if (failslab.ignore_gfp_reclaim &&
+ (gfpflags & __GFP_DIRECT_RECLAIM))
return false;
if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
diff --git a/mm/filemap.c b/mm/filemap.c
index f1aa20ab8434..d0cf700bf201 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2504,10 +2504,8 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
*
* vma->vm_mm->mmap_sem must be held on entry.
*
- * If our return value has VM_FAULT_RETRY set, it's because
- * lock_page_or_retry() returned 0.
- * The mmap_sem has usually been released in this case.
- * See __lock_page_or_retry() for the exception.
+ * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
+ * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
*
* If our return value does not have VM_FAULT_RETRY set, the mmap_sem
* has not been released.
@@ -2825,7 +2823,11 @@ repeat:
}
filler:
- err = filler(data, page);
+ if (filler)
+ err = filler(data, page);
+ else
+ err = mapping->a_ops->readpage(data, page);
+
if (err < 0) {
put_page(page);
return ERR_PTR(err);
@@ -2915,7 +2917,8 @@ struct page *read_cache_page(struct address_space *mapping,
int (*filler)(void *, struct page *),
void *data)
{
- return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+ return do_read_cache_page(mapping, index, filler, data,
+ mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);
@@ -2936,9 +2939,7 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
pgoff_t index,
gfp_t gfp)
{
- filler_t *filler = (filler_t *)mapping->a_ops->readpage;
-
- return do_read_cache_page(mapping, index, filler, NULL, gfp);
+ return do_read_cache_page(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);
diff --git a/mm/gup.c b/mm/gup.c
index ddde097cf9e4..43b7d875de37 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -134,6 +134,7 @@ void put_user_pages(struct page **pages, unsigned long npages)
}
EXPORT_SYMBOL(put_user_pages);
+#ifdef CONFIG_MMU
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
{
@@ -515,7 +516,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
* an error pointer if there is a mapping to something not represented
* by a page descriptor (see also vm_normal_page()).
*/
-struct page *follow_page_mask(struct vm_area_struct *vma,
+static struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
struct follow_page_context *ctx)
{
@@ -585,11 +586,14 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pgd = pgd_offset_k(address);
else
pgd = pgd_offset_gate(mm, address);
- BUG_ON(pgd_none(*pgd));
+ if (pgd_none(*pgd))
+ return -EFAULT;
p4d = p4d_offset(pgd, address);
- BUG_ON(p4d_none(*p4d));
+ if (p4d_none(*p4d))
+ return -EFAULT;
pud = pud_offset(p4d, address);
- BUG_ON(pud_none(*pud));
+ if (pud_none(*pud))
+ return -EFAULT;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return -EFAULT;
@@ -1101,86 +1105,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
}
/*
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
- * get_user_pages_locked() is suitable to replace the form:
- *
- * down_read(&mm->mmap_sem);
- * do_something()
- * get_user_pages(tsk, mm, ..., pages, NULL);
- * up_read(&mm->mmap_sem);
- *
- * to:
- *
- * int locked = 1;
- * down_read(&mm->mmap_sem);
- * do_something()
- * get_user_pages_locked(tsk, mm, ..., pages, &locked);
- * if (locked)
- * up_read(&mm->mmap_sem);
- */
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
-
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
- pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages_locked);
-
-/*
- * get_user_pages_unlocked() is suitable to replace the form:
- *
- * down_read(&mm->mmap_sem);
- * get_user_pages(tsk, mm, ..., pages, NULL);
- * up_read(&mm->mmap_sem);
- *
- * with:
- *
- * get_user_pages_unlocked(tsk, mm, ..., pages);
- *
- * It is functionally equivalent to get_user_pages_fast so
- * get_user_pages_fast should be used instead if specific gup_flags
- * (e.g. FOLL_FORCE) are not required.
- */
-long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
-{
- struct mm_struct *mm = current->mm;
- int locked = 1;
- long ret;
-
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
-
- down_read(&mm->mmap_sem);
- ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
- &locked, gup_flags | FOLL_TOUCH);
- if (locked)
- up_read(&mm->mmap_sem);
- return ret;
-}
-EXPORT_SYMBOL(get_user_pages_unlocked);
-
-/*
* get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
@@ -1256,6 +1180,198 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(get_user_pages_remote);
+/**
+ * populate_vma_page_range() - populate a range of pages in the vma.
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @nonblocking:
+ *
+ * This takes care of mlocking the pages too if VM_LOCKED is set.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * If @nonblocking is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @nonblocking is non-NULL, it must held for read only and may be
+ * released. If it's released, *@nonblocking will be set to 0.
+ */
+long populate_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+ if (vma->vm_flags & VM_LOCKONFAULT)
+ gup_flags &= ~FOLL_POPULATE;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ gup_flags |= FOLL_FORCE;
+
+ /*
+ * We made sure addr is within a VMA, so the following will
+ * not result in a stack expansion that recurses back here.
+ */
+ return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
+}
+
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ long ret = 0;
+
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. populate_vma_page_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = populate_vma_page_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_sem, but after all other threads have been killed.
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ if (__get_user_pages(current, current->mm, addr, 1,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+ NULL) < 1)
+ return NULL;
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ return page;
+}
+#endif /* CONFIG_ELF_CORE */
+#else /* CONFIG_MMU */
+static long __get_user_pages_locked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ struct vm_area_struct **vmas, int *locked,
+ unsigned int foll_flags)
+{
+ struct vm_area_struct *vma;
+ unsigned long vm_flags;
+ int i;
+
+ /* calculate required read or write permissions.
+ * If FOLL_FORCE is set, we only require the "MAY" flags.
+ */
+ vm_flags = (foll_flags & FOLL_WRITE) ?
+ (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= (foll_flags & FOLL_FORCE) ?
+ (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ for (i = 0; i < nr_pages; i++) {
+ vma = find_vma(mm, start);
+ if (!vma)
+ goto finish_or_fault;
+
+ /* protect what we can, including chardevs */
+ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ !(vm_flags & vma->vm_flags))
+ goto finish_or_fault;
+
+ if (pages) {
+ pages[i] = virt_to_page(start);
+ if (pages[i])
+ get_page(pages[i]);
+ }
+ if (vmas)
+ vmas[i] = vma;
+ start = (start + PAGE_SIZE) & PAGE_MASK;
+ }
+
+ return i;
+
+finish_or_fault:
+ return i ? : -EFAULT;
+}
+#endif /* !CONFIG_MMU */
+
#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
{
@@ -1336,25 +1452,31 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
struct vm_area_struct **vmas,
unsigned int gup_flags)
{
- long i;
+ unsigned long i;
+ unsigned long step;
bool drain_allow = true;
bool migrate_allow = true;
LIST_HEAD(cma_page_list);
check_again:
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_pages;) {
+
+ struct page *head = compound_head(pages[i]);
+
+ /*
+ * gup may start from a tail page. Advance step by the left
+ * part.
+ */
+ step = (1 << compound_order(head)) - (pages[i] - head);
/*
* If we get a page from the CMA zone, since we are going to
* be pinning these entries, we might as well move them out
* of the CMA zone if possible.
*/
- if (is_migrate_cma_page(pages[i])) {
-
- struct page *head = compound_head(pages[i]);
-
- if (PageHuge(head)) {
+ if (is_migrate_cma_page(head)) {
+ if (PageHuge(head))
isolate_huge_page(head, &cma_page_list);
- } else {
+ else {
if (!PageLRU(head) && drain_allow) {
lru_add_drain_all();
drain_allow = false;
@@ -1369,6 +1491,8 @@ check_again:
}
}
}
+
+ i += step;
}
if (!list_empty(&cma_page_list)) {
@@ -1417,7 +1541,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
{
return nr_pages;
}
-#endif
+#endif /* CONFIG_CMA */
/*
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1503,155 +1627,88 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
-/**
- * populate_vma_page_range() - populate a range of pages in the vma.
- * @vma: target vma
- * @start: start address
- * @end: end address
- * @nonblocking:
- *
- * This takes care of mlocking the pages too if VM_LOCKED is set.
+/*
+ * We can leverage the VM_FAULT_