summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 19:42:40 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 19:42:40 -0800
commit7c225c69f86c934e3be9be63ecde754e286838d7 (patch)
treeff2df419b0c4886b37407235f7d21215e4cf45e4 /mm
parent6363b3f3ac5be096d08c8c504128befa0c033529 (diff)
parent1b7176aea0a924ac59c6a283129d3e8eb00aa915 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - a few misc bits - ocfs2 updates - almost all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (131 commits) memory hotplug: fix comments when adding section mm: make alloc_node_mem_map a void call if we don't have CONFIG_FLAT_NODE_MEM_MAP mm: simplify nodemask printing mm,oom_reaper: remove pointless kthread_run() error check mm/page_ext.c: check if page_ext is not prepared writeback: remove unused function parameter mm: do not rely on preempt_count in print_vma_addr mm, sparse: do not swamp log with huge vmemmap allocation failures mm/hmm: remove redundant variable align_end mm/list_lru.c: mark expected switch fall-through mm/shmem.c: mark expected switch fall-through mm/page_alloc.c: broken deferred calculation mm: don't warn about allocations which stall for too long fs: fuse: account fuse_inode slab memory as reclaimable mm, page_alloc: fix potential false positive in __zone_watermark_ok mm: mlock: remove lru_add_drain_all() mm, sysctl: make NUMA stats configurable shmem: convert shmem_init_inodecache() to void Unify migrate_pages and move_pages access checks mm, pagevec: rename pagevec drained field ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/debug.c5
-rw-r--r--mm/filemap.c221
-rw-r--r--mm/hmm.c3
-rw-r--r--mm/huge_memory.c78
-rw-r--r--mm/hugetlb.c16
-rw-r--r--mm/kasan/kasan.c2
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/kmemcheck.c125
-rw-r--r--mm/kmemleak.c11
-rw-r--r--mm/ksm.c15
-rw-r--r--mm/list_lru.c1
-rw-r--r--mm/memblock.c68
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c90
-rw-r--r--mm/memory_hotplug.c50
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c15
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmu_notifier.c11
-rw-r--r--mm/oom_kill.c60
-rw-r--r--mm/page-writeback.c47
-rw-r--r--mm/page_alloc.c465
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab.c45
-rw-r--r--mm/slab.h41
-rw-r--r--mm/slab_common.c59
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c67
-rw-r--r--mm/sparse-vmemmap.c34
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap.c35
-rw-r--r--mm/swap_slots.c11
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c21
-rw-r--r--mm/truncate.c149
-rw-r--r--mm/vmscan.c8
-rw-r--r--mm/vmstat.c77
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/zsmalloc.c2
50 files changed, 1196 insertions, 813 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5b0adf1435de..e5e606ee5f71 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
depends on DEBUG_KERNEL
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
- depends on !KMEMCHECK
select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
diff --git a/mm/Makefile b/mm/Makefile
index 4659b93cba43..e7ebd176fb93 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n
KCOV_INSTRUMENT_page_alloc.o := n
KCOV_INSTRUMENT_debug-pagealloc.o := n
KCOV_INSTRUMENT_kmemleak.o := n
-KCOV_INSTRUMENT_kmemcheck.o := n
KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
@@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
-obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/cma.c b/mm/cma.c
index 022e52bd8370..0607729abf3b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -461,7 +461,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
trace_cma_alloc(pfn, page, count, align);
if (ret && !(gfp_mask & __GFP_NOWARN)) {
- pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n",
+ pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
__func__, count, ret);
cma_debug_show_areas(cma);
}
diff --git a/mm/debug.c b/mm/debug.c
index 6726bec731c9..d947f3e03b0d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,7 +105,7 @@ void dump_mm(const struct mm_struct *mm)
"get_unmapped_area %p\n"
#endif
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
- "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
+ "pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -135,8 +135,7 @@ void dump_mm(const struct mm_struct *mm)
mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
- atomic_long_read((atomic_long_t *)&mm->nr_ptes),
- mm_nr_pmds((struct mm_struct *)mm),
+ mm_pgtables_bytes(mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/filemap.c b/mm/filemap.c
index 594d73fef8b4..923fc2ebd74a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -35,6 +35,7 @@
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
+#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"
@@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
*shadowp = p;
}
__radix_tree_replace(&mapping->page_tree, node, slot, page,
- workingset_update_node, mapping);
+ workingset_lookup_update(mapping));
mapping->nrpages++;
return 0;
}
@@ -162,9 +163,12 @@ static void page_cache_tree_delete(struct address_space *mapping,
radix_tree_clear_tags(&mapping->page_tree, node, slot);
__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
- workingset_update_node, mapping);
+ workingset_lookup_update(mapping));
}
+ page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies upon it */
+
if (shadow) {
mapping->nrexceptional += nr;
/*
@@ -178,17 +182,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
mapping->nrpages -= nr;
}
-/*
- * Delete a page from the page cache and free it. Caller has to make
- * sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock.
- */
-void __delete_from_page_cache(struct page *page, void *shadow)
+static void unaccount_page_cache_page(struct address_space *mapping,
+ struct page *page)
{
- struct address_space *mapping = page->mapping;
- int nr = hpage_nr_pages(page);
+ int nr;
- trace_mm_filemap_delete_from_page_cache(page);
/*
* if we're uptodate, flush out into the cleancache, otherwise
* invalidate any existing cleancache entries. We can't leave
@@ -224,15 +222,12 @@ void __delete_from_page_cache(struct page *page, void *shadow)
}
}
- page_cache_tree_delete(mapping, page, shadow);
-
- page->mapping = NULL;
- /* Leave page->index set: truncation lookup relies upon it */
-
/* hugetlb pages do not participate in page cache accounting. */
if (PageHuge(page))
return;
+ nr = hpage_nr_pages(page);
+
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
@@ -243,17 +238,51 @@ void __delete_from_page_cache(struct page *page, void *shadow)
}
/*
- * At this point page must be either written or cleaned by truncate.
- * Dirty page here signals a bug and loss of unwritten data.
+ * At this point page must be either written or cleaned by
+ * truncate. Dirty page here signals a bug and loss of
+ * unwritten data.
*
- * This fixes dirty accounting after removing the page entirely but
- * leaves PageDirty set: it has no effect for truncated page and
- * anyway will be cleared before returning page into buddy allocator.
+ * This fixes dirty accounting after removing the page entirely
+ * but leaves PageDirty set: it has no effect for truncated
+ * page and anyway will be cleared before returning page into
+ * buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
+/*
+ * Delete a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe. The caller must hold the mapping's tree_lock.
+ */
+void __delete_from_page_cache(struct page *page, void *shadow)
+{
+ struct address_space *mapping = page->mapping;
+
+ trace_mm_filemap_delete_from_page_cache(page);
+
+ unaccount_page_cache_page(mapping, page);
+ page_cache_tree_delete(mapping, page, shadow);
+}
+
+static void page_cache_free_page(struct address_space *mapping,
+ struct page *page)
+{
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+ if (freepage)
+ freepage(page);
+
+ if (PageTransHuge(page) && !PageHuge(page)) {
+ page_ref_sub(page, HPAGE_PMD_NR);
+ VM_BUG_ON_PAGE(page_count(page) <= 0, page);
+ } else {
+ put_page(page);
+ }
+}
+
/**
* delete_from_page_cache - delete page from page cache
* @page: the page which the kernel is trying to remove from page cache
@@ -266,27 +295,98 @@ void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page_mapping(page);
unsigned long flags;
- void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
-
- freepage = mapping->a_ops->freepage;
-
spin_lock_irqsave(&mapping->tree_lock, flags);
__delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- if (freepage)
- freepage(page);
+ page_cache_free_page(mapping, page);
+}
+EXPORT_SYMBOL(delete_from_page_cache);
- if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
- VM_BUG_ON_PAGE(page_count(page) <= 0, page);
- } else {
- put_page(page);
+/*
+ * page_cache_tree_delete_batch - delete several pages from page cache
+ * @mapping: the mapping to which pages belong
+ * @pvec: pagevec with pages to delete
+ *
+ * The function walks over mapping->page_tree and removes pages passed in @pvec
+ * from the radix tree. The function expects @pvec to be sorted by page index.
+ * It tolerates holes in @pvec (radix tree entries at those indices are not
+ * modified). The function expects only THP head pages to be present in the
+ * @pvec and takes care to delete all corresponding tail pages from the radix
+ * tree as well.
+ *
+ * The function expects mapping->tree_lock to be held.
+ */
+static void
+page_cache_tree_delete_batch(struct address_space *mapping,
+ struct pagevec *pvec)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ int total_pages = 0;
+ int i = 0, tail_pages = 0;
+ struct page *page;
+ pgoff_t start;
+
+ start = pvec->pages[0]->index;
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ if (i >= pagevec_count(pvec) && !tail_pages)
+ break;
+ page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (radix_tree_exceptional_entry(page))
+ continue;
+ if (!tail_pages) {
+ /*
+ * Some page got inserted in our range? Skip it. We
+ * have our pages locked so they are protected from
+ * being removed.
+ */
+ if (page != pvec->pages[i])
+ continue;
+ WARN_ON_ONCE(!PageLocked(page));
+ if (PageTransHuge(page) && !PageHuge(page))
+ tail_pages = HPAGE_PMD_NR - 1;
+ page->mapping = NULL;
+ /*
+ * Leave page->index set: truncation lookup relies
+ * upon it
+ */
+ i++;
+ } else {
+ tail_pages--;
+ }
+ radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
+ __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
+ workingset_lookup_update(mapping));
+ total_pages++;
}
+ mapping->nrpages -= total_pages;
+}
+
+void delete_from_page_cache_batch(struct address_space *mapping,
+ struct pagevec *pvec)
+{
+ int i;
+ unsigned long flags;
+
+ if (!pagevec_count(pvec))
+ return;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
+
+ unaccount_page_cache_page(mapping, pvec->pages[i]);
+ }
+ page_cache_tree_delete_batch(mapping, pvec);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+ for (i = 0; i < pagevec_count(pvec); i++)
+ page_cache_free_page(mapping, pvec->pages[i]);
}
-EXPORT_SYMBOL(delete_from_page_cache);
int filemap_check_errors(struct address_space *mapping)
{
@@ -419,20 +519,18 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
if (end_byte < start_byte)
return;
- pagevec_init(&pvec, 0);
- while ((index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+ pagevec_init(&pvec);
+ while (index <= end) {
unsigned i;
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_WRITEBACK);
+ if (!nr_pages)
+ break;
+
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- /* until radix tree lookup accepts end_index */
- if (page->index > end)
- continue;
-
wait_on_page_writeback(page);
ClearPageError(page);
}
@@ -1754,9 +1852,10 @@ repeat:
EXPORT_SYMBOL(find_get_pages_contig);
/**
- * find_get_pages_tag - find and return pages that match @tag
+ * find_get_pages_range_tag - find and return pages in given range matching @tag
* @mapping: the address_space to search
* @index: the starting page index
+ * @end: The final page index (inclusive)
* @tag: the tag index
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
@@ -1764,8 +1863,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
*/
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
- int tag, unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+ pgoff_t end, int tag, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
@@ -1778,6 +1878,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *head, *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1819,18 +1922,28 @@ repeat:
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *index = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when we got at @end. We take care to not overflow the
+ * index @index as it confuses some of the callers. This breaks the
+ * iteration when there is page at index -1 but that is already broken
+ * anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *index = (pgoff_t)-1;
+ else
+ *index = end + 1;
+out:
rcu_read_unlock();
- if (ret)
- *index = pages[ret - 1]->index + 1;
-
return ret;
}
-EXPORT_SYMBOL(find_get_pages_tag);
+EXPORT_SYMBOL(find_get_pages_range_tag);
/**
* find_get_entries_tag - find and return entries that match @tag
@@ -2159,7 +2272,7 @@ no_cached_page:
* Ok, it wasn't cached, so we need to create a new
* page..
*/
- page = page_cache_alloc_cold(mapping);
+ page = page_cache_alloc(mapping);
if (!page) {
error = -ENOMEM;
goto out;
@@ -2271,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
int ret;
do {
- page = __page_cache_alloc(gfp_mask|__GFP_COLD);
+ page = __page_cache_alloc(gfp_mask);
if (!page)
return -ENOMEM;
@@ -2675,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping,
repeat:
page = find_get_page(mapping, index);
if (!page) {
- page = __page_cache_alloc(gfp | __GFP_COLD);
+ page = __page_cache_alloc(gfp);
if (!page)
return ERR_PTR(-ENOMEM);
err = add_to_page_cache_lru(page, mapping, index, gfp);
diff --git a/mm/hmm.c b/mm/hmm.c
index a88a847bccba..ea19742a5d60 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -803,11 +803,10 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
static void hmm_devmem_radix_release(struct resource *resource)
{
- resource_size_t key, align_start, align_size, align_end;
+ resource_size_t key, align_start, align_size;
align_start = resource->start & ~(PA_SECTION_SIZE - 1);
align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
- align_end = align_start + align_size - 1;
mutex_lock(&hmm_devmem_lock);
for (key = resource->start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 003f7bcd0952..86fe697e8bfb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&vma->vm_mm->nr_ptes);
+ mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
if (pgtable)
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- atomic_long_inc(&mm->nr_ptes);
+ mm_inc_nr_ptes(mm);
return true;
}
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
- atomic_long_inc(&mm->nr_ptes);
+ mm_inc_nr_ptes(mm);
}
set_pmd_at(mm, addr, pmd, entry);
@@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&dst_mm->nr_ptes);
+ mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
@@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(src_page);
page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&dst_mm->nr_ptes);
+ mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1189,8 +1189,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
+ /*
+ * Leave pmd empty until pte is filled note we must notify here as
+ * concurrent CPU thread might write to new page before the call to
+ * mmu_notifier_invalidate_range_end() happens which can lead to a
+ * device seeing memory write in different order than CPU.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
- /* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
pmd_populate(vma->vm_mm, &_pmd, pgtable);
@@ -1216,7 +1223,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
page_remove_rmap(page, true);
spin_unlock(vmf->ptl);
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pmdp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+ mmun_end);
ret |= VM_FAULT_WRITE;
put_page(page);
@@ -1365,7 +1377,12 @@ alloc:
}
spin_unlock(vmf->ptl);
out_mn:
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pmdp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+ mmun_end);
out:
return ret;
out_unlock:
@@ -1678,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pte_free(mm, pgtable);
- atomic_long_dec(&mm->nr_ptes);
+ mm_dec_nr_ptes(mm);
}
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
@@ -2017,7 +2034,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
out:
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pudp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+ HPAGE_PUD_SIZE);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -2029,8 +2051,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- /* leave pmd empty until pte is filled */
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ /*
+ * Leave pmd empty until pte is filled note that it is fine to delay
+ * notification until mmu_notifier_invalidate_range_end() as we are
+ * replacing a zero pmd write protected page with a zero pte write
+ * protected page.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
+ pmdp_huge_clear_flush(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2085,6 +2114,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
return;
} else if (is_huge_zero_pmd(*pmd)) {
+ /*
+ * FIXME: Do we want to invalidate secondary mmu by calling
+ * mmu_notifier_invalidate_range() see comments below inside
+ * __split_huge_pmd() ?
+ *
+ * We are going from a zero huge page write protected to zero
+ * small page also write protected so it does not seems useful
+ * to invalidate secondary mmu at this time.
+ */
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
@@ -2220,7 +2258,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
__split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback.
+ * They are 3 cases to consider inside __split_huge_pmd_locked():
+ * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
+ * 2) __split_huge_zero_page_pmd() read only zero page and any write
+ * fault will trigger a flush_notify before pointing to a new page
+ * (it is fine if the secondary mmu keeps pointing to the old zero
+ * page in the meantime)
+ * 3) Split a huge pmd into pte pointing to the same page. No need
+ * to invalidate secondary tlb entry they are all still valid.
+ * any further changes to individual pte will notify. So no need
+ * to call mmu_notifier->invalidate_range()
+ */
+ mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+ HPAGE_PMD_SIZE);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d2ff5e8bf2b..681b300185c0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3256,9 +3256,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
if (cow) {
+ /*
+ * No need to notify as we are downgrading page
+ * table protection not changing it to point
+ * to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
huge_ptep_set_wrprotect(src, addr, src_pte);
- mmu