diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-15 19:42:40 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-15 19:42:40 -0800 |
commit | 7c225c69f86c934e3be9be63ecde754e286838d7 (patch) | |
tree | ff2df419b0c4886b37407235f7d21215e4cf45e4 /mm | |
parent | 6363b3f3ac5be096d08c8c504128befa0c033529 (diff) | |
parent | 1b7176aea0a924ac59c6a283129d3e8eb00aa915 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- a few misc bits
- ocfs2 updates
- almost all of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (131 commits)
memory hotplug: fix comments when adding section
mm: make alloc_node_mem_map a void call if we don't have CONFIG_FLAT_NODE_MEM_MAP
mm: simplify nodemask printing
mm,oom_reaper: remove pointless kthread_run() error check
mm/page_ext.c: check if page_ext is not prepared
writeback: remove unused function parameter
mm: do not rely on preempt_count in print_vma_addr
mm, sparse: do not swamp log with huge vmemmap allocation failures
mm/hmm: remove redundant variable align_end
mm/list_lru.c: mark expected switch fall-through
mm/shmem.c: mark expected switch fall-through
mm/page_alloc.c: broken deferred calculation
mm: don't warn about allocations which stall for too long
fs: fuse: account fuse_inode slab memory as reclaimable
mm, page_alloc: fix potential false positive in __zone_watermark_ok
mm: mlock: remove lru_add_drain_all()
mm, sysctl: make NUMA stats configurable
shmem: convert shmem_init_inodecache() to void
Unify migrate_pages and move_pages access checks
mm, pagevec: rename pagevec drained field
...
Diffstat (limited to 'mm')
50 files changed, 1196 insertions, 813 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5b0adf1435de..e5e606ee5f71 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC - depends on !KMEMCHECK select PAGE_EXTENSION select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- diff --git a/mm/Makefile b/mm/Makefile index 4659b93cba43..e7ebd176fb93 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n KCOV_INSTRUMENT_page_alloc.o := n KCOV_INSTRUMENT_debug-pagealloc.o := n KCOV_INSTRUMENT_kmemleak.o := n -KCOV_INSTRUMENT_kmemcheck.o := n KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n @@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o -obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o @@ -461,7 +461,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); if (ret && !(gfp_mask & __GFP_NOWARN)) { - pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", + pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); } diff --git a/mm/debug.c b/mm/debug.c index 6726bec731c9..d947f3e03b0d 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -105,7 +105,7 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -135,8 +135,7 @@ void dump_mm(const struct mm_struct *mm) mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), - atomic_long_read((atomic_long_t *)&mm->nr_ptes), - mm_nr_pmds((struct mm_struct *)mm), + mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/filemap.c b/mm/filemap.c index 594d73fef8b4..923fc2ebd74a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -35,6 +35,7 @@ #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> +#include <linux/shmem_fs.h> #include <linux/rmap.h> #include "internal.h" @@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping, *shadowp = p; } __radix_tree_replace(&mapping->page_tree, node, slot, page, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); mapping->nrpages++; return 0; } @@ -162,9 +163,12 @@ static void page_cache_tree_delete(struct address_space *mapping, radix_tree_clear_tags(&mapping->page_tree, node, slot); __radix_tree_replace(&mapping->page_tree, node, slot, shadow, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); } + page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ + if (shadow) { mapping->nrexceptional += nr; /* @@ -178,17 +182,11 @@ static void page_cache_tree_delete(struct address_space *mapping, mapping->nrpages -= nr; } -/* - * Delete a page from the page cache and free it. Caller has to make - * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. - */ -void __delete_from_page_cache(struct page *page, void *shadow) +static void unaccount_page_cache_page(struct address_space *mapping, + struct page *page) { - struct address_space *mapping = page->mapping; - int nr = hpage_nr_pages(page); + int nr; - trace_mm_filemap_delete_from_page_cache(page); /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave @@ -224,15 +222,12 @@ void __delete_from_page_cache(struct page *page, void *shadow) } } - page_cache_tree_delete(mapping, page, shadow); - - page->mapping = NULL; - /* Leave page->index set: truncation lookup relies upon it */ - /* hugetlb pages do not participate in page cache accounting. */ if (PageHuge(page)) return; + nr = hpage_nr_pages(page); + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); @@ -243,17 +238,51 @@ void __delete_from_page_cache(struct page *page, void *shadow) } /* - * At this point page must be either written or cleaned by truncate. - * Dirty page here signals a bug and loss of unwritten data. + * At this point page must be either written or cleaned by + * truncate. Dirty page here signals a bug and loss of + * unwritten data. * - * This fixes dirty accounting after removing the page entirely but - * leaves PageDirty set: it has no effect for truncated page and - * anyway will be cleared before returning page into buddy allocator. + * This fixes dirty accounting after removing the page entirely + * but leaves PageDirty set: it has no effect for truncated + * page and anyway will be cleared before returning page into + * buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } +/* + * Delete a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. The caller must hold the mapping's tree_lock. + */ +void __delete_from_page_cache(struct page *page, void *shadow) +{ + struct address_space *mapping = page->mapping; + + trace_mm_filemap_delete_from_page_cache(page); + + unaccount_page_cache_page(mapping, page); + page_cache_tree_delete(mapping, page, shadow); +} + +static void page_cache_free_page(struct address_space *mapping, + struct page *page) +{ + void (*freepage)(struct page *); + + freepage = mapping->a_ops->freepage; + if (freepage) + freepage(page); + + if (PageTransHuge(page) && !PageHuge(page)) { + page_ref_sub(page, HPAGE_PMD_NR); + VM_BUG_ON_PAGE(page_count(page) <= 0, page); + } else { + put_page(page); + } +} + /** * delete_from_page_cache - delete page from page cache * @page: the page which the kernel is trying to remove from page cache @@ -266,27 +295,98 @@ void delete_from_page_cache(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; - void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); - - freepage = mapping->a_ops->freepage; - spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - if (freepage) - freepage(page); + page_cache_free_page(mapping, page); +} +EXPORT_SYMBOL(delete_from_page_cache); - if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, HPAGE_PMD_NR); - VM_BUG_ON_PAGE(page_count(page) <= 0, page); - } else { - put_page(page); +/* + * page_cache_tree_delete_batch - delete several pages from page cache + * @mapping: the mapping to which pages belong + * @pvec: pagevec with pages to delete + * + * The function walks over mapping->page_tree and removes pages passed in @pvec + * from the radix tree. The function expects @pvec to be sorted by page index. + * It tolerates holes in @pvec (radix tree entries at those indices are not + * modified). The function expects only THP head pages to be present in the + * @pvec and takes care to delete all corresponding tail pages from the radix + * tree as well. + * + * The function expects mapping->tree_lock to be held. + */ +static void +page_cache_tree_delete_batch(struct address_space *mapping, + struct pagevec *pvec) +{ + struct radix_tree_iter iter; + void **slot; + int total_pages = 0; + int i = 0, tail_pages = 0; + struct page *page; + pgoff_t start; + + start = pvec->pages[0]->index; + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (i >= pagevec_count(pvec) && !tail_pages) + break; + page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (radix_tree_exceptional_entry(page)) + continue; + if (!tail_pages) { + /* + * Some page got inserted in our range? Skip it. We + * have our pages locked so they are protected from + * being removed. + */ + if (page != pvec->pages[i]) + continue; + WARN_ON_ONCE(!PageLocked(page)); + if (PageTransHuge(page) && !PageHuge(page)) + tail_pages = HPAGE_PMD_NR - 1; + page->mapping = NULL; + /* + * Leave page->index set: truncation lookup relies + * upon it + */ + i++; + } else { + tail_pages--; + } + radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); + __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, + workingset_lookup_update(mapping)); + total_pages++; } + mapping->nrpages -= total_pages; +} + +void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec) +{ + int i; + unsigned long flags; + + if (!pagevec_count(pvec)) + return; + + spin_lock_irqsave(&mapping->tree_lock, flags); + for (i = 0; i < pagevec_count(pvec); i++) { + trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); + + unaccount_page_cache_page(mapping, pvec->pages[i]); + } + page_cache_tree_delete_batch(mapping, pvec); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + for (i = 0; i < pagevec_count(pvec); i++) + page_cache_free_page(mapping, pvec->pages[i]); } -EXPORT_SYMBOL(delete_from_page_cache); int filemap_check_errors(struct address_space *mapping) { @@ -419,20 +519,18 @@ static void __filemap_fdatawait_range(struct address_space *mapping, if (end_byte < start_byte) return; - pagevec_init(&pvec, 0); - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + pagevec_init(&pvec); + while (index <= end) { unsigned i; + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_WRITEBACK); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - wait_on_page_writeback(page); ClearPageError(page); } @@ -1754,9 +1852,10 @@ repeat: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_tag - find and return pages that match @tag + * find_get_pages_range_tag - find and return pages in given range matching @tag * @mapping: the address_space to search * @index: the starting page index + * @end: The final page index (inclusive) * @tag: the tag index * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed @@ -1764,8 +1863,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. */ -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1778,6 +1878,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *head, *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1819,18 +1922,28 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *index = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when we got at @end. We take care to not overflow the + * index @index as it confuses some of the callers. This breaks the + * iteration when there is page at index -1 but that is already broken + * anyway. + */ + if (end == (pgoff_t)-1) + *index = (pgoff_t)-1; + else + *index = end + 1; +out: rcu_read_unlock(); - if (ret) - *index = pages[ret - 1]->index + 1; - return ret; } -EXPORT_SYMBOL(find_get_pages_tag); +EXPORT_SYMBOL(find_get_pages_range_tag); /** * find_get_entries_tag - find and return entries that match @tag @@ -2159,7 +2272,7 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc(mapping); if (!page) { error = -ENOMEM; goto out; @@ -2271,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) int ret; do { - page = __page_cache_alloc(gfp_mask|__GFP_COLD); + page = __page_cache_alloc(gfp_mask); if (!page) return -ENOMEM; @@ -2675,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping, repeat: page = find_get_page(mapping, index); if (!page) { - page = __page_cache_alloc(gfp | __GFP_COLD); + page = __page_cache_alloc(gfp); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, gfp); @@ -803,11 +803,10 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); static void hmm_devmem_radix_release(struct resource *resource) { - resource_size_t key, align_start, align_size, align_end; + resource_size_t key, align_start, align_size; align_start = resource->start & ~(PA_SECTION_SIZE - 1); align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); - align_end = align_start + align_size - 1; mutex_lock(&hmm_devmem_lock); for (key = resource->start; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 003f7bcd0952..86fe697e8bfb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, if (pgtable) pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); return true; } @@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); } set_pmd_at(mm, addr, pmd, entry); @@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; @@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); @@ -1189,8 +1189,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); + /* + * Leave pmd empty until pte is filled note we must notify here as + * concurrent CPU thread might write to new page before the call to + * mmu_notifier_invalidate_range_end() happens which can lead to a + * device seeing memory write in different order than CPU. + * + * See Documentation/vm/mmu_notifier.txt + */ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); @@ -1216,7 +1223,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, page_remove_rmap(page, true); spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); ret |= VM_FAULT_WRITE; put_page(page); @@ -1365,7 +1377,12 @@ alloc: } spin_unlock(vmf->ptl); out_mn: - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); out: return ret; out_unlock: @@ -1678,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) pgtable = pgtable_trans_huge_withdraw(mm, pmd); pte_free(mm, pgtable); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -2017,7 +2034,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, out: spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pudp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + + HPAGE_PUD_SIZE); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2029,8 +2051,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - /* leave pmd empty until pte is filled */ - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * Leave pmd empty until pte is filled note that it is fine to delay + * notification until mmu_notifier_invalidate_range_end() as we are + * replacing a zero pmd write protected page with a zero pte write + * protected page. + * + * See Documentation/vm/mmu_notifier.txt + */ + pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2085,6 +2114,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); return; } else if (is_huge_zero_pmd(*pmd)) { + /* + * FIXME: Do we want to invalidate secondary mmu by calling + * mmu_notifier_invalidate_range() see comments below inside + * __split_huge_pmd() ? + * + * We are going from a zero huge page write protected to zero + * small page also write protected so it does not seems useful + * to invalidate secondary mmu at this time. + */ return __split_huge_zero_page_pmd(vma, haddr, pmd); } @@ -2220,7 +2258,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); + /* + * No need to double call mmu_notifier->invalidate_range() callback. + * They are 3 cases to consider inside __split_huge_pmd_locked(): + * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious + * 2) __split_huge_zero_page_pmd() read only zero page and any write + * fault will trigger a flush_notify before pointing to a new page + * (it is fine if the secondary mmu keeps pointing to the old zero + * page in the meantime) + * 3) Split a huge pmd into pte pointing to the same page. No need + * to invalidate secondary tlb entry they are all still valid. + * any further changes to individual pte will notify. So no need + * to call mmu_notifier->invalidate_range() + */ + mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + + HPAGE_PMD_SIZE); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2d2ff5e8bf2b..681b300185c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3256,9 +3256,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { if (cow) { + /* + * No need to notify as we are downgrading page + * table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ huge_ptep_set_wrprotect(src, addr, src_pte); - mmu |