diff options
Diffstat (limited to 'mm')
52 files changed, 1057 insertions, 918 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 03ff7703d322..c782e8fb7235 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -639,15 +639,10 @@ config MAX_STACK_SIZE_MB A sane initial value is 80 MB. -# For architectures that support deferred memory initialisation -config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - bool - config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n - depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - depends on NO_BOOTMEM && MEMORY_HOTPLUG + depends on NO_BOOTMEM depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 84b2dc76f140..b5f940ce0143 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -882,13 +882,10 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) if (IS_ERR(dev)) return PTR_ERR(dev); - if (bdi_debug_register(bdi, dev_name(dev))) { - device_destroy(bdi_class, dev->devt); - return -ENOMEM; - } cgwb_bdi_register(bdi); bdi->dev = dev; + bdi_debug_register(bdi, dev_name(dev)); set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); diff --git a/mm/compaction.c b/mm/compaction.c index 10cd757f1006..2c8999d027ab 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1738,7 +1738,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation - * @mode: The migration mode for async, sync light, or sync migration + * @prio: Determines how hard direct compaction should try to succeed * * This is the main entry point for direct page compaction. */ diff --git a/mm/debug.c b/mm/debug.c index d947f3e03b0d..56e2d9125ea5 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -50,7 +50,7 @@ void __dump_page(struct page *page, const char *reason) */ int mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", + pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageCompound(page)) @@ -69,7 +69,7 @@ void __dump_page(struct page *page, const char *reason) #ifdef CONFIG_MEMCG if (page->mem_cgroup) - pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); + pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); #endif } @@ -84,10 +84,10 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { - pr_emerg("vma %p start %p end %p\n" - "next %p prev %p mm %p\n" - "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n" + pr_emerg("vma %px start %px end %px\n" + "next %px prev %px mm %px\n" + "prot %lx anon_vma %px vm_ops %px\n" + "pgoff %lx file %px private_data %px\n" "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, vma->vm_prev, vma->vm_mm, @@ -100,27 +100,27 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n" + pr_emerg("mm %px mmap %px seqnum %d task_size %lu\n" #ifdef CONFIG_MMU - "get_unmapped_area %p\n" + "get_unmapped_area %px\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" + "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %p flags %lx core_state %p\n" + "binfmt %px flags %lx core_state %px\n" #ifdef CONFIG_AIO - "ioctx_table %p\n" + "ioctx_table %px\n" #endif #ifdef CONFIG_MEMCG - "owner %p " + "owner %px " #endif - "exe_file %p\n" + "exe_file %px\n" #ifdef CONFIG_MMU_NOTIFIER - "mmu_notifier_mm %p\n" + "mmu_notifier_mm %px\n" #endif #ifdef CONFIG_NUMA_BALANCING "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index d04ac1ec0559..1826f191e72c 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -111,7 +111,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) enum fixed_addresses idx; int i, slot; - WARN_ON(system_state != SYSTEM_BOOTING); + WARN_ON(system_state >= SYSTEM_RUNNING); slot = -1; for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { diff --git a/mm/fadvise.c b/mm/fadvise.c index ec70d6e4b86d..767887f5f3bf 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -127,7 +127,15 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) */ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; end_index = (endbyte >> PAGE_SHIFT); - if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) { + /* + * The page at end_index will be inclusively discarded according + * by invalidate_mapping_pages(), so subtracting 1 from + * end_index means we will skip the last page. But if endbyte + * is page aligned or is at the end of file, we should not skip + * that page - discarding the last page is safe enough. + */ + if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK && + endbyte != inode->i_size - 1) { /* First page is tricky as 0 - 1 = -1, but pgoff_t * is unsigned, so the end_index >= start_index * check below would be true and we'll discard the whole diff --git a/mm/filemap.c b/mm/filemap.c index ee83baaf855d..693f62212a59 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,7 +31,6 @@ #include <linux/blkdev.h> #include <linux/security.h> #include <linux/cpuset.h> -#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 297c7238f7d4..c64dca6e27c2 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -62,8 +62,10 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, * get_user_pages_longterm() and disallow it for filesystem-dax * mappings. */ - if (vma_is_fsdax(vma)) - return -EOPNOTSUPP; + if (vma_is_fsdax(vma)) { + ret = -EOPNOTSUPP; + goto out; + } if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; @@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_access_permitted(pte, WRITE) || + return pte_write(pte) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } @@ -848,7 +848,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, - int *locked, bool notify_drop, + int *locked, unsigned int flags) { long ret, pages_done; @@ -922,7 +922,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, pages++; start += PAGE_SIZE; } - if (notify_drop && lock_dropped && *locked) { + if (lock_dropped && *locked) { /* * We must let the caller know we temporarily dropped the lock * and so the critical section protected by it was lost. @@ -959,36 +959,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, int *locked) { return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, NULL, locked, true, + pages, NULL, locked, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_locked); /* - * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for - * tsk, mm to be specified. - * - * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the - * caller if required (just like with __get_user_pages). "FOLL_GET" - * is set implicitly if "pages" is non-NULL. - */ -static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, - struct mm_struct *mm, unsigned long start, - unsigned long nr_pages, struct page **pages, - unsigned int gup_flags) -{ - long ret; - int locked = 1; - - down_read(&mm->mmap_sem); - ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, NULL, - &locked, false, gup_flags); - if (locked) - up_read(&mm->mmap_sem); - return ret; -} - -/* * get_user_pages_unlocked() is suitable to replace the form: * * down_read(&mm->mmap_sem); @@ -1006,8 +982,16 @@ static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { - return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - pages, gup_flags | FOLL_TOUCH); + struct mm_struct *mm = current->mm; + int locked = 1; + long ret; + + down_read(&mm->mmap_sem); + ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, + &locked, gup_flags | FOLL_TOUCH); + if (locked) + up_read(&mm->mmap_sem); + return ret; } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -1073,7 +1057,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct **vmas, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - locked, true, + locked, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -1090,7 +1074,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, struct vm_area_struct **vmas) { return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, false, + pages, vmas, NULL, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -391,11 +391,11 @@ again: if (pmd_protnone(pmd)) return hmm_vma_walk_clear(start, end, walk); - if (!pmd_access_permitted(pmd, write_fault)) + if (write_fault && !pmd_write(pmd)) return hmm_vma_walk_clear(start, end, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_access_permitted(pmd, WRITE) ? HMM_PFN_WRITE : 0; + flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; for (; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; return 0; @@ -418,7 +418,7 @@ again: } if (!pte_present(pte)) { - swp_entry_t entry; + swp_entry_t entry = pte_to_swp_entry(pte); if (!non_swap_entry(entry)) { if (hmm_vma_walk->fault) @@ -426,8 +426,6 @@ again: continue; } - entry = pte_to_swp_entry(pte); - /* * This is a special swap entry, ignore migration, use * device and report anything else as error. @@ -456,11 +454,11 @@ again: continue; } - if (!pte_access_permitted(pte, write_fault)) + if (write_fault && !pte_write(pte)) goto fault; pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_access_permitted(pte, WRITE) ? HMM_PFN_WRITE : 0; + pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; continue; fault: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2f5e774902..87ab9b8f56b5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -870,7 +870,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, */ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); - if (!pmd_access_permitted(*pmd, flags & FOLL_WRITE)) + if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; if (pmd_present(*pmd) && pmd_devmap(*pmd)) @@ -1012,7 +1012,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pud_lockptr(mm, pud)); - if (!pud_access_permitted(*pud, flags & FOLL_WRITE)) + if (flags & FOLL_WRITE && !pud_write(*pud)) return NULL; if (pud_present(*pud) && pud_devmap(*pud)) @@ -1386,7 +1386,7 @@ out_unlock: */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_access_permitted(pmd, WRITE) || + return pmd_write(pmd) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); } @@ -1910,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * pmdp_invalidate() is required to make sure we don't miss * dirty/young flags set by hardware. */ - entry = *pmd; - pmdp_invalidate(vma, addr, pmd); - - /* - * Recover dirty/young flags. It relies on pmdp_invalidate to not - * corrupt them. - */ - if (pmd_dirty(*pmd)) - entry = pmd_mkdirty(entry); - if (pmd_young(*pmd)) - entry = pmd_mkyoung(entry); + entry = pmdp_invalidate(vma, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) @@ -2073,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; struct page *page; pgtable_t pgtable; - pmd_t _pmd; - bool young, write, dirty, soft_dirty, pmd_migration = false; + pmd_t old_pmd, _pmd; + bool young, write, soft_dirty, pmd_migration = false; unsigned long addr; int i; @@ -2116,24 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } + /* + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum + * 383 on page 93. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * must remain set at all times on the pmd until the split is complete + * for this pmd), then we flush the SMP TLB and finally we write the + * non-huge version of the pmd entry with pmd_populate. + */ + old_pmd = pmdp_invalidate(vma, haddr, pmd); + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - pmd_migration = is_pmd_migration_entry(*pmd); + pmd_migration = is_pmd_migration_entry(old_pmd); if (pmd_migration) { swp_entry_t entry; - entry = pmd_to_swp_entry(*pmd); + entry = pmd_to_swp_entry(old_pmd); page = pfn_to_page(swp_offset(entry)); } else #endif - page = pmd_page(*pmd); + page = pmd_page(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); - write = pmd_write(*pmd); - young = pmd_young(*pmd); - dirty = pmd_dirty(*pmd); - soft_dirty = pmd_soft_dirty(*pmd); + if (pmd_dirty(old_pmd)) + SetPageDirty(page); + write = pmd_write(old_pmd); + young = pmd_young(old_pmd); + soft_dirty = pmd_soft_dirty(old_pmd); - pmdp_huge_split_prepare(vma, haddr, pmd); + /* + * Withdraw the table only after we mark the pmd entry invalid. + * This's critical for some architectures (Power). + */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2160,8 +2176,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (soft_dirty) entry = pte_mksoft_dirty(entry); } - if (dirty) - SetPageDirty(page + i); pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); @@ -2189,28 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } smp_wmb(); /* make pte visible before pmd */ - /* - * Up to this point the pmd is present and huge and userland has the - * whole access to the hugepage during the split (which happens in - * place). If we overwrite the pmd with the not-huge version pointing - * to the pte here (which of course we could if all CPUs were bug - * free), userland could trigger a small page size TLB miss on the - * small sized TLB while the hugepage TLB entry is still established in - * the huge TLB. Some CPU doesn't like that. - * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum - * 383 on page 93. Intel should be safe but is also warns that it's - * only safe if the permission and cache attributes of the two entries - * loaded in the two TLB is identical (which should be the case here). - * But it is generally safer to never allow small and huge TLB entries - * for the same virtual address to be loaded simultaneously. So instead - * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the - * current pmd notpresent (atomically because here the pmd_trans_huge - * and pmd_trans_splitting must remain set at all times on the pmd - * until the split is complete for this pmd), then we flush the SMP TLB - * and finally we write the non-huge version of the pmd entry with - * pmd_populate. - */ - pmdp_invalidate(vma, haddr, pmd); pmd_populate(mm, pmd, pgtable); if (freeze) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a334f5fb730..7c204e3d132b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,10 +34,9 @@ #include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include <linux/userfaultfd_k.h> +#include <linux/page_owner.h> #include "internal.h" -int hugepages_treat_as_movable; - int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -926,7 +925,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_supported(h)) + if (hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; @@ -1108,7 +1107,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, struct hstate *h) +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { unsigned int order = huge_page_order(h); unsigned long nr_pages = 1 << order; @@ -1116,11 +1116,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - gfp_t gfp_mask; - gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; zonelist = node_zonelist(nid, gfp_mask); - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { spin_lock_irqsave(&zone->lock, flags); pfn = ALIGN(zone->zone_start_pfn, nr_pages); @@ -1151,41 +1149,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); -static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) -{ - struct page *page; - - page = alloc_gigantic_page(nid, h); - if (page) { - prep_compound_gigantic_page(page, huge_page_order(h)); - prep_new_huge_page(h, page, nid); - } - - return page; -} - -static int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) -{ - struct page *page = NULL; - int nr_nodes, node; - - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_gigantic_page_node(h, node); - if (page) - return 1; - } - - return 0; -} - #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static inline bool gigantic_page_supported(void) { return false; } +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { return NULL; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } -static inline int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) { return 0; } #endif static void update_and_free_page(struct hstate *h, struct page *page) @@ -1250,6 +1220,28 @@ static void clear_page_huge_active(struct page *page) ClearPagePrivate(&page[1]); } +/* + * Internal hugetlb specific page flag. Do not use outside of the hugetlb + * code |