diff options
Diffstat (limited to 'mm')
53 files changed, 3261 insertions, 1884 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a5dae9a7eb51..ab80933be65f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -29,7 +29,7 @@ config FLATMEM_MANUAL For systems that have holes in their physical address spaces and for features like NUMA and memory hotplug, - choose "Sparse Memory" + choose "Sparse Memory". If unsure, choose this option (Flat Memory) over any other. @@ -122,9 +122,9 @@ config SPARSEMEM_VMEMMAP depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE default y help - SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise - pfn_to_page and page_to_pfn operations. This is the most - efficient option when sufficient kernel resources are available. + SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise + pfn_to_page and page_to_pfn operations. This is the most + efficient option when sufficient kernel resources are available. config HAVE_MEMBLOCK_NODE_MAP bool @@ -160,9 +160,9 @@ config MEMORY_HOTPLUG_SPARSE depends on SPARSEMEM && MEMORY_HOTPLUG config MEMORY_HOTPLUG_DEFAULT_ONLINE - bool "Online the newly added memory blocks by default" - depends on MEMORY_HOTPLUG - help + bool "Online the newly added memory blocks by default" + depends on MEMORY_HOTPLUG + help This option sets the default policy setting for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks) which determines what happens to newly added memory regions. Policy setting @@ -227,14 +227,14 @@ config COMPACTION select MIGRATION depends on MMU help - Compaction is the only memory management component to form - high order (larger physically contiguous) memory blocks - reliably. The page allocator relies on compaction heavily and - the lack of the feature can lead to unexpected OOM killer - invocations for high order memory requests. You shouldn't - disable this option unless there really is a strong reason for - it and then we would be really interested to hear about that at - linux-mm@kvack.org. + Compaction is the only memory management component to form + high order (larger physically contiguous) memory blocks + reliably. The page allocator relies on compaction heavily and + the lack of the feature can lead to unexpected OOM killer + invocations for high order memory requests. You shouldn't + disable this option unless there really is a strong reason for + it and then we would be really interested to hear about that at + linux-mm@kvack.org. # # support for page migration @@ -258,7 +258,7 @@ config ARCH_ENABLE_THP_MIGRATION bool config CONTIG_ALLOC - def_bool (MEMORY_ISOLATION && COMPACTION) || CMA + def_bool (MEMORY_ISOLATION && COMPACTION) || CMA config PHYS_ADDR_T_64BIT def_bool 64BIT @@ -284,6 +284,7 @@ config VIRT_TO_BUS config MMU_NOTIFIER bool select SRCU + select INTERVAL_TREE config KSM bool "Enable KSM for page merging" @@ -301,10 +302,10 @@ config KSM root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" + int "Low address space to protect from user allocation" depends on MMU - default 4096 - help + default 4096 + help This is the portion of low virtual memory which should be protected from userspace allocation. Keeping a user from writing to low pages can help reduce the impact of kernel NULL pointer bugs. @@ -407,7 +408,7 @@ choice endchoice config ARCH_WANTS_THP_SWAP - def_bool n + def_bool n config THP_SWAP def_bool y @@ -674,7 +675,6 @@ config DEV_PAGEMAP_OPS config HMM_MIRROR bool depends on MMU - depends on MMU_NOTIFIER config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" @@ -736,4 +736,7 @@ config ARCH_HAS_PTE_SPECIAL config ARCH_HAS_HUGEPD bool +config MAPPING_DIRTY_HELPERS + bool + endmenu diff --git a/mm/Makefile b/mm/Makefile index d996846697ef..1937cc251883 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -107,3 +107,4 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o @@ -95,13 +95,11 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, static int __init cma_activate_area(struct cma *cma) { - int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; unsigned i = cma->count >> pageblock_order; struct zone *zone; - cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - + cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); if (!cma->bitmap) { cma->count = 0; return -ENOMEM; @@ -139,7 +137,7 @@ static int __init cma_activate_area(struct cma *cma) not_in_zone: pr_err("CMA area %s could not be activated\n", cma->name); - kfree(cma->bitmap); + bitmap_free(cma->bitmap); cma->count = 0; return -EINVAL; } diff --git a/mm/cma_debug.c b/mm/cma_debug.c index a7dd9e8e10d5..4e6cbe2f586e 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -29,7 +29,7 @@ static int cma_debugfs_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); static int cma_used_get(void *data, u64 *val) { @@ -44,7 +44,7 @@ static int cma_used_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); static int cma_maxchunk_get(void *data, u64 *val) { @@ -66,7 +66,7 @@ static int cma_maxchunk_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) { @@ -126,7 +126,7 @@ static int cma_free_write(void *data, u64 val) return cma_free_mem(cma, pages); } -DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); static int cma_alloc_mem(struct cma *cma, int count) { @@ -158,7 +158,7 @@ static int cma_alloc_write(void *data, u64 val) return cma_alloc_mem(cma, pages); } -DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { diff --git a/mm/debug.c b/mm/debug.c index 8345bb6e4769..0461df1207cb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -67,28 +67,31 @@ void __dump_page(struct page *page, const char *reason) */ mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx", - page, page_ref_count(page), mapcount, - page->mapping, page_to_pgoff(page)); if (PageCompound(page)) - pr_cont(" compound_mapcount: %d", compound_mapcount(page)); - pr_cont("\n"); - if (PageAnon(page)) - pr_warn("anon "); - else if (PageKsm(page)) - pr_warn("ksm "); + pr_warn("page:%px refcount:%d mapcount:%d mapping:%px " + "index:%#lx compound_mapcount: %d\n", + page, page_ref_count(page), mapcount, + page->mapping, page_to_pgoff(page), + compound_mapcount(page)); + else + pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx\n", + page, page_ref_count(page), mapcount, + page->mapping, page_to_pgoff(page)); + if (PageKsm(page)) + pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags); + else if (PageAnon(page)) + pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags); else if (mapping) { - pr_warn("%ps ", mapping->a_ops); if (mapping->host && mapping->host->i_dentry.first) { struct dentry *dentry; dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); - pr_warn("name:\"%pd\" ", dentry); - } + pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); + } else + pr_warn("%ps\n", mapping->a_ops); + pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); } BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); - hex_only: print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, diff --git a/mm/filemap.c b/mm/filemap.c index 85b7d087eb45..bf6aa30be58d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2329,27 +2329,6 @@ EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) -static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, - struct file *fpin) -{ - int flags = vmf->flags; - - if (fpin) - return fpin; - - /* - * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or - * anything, so we only pin the file and drop the mmap_sem if only - * FAULT_FLAG_ALLOW_RETRY is set. - */ - if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == - FAULT_FLAG_ALLOW_RETRY) { - fpin = get_file(vmf->vma->vm_file); - up_read(&vmf->vma->vm_mm->mmap_sem); - } - return fpin; -} - /* * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem * @vmf - the vm_fault for this fault. @@ -3161,6 +3140,27 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(pagecache_write_end); +/* + * Warn about a page cache invalidation failure during a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { @@ -3218,11 +3218,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * Most of the time we do not need this since dio_complete() will do * the invalidation for us. However there are some file systems that * do not end up with dio_complete() being called, so let's not break - * them by removing it completely + * them by removing it completely. + * + * Noticeable example is a blkdev_direct_IO(). + * + * Skip invalidation for async writes or if mapping has no pages. */ - if (mapping->nrpages) - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); + if (written > 0 && mapping->nrpages && + invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) + dio_warn_stale_pagecache(file); if (written > 0) { pos += written; @@ -734,11 +734,17 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * Or NULL if the caller does not require them. * @nonblocking: whether waiting for disk IO or mmap_sem contention * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_sem is held. * * Must be called with mmap_sem held. It may be released. See below. * @@ -1107,11 +1113,17 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_sem is held. * * Must be called with mmap_sem held for read or write. * @@ -1443,6 +1455,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, bool drain_allow = true; bool migrate_allow = true; LIST_HEAD(cma_page_list); + long ret = nr_pages; check_again: for (i = 0; i < nr_pages;) { @@ -1504,17 +1517,18 @@ check_again: * again migrating any new CMA pages which we failed to isolate * earlier. */ - nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, NULL, gup_flags); - if ((nr_pages > 0) && migrate_allow) { + if ((ret > 0) && migrate_allow) { + nr_pages = ret; drain_allow = true; goto check_again; } } - return nr_pages; + return ret; } #else static long check_and_migrate_cma_pages(struct task_struct *tsk, @@ -26,193 +26,6 @@ #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> -static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) -{ - struct hmm *hmm; - - hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); - if (!hmm) - return ERR_PTR(-ENOMEM); - - init_waitqueue_head(&hmm->wq); - INIT_LIST_HEAD(&hmm->mirrors); - init_rwsem(&hmm->mirrors_sem); - INIT_LIST_HEAD(&hmm->ranges); - spin_lock_init(&hmm->ranges_lock); - hmm->notifiers = 0; - return &hmm->mmu_notifier; -} - -static void hmm_free_notifier(struct mmu_notifier *mn) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - - WARN_ON(!list_empty(&hmm->ranges)); - WARN_ON(!list_empty(&hmm->mirrors)); - kfree(hmm); -} - -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - struct hmm_mirror *mirror; - - /* - * Since hmm_range_register() holds the mmget() lock hmm_release() is - * prevented as long as a range exists. - */ - WARN_ON(!list_empty_careful(&hmm->ranges)); - - down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) { - /* - * Note: The driver is not allowed to trigger - * hmm_mirror_unregister() from this thread. - */ - if (mirror->ops->release) - mirror->ops->release(mirror); - } - up_read(&hmm->mirrors_sem); -} - -static void notifiers_decrement(struct hmm *hmm) -{ - unsigned long flags; - - spin_lock_irqsave(&hmm->ranges_lock, flags); - hmm->notifiers--; - if (!hmm->notifiers) { - struct hmm_range *range; - - list_for_each_entry(range, &hmm->ranges, list) { - if (range->valid) - continue; - range->valid = true; - } - wake_up_all(&hmm->wq); - } - spin_unlock_irqrestore(&hmm->ranges_lock, flags); -} - -static int hmm_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *nrange) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - struct hmm_mirror *mirror; - struct hmm_range *range; - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&hmm->ranges_lock, flags); - hmm->notifiers++; - list_for_each_entry(range, &hmm->ranges, list) { - if (nrange->end < range->start || nrange->start >= range->end) - continue; - - range->valid = false; - } - spin_unlock_irqrestore(&hmm->ranges_lock, flags); - - if (mmu_notifier_range_blockable(nrange)) - down_read(&hmm->mirrors_sem); - else if (!down_read_trylock(&hmm->mirrors_sem)) { - ret = -EAGAIN; - goto out; - } - - list_for_each_entry(mirror, &hmm->mirrors, list) { - int rc; - - rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); - if (rc) { - if (WARN_ON(mmu_notifier_range_blockable(nrange) || - rc != -EAGAIN)) - continue; - ret = -EAGAIN; - break; - } - } - up_read(&hmm->mirrors_sem); - -out: - if (ret) - notifiers_decrement(hmm); - return ret; -} - -static void hmm_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *nrange) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - - notifiers_decrement(hmm); -} - -static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { - .release = hmm_release, - .invalidate_range_start = hmm_invalidate_range_start, - .invalidate_range_end = hmm_invalidate_range_end, - .alloc_notifier = hmm_alloc_notifier, - .free_notifier = hmm_free_notifier, -}; - -/* - * hmm_mirror_register() - register a mirror against an mm - * - * @mirror: new mirror struct to register - * @mm: mm to register against - * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments - * - * To start mirroring a process address space, the device driver must register - * an HMM mirror struct. - * - * The caller cannot unregister the hmm_mirror while any ranges are - * registered. - * - * Callers using this function must put a call to mmu_notifier_synchronize() - * in their module exit functions. - */ -int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) -{ - struct mmu_notifier *mn; - - lockdep_assert_held_write(&mm->mmap_sem); - - /* Sanity check */ - if (!mm || !mirror || !mirror->ops) - return -EINVAL; - - mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); - if (IS_ERR(mn)) - return PTR_ERR(mn); - mirror->hmm = container_of(mn, struct hmm, mmu_notifier); - - down_write(&mirror->hmm->mirrors_sem); - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); - - return 0; -} -EXPORT_SYMBOL(hmm_mirror_register); - -/* - * hmm_mirror_unregister() - unregister a mirror - * - * @mirror: mirror struct to unregister - * - * Stop mirroring a process address space, and cleanup. - */ -void hmm_mirror_unregister(struct hmm_mirror *mirror) -{ - struct hmm *hmm = mirror->hmm; - - down_write(&hmm->mirrors_sem); - list_del(&mirror->list); - up_write(&hmm->mirrors_sem); - mmu_notifier_put(&hmm->mmu_notifier); -} -EXPORT_SYMBOL(hmm_mirror_unregister); - struct hmm_vma_walk { struct hmm_range *range; struct dev_pagemap *pgmap; @@ -252,18 +65,15 @@ err: return -EFAULT; } -static int hmm_pfns_bad(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static int hmm_pfns_fill(unsigned long addr, unsigned long end, + struct hmm_range *range, enum hmm_pfn_value_e value) { - struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = range->values[HMM_PFN_ERROR]; + pfns[i] = range->values[value]; return 0; } @@ -532,8 +342,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (unl |