diff options
Diffstat (limited to 'mm')
58 files changed, 6397 insertions, 1896 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 48b1af447fa7..9c4bdddd80c2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -262,6 +262,9 @@ config MIGRATION config ARCH_ENABLE_HUGEPAGE_MIGRATION bool +config ARCH_ENABLE_THP_MIGRATION + bool + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT @@ -673,11 +676,12 @@ config ARCH_HAS_ZONE_DEVICE bool config ZONE_DEVICE - bool "Device memory (pmem, etc...) hotplug support" + bool "Device memory (pmem, HMM, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP depends on ARCH_HAS_ZONE_DEVICE + select RADIX_TREE_MULTIORDER help Device memory hotplug support allows for establishing pmem, @@ -688,6 +692,55 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ARCH_HAS_HMM + bool + default y + depends on (X86_64 || PPC64) + depends on ZONE_DEVICE + depends on MMU && 64BIT + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + +config MIGRATE_VMA_HELPER + bool + +config HMM + bool + select MIGRATE_VMA_HELPER + +config HMM_MIRROR + bool "HMM mirror CPU page table into a device page table" + depends on ARCH_HAS_HMM + select MMU_NOTIFIER + select HMM + help + Select HMM_MIRROR if you want to mirror range of the CPU page table of a + process into a device page table. Here, mirror means "keep synchronized". + Prerequisites: the device must provide the ability to write-protect its + page tables (at PAGE_SIZE granularity), and must be able to recover from + the resulting potential page faults. + +config DEVICE_PRIVATE + bool "Unaddressable device memory (GPU memory, ...)" + depends on ARCH_HAS_HMM + select HMM + + help + Allows creation of struct pages to represent unaddressable device + memory; i.e., memory that is only accessible from the device (or + group of devices). You likely also want to select HMM_MIRROR. + +config DEVICE_PUBLIC + bool "Addressable device memory (like GPU memory)" + depends on ARCH_HAS_HMM + select HMM + + help + Allows creation of struct pages to represent addressable device + memory; i.e., memory that is accessible from both the device and + the CPU + config FRAME_VECTOR bool diff --git a/mm/Makefile b/mm/Makefile index 411bd24d4a7c..e3ac3aeb533b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,3 +104,4 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o +obj-$(CONFIG_HMM) += hmm.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f028a9a472fd..e19606bb41a0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -569,8 +569,10 @@ static int cgwb_create(struct backing_dev_info *bdi, /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); - if (!wb) - return -ENOMEM; + if (!wb) { + ret = -ENOMEM; + goto out_put; + } ret = wb_init(wb, bdi, blkcg_css->id, gfp); if (ret) diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index b06d9fe23a28..68d28924ba79 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -139,6 +139,14 @@ int balloon_page_migrate(struct address_space *mapping, { struct balloon_dev_info *balloon = balloon_page_device(page); + /* + * We can not easily support the no copy case here so ignore it as it + * is unlikely to be use with ballon pages. See include/linux/hmm.h for + * user of the MIGRATE_SYNC_NO_COPY mode. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 6d5717bd7197..b1dd4a948fc0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup); static int after_paging_init __initdata; +pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + return prot; +} + void __init __weak early_ioremap_shutdown(void) { } @@ -215,14 +222,29 @@ early_ioremap(resource_size_t phys_addr, unsigned long size) void __init * early_memremap(resource_size_t phys_addr, unsigned long size) { - return (__force void *)__early_ioremap(phys_addr, size, - FIXMAP_PAGE_NORMAL); + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_NORMAL); + + return (__force void *)__early_ioremap(phys_addr, size, prot); } #ifdef FIXMAP_PAGE_RO void __init * early_memremap_ro(resource_size_t phys_addr, unsigned long size) { - return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_RO); + + return (__force void *)__early_ioremap(phys_addr, size, prot); +} +#endif + +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +void __init * +early_memremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return (__force void *)__early_ioremap(phys_addr, size, + __pgprot(prot_val)); } #endif diff --git a/mm/fadvise.c b/mm/fadvise.c index a43013112581..702f239cd6db 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -52,7 +52,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) goto out; } - if (IS_DAX(inode)) { + bdi = inode_to_bdi(mapping->host); + + if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: @@ -75,8 +77,6 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) else endbyte--; /* inclusive */ - bdi = inode_to_bdi(mapping->host); - switch (advice) { case POSIX_FADV_NORMAL: f.file->f_ra.ra_pages = bdi->ra_pages; diff --git a/mm/filemap.c b/mm/filemap.c index 92d4e0a6c012..870971e20967 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping, return -EEXIST; mapping->nrexceptional--; - if (!dax_mapping(mapping)) { - if (shadowp) - *shadowp = p; - } else { - /* DAX can replace empty locked entry with a hole */ - WARN_ON_ONCE(p != - dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); - /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, p, - true); - } + if (shadowp) + *shadowp = p; } __radix_tree_replace(&mapping->page_tree, node, slot, page, workingset_update_node, mapping); @@ -402,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping, { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; - struct pagevec pvec; - bool ret; + struct page *page; if (end_byte < start_byte) return false; @@ -411,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping, if (mapping->nrpages == 0) return false; - pagevec_init(&pvec, 0); - if (!pagevec_lookup(&pvec, mapping, index, 1)) + if (!find_get_pages_range(mapping, &index, end, 1, &page)) return false; - ret = (pvec.pages[0]->index <= end); - pagevec_release(&pvec); - return ret; + put_page(page); + return true; } EXPORT_SYMBOL(filemap_range_has_page); @@ -476,6 +464,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, EXPORT_SYMBOL(filemap_fdatawait_range); /** + * file_fdatawait_range - wait for writeback to complete + * @file: file pointing to address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the address space that file + * refers to, in the given range and wait for all of them. Check error + * status of the address space vs. the file->f_wb_err cursor and return it. + * + * Since the error status of the file is advanced by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + */ +int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) +{ + struct address_space *mapping = file->f_mapping; + + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return file_check_and_advance_wb_err(file); +} +EXPORT_SYMBOL(file_fdatawait_range); + +/** * filemap_fdatawait_keep_errors - wait for writeback without clearing errors * @mapping: address space structure to wait for * @@ -489,45 +500,22 @@ EXPORT_SYMBOL(filemap_fdatawait_range); */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { - loff_t i_size = i_size_read(mapping->host); - - if (i_size == 0) - return 0; - - __filemap_fdatawait_range(mapping, 0, i_size - 1); + __filemap_fdatawait_range(mapping, 0, LLONG_MAX); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); -/** - * filemap_fdatawait - wait for all under-writeback pages to complete - * @mapping: address space structure to wait for - * - * Walk the list of under-writeback pages of the given address space - * and wait for all of them. Check error status of the address space - * and return it. - * - * Since the error status of the address space is cleared by this function, - * callers are responsible for checking the return value and handling and/or - * reporting the error. - */ -int filemap_fdatawait(struct address_space *mapping) +static bool mapping_needs_writeback(struct address_space *mapping) { - loff_t i_size = i_size_read(mapping->host); - - if (i_size == 0) - return 0; - - return filemap_fdatawait_range(mapping, 0, i_size - 1); + return (!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional); } -EXPORT_SYMBOL(filemap_fdatawait); int filemap_write_and_wait(struct address_space *mapping) { int err = 0; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = filemap_fdatawrite(mapping); /* * Even if the above returned error, the pages may be @@ -566,8 +554,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ @@ -589,7 +576,7 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); void __filemap_set_wb_err(struct address_space *mapping, int err) { - errseq_t eseq = __errseq_set(&mapping->wb_err, err); + errseq_t eseq = errseq_set(&mapping->wb_err, err); trace_filemap_set_wb_err(mapping, eseq); } @@ -656,8 +643,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) int err = 0, err2; struct address_space *mapping = file->f_mapping; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ @@ -923,13 +909,33 @@ static void wake_up_page_bit(struct page *page, int bit_nr) wait_queue_head_t *q = page_waitqueue(page); struct wait_page_key key; unsigned long flags; + wait_queue_entry_t bookmark; key.page = page; key.bit_nr = bit_nr; key.page_match = 0; + bookmark.flags = 0; + bookmark.private = NULL; + bookmark.func = NULL; + INIT_LIST_HEAD(&bookmark.entry); + spin_lock_irqsave(&q->lock, flags); - __wake_up_locked_key(q, TASK_NORMAL, &key); + __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); + + while (bookmark.flags & WQ_FLAG_BOOKMARK) { + /* + * Take a breather from holding the lock, + * allow pages that finish wake up asynchronously + * to acquire the lock and remove themselves + * from wait queue + */ + spin_unlock_irqrestore(&q->lock, flags); + cpu_relax(); + spin_lock_irqsave(&q->lock, flags); + __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); + } + /* * It is possible for other pages to have collided on the waitqueue * hash, so in that case check for a page match. That prevents a long- @@ -1041,7 +1047,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, waiter); + __add_wait_queue_entry_tail(q, waiter); SetPageWaiters(page); spin_unlock_irqrestore(&q->lock, flags); } @@ -1566,23 +1572,29 @@ export: } /** - * find_get_pages - gang pagecache lookup + * find_get_pages_range - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page index + * @end: The final page index (inclusive) * @nr_pages: The maximum number of pages * @pages: Where the resulting pages are placed * - * find_get_pages() will search for and return a group of up to - * @nr_pages pages in the mapping. The pages are placed at @pages. - * find_get_pages() takes a reference against the returned pages. + * find_get_pages_range() will search for and return a group of up to @nr_pages + * pages in the mapping starting at index @start and up to index @end + * (inclusive). The pages are placed at @pages. find_get_pages_range() takes + * a reference against the returned pages. * * The search returns a group of mapping-contiguous pages with ascending * indexes. There may be holes in the indices due to not-present pages. + * We also update @start to index the next page for the traversal. * - * find_get_pages() returns the number of pages which were found. + * find_get_pages_range() returns the number of pages which were found. If this + * number is smaller than @nr_pages, the end of specified range has been + * reached. */ -unsigned find_get_pages(struct address_space *mapping, pgoff_t start, - unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, + pgoff_t end, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1592,8 +1604,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { struct page *head, *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1629,11 +1644,25 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *start = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is page at index -1 but that is + * already broken anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: rcu_read_unlock(); + return ret; } @@ -234,6 +234,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } +retry: + if (!pmd_present(*pmd)) { + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(*pmd)); + if (is_pmd_migration_entry(*pmd)) + pmd_migration_entry_wait(mm, pmd); + goto retry; + } if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); @@ -247,7 +257,15 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); +retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_present(*pmd))) { + spin_unlock(ptl); + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + pmd_migration_entry_wait(mm, pmd); + goto retry_locked; + } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags); @@ -424,7 +442,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud = pud_offset(p4d, address); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return -EFAULT; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, address); @@ -438,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); + + /* + * This should never happen (a device public page in the gate + * area). + */ + if (is_device_public_page(*page)) + goto unmap; } get_page(*page); out: @@ -1352,7 +1377,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* __HAVE_ARCH_PTE_SPECIAL */ -#ifdef __HAVE_ARCH_PTE_DEVMAP +#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { @@ -1534,7 +1559,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { diff --git a/mm/hmm.c b/mm/hmm.c new file mode 100644 index 000000000000..a88a847bccba --- /dev/null +++ b/mm/hmm.c @@ -0,0 +1,1257 @@ +/* + * Copyright 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Jérôme Glisse <jglisse@redhat.com> + */ +/* + * Refer to include/linux/hmm.h for information about heterogeneous memory + * management or HMM for short. + */ +#include <linux/mm.h> +#include <linux/hmm.h> +#include <linux/init.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/mmzone.h> +#include <linux/pagemap.h> |