summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/cma.c7
-rw-r--r--mm/fadvise.c11
-rw-r--r--mm/filemap.c30
-rw-r--r--mm/hugetlb.c42
-rw-r--r--mm/kasan/kasan.c4
-rw-r--r--mm/kasan/kasan.h1
-rw-r--r--mm/memcontrol.c46
-rw-r--r--mm/memory.c40
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c39
-rw-r--r--mm/page-writeback.c21
-rw-r--r--mm/page_alloc.c39
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_owner.c26
-rw-r--r--mm/page_poison.c8
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/swap.c20
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/truncate.c62
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/z3fold.c24
-rw-r--r--mm/zsmalloc.c35
27 files changed, 333 insertions, 176 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2664c118b5d2..3e2daef3c946 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -648,7 +648,8 @@ config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
- depends on MEMORY_HOTPLUG
+ depends on NO_BOOTMEM && MEMORY_HOTPLUG
+ depends on !FLATMEM
help
Ordinarily all struct pages are initialised during early boot in a
single thread. On very large machines this can take a considerable
diff --git a/mm/cma.c b/mm/cma.c
index ea506eb18cd6..bd0e1412475e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -183,7 +183,8 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
return -EINVAL;
/* ensure minimal alignment required by mm core */
- alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
+ alignment = PAGE_SIZE <<
+ max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
/* alignment should be aligned with order_per_bit */
if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit))
@@ -266,8 +267,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
* migratetype page by page allocator's buddy algorithm. In the case,
* you couldn't get a contiguous memory, which is not what we want.
*/
- alignment = max(alignment,
- (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order));
+ alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
+ max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
base = ALIGN(base, alignment);
size = ALIGN(size, alignment);
limit &= ~(alignment - 1);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index b8024fa7101d..6c707bfe02fd 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -126,6 +126,17 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
*/
start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
end_index = (endbyte >> PAGE_SHIFT);
+ if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) {
+ /* First page is tricky as 0 - 1 = -1, but pgoff_t
+ * is unsigned, so the end_index >= start_index
+ * check below would be true and we'll discard the whole
+ * file cache which is not what was asked.
+ */
+ if (end_index == 0)
+ break;
+
+ end_index--;
+ }
if (end_index >= start_index) {
unsigned long count = invalidate_mapping_pages(mapping,
diff --git a/mm/filemap.c b/mm/filemap.c
index 9665b1d4f318..00ae878b2a38 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -143,13 +143,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
return;
/*
- * Track node that only contains shadow entries.
+ * Track node that only contains shadow entries. DAX mappings contain
+ * no shadow entries and may contain other exceptional entries so skip
+ * those.
*
* Avoid acquiring the list_lru lock if already tracked. The
* list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
- if (!workingset_node_pages(node) &&
+ if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
list_empty(&node->private_list)) {
node->private_data = mapping;
list_lru_add(&workingset_shadow_nodes, &node->private_list);
@@ -580,14 +582,24 @@ static int page_cache_tree_insert(struct address_space *mapping,
if (!radix_tree_exceptional_entry(p))
return -EEXIST;
- if (WARN_ON(dax_mapping(mapping)))
- return -EINVAL;
-
- if (shadowp)
- *shadowp = p;
mapping->nrexceptional--;
- if (node)
- workingset_node_shadows_dec(node);
+ if (!dax_mapping(mapping)) {
+ if (shadowp)
+ *shadowp = p;
+ if (node)
+ workingset_node_shadows_dec(node);
+ } else {
+ /* DAX can replace empty locked entry with a hole */
+ WARN_ON_ONCE(p !=
+ (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+ RADIX_DAX_ENTRY_LOCK));
+ /* DAX accounts exceptional entries as normal pages */
+ if (node)
+ workingset_node_pages_dec(node);
+ /* Wakeup waiters for exceptional entry lock */
+ dax_wake_mapping_entry_waiter(mapping, page->index,
+ false);
+ }
}
radix_tree_replace_slot(slot, page);
mapping->nrpages++;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d26162e81fea..388c2bb9b55c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -832,8 +832,27 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
* Only the process that called mmap() has reserves for
* private mappings.
*/
- if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
- return true;
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ /*
+ * Like the shared case above, a hole punch or truncate
+ * could have been performed on the private mapping.
+ * Examine the value of chg to determine if reserves
+ * actually exist or were previously consumed.
+ * Very Subtle - The value of chg comes from a previous
+ * call to vma_needs_reserves(). The reserve map for
+ * private mappings has different (opposite) semantics
+ * than that of shared mappings. vma_needs_reserves()
+ * has already taken this difference in semantics into
+ * account. Therefore, the meaning of chg is the same
+ * as in the shared case above. Code could easily be
+ * combined, but keeping it separate draws attention to
+ * subtle differences.
+ */
+ if (chg)
+ return false;
+ else
+ return true;
+ }
return false;
}
@@ -1816,6 +1835,25 @@ static long __vma_reservation_common(struct hstate *h,
if (vma->vm_flags & VM_MAYSHARE)
return ret;
+ else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
+ /*
+ * In most cases, reserves always exist for private mappings.
+ * However, a file associated with mapping could have been
+ * hole punched or truncated after reserves were consumed.
+ * As subsequent fault on such a range will not use reserves.
+ * Subtle - The reserve map for private mappings has the
+ * opposite meaning than that of shared mappings. If NO
+ * entry is in the reserve map, it means a reservation exists.
+ * If an entry exists in the reserve map, it means the
+ * reservation has already been consumed. As a result, the
+ * return value of this routine is the opposite of the
+ * value returned from reserve map manipulation routines above.
+ */
+ if (ret)
+ return 0;
+ else
+ return 1;
+ }
else
return ret < 0 ? ret : 0;
}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 18b6a2b8d183..28439acda6ec 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -763,8 +763,8 @@ static int kasan_mem_notifier(struct notifier_block *nb,
static int __init kasan_memhotplug_init(void)
{
- pr_err("WARNING: KASAN doesn't support memory hot-add\n");
- pr_err("Memory hot-add will be disabled\n");
+ pr_info("WARNING: KASAN doesn't support memory hot-add\n");
+ pr_info("Memory hot-add will be disabled\n");
hotplug_memory_notifier(kasan_mem_notifier, 0);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 7f7ac51d7faf..fb87923552ef 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -77,7 +77,6 @@ struct kasan_alloc_meta {
struct kasan_track track;
u32 state : 2; /* enum kasan_state */
u32 alloc_size : 30;
- u32 reserved;
};
struct qlist_node {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf428d7b9a03..75e74408cc8f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1108,6 +1108,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
limit = READ_ONCE(memcg->memsw.limit);
if (count <= limit)
margin = min(margin, limit - count);
+ else
+ margin = 0;
}
return margin;
@@ -1302,6 +1304,8 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
mem_cgroup_iter_break(memcg, iter);
if (chosen)
put_task_struct(chosen);
+ /* Set a dummy value to return "true". */
+ chosen = (void *) 1;
goto unlock;
case OOM_SCAN_OK:
break;
@@ -1604,7 +1608,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- if (!current->memcg_may_oom || current->memcg_in_oom)
+ if (!current->memcg_may_oom)
return;
/*
* We are in the middle of the charge context here, so we
@@ -2892,6 +2896,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
* ordering is imposed by list_lru_node->lock taken by
* memcg_drain_all_list_lrus().
*/
+ rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
css_for_each_descendant_pre(css, &memcg->css) {
child = mem_cgroup_from_css(css);
BUG_ON(child->kmemcg_id != kmemcg_id);
@@ -2899,6 +2904,8 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (!memcg->use_hierarchy)
break;
}
+ rcu_read_unlock();
+
memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
memcg_free_cache_id(kmemcg_id);
@@ -4305,24 +4312,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
return 0;
}
-/**
- * get_mctgt_type - get target type of moving charge
- * @vma: the vma the pte to be checked belongs
- * @addr: the address corresponding to the pte to be checked
- * @ptent: the pte to be checked
- * @target: the pointer the target page or swap ent will be stored(can be NULL)
- *
- * Returns
- * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
- * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
- * move charge. if @target is not NULL, the page is stored in target->page
- * with extra refcnt got(Callers should handle it).
- * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
- * target for charge migration. if @target is not NULL, the entry is stored
- * in target->ent.
- *
- * Called with pte lock held.
- */
union mc_target {
struct page *page;
swp_entry_t ent;
@@ -4511,6 +4500,25 @@ out:
return ret;
}
+/**
+ * get_mctgt_type - get target type of moving charge
+ * @vma: the vma the pte to be checked belongs
+ * @addr: the address corresponding to the pte to be checked
+ * @ptent: the pte to be checked
+ * @target: the pointer the target page or swap ent will be stored(can be NULL)
+ *
+ * Returns
+ * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
+ * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
+ * move charge. if @target is not NULL, the page is stored in target->page
+ * with extra refcnt got(Callers should handle it).
+ * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ * target for charge migration. if @target is not NULL, the entry is stored
+ * in target->ent.
+ *
+ * Called with pte lock held.
+ */
+
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
diff --git a/mm/memory.c b/mm/memory.c
index a1b93d9e4449..15322b73636b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -63,6 +63,7 @@
#include <linux/dma-debug.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
+#include <linux/dax.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -2492,8 +2493,6 @@ void unmap_mapping_range(struct address_space *mapping,
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
-
- /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -2825,7 +2824,8 @@ oom:
*/
static int __do_fault(struct vm_area_struct *vma, unsigned long address,
pgoff_t pgoff, unsigned int flags,
- struct page *cow_page, struct page **page)
+ struct page *cow_page, struct page **page,
+ void **entry)
{
struct vm_fault vmf;
int ret;
@@ -2840,8 +2840,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- if (!vmf.page)
- goto out;
+ if (ret & VM_FAULT_DAX_LOCKED) {
+ *entry = vmf.entry;
+ return ret;
+ }
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
@@ -2855,7 +2857,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
else
VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
- out:
*page = vmf.page;
return ret;
}
@@ -3048,7 +3049,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
- ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3071,6 +3072,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
struct page *fault_page, *new_page;
+ void *fault_entry;
struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
@@ -3088,26 +3090,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
}
- ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
+ &fault_entry);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
- if (fault_page)
+ if (!(ret & VM_FAULT_DAX_LOCKED))
copy_user_highpage(new_page, fault_page, address, vma);
__SetPageUptodate(new_page);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
- if (fault_page) {
+ if (!(ret & VM_FAULT_DAX_LOCKED)) {
unlock_page(fault_page);
put_page(fault_page);
} else {
- /*
- * The fault handler has no page to lock, so it holds
- * i_mmap_lock for read to protect against truncate.
- */
- i_mmap_unlock_read(vma->vm_file->f_mapping);
+ dax_unlock_mapping_entry(vma->vm_file->f_mapping,
+ pgoff);
}
goto uncharge_out;
}
@@ -3115,15 +3115,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
- if (fault_page) {
+ if (!(ret & VM_FAULT_DAX_LOCKED)) {
unlock_page(fault_page);
put_page(fault_page);
} else {
- /*
- * The fault handler has no page to lock, so it holds
- * i_mmap_lock for read to protect against truncate.
- */
- i_mmap_unlock_read(vma->vm_file->f_mapping);
+ dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
}
return ret;
uncharge_out:
@@ -3143,7 +3139,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int dirtied = 0;
int ret, tmp;
- ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index caf2a14c37ad..e3cbdcaff2a5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -263,7 +263,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-void register_page_bootmem_info_node(struct pglist_data *pgdat)
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
{
unsigned long i, pfn, end_pfn, nr_pages;
int node = pgdat->node_id;
@@ -300,7 +300,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
* multiple nodes we check that this pfn does not already
* reside in some other nodes.
*/
- if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+ if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
register_page_bootmem_info_section(pfn);
}
}
diff --git a/mm/mmap.c b/mm/mmap.c
index d3d9a94ca031..de2c1769cc68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -168,7 +168,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
return next;
}
-static unsigned long do_brk(unsigned long addr, unsigned long len);
+static int do_brk(unsigned long addr, unsigned long len);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
@@ -224,7 +224,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto out;
/* Ok, looks good - let it rip. */
- if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+ if (do_brk(oldbrk, newbrk-oldbrk) < 0)
goto out;
set_brk:
@@ -2625,7 +2625,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
-static unsigned long do_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
@@ -2636,7 +2636,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
len = PAGE_ALIGN(len);
if (!len)
- return addr;
+ return 0;
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
@@ -2703,13 +2703,13 @@ out:
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
- return addr;
+ return 0;
}
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
- unsigned long ret;
+ int ret;
bool populate;
if (down_write_killable(&mm->mmap_sem))
@@ -2718,7 +2718,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
ret = do_brk(addr, len);
populate = ((mm->def_flags & VM_LOCKED) != 0);
up_write(&mm->mmap_sem);
- if (populate)
+ if (populate && !ret)
mm_populate(addr, len);
return ret;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index c8bd59a03c71..c2e58880207f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1682,7 +1682,7 @@ void exit_mmap(struct mm_struct *mm)
}
}
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
{
return -ENOMEM;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5bb2f7698ad7..acbc432d1a52 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -443,13 +443,29 @@ static bool __oom_reap_task(struct task_struct *tsk)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
- struct mm_struct *mm;
+ struct mm_struct *mm = NULL;
struct task_struct *p;
struct zap_details details = {.check_swap_entries = true,
.ignore_dirty = true};
bool ret = true;
/*
+ * We have to make sure to not race with the victim exit path
+ * and cause premature new oom victim selection:
+ * __oom_reap_task exit_mm
+ * atomic_inc_not_zero
+ * mmput
+ * atomic_dec_and_test
+ * exit_oom_victim
+ * [...]
+ * out_of_memory
+ * select_bad_process
+ * # no TIF_MEMDIE task selects new victim
+ * unmap_page_range # frees some memory
+ */
+ mutex_lock(&oom_lock);
+
+ /*
* Make sure we find the associated mm_struct even when the particular
* thread has already terminated and cleared its mm.
* We might have race with exit path so consider our work done if there
@@ -457,19 +473,19 @@ static bool __oom_reap_task(struct task_struct *tsk)
*/
p = find_lock_task_mm(tsk);
if (!p)
- return true;
+ goto unlock_oom;
mm = p->mm;
if (!atomic_inc_not_zero(&mm->mm_users)) {
task_unlock(p);
- return true;
+ goto unlock_oom;
}
task_unlock(p);
if (!down_read_trylock(&mm->mmap_sem)) {
ret = false;
- goto out;
+ goto unlock_oom;
}
tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -511,13 +527,15 @@ static bool __oom_reap_task(struct task_struct *tsk)
* to release its memory.
*/
set_bit(MMF_OOM_REAPED, &mm->flags);
-out:
+unlock_oom:
+ mutex_unlock(&oom_lock);
/*
* Drop our reference but make sure the mmput slow path is called from a
* different context because we shouldn't risk we get stuck there and
* put the oom_reaper out of the way.
*/
- mmput_async(mm);
+ if (mm)
+ mmput_async(mm);
return ret;
}
@@ -607,12 +625,8 @@ void try_oom_reaper(struct task_struct *tsk)
if (atomic_read(&mm->mm_users) > 1) {
rcu_read_lock();
for_each_process(p) {
- bool exiting;
-
if (!process_shares_mm(p, mm))
continue;
- if (same_thread_group(p, tsk))
- continue;
if (fatal_signal_pending(p))
continue;
@@ -620,10 +634,7 @@ void try_oom_reaper(struct task_struct *tsk)
* If the task is exiting make sure the whole thread group
* is exiting and cannot acces mm anymore.
*/
- spin_lock_irq(&p->sighand->siglock);
- exiting = signal_group_exit(p->signal);
- spin_unlock_irq(&p->sighand->siglock);
- if (exiting)
+ if (signal_group_exit(p->signal))
continue;
/* Give up */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b9956fdee8f5..e2481949494c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -373,8 +373,9 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
unsigned long bytes = vm_dirty_bytes;
unsigned long bg_bytes = dirty_background_bytes;
- unsigned long ratio = vm_dirty_ratio;
- unsigned long bg_ratio = dirty_background_ratio;
+ /* convert ratios to per-PAGE_SIZE for higher precision */
+ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
+ unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
unsigned long thresh;
unsigned long bg_thresh;
struct task_struct *tsk;
@@ -386,26 +387,28 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
/*
* The byte settings can't be applied directly to memcg
* domains. Convert them to ratios by scaling against
- * globally available memory.
+ * globally available memory. As the ratios are in
+ * per-PAGE_SIZE, they can be obtained by dividing bytes by
+ * number of pages.
*/
if (bytes)
- ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 /
- global_avail, 100UL);
+ ratio = min(DIV_ROUND_UP(bytes, global_avail),
+ PAGE_SIZE);
if (bg_bytes)
- bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 /
- global_avail, 100UL);
+ bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
+ PAGE_SIZE);
bytes = bg_bytes = 0;
}
if (bytes)
thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
else
- thresh = (ratio * available_memory) / 100;
+ thresh = (ratio * available_memory) / PAGE_SIZE;
if (bg_bytes)
bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
else
- bg_thresh = (bg_ratio * available_memory) / 100;
+ bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
if (bg_thresh >= thresh)
bg_thresh = thresh / 2;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f8f3bfc435ee..6903b695ebae 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
return;
page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
+
__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
INIT_LIST_HEAD(&page->lru);
@@ -673,6 +676,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
return;
page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
+
__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
set_page_private(page, 0);
@@ -2609,11 +2615,12 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
page = list_last_entry(list, struct page, lru);
else
page = list_first_entry(list, struct page, lru);
- } while (page && check_new_pcp(page));
- __dec_zone_state(zone, NR_ALLOC_BATCH);
- list_del(&page->lru);
- pcp->count--;
+ __dec_zone_state(zone, NR_ALLOC_BATCH);
+ list_del(&page->lru);
+ pcp->count--;
+
+ } while (check_new_pcp(page));
} else {
/*
* We most definitely don't want callers attempting to
@@ -3023,6 +3030,7 @@ reset_fair:
apply_fair = false;
fair_skipped = false;
reset_alloc_batches(ac->preferred_zoneref->zone);
+ z = ac->preferred_zoneref;
goto zonelist_scan;
}
@@ -3596,6 +3604,17 @@ retry:
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ /*
+ * Reset the zonelist iterators if memory policies can be ignored.
+ * These allocations are high priority and system rather than user
+ * orientated.
+ */
+ if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) {
+ ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->high_zoneidx, ac->nodemask);
+ }
+
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, order,
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
@@ -3604,12 +3623,6 @@ retry:
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
- /*
- * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
- * the allocation is high priority and these type of
- * allocations are system rather than user orientated
- */
- ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
page = get_page_from_freelist(gfp_mask, order,
ALLOC_NO_WATERMARKS, ac);
if (page)
@@ -3808,7 +3821,11 @@ retry_cpuset:
/* Dirty zone balancing only done in the fast path */
ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
- /* The preferred zone is used for statistics later */
+ /*
+ * The preferred zone is used for statistics but crucially it is
+ * also used as the starting point for the zonelist iterator. It
+ * may get reset for allocations that ignore memory policies.
+ */
ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
ac.high_zoneidx, ac.nodemask);
if (!ac.preferred_zoneref) {
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2d864e64f7fe..44a4c029c8e7 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -390,8 +390,10 @@ void __init page_ext_init(void)
* We know some arch can have a nodes layout such as
* -------------pfn-------------->
* N0 | N1 | N2 | N0 | N1 | N2|....
+ *
+ * Take into account DEFERRED_STRUCT_PAGE_INIT.
*/
- if (pfn_to_nid(pfn) != nid)
+ if (early_pfn_to_nid(pfn) != nid)
continue;
if (init_section_page_ext(pfn, nid))
goto oom;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 792b56da13d8..c6cda3e36212 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -55,6 +55,8 @@ void __reset_page_owner(struct page *page, unsigned int order)
for (i = 0; i < (1 << order); i++) {
page_ext = lookup_page_ext(page + i);
+ if (unlikely(!page_ext))
+ continue;
__clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
}
@@ -62,6 +64,7 @@ void __reset_page_owner(struct page *page, unsigned int order)
void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
{
struct page_ext *page_ext = lookup_page_ext(page);
+
struct stack_trace trace = {
.nr_entries = 0,
.max_entries = ARRAY_SIZE(page_ext->trace_entries),
@@ -69,6 +72,9 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
.skip = 3,
};
+ if (unlikely(!page_ext))
+ return;
+
save_stack_trace(&trace);
page_ext->order = order;
@@ -82,6 +88,8 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
void __set_page_owner_migrate_reason(struct page *page, int reason)
{
struct page_ext *page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
page_ext->last_migrate_reason = reason;
}
@@ -89,6 +97,12 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
gfp_t __get_page_owner_gfp(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ /*
+ * The caller just returns 0 if no valid gfp