summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig14
-rw-r--r--mm/backing-dev.c5
-rw-r--r--mm/bootmem.c39
-rw-r--r--mm/huge_memory.c30
-rw-r--r--mm/hugetlb.c240
-rw-r--r--mm/memcontrol.c97
-rw-r--r--mm/memory-failure.c22
-rw-r--r--mm/memory.c23
-rw-r--r--mm/memory_hotplug.c129
-rw-r--r--mm/mm_init.c47
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nobootmem.c35
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page_alloc.c294
-rw-r--r--mm/page_io.c50
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c7
-rw-r--r--mm/shmem.c45
-rw-r--r--mm/slab_common.c4
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap.c106
-rw-r--r--mm/swapfile.c55
-rw-r--r--mm/truncate.c117
-rw-r--r--mm/vmalloc.c103
-rw-r--r--mm/vmscan.c585
27 files changed, 1410 insertions, 657 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e742d06285b7..7e28ecfa8aa4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -173,7 +173,7 @@ config HAVE_BOOTMEM_INFO_NODE
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
- depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
@@ -477,3 +477,15 @@ config FRONTSWAP
and swap data is stored as normal on the matching swap device.
If unsure, say Y to enable frontswap.
+
+config MEM_SOFT_DIRTY
+ bool "Track memory changes"
+ depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
+ select PROC_PAGE_MONITOR
+ help
+ This option enables memory changes tracking by introducing a
+ soft-dirty bit on pte-s. This bit it set when someone writes
+ into a page just as regular dirty bit, but unlike the latter
+ it can be cleared by hands.
+
+ See Documentation/vm/soft-dirty.txt for more details.
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 502517492258..d014ee5fcbbd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy);
int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
unsigned int cap)
{
- char tmp[32];
int err;
bdi->name = name;
@@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
if (err)
return err;
- sprintf(tmp, "%.28s%s", name, "-%d");
- err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
+ err = bdi_register(bdi, NULL, "%.28s-%ld", name,
+ atomic_long_inc_return(&bdi_seq));
if (err) {
bdi_destroy(bdi);
return err;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb019ec2..6ab7744e692e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
-static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+static int reset_managed_pages_done __initdata;
+
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
- /*
- * In free_area_init_core(), highmem zone's managed_pages is set to
- * present_pages, and bootmem allocator doesn't allocate from highmem
- * zones. So there's no need to recalculate managed_pages because all
- * highmem pages will be managed by the buddy system. Here highmem
- * zone also includes highmem movable zone.
- */
+ if (reset_managed_pages_done)
+ return;
+
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- if (!is_highmem(z))
- z->managed_pages = 0;
+ z->managed_pages = 0;
}
-/**
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+void __init reset_all_zones_managed_pages(void)
{
- register_page_bootmem_info_node(pgdat);
- reset_node_lowmem_managed_pages(pgdat);
- return free_all_bootmem_core(pgdat->bdata);
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_managed_pages(pgdat);
+ reset_managed_pages_done = 1;
}
/**
@@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void)
{
unsigned long total_pages = 0;
bootmem_data_t *bdata;
- struct pglist_data *pgdat;
- for_each_online_pgdat(pgdat)
- reset_node_lowmem_managed_pages(pgdat);
+ reset_all_zones_managed_pages();
list_for_each_entry(bdata, &bdata_list, list)
total_pages += free_all_bootmem_core(bdata);
+ totalram_pages += total_pages;
+
return total_pages;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83fe..243e710c6039 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pmd_t entry;
entry = mk_huge_pmd(page, vma);
page_add_new_anon_rmap(page, vma, haddr);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- pgtable_trans_huge_deposit(mm, pgtable);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm->nr_ptes++;
spin_unlock(&mm->page_table_lock);
@@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- pgtable_trans_huge_deposit(mm, pgtable);
mm->nr_ptes++;
return true;
}
@@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- pgtable_trans_huge_deposit(dst_mm, pgtable);
dst_mm->nr_ptes++;
ret = 0;
@@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
* young bit, instead of the current set_pmd_at.
*/
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
- set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+ if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+ pmd, _pmd, 1))
+ update_mmu_cache_pmd(vma, addr, pmd);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
@@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
- pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pmdp_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pmdp related
+ * operations.
+ */
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
@@ -1429,7 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
if (ret == 1) {
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
- set_pmd_at(mm, new_addr, new_pmd, pmd);
+ set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
spin_unlock(&mm->page_table_lock);
}
out:
@@ -1691,7 +1699,7 @@ static int __split_huge_page_map(struct page *page,
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
haddr = address;
@@ -2359,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- pgtable_trans_huge_deposit(mm, pgtable);
spin_unlock(&mm->page_table_lock);
*hpage = NULL;
@@ -2667,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e2bfbf73a551..83aff0a4d093 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
hstate = hstate_vma(vma);
- return 1UL << (hstate->order + PAGE_SHIFT);
+ return 1UL << huge_page_shift(hstate);
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
@@ -690,6 +690,23 @@ int PageHuge(struct page *page)
}
EXPORT_SYMBOL_GPL(PageHuge);
+pgoff_t __basepage_index(struct page *page)
+{
+ struct page *page_head = compound_head(page);
+ pgoff_t index = page_index(page_head);
+ unsigned long compound_idx;
+
+ if (!PageHuge(page_head))
+ return page_index(page);
+
+ if (compound_order(page_head) >= MAX_ORDER)
+ compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+ else
+ compound_idx = page - page_head;
+
+ return (index << compound_order(page_head)) + compound_idx;
+}
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -1246,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void)
* side-effects, like CommitLimit going negative.
*/
if (h->order > (MAX_ORDER - 1))
- totalram_pages += 1 << h->order;
+ adjust_managed_page_count(page, 1 << h->order);
}
}
@@ -2931,15 +2948,6 @@ out_mutex:
return ret;
}
-/* Can be overriden by architectures */
-__attribute__((weak)) struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- BUG();
- return NULL;
-}
-
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -3169,6 +3177,216 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed));
}
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+ struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t idx)
+{
+ unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+ svma->vm_start;
+ unsigned long sbase = saddr & PUD_MASK;
+ unsigned long s_end = sbase + PUD_SIZE;
+
+ /* Allow segments to share if only one is marked locked */
+ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
+ unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+
+ /*
+ * match the virtual addresses, permission and the alignment of the
+ * page table page.
+ */
+ if (pmd_index(addr) != pmd_index(saddr) ||
+ vm_flags != svm_flags ||
+ sbase < svma->vm_start || svma->vm_end < s_end)
+ return 0;
+
+ return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long base = addr & PUD_MASK;
+ unsigned long end = base + PUD_SIZE;
+
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (vma->vm_flags & VM_MAYSHARE &&
+ vma->vm_start <= base && end <= vma->vm_end)
+ return 1;
+ return 0;
+}
+
+/*
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
+ */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ struct vm_area_struct *svma;
+ unsigned long saddr;
+ pte_t *spte = NULL;
+ pte_t *pte;
+
+ if (!vma_shareable(vma, addr))
+ return (pte_t *)pmd_alloc(mm, pud, addr);
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+ if (svma == vma)
+ continue;
+
+ saddr = page_table_shareable(svma, vma, addr, idx);
+ if (saddr) {
+ spte = huge_pte_offset(svma->vm_mm, saddr);
+ if (spte) {
+ get_page(virt_to_page(spte));
+ break;
+ }
+ }
+ }
+
+ if (!spte)
+ goto out;
+
+ spin_lock(&mm->page_table_lock);
+ if (pud_none(*pud))
+ pud_populate(mm, pud,
+ (pmd_t *)((unsigned long)spte & PAGE_MASK));
+ else
+ put_page(virt_to_page(spte));
+ spin_unlock(&mm->page_table_lock);
+out:
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ return pte;
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ * 0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+ pgd_t *pgd = pgd_offset(mm, *addr);
+ pud_t *pud = pud_offset(pgd, *addr);
+
+ BUG_ON(page_count(virt_to_page(ptep)) == 0);
+ if (page_count(virt_to_page(ptep)) == 1)
+ return 0;
+
+ pud_clear(pud);
+ put_page(virt_to_page(ptep));
+ *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+ return 1;
+}
+#define want_pmd_share() (1)
+#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ return NULL;
+}
+#define want_pmd_share() (0)
+#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ pud = pud_alloc(mm, pgd, addr);
+ if (pud) {
+ if (sz == PUD_SIZE) {
+ pte = (pte_t *)pud;
+ } else {
+ BUG_ON(sz != PMD_SIZE);
+ if (want_pmd_share() && pud_none(*pud))
+ pte = huge_pmd_share(mm, addr, pud);
+ else
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ }
+ }
+ BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+ return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, addr);
+ if (pud_present(*pud)) {
+ if (pud_huge(*pud))
+ return (pte_t *)pud;
+ pmd = pmd_offset(pud, addr);
+ }
+ }
+ return (pte_t *) pmd;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pmd);
+ if (page)
+ page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pud);
+ if (page)
+ page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
#ifdef CONFIG_MEMORY_FAILURE
/* Should be called in hugetlb_lock */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf5..2e851f453814 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1148,6 +1148,58 @@ skip_node:
return NULL;
}
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+{
+ /*
+ * When a group in the hierarchy below root is destroyed, the
+ * hierarchy iterator can no longer be trusted since it might
+ * have pointed to the destroyed group. Invalidate it.
+ */
+ atomic_inc(&root->dead_count);
+}
+
+static struct mem_cgroup *
+mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
+ struct mem_cgroup *root,
+ int *sequence)
+{
+ struct mem_cgroup *position = NULL;
+ /*
+ * A cgroup destruction happens in two stages: offlining and
+ * release. They are separated by a RCU grace period.
+ *
+ * If the iterator is valid, we may still race with an
+ * offlining. The RCU lock ensures the object won't be
+ * released, tryget will fail if we lost the race.
+ */
+ *sequence = atomic_read(&root->dead_count);
+ if (iter->last_dead_count == *sequence) {
+ smp_rmb();
+ position = iter->last_visited;
+ if (position && !css_tryget(&position->css))
+ position = NULL;
+ }
+ return position;
+}
+
+static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
+ struct mem_cgroup *last_visited,
+ struct mem_cgroup *new_position,
+ int sequence)
+{
+ if (last_visited)
+ css_put(&last_visited->css);
+ /*
+ * We store the sequence count from the time @last_visited was
+ * loaded successfully instead of rereading it here so that we
+ * don't lose destruction events in between. We could have
+ * raced with the destruction of @new_position after all.
+ */
+ iter->last_visited = new_position;
+ smp_wmb();
+ iter->last_dead_count = sequence;
+}
+
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -1171,7 +1223,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
{
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *last_visited = NULL;
- unsigned long uninitialized_var(dead_count);
if (mem_cgroup_disabled())
return NULL;
@@ -1191,6 +1242,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
rcu_read_lock();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+ int uninitialized_var(seq);
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
@@ -1204,37 +1256,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
goto out_unlock;
}
- /*
- * If the dead_count mismatches, a destruction
- * has happened or is happening concurrently.
- * If the dead_count matches, a destruction
- * might still happen concurrently, but since
- * we checked under RCU, that destruction
- * won't free the object until we release the
- * RCU reader lock. Thus, the dead_count
- * check verifies the pointer is still valid,
- * css_tryget() verifies the cgroup pointed to
- * is alive.
- */
- dead_count = atomic_read(&root->dead_count);
- if (dead_count == iter->last_dead_count) {
- smp_rmb();
- last_visited = iter->last_visited;
- if (last_visited &&
- !css_tryget(&last_visited->css))
- last_visited = NULL;
- }
+ last_visited = mem_cgroup_iter_load(iter, root, &seq);
}
memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- if (last_visited)
- css_put(&last_visited->css);
-
- iter->last_visited = memcg;
- smp_wmb();
- iter->last_dead_count = dead_count;
+ mem_cgroup_iter_update(iter, last_visited, memcg, seq);
if (!memcg)
iter->generation++;
@@ -1448,11 +1476,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return ret;
}
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
+bool task_in_mem_cgroup(struct task_struct *task,
+ const struct mem_cgroup *memcg)
{
- int ret;
struct mem_cgroup *curr = NULL;
struct task_struct *p;
+ bool ret;
p = find_lock_task_mm(task);
if (p) {
@@ -1464,14 +1493,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
* killer still needs to detect if they have already been oom
* killed to prevent needlessly killing additional tasks.
*/
- task_lock(task);
+ rcu_read_lock();
curr = mem_cgroup_from_task(task);
if (curr)
css_get(&curr->css);
- task_unlock(task);
+ rcu_read_unlock();
}
if (!curr)
- return 0;
+ return false;
/*
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
@@ -6317,14 +6346,14 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
struct mem_cgroup *parent = memcg;
while ((parent = parent_mem_cgroup(parent)))
- atomic_inc(&parent->dead_count);
+ mem_cgroup_iter_invalidate(parent);
/*
* if the root memcg is not hierarchical we have to check it
* explicitely.
*/
if (!root_mem_cgroup->use_hierarchy)
- atomic_inc(&root_mem_cgroup->dead_count);
+ mem_cgroup_iter_invalidate(root_mem_cgroup);
}
static void mem_cgroup_css_offline(struct cgroup *cont)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ceb0c7f1932f..2c13aa7a0164 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/*
* Isolate the page, so that it doesn't get reallocated if it
- * was free.
+ * was free. This flag should be kept set until the source page
+ * is freed and PG_hwpoison on it is set.
*/
set_migratetype_isolate(p, true);
/*
@@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/* Not a free page */
ret = 1;
}
- unset_migratetype_isolate(p, MIGRATE_MOVABLE);
unlock_memory_hotplug();
return ret;
}
@@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
atomic_long_add(1 << compound_trans_order(hpage),
&num_poisoned_pages);
}
- /* keep elevated page count for bad page */
return ret;
}
@@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags)
atomic_long_inc(&num_poisoned_pages);
}
}
- /* keep elevated page count for bad page */
+ unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}
@@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
+ /*
+ * After page migration succeeds, the source page can
+ * be trapped in pagevec and actual freeing is delayed.
+ * Freeing code works differently based on PG_hwpoison,
+ * so there's a race. We need to make sure that the
+ * source page should be freed back to buddy before
+ * setting PG_hwpoison.
+ */
+ if (!is_free_buddy_page(page))
+ lru_add_drain_all();
+ if (!is_free_buddy_page(page))
+ drain_all_pages();
SetPageHWPoison(page);
+ if (!is_free_buddy_page(page))
+ pr_info("soft offline: %#lx: page leaked\n",
+ pfn);
atomic_long_inc(&num_poisoned_pages);
}
} else {
diff --git a/mm/memory.c b/mm/memory.c
index 61a262b08e53..b68812d682b6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif
-unsigned long num_physpages;
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
@@ -92,7 +91,6 @@ unsigned long num_physpages;
*/
void * high_memory;
-EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
/*
@@ -1101,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
+ unsigned long range_start = addr;
again:
init_rss_vec(rss);
@@ -1206,12 +1205,14 @@ again:
force_flush = 0;
#ifdef HAVE_GENERIC_MMU_GATHER
- tlb->start = addr;
- tlb->end = end;
+ tlb->start = range_start;
+ tlb->end = addr;
#endif
tlb_flush_mmu(tlb);
- if (addr != end)
+ if (addr != end) {
+ range_start = addr;
goto again;
+ }
}
return addr;
@@ -2904,7 +2905,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
details->first_index, details->last_index) {
vba = vma->vm_pgoff;
- vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
+ vea = vba + vma_pages(vma) - 1;
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
zba = details->first_index;
if (zba < vba)
@@ -4201,7 +4202,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
up_read(&mm->mmap_sem);
}
-#ifdef CONFIG_PROVE_LOCKING
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void might_fault(void)
{
/*
@@ -4213,13 +4214,17 @@ void might_fault(void)
if (segment_eq(get_fs(), KERNEL_DS))
return;
- might_sleep();
/*
* it would be nicer only to annotate paths which are not under
* pagefault_disable, however that requires a larger audit and
* providing helpers like get_user_atomic.
*/
- if (!in_atomic() && current->mm)
+ if (in_atomic())
+ return;
+
+ __might_sleep(__FILE__, __LINE__, 0);
+
+ if (current->mm)
might_lock_read(&current->mm->mmap_sem);
}
EXPORT_SYMBOL(might_fault);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1ad92b46753e..f5ba127b2051 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
res->end = start + size - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
if (request_resource(&iomem_resource, res) < 0) {
- printk("System RAM resource %pR cannot be added\n", res);
+ pr_debug("System RAM resource %pR cannot be added\n", res);
kfree(res);
res = NULL;
}
@@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info, struct page *page,
atomic_inc(&page->_count);
}
-/* reference to __meminit __free_pages_bootmem is valid
- * so use __ref to tell modpost not to generate a warning */
-void __ref put_page_bootmem(struct page *page)
+void put_page_bootmem(struct page *page)
{
unsigned long type;
- static DEFINE_MUTEX(ppb_lock);
type = (unsigned long) page->lru.next;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
-
- /*
- * Please refer to comment for __free_pages_bootmem()
- * for why we serialize here.
- */
- mutex_lock(&ppb_lock);
- __free_pages_bootmem(page, 0);
- mutex_unlock(&ppb_lock);
- totalram_pages++;
+ free_reserved_page(page);
}
-
}
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
@@ -309,7 +297,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
/* can't move pfns which are higher than @z2 */
if (end_pfn > zone_end_pfn(z2))
goto out_fail;
- /* the move out part mast at the left most of @z2 */
+ /* the move out part must be at the left most of @z2 */
if (start_pfn > z2->zone_start_pfn)
goto out_fail;
/* must included/overlap */
@@ -775,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
void __online_page_set_limits(struct page *page)
{
- unsigned long pfn = page_to_pfn(page);
-
- if (pfn >= num_physpages)
- num_physpages = pfn + 1;
}
EXPORT_SYMBOL_GPL(__online_page_set_limits);
void __online_page_increment_counters