summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 20:49:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 20:49:49 -0700
commitd34fc1adf01ff87026da85fb972dc259dc347540 (patch)
tree27356073d423187157b7cdb69da32b53102fb9e7 /mm
parent1c9fe4409ce3e9c78b1ed96ee8ed699d4f03bf33 (diff)
parentd2cd9ede6e193dd7d88b6d27399e96229a551b19 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - various misc bits - DAX updates - OCFS2 - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (119 commits) mm,fork: introduce MADV_WIPEONFORK x86,mpx: make mpx depend on x86-64 to free up VMA flag mm: add /proc/pid/smaps_rollup mm: hugetlb: clear target sub-page last when clearing huge page mm: oom: let oom_reap_task and exit_mmap run concurrently swap: choose swap device according to numa node mm: replace TIF_MEMDIE checks by tsk_is_oom_victim mm, oom: do not rely on TIF_MEMDIE for memory reserves access z3fold: use per-cpu unbuddied lists mm, swap: don't use VMA based swap readahead if HDD is used as swap mm, swap: add sysfs interface for VMA based swap readahead mm, swap: VMA based swap readahead mm, swap: fix swap readahead marking mm, swap: add swap readahead hit statistics mm/vmalloc.c: don't reinvent the wheel but use existing llist API mm/vmstat.c: fix wrong comment selftests/memfd: add memfd_create hugetlbfs selftest mm/shmem: add hugetlbfs support to memfd_create() mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups mm/vmalloc.c: halve the number of comparisons performed in pcpu_get_vm_areas() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/filemap.c67
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c32
-rw-r--r--mm/hugetlb.c65
-rw-r--r--mm/internal.h12
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/madvise.c13
-rw-r--r--mm/memcontrol.c40
-rw-r--r--mm/memory.c135
-rw-r--r--mm/memory_hotplug.c114
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/mremap.c13
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/oom_kill.c24
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c438
-rw-r--r--mm/page_ext.c6
-rw-r--r--mm/page_idle.c2
-rw-r--r--mm/page_io.c21
-rw-r--r--mm/page_owner.c68
-rw-r--r--mm/shmem.c206
-rw-r--r--mm/slub.c52
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swap.c24
-rw-r--r--mm/swap_state.c314
-rw-r--r--mm/swapfile.c362
-rw-r--r--mm/userfaultfd.c48
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c113
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/z3fold.c479
-rw-r--r--mm/zsmalloc.c8
35 files changed, 1868 insertions, 905 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 48b1af447fa7..0ded10a22639 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -678,6 +678,7 @@ config ZONE_DEVICE
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
depends on ARCH_HAS_ZONE_DEVICE
+ select RADIX_TREE_MULTIORDER
help
Device memory hotplug support allows for establishing pmem,
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e01cb6e5173..9d21afd692b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
return -EEXIST;
mapping->nrexceptional--;
- if (!dax_mapping(mapping)) {
- if (shadowp)
- *shadowp = p;
- } else {
- /* DAX can replace empty locked entry with a hole */
- WARN_ON_ONCE(p !=
- dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
- /* Wakeup waiters for exceptional entry lock */
- dax_wake_mapping_entry_waiter(mapping, page->index, p,
- true);
- }
+ if (shadowp)
+ *shadowp = p;
}
__radix_tree_replace(&mapping->page_tree, node, slot, page,
workingset_update_node, mapping);
@@ -402,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping,
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
- struct pagevec pvec;
- bool ret;
+ struct page *page;
if (end_byte < start_byte)
return false;
@@ -411,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping,
if (mapping->nrpages == 0)
return false;
- pagevec_init(&pvec, 0);
- if (!pagevec_lookup(&pvec, mapping, index, 1))
+ if (!find_get_pages_range(mapping, &index, end, 1, &page))
return false;
- ret = (pvec.pages[0]->index <= end);
- pagevec_release(&pvec);
- return ret;
+ put_page(page);
+ return true;
}
EXPORT_SYMBOL(filemap_range_has_page);
@@ -1564,23 +1552,29 @@ export:
}
/**
- * find_get_pages - gang pagecache lookup
+ * find_get_pages_range - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page index
+ * @end: The final page index (inclusive)
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
- * find_get_pages() will search for and return a group of up to
- * @nr_pages pages in the mapping. The pages are placed at @pages.
- * find_get_pages() takes a reference against the returned pages.
+ * find_get_pages_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting at index @start and up to index @end
+ * (inclusive). The pages are placed at @pages. find_get_pages_range() takes
+ * a reference against the returned pages.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
+ * We also update @start to index the next page for the traversal.
*
- * find_get_pages() returns the number of pages which were found.
+ * find_get_pages_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
*/
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
@@ -1590,8 +1584,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
struct page *head, *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1627,11 +1624,25 @@ repeat:
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *start = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when there is no page beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is page at index -1 but that is
+ * already broken anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *start = (pgoff_t)-1;
+ else
+ *start = end + 1;
+out:
rcu_read_unlock();
+
return ret;
}
diff --git a/mm/gup.c b/mm/gup.c
index 23f01c40c88f..33d651deeae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3644ff918434..0b51e70e0a8b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -328,7 +328,7 @@ static struct attribute *hugepage_attr[] = {
NULL,
};
-static struct attribute_group hugepage_attr_group = {
+static const struct attribute_group hugepage_attr_group = {
.attrs = hugepage_attr,
};
@@ -567,7 +567,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
goto release;
}
- clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
/*
* The memory barrier inside __SetPageUptodate makes sure that
* clear_huge_page writes become visible before the set_pmd_at()
@@ -1240,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
* We can only reuse the page if nobody else maps the huge page or it's
* part.
*/
- if (page_trans_huge_mapcount(page, NULL) == 1) {
+ if (!trylock_page(page)) {
+ get_page(page);
+ spin_unlock(vmf->ptl);
+ lock_page(page);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ put_page(page);
+ }
+ if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
+ unlock_page(page);
goto out_unlock;
}
+ unlock_page(page);
get_page(page);
spin_unlock(vmf->ptl);
alloc:
@@ -1291,7 +1305,7 @@ alloc:
count_vm_event(THP_FAULT_ALLOC);
if (!page)
- clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+ clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
@@ -2467,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
+ if (PageWriteback(page))
+ return -EBUSY;
+
if (PageAnon(head)) {
/*
* The caller does not necessarily hold an mmap_sem that would
@@ -2544,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, flags);
- ret = 0;
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ ret = split_swap_cluster(entry);
+ } else
+ ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 31e207cb399b..34625b257128 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order)
}
static int __alloc_gigantic_page(unsigned long start_pfn,
- unsigned long nr_pages)
+ unsigned long nr_pages, gfp_t gfp_mask)
{
unsigned long end_pfn = start_pfn + nr_pages;
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
- GFP_KERNEL);
+ gfp_mask);
}
static bool pfn_range_valid_gigantic(struct zone *z,
@@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone,
return zone_spans_pfn(zone, last_pfn);
}
-static struct page *alloc_gigantic_page(int nid, unsigned int order)
+static struct page *alloc_gigantic_page(int nid, struct hstate *h)
{
+ unsigned int order = huge_page_order(h);
unsigned long nr_pages = 1 << order;
unsigned long ret, pfn, flags;
- struct zone *z;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+ gfp_t gfp_mask;
- z = NODE_DATA(nid)->node_zones;
- for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
- spin_lock_irqsave(&z->lock, flags);
+ gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ zonelist = node_zonelist(nid, gfp_mask);
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+ spin_lock_irqsave(&zone->lock, flags);
- pfn = ALIGN(z->zone_start_pfn, nr_pages);
- while (zone_spans_last_pfn(z, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
/*
* We release the zone lock here because
* alloc_contig_range() will also lock the zone
@@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
* spinning on this lock, it may win the race
* and cause alloc_contig_range() to fail...
*/
- spin_unlock_irqrestore(&z->lock, flags);
- ret = __alloc_gigantic_page(pfn, nr_pages);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
if (!ret)
return pfn_to_page(pfn);
- spin_lock_irqsave(&z->lock, flags);
+ spin_lock_irqsave(&zone->lock, flags);
}
pfn += nr_pages;
}
- spin_unlock_irqrestore(&z->lock, flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
return NULL;
@@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
{
struct page *page;
- page = alloc_gigantic_page(nid, huge_page_order(h));
+ page = alloc_gigantic_page(nid, h);
if (page) {
prep_compound_gigantic_page(page, huge_page_order(h));
prep_new_huge_page(h, page, nid);
@@ -2569,13 +2574,13 @@ static struct attribute *hstate_attrs[] = {
NULL,
};
-static struct attribute_group hstate_attr_group = {
+static const struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct kobject **hstate_kobjs,
- struct attribute_group *hstate_attr_group)
+ const struct attribute_group *hstate_attr_group)
{
int retval;
int hi = hstate_index(h);
@@ -2633,7 +2638,7 @@ static struct attribute *per_node_hstate_attrs[] = {
NULL,
};
-static struct attribute_group per_node_hstate_attr_group = {
+static const struct attribute_group per_node_hstate_attr_group = {
.attrs = per_node_hstate_attrs,
};
@@ -4600,6 +4605,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
return pte;
}
+/*
+ * huge_pte_offset() - Walk the page table to resolve the hugepage
+ * entry at address @addr
+ *
+ * Return: Pointer to page table or swap entry (PUD or PMD) for
+ * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * size @sz doesn't match the hugepage size at this level of the page
+ * table.
+ */
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
@@ -4614,13 +4628,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
p4d = p4d_offset(pgd, addr);
if (!p4d_present(*p4d))
return NULL;
+
pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
+ if (sz != PUD_SIZE && pud_none(*pud))
return NULL;
- if (pud_huge(*pud))
+ /* hugepage or swap? */
+ if (pud_huge(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+
pmd = pmd_offset(pud, addr);
- return (pte_t *) pmd;
+ if (sz != PMD_SIZE && pmd_none(*pmd))
+ return NULL;
+ /* hugepage or swap? */
+ if (pmd_huge(*pmd) || !pmd_present(*pmd))
+ return (pte_t *)pmd;
+
+ return NULL;
}
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..1df011f62480 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
+/*
+ * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
+ * cannot assume a reduced access to memory reserves is sufficient for
+ * !MMU
+ */
+#ifdef CONFIG_MMU
+#define ALLOC_OOM 0x08
+#else
+#define ALLOC_OOM ALLOC_NO_WATERMARKS
+#endif
+
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -525,4 +536,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
}
+void setup_zone_pageset(struct zone *zone);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/ksm.c b/mm/ksm.c
index db20f8436bc3..15dd7415f7b3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3043,7 +3043,7 @@ static struct attribute *ksm_attrs[] = {
NULL,
};
-static struct attribute_group ksm_attr_group = {
+static const struct attribute_group ksm_attr_group = {
.attrs = ksm_attrs,
.name = "ksm",
};
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d7d1e5ddba9..eea1c733286f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma,
}
new_flags &= ~VM_DONTCOPY;
break;
+ case MADV_WIPEONFORK:
+ /* MADV_WIPEONFORK is only supported on anonymous memory. */
+ if (vma->vm_file || vma->vm_flags & VM_SHARED) {
+ error = -EINVAL;
+ goto out;
+ }
+ new_flags |= VM_WIPEONFORK;
+ break;
+ case MADV_KEEPONFORK:
+ new_flags &= ~VM_WIPEONFORK;
+ break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
@@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior)
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
+ case MADV_WIPEONFORK:
+ case MADV_KEEPONFORK:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e09741af816f..ad15850ee157 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
* value, and reading all cpu value can be performance bottleneck in some
* common workload, threshold and synchronization as vmstat[] should be
* implemented.
+ *
+ * The parameter idx can be of type enum memcg_event_item or vm_event_item.
*/
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
- enum memcg_event_item event)
+ int event)
{
unsigned long val = 0;
int cpu;
@@ -1915,7 +1917,7 @@ retry:
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ if (unlikely(tsk_is_oom_victim(current) ||
fatal_signal_pending(current) ||
current->flags & PF_EXITING))
goto force;
@@ -4319,6 +4321,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
+ memcg->low = 0;
+
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
@@ -4635,8 +4639,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (!ret || !target)
put_page(page);
}
- /* There is a swap entry and a page doesn't exist or isn't charged */
- if (ent.val && !ret &&
+ /*
+ * There is a swap entry and a page doesn't exist or isn't charged.
+ * But we cannot move a tail-page in a THP.
+ */
+ if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
@@ -4647,8 +4654,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
- * We don't consider swapping or file mapped pages because THP does not
- * support them for now.
+ * We don't consider PMD mapped swapping or file mapped pages because THP does
+ * not support them for now.
* Caller should make sure that pmd_trans_huge(pmd) is true.
*/
static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@ -5423,7 +5430,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
* in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (page->mem_cgroup)
+ if (compound_head(page)->mem_cgroup)
goto out;
if (do_swap_account) {
@@ -5906,6 +5913,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
+ unsigned int nr_entries;
unsigned short oldid;
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5926,19 +5934,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
+ nr_entries = hpage_nr_pages(page);
+ /* Get references for the tail pages, too */
+ if (nr_entries > 1)
+ mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+ nr_entries);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(swap_memcg, 1);
+ mem_cgroup_swap_statistics(swap_memcg, nr_entries);
page->mem_cgroup = NULL;
if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, 1);
+ page_counter_uncharge(&memcg->memory, nr_entries);
if (memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
- page_counter_charge(&swap_memcg->memsw, 1);
- page_counter_uncharge(&memcg->memsw, 1);
+ page_counter_charge(&swap_memcg->memsw, nr_entries);
+ page_counter_uncharge(&memcg->memsw, nr_entries);
}
/*
@@ -5948,7 +5961,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for udpating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, false, -1);
+ mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
+ -nr_entries);
memcg_check_events(memcg, page);
if (!mem_cgroup_is_root(memcg))
diff --git a/mm/memory.c b/mm/memory.c
index 56e48e4593cb..13ee83b43878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1513,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
unmap_single_vma(&tlb, vma, start, end, NULL);
+
+ /*
+ * zap_page_range does not specify whether mmap_sem should be
+ * held for read or write. That allows parallel zap_page_range
+ * operations to unmap a PTE and defer a flush meaning that
+ * this call observes pte_none and fails to flush the TLB.
+ * Rather than adding a complex API, ensure that no stale
+ * TLB entries exist when this call returns.
+ */
+ flush_tlb_range(vma, start, end);
+ }
+
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -1676,7 +1688,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
EXPORT_SYMBOL(vm_insert_page);
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot)
+ pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
@@ -1688,14 +1700,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (!pte)
goto out;
retval = -EBUSY;
- if (!pte_none(*pte))
- goto out_unlock;
+ if (!pte_none(*pte)) {
+ if (mkwrite) {
+ /*
+ * For read faults on private mappings the PFN passed
+ * in may not match the PFN we have mapped if the
+ * mapped PFN is a writeable COW page. In the mkwrite
+ * case we are creating a writable PTE for a shared
+ * mapping and we expect the PFNs to match.
+ */
+ if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+ goto out_unlock;
+ entry = *pte;
+ goto out_mkwrite;
+ } else
+ goto out_unlock;
+ }
/* Ok, finally just insert the thing.. */
if (pfn_t_devmap(pfn))
entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+ if (mkwrite) {
+ entry = pte_mkyoung(entry);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ }
+
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -1766,14 +1799,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+ ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ false);
return ret;
}
EXPORT_SYMBOL(vm_insert_pfn_prot);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
@@ -1802,10 +1836,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
page = pfn_to_page(pfn_t_to_pfn(pfn));
return insert_page(vma, addr, page, pgprot);
}
- return insert_pfn(vma, addr, pfn, pgprot);
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, false);
+
}
EXPORT_SYMBOL(vm_insert_mixed);
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
@@ -2571,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf)
* not dirty accountable.
*/
if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
- int total_mapcount;
+ int total_map_swapcount;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2586,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf)
}
put_page(vmf->page);
}
- if (reuse_swap_page(vmf->page, &total_mapcount)) {
- if (total_mapcount == 1) {
+ if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+ if (total_map_swapcount == 1) {
/*
* The page is all ours. Move it to
* our anon_vma so the rmap code will
@@ -2704,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range);
int do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page, *swapcache;
+ struct page *page = NULL, *swapcache;
struct mem_cgroup *memcg;
+ struct vma_swap_readahead swap_ra;
swp_entry_t entry;
pte_t pte;
int locked;
int exclusive = 0;
int ret = 0;
+ bool vma_readahead = swap_use_vma_readahead();
- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+ if (vma_readahead)
+ page = swap_readahead_detect(vmf, &swap_ra);
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
+ if (page)
+ put_page(page);
goto out;
+ }
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
@@ -2729,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf)
goto out;
}
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
- page = lookup_swap_cache(entry);
+ if (!page)
+ page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
+ vmf->address);
if (!page) {
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
+ if (vma_readahead)
+ page = do_swap_page_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+ else
+ page = swapin_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
@@ -4356,19 +4417,53 @@ static void clear_gigantic_page(struct page *page,
}
}
void clear_huge_page(struct page *page,