summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-31 19:25:39 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-31 19:25:39 -0700
commitac694dbdbc403c00e2c14d10bc7b8412cc378259 (patch)
treee37328cfbeaf43716dd5914cad9179e57e84df76 /mm
parenta40a1d3d0a2fd613fdec6d89d3c053268ced76ed (diff)
parent437ea90cc3afdca5229b41c6b1d38c4842756cb9 (diff)
Merge branch 'akpm' (Andrew's patch-bomb)
Merge Andrew's second set of patches: - MM - a few random fixes - a couple of RTC leftovers * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits) rtc/rtc-88pm80x: remove unneed devm_kfree rtc/rtc-88pm80x: assign ret only when rtc_register_driver fails mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables tmpfs: distribute interleave better across nodes mm: remove redundant initialization mm: warn if pg_data_t isn't initialized with zero mips: zero out pg_data_t when it's allocated memcg: gix memory accounting scalability in shrink_page_list mm/sparse: remove index_init_lock mm/sparse: more checks on mem_section number mm/sparse: optimize sparse_index_alloc memcg: add mem_cgroup_from_css() helper memcg: further prevent OOM with too many dirty pages memcg: prevent OOM with too many dirty pages mm: mmu_notifier: fix freed page still mapped in secondary MMU mm: memcg: only check anon swapin page charges for swap cache mm: memcg: only check swap cache pages for repeated charging mm: memcg: split swapin charge function into private and public part mm: memcg: remove needless !mm fixup to init_mm when charging mm: memcg: remove unneeded shmem charge type ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile8
-rw-r--r--mm/backing-dev.c20
-rw-r--r--mm/compaction.c63
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/hugetlb.c195
-rw-r--r--mm/hugetlb_cgroup.c418
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h8
-rw-r--r--mm/memblock.c35
-rw-r--r--mm/memcontrol.c390
-rw-r--r--mm/memory-failure.c17
-rw-r--r--mm/memory.c9
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/migrate.c81
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/oom_kill.c223
-rw-r--r--mm/page_alloc.c318
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/page_io.c145
-rw-r--r--mm/page_isolation.c93
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slab.c216
-rw-r--r--mm/slub.c30
-rw-r--r--mm/sparse.c29
-rw-r--r--mm/swap.c52
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/swapfile.c145
-rw-r--r--mm/vmalloc.c16
-rw-r--r--mm/vmscan.c175
-rw-r--r--mm/vmstat.c1
35 files changed, 2043 insertions, 770 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 82fed4eb2b6f..d5c8019c6627 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
config NO_BOOTMEM
boolean
+config MEMORY_ISOLATION
+ boolean
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
+ select MEMORY_ISOLATION
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
depends on MMU
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
+ select MEMORY_ISOLATION
help
Enables code to recover from some memory failures on systems
with MCA recovery. This allows a system to continue running
diff --git a/mm/Makefile b/mm/Makefile
index 8e81fe263c94..92753e2d82da 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o mm_init.o mmu_context.o percpu.o \
- compaction.o slab_common.o $(mmu-y)
+ mm_init.o mmu_context.o percpu.o slab_common.o \
+ compaction.o $(mmu-y)
obj-y += init-mm.o
@@ -49,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3387aea11209..6b4718e2ee34 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -886,3 +886,23 @@ out:
return ret;
}
EXPORT_SYMBOL(wait_iff_congested);
+
+int pdflush_proc_obsolete(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char kbuf[] = "0\n";
+
+ if (*ppos) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
+ return -EFAULT;
+ printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
+ table->procname);
+
+ *lenp = 2;
+ *ppos += *lenp;
+ return 2;
+}
diff --git a/mm/compaction.c b/mm/compaction.c
index 2f42d9528539..e78cb9688421 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone,
pfn -= pageblock_nr_pages) {
unsigned long isolated;
+ /*
+ * Skip ahead if another thread is compacting in the area
+ * simultaneously. If we wrapped around, we can only skip
+ * ahead if zone->compact_cached_free_pfn also wrapped to
+ * above our starting point.
+ */
+ if (cc->order > 0 && (!cc->wrapped ||
+ zone->compact_cached_free_pfn >
+ cc->start_free_pfn))
+ pfn = min(pfn, zone->compact_cached_free_pfn);
+
if (!pfn_valid(pfn))
continue;
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone,
* looking for free pages, the search will restart here as
* page migration may have returned some pages to the allocator
*/
- if (isolated)
+ if (isolated) {
high_pfn = max(high_pfn, pfn);
+ if (cc->order > 0)
+ zone->compact_cached_free_pfn = high_pfn;
+ }
}
/* split_free_page does not map the pages */
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
return ISOLATE_SUCCESS;
}
+/*
+ * Returns the start pfn of the last page block in a zone. This is the starting
+ * point for full compaction of a zone. Compaction searches for free pages from
+ * the end of each zone, while isolate_freepages_block scans forward inside each
+ * page block.
+ */
+static unsigned long start_free_pfn(struct zone *zone)
+{
+ unsigned long free_pfn;
+ free_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ free_pfn &= ~(pageblock_nr_pages-1);
+ return free_pfn;
+}
+
static int compact_finished(struct zone *zone,
struct compact_control *cc)
{
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone,
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
- /* Compaction run completes if the migrate and free scanner meet */
- if (cc->free_pfn <= cc->migrate_pfn)
+ /*
+ * A full (order == -1) compaction run starts at the beginning and
+ * end of a zone; it completes when the migrate and free scanner meet.
+ * A partial (order > 0) compaction can start with the free scanner
+ * at a random point in the zone, and may have to restart.
+ */
+ if (cc->free_pfn <= cc->migrate_pfn) {
+ if (cc->order > 0 && !cc->wrapped) {
+ /* We started partway through; restart at the end. */
+ unsigned long free_pfn = start_free_pfn(zone);
+ zone->compact_cached_free_pfn = free_pfn;
+ cc->free_pfn = free_pfn;
+ cc->wrapped = 1;
+ return COMPACT_CONTINUE;
+ }
+ return COMPACT_COMPLETE;
+ }
+
+ /* We wrapped around and ended up where we started. */
+ if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
return COMPACT_COMPLETE;
/*
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
/* Setup to move all movable pages to the end of the zone */
cc->migrate_pfn = zone->zone_start_pfn;
- cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
- cc->free_pfn &= ~(pageblock_nr_pages-1);
+
+ if (cc->order > 0) {
+ /* Incremental compaction. Start where the last one stopped. */
+ cc->free_pfn = zone->compact_cached_free_pfn;
+ cc->start_free_pfn = cc->free_pfn;
+ } else {
+ /* Order == -1 starts at the end of the zone. */
+ cc->free_pfn = start_free_pfn(zone);
+ }
migrate_prep_local();
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..9b75a045dbf4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_WILLNEED:
- if (!mapping->a_ops->readpage) {
- ret = -EINVAL;
- break;
- }
-
/* First and last PARTIAL page! */
start_index = offset >> PAGE_CACHE_SHIFT;
end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
nrpages = end_index - start_index + 1;
if (!nrpages)
nrpages = ~0UL;
-
- ret = force_page_cache_readahead(mapping, file,
- start_index,
- nrpages);
- if (ret > 0)
- ret = 0;
+
+ /*
+ * Ignore return value because fadvise() shall return
+ * success even if filesystem can't retrieve a hint,
+ */
+ force_page_cache_readahead(mapping, file, start_index,
+ nrpages);
break;
case POSIX_FADV_NOREUSE:
break;
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c3..d517cd16a6eb 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
#endif
+struct page *kmap_to_page(void *vaddr)
+{
+ unsigned long addr = (unsigned long)vaddr;
+
+ if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
+ int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+ return pte_page(pkmap_page_table[i]);
+ }
+
+ return virt_to_page(addr);
+}
+
static void flush_all_zero_pkmaps(void)
{
int i;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a3..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
#include <asm/page.h>
#include <asm/pgtable.h>
-#include <linux/io.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
+#include <linux/hugetlb_cgroup.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
-static int max_hstate;
+int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
-#define for_each_hstate(h) \
- for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
-
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
- list_add(&page->lru, &h->hugepage_freelists[nid]);
+ list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
}
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
if (list_empty(&h->hugepage_freelists[nid]))
return NULL;
page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
- list_del(&page->lru);
+ list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1 << PG_writeback);
}
+ VM_BUG_ON(hugetlb_cgroup_from_page(page));
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
page->mapping = NULL;
BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
- INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_page(hstate_index(h),
+ pages_per_huge_page(h), page);
if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ /* remove the page from active list */
+ list_del(&page->lru);
update_and_free_page(h, page);
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
+ INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
+ set_hugetlb_cgroup(page, NULL);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
spin_lock(&hugetlb_lock);
if (page) {
+ INIT_LIST_HEAD(&page->lru);
r_nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
+ set_hugetlb_cgroup(page, NULL);
/*
* We incremented the global counters already
*/
@@ -993,7 +1001,6 @@ retry:
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
- list_del(&page->lru);
/*
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
/* Free unnecessary surplus pages to the buddy allocator */
if (!list_empty(&surplus_list)) {
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- list_del(&page->lru);
put_page(page);
}
}
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct page *page;
long chg;
+ int ret, idx;
+ struct hugetlb_cgroup *h_cg;
+ idx = hstate_index(h);
/*
* Processes that did not create the mapping will have no
* reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
- return ERR_PTR(-VM_FAULT_OOM);
+ return ERR_PTR(-ENOMEM);
if (chg)
if (hugepage_subpool_get_pages(spool, chg))
- return ERR_PTR(-VM_FAULT_SIGBUS);
+ return ERR_PTR(-ENOSPC);
+ ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+ if (ret) {
+ hugepage_subpool_put_pages(spool, chg);
+ return ERR_PTR(-ENOSPC);
+ }
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
- spin_unlock(&hugetlb_lock);
-
- if (!page) {
+ if (page) {
+ /* update page cgroup details */
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ spin_unlock(&hugetlb_lock);
+ } else {
+ spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
+ hugetlb_cgroup_uncharge_cgroup(idx,
+ pages_per_huge_page(h),
+ h_cg);
hugepage_subpool_put_pages(spool, chg);
- return ERR_PTR(-VM_FAULT_SIGBUS);
+ return ERR_PTR(-ENOSPC);
}
+ spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ list_move(&page->lru, &h->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
}
set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
-
return page;
}
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct attribute_group *hstate_attr_group)
{
int retval;
- int hi = h - hstates;
+ int hi = hstate_index(h);
hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
- for_each_hstate(h)
- if (nhs->hstate_kobjs[h - hstates]) {
- kobject_put(nhs->hstate_kobjs[h - hstates]);
- nhs->hstate_kobjs[h - hstates] = NULL;
+ for_each_hstate(h) {
+ int idx = hstate_index(h);
+ if (nhs->hstate_kobjs[idx]) {
+ kobject_put(nhs->hstate_kobjs[idx]);
+ nhs->hstate_kobjs[idx] = NULL;
}
+ }
kobject_put(nhs->hugepages_kobj);
nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
hugetlb_unregister_all_nodes();
for_each_hstate(h) {
- kobject_put(hstate_kobjs[h - hstates]);
+ kobject_put(hstate_kobjs[hstate_index(h)]);
}
kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
if (!size_to_hstate(default_hstate_size))
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
}
- default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+ default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
if (default_hstate_max_huge_pages)
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
return;
}
- BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
- h = &hstates[max_hstate++];
+ h = &hstates[hugetlb_max_hstate++];
h->order = order;
h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
h->nr_huge_pages = 0;
h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+ INIT_LIST_HEAD(&h->hugepage_activelist);
h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
+ /*
+ * Add cgroup control files only if the huge page consists
+ * of more than two normal pages. This is because we use
+ * page[2].lru.next for storing cgoup details.
+ */
+ if (order >= HUGETLB_CGROUP_MIN_ORDER)
+ hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
parsed_hstate = h;
}
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
static unsigned long *last_mhp;
/*
- * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
* so this hugepages= parameter goes to the "default hstate".
*/
- if (!max_hstate)
+ if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
* But we need to allocate >= MAX_ORDER hstates here early to still
* use the bootmem allocator.
*/
- if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+ if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
return 0;
}
-void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
{
+ int force_flush = 0;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
struct page *page;
- struct page *tmp;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- /*
- * A page gathering list, protected by per file i_mmap_mutex. The
- * lock is used to avoid list corruption from multiple unmapping
- * of the same page since we are using page->lru.
- */
- LIST_HEAD(page_list);
-
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
+ tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, start, end);
+again:
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
+ tlb_remove_tlb_entry(tlb, ptep, address);
if (pte_dirty(pte))
set_page_dirty(page);
- list_add(&page->lru, &page_list);
+ page_remove_rmap(page);
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush)
+ break;
/* Bail out after unmapping reference page if supplied */
if (ref_page)
break;
}
- flush_tlb_range(vma, start, end);
spin_unlock(&mm->page_table_lock);
- mmu_notifier_invalidate_range_end(mm, start, end);
- list_for_each_entry_safe(page, tmp, &page_list, lru) {
- page_remove_rmap(page);
- list_del(&page->lru);
- put_page(page);
+ /*
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (address < end && !ref_page)
+ goto again;
}
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_end_vma(tlb, vma);
+}
+
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+
+ /*
+ * Clear this flag so that x86's huge_pmd_share page_table_shareable
+ * test will fail on a vma being torn down, and not grab a page table
+ * on its way out. We're lucky that the flag has such an appropriate
+ * name, and can in fact be safely cleared here. We could clear it
+ * before the __unmap_hugepage_range above, but all that's necessary
+ * is to clear it before releasing the i_mmap_mutex. This works
+ * because in the context this is called, the VMA is about to be
+ * destroyed and the i_mmap_mutex is held.
+ */
+ vma->vm_flags &= ~VM_MAYSHARE;
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
- mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
- __unmap_hugepage_range(vma, start, end, ref_page);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ struct mm_struct *mm;
+ struct mmu_gather tlb;
+
+ mm = vma->vm_mm;
+
+ tlb_gather_mmu(&tlb, mm, 0);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ tlb_finish_mmu(&tlb, start, end);
}
/*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* from the time of fork. This would look like data corruption
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
- __unmap_hugepage_range(iter_vma,
- address, address + huge_page_size(h),
- page);
+ unmap_hugepage_range(iter_vma, address,
+ address + huge_page_size(h), page);
}
mutex_unlock(&mapping->i_mmap_mutex);
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
+ long err = PTR_ERR(new_page);
page_cache_release(old_page);
/*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
/* Caller expects lock to be held */
spin_lock(&mm->page_table_lock);
- return -PTR_ERR(new_page);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ else
+ return VM_FAULT_SIGBUS;
}
/*
@@ -2642,7 +2710,11 @@ retry:
goto out;
page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
- ret = -PTR_ERR(page);
+ ret = PTR_ERR(page);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
*/
if (unlikely(PageHWPoison(page))) {
ret = VM_FAULT_HWPOISON |
- VM_FAULT_SET_HINDEX(h - hstates);
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
}
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(h - hstates);
+ VM_FAULT_SET_HINDEX(hstate_index(h));
}
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
}
}
spin_unlock(&mm->page_table_lock);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
-
+ /*
+ * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_mutex, another task can do the final put_page
+ * and that page table be reused and filled with junk.
+ */
flush_tlb_range(vma, start, end);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000000..a3f358fb8a0c
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
+/*
+ *
+ * Copyright IBM Corporation, 2012
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+
+struct hugetlb_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the counter to account for hugepages from hugetlb.
+ */
+ struct res_counter hugepage[HUGE_MAX_HSTATE];
+};
+
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+
+struct cgroup_subsys hugetlb_subsys __read_mostly;
+static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+ return container_of(s, struct hugetlb_cgroup, css);
+}
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+ return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
+ hugetlb_subsys_id));
+}