summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt3
-rw-r--r--arch/sh/mm/Kconfig1
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/include/asm/pgtable.h17
-rw-r--r--arch/x86/include/asm/pgtable_types.h20
-rw-r--r--arch/x86/mm/pgtable.c8
-rw-r--r--include/asm-generic/pgtable.h110
-rw-r--r--include/linux/huge_mm.h16
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--include/linux/mempolicy.h8
-rw-r--r--include/linux/migrate.h46
-rw-r--r--include/linux/mm.h39
-rw-r--r--include/linux/mm_types.h31
-rw-r--r--include/linux/mmzone.h13
-rw-r--r--include/linux/rmap.h33
-rw-r--r--include/linux/sched.h27
-rw-r--r--include/linux/vm_event_item.h12
-rw-r--r--include/linux/vmstat.h8
-rw-r--r--include/trace/events/migrate.h51
-rw-r--r--include/uapi/linux/mempolicy.h15
-rw-r--r--init/Kconfig44
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sched/core.c71
-rw-r--r--kernel/sched/fair.c227
-rw-r--r--kernel/sched/features.h11
-rw-r--r--kernel/sched/sched.h12
-rw-r--r--kernel/sysctl.c45
-rw-r--r--mm/compaction.c15
-rw-r--r--mm/huge_memory.c108
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/internal.h7
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/memcontrol.c7
-rw-r--r--mm/memory-failure.c7
-rw-r--r--mm/memory.c198
-rw-r--r--mm/memory_hotplug.c3
-rw-r--r--mm/mempolicy.c283
-rw-r--r--mm/migrate.c337
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c135
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c10
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c66
-rw-r--r--mm/vmstat.c16
45 files changed, 1938 insertions, 172 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 20e248cc03a9..ea8e5b485576 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2032,6 +2032,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nr_uarts= [SERIAL] maximum number of UARTs to be registered.
+ numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing.
+ Allowed values are enable and disable
+
numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
one of ['zone', 'node', 'default'] can be specified
This can be set from sysctl after boot.
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index cb8f9920f4dd..0f7c852f355c 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
config NUMA
bool "Non Uniform Memory Access (NUMA) Support"
depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+ select ARCH_WANT_NUMA_VARIABLE_LOCALITY
default n
help
Some SH systems have many various memories scattered around
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 65a872bf72f9..97f8c5ad8c2d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
def_bool y
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
+ select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a1f780d45f76..5199db2923d3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -404,7 +404,14 @@ static inline int pte_same(pte_t a, pte_t b)
static inline int pte_present(pte_t a)
{
- return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
+ _PAGE_NUMA);
+}
+
+#define pte_accessible pte_accessible
+static inline int pte_accessible(pte_t a)
+{
+ return pte_flags(a) & _PAGE_PRESENT;
}
static inline int pte_hidden(pte_t pte)
@@ -420,7 +427,8 @@ static inline int pmd_present(pmd_t pmd)
* the _PAGE_PSE flag will remain set at all times while the
* _PAGE_PRESENT bit is clear).
*/
- return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
+ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
+ _PAGE_NUMA);
}
static inline int pmd_none(pmd_t pmd)
@@ -479,6 +487,11 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
static inline int pmd_bad(pmd_t pmd)
{
+#ifdef CONFIG_NUMA_BALANCING
+ /* pmd_numa check */
+ if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
+ return 0;
+#endif
return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
}
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index ec8a1fc9505d..3c32db8c539d 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -64,6 +64,26 @@
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+/*
+ * _PAGE_NUMA indicates that this page will trigger a numa hinting
+ * minor page fault to gather numa placement statistics (see
+ * pte_numa()). The bit picked (8) is within the range between
+ * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
+ * require changes to the swp entry format because that bit is always
+ * zero when the pte is not present.
+ *
+ * The bit picked must be always zero when the pmd is present and not
+ * present, so that we don't lose information when we set it while
+ * atomically clearing the present bit.
+ *
+ * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
+ * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
+ * couldn't reach, like handle_mm_fault() (see access_error in
+ * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
+ * handle_mm_fault() to be invoked).
+ */
+#define _PAGE_NUMA _PAGE_PROTNONE
+
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 217eb705fac0..e27fbf887f3b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -301,6 +301,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
free_page((unsigned long)pgd);
}
+/*
+ * Used to set accessed or dirty bits in the page table entries
+ * on other architectures. On x86, the accessed and dirty bits
+ * are tracked by hardware. However, do_wp_page calls this function
+ * to also make the pte writeable at the same time the dirty bit is
+ * set. In that case we do actually need to write the PTE.
+ */
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
@@ -310,7 +317,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*ptep = entry;
pte_update_defer(vma->vm_mm, address, ptep);
- flush_tlb_page(vma, address);
}
return changed;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 284e80831d2c..701beab27aab 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -219,6 +219,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
#define move_pte(pte, prot, old_addr, new_addr) (pte)
#endif
+#ifndef pte_accessible
+# define pte_accessible(pte) ((void)(pte),1)
+#endif
+
#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
#endif
@@ -580,6 +584,112 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#endif
}
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
+ * same bit too). It's set only when _PAGE_PRESET is not set and it's
+ * never set if _PAGE_PRESENT is set.
+ *
+ * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
+ * fault triggers on those regions if pte/pmd_numa returns true
+ * (because _PAGE_PRESENT is not set).
+ */
+#ifndef pte_numa
+static inline int pte_numa(pte_t pte)
+{
+ return (pte_flags(pte) &
+ (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+#endif
+
+#ifndef pmd_numa
+static inline int pmd_numa(pmd_t pmd)
+{
+ return (pmd_flags(pmd) &
+ (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+#endif
+
+/*
+ * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
+ * because they're called by the NUMA hinting minor page fault. If we
+ * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
+ * would be forced to set it later while filling the TLB after we
+ * return to userland. That would trigger a second write to memory
+ * that we optimize away by setting _PAGE_ACCESSED here.
+ */
+#ifndef pte_mknonnuma
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+ pte = pte_clear_flags(pte, _PAGE_NUMA);
+ return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED);
+}
+#endif
+
+#ifndef pmd_mknonnuma
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+ pmd = pmd_clear_flags(pmd, _PAGE_NUMA);
+ return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED);
+}
+#endif
+
+#ifndef pte_mknuma
+static inline pte_t pte_mknuma(pte_t pte)
+{
+ pte = pte_set_flags(pte, _PAGE_NUMA);
+ return pte_clear_flags(pte, _PAGE_PRESENT);
+}
+#endif
+
+#ifndef pmd_mknuma
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+ pmd = pmd_set_flags(pmd, _PAGE_NUMA);
+ return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+#endif
+#else
+extern int pte_numa(pte_t pte);
+extern int pmd_numa(pmd_t pmd);
+extern pte_t pte_mknonnuma(pte_t pte);
+extern pmd_t pmd_mknonnuma(pmd_t pmd);
+extern pte_t pte_mknuma(pte_t pte);
+extern pmd_t pmd_mknuma(pmd_t pmd);
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+#else
+static inline int pmd_numa(pmd_t pmd)
+{
+ return 0;
+}
+
+static inline int pte_numa(pte_t pte)
+{
+ return 0;
+}
+
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+ return pte;
+}
+
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+ return pmd;
+}
+
+static inline pte_t pte_mknuma(pte_t pte)
+{
+ return pte;
+}
+
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+ return pmd;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
#endif /* CONFIG_MMU */
#endif /* !__ASSEMBLY__ */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 092dc5305a32..1d76f8ca90f0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd);
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot);
+ unsigned long addr, pgprot_t newprot,
+ int prot_numa);
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG,
@@ -111,7 +112,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
#define wait_split_huge_page(__anon_vma, __pmd) \
do { \
pmd_t *____pmd = (__pmd); \
- anon_vma_lock(__anon_vma); \
+ anon_vma_lock_write(__anon_vma); \
anon_vma_unlock(__anon_vma); \
BUG_ON(pmd_trans_splitting(*____pmd) || \
pmd_trans_huge(*____pmd)); \
@@ -171,6 +172,10 @@ static inline struct page *compound_trans_head(struct page *page)
}
return page;
}
+
+extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp);
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -209,6 +214,13 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
{
return 0;
}
+
+static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+ return 0;
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3e7fa1acf09c..0c80d3f57a5b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int write);
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pmd);
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -132,7 +132,11 @@ static inline void copy_huge_page(struct page *dst, struct page *src)
{
}
-#define hugetlb_change_protection(vma, address, end, newprot)
+static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end, pgprot_t newprot)
+{
+ return 0;
+}
static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index dbd212723b74..9adc270de7ef 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -188,6 +188,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
return 1;
}
+extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+
#else
struct mempolicy {};
@@ -307,5 +309,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
return 0;
}
+static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return -1; /* no node preference */
+}
+
#endif /* CONFIG_NUMA */
#endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 0b5865c61efd..1e9f627967a3 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -23,6 +23,15 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **);
#define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page
* sucessful migration case.
*/
+enum migrate_reason {
+ MR_COMPACTION,
+ MR_MEMORY_FAILURE,
+ MR_MEMORY_HOTPLUG,
+ MR_SYSCALL, /* also applies to cpusets */
+ MR_MEMPOLICY_MBIND,
+ MR_NUMA_MISPLACED,
+ MR_CMA
+};
#ifdef CONFIG_MIGRATION
@@ -32,7 +41,7 @@ extern int migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode);
extern int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
- enum migrate_mode mode);
+ enum migrate_mode mode, int reason);
extern int migrate_huge_page(struct page *, new_page_t x,
unsigned long private, bool offlining,
enum migrate_mode mode);
@@ -54,7 +63,7 @@ static inline void putback_lru_pages(struct list_head *l) {}
static inline void putback_movable_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
- enum migrate_mode mode) { return -ENOSYS; }
+ enum migrate_mode mode, int reason) { return -ENOSYS; }
static inline int migrate_huge_page(struct page *page, new_page_t x,
unsigned long private, bool offlining,
enum migrate_mode mode) { return -ENOSYS; }
@@ -83,4 +92,37 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
#define fail_migrate_page NULL
#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_NUMA_BALANCING
+extern int migrate_misplaced_page(struct page *page, int node);
+extern int migrate_misplaced_page(struct page *page, int node);
+extern bool migrate_ratelimited(int node);
+#else
+static inline int migrate_misplaced_page(struct page *page, int node)
+{
+ return -EAGAIN; /* can't migrate now */
+}
+static inline bool migrate_ratelimited(int node)
+{
+ return false;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node);
+#else
+static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node)
+{
+ return -EAGAIN;
+}
+#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
+
#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4af4f0b1be4c..7f4f906190bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -693,6 +693,36 @@ static inline int page_to_nid(const struct page *page)
}
#endif
+#ifdef CONFIG_NUMA_BALANCING
+static inline int page_xchg_last_nid(struct page *page, int nid)
+{
+ return xchg(&page->_last_nid, nid);
+}
+
+static inline int page_last_nid(struct page *page)
+{
+ return page->_last_nid;
+}
+static inline void reset_page_last_nid(struct page *page)
+{
+ page->_last_nid = -1;
+}
+#else
+static inline int page_xchg_last_nid(struct page *page, int nid)
+{
+ return page_to_nid(page);
+}
+
+static inline int page_last_nid(struct page *page)
+{
+ return page_to_nid(page);
+}
+
+static inline void reset_page_last_nid(struct page *page)
+{
+}
+#endif
+
static inline struct zone *page_zone(const struct page *page)
{
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
@@ -1078,6 +1108,9 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
extern unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr);
+extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ int dirty_accountable, int prot_numa);
extern int mprotect_fixup(struct vm_area_struct *vma,
struct vm_area_struct **pprev, unsigned long start,
unsigned long end, unsigned long newflags);
@@ -1579,6 +1612,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
}
#endif
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
+#endif
+
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
@@ -1600,6 +1638,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
#define FOLL_MLOCK 0x40 /* mark page as mlocked */
#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
+#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7ade2731b5d6..7d9ebb7cc982 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -175,6 +175,10 @@ struct page {
*/
void *shadow;
#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ int _last_nid;
+#endif
}
/*
* The struct page can be forced to be double word aligned so that atomic ops
@@ -411,9 +415,36 @@ struct mm_struct {
#ifdef CONFIG_CPUMASK_OFFSTACK
struct cpumask cpumask_allocation;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * numa_next_scan is the next time when the PTEs will me marked
+ * pte_numa to gather statistics and migrate pages to new nodes
+ * if necessary
+ */
+ unsigned long numa_next_scan;
+
+ /* numa_next_reset is when the PTE scanner period will be reset */
+ unsigned long numa_next_reset;
+
+ /* Restart point for scanning and setting pte_numa */
+ unsigned long numa_scan_offset;
+
+ /* numa_scan_seq prevents two threads setting pte_numa */
+ int numa_scan_seq;
+
+ /*
+ * The first node a task was scheduled on. If a task runs on
+ * a different node than Make PTE Scan Go Now.
+ */
+ int first_nid;
+#endif
struct uprobes_state uprobes_state;
};
+/* first nid will either be a valid NID or one of these values */
+#define NUMA_PTE_SCAN_INIT -1
+#define NUMA_PTE_SCAN_ACTIVE -2
+
static inline void mm_init_cpumask(struct mm_struct *mm)
{
#ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd55dad56aac..4bec5be82cab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -735,6 +735,19 @@ typedef struct pglist_data {
struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
int kswapd_max_order;
enum zone_type classzone_idx;
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * Lock serializing the per destination node AutoNUMA memory
+ * migration rate limiting data.
+ */
+ spinlock_t numabalancing_migrate_lock;
+
+ /* Rate limiting time interval */
+ unsigned long numabalancing_migrate_next_window;
+
+ /* Number of pages migrated during the rate limiting time interval */
+ unsigned long numabalancing_migrate_nr_pages;
+#endif
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bfe1f4780644..c20635c527a9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -7,7 +7,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/memcontrol.h>
/*
@@ -25,8 +25,8 @@
* pointing to this anon_vma once its vma list is empty.
*/
struct anon_vma {
- struct anon_vma *root; /* Root of this anon_vma tree */
- struct mutex mutex; /* Serialize access to vma list */
+ struct anon_vma *root; /* Root of this anon_vma tree */
+ struct rw_semaphore rwsem; /* W: modification, R: walking the list */
/*
* The refcount is taken on an anon_vma when there is no
* guarantee that the vma of page tables will exist for
@@ -64,7 +64,7 @@ struct anon_vma_chain {
struct vm_area_struct *vma;
struct anon_vma *anon_vma;
struct list_head same_vma; /* locked by mmap_sem & page_table_lock */
- struct rb_node rb; /* locked by anon_vma->mutex */
+ struct rb_node rb; /* locked by anon_vma->rwsem */
unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
unsigned long cached_vma_start, cached_vma_last;
@@ -108,26 +108,37 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
- mutex_lock(&anon_vma->root->mutex);
+ down_write(&anon_vma->root->rwsem);
}
static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
- mutex_unlock(&anon_vma->root->mutex);
+ up_write(&anon_vma->root->rwsem);
}
-static inline void anon_vma_lock(struct anon_vma *anon_vma)
+static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
- mutex_lock(&anon_vma->root->mutex);
+ down_write(&anon_vma->root->rwsem);
}
static inline void anon_vma_unlock(struct anon_vma *anon_vma)
{
- mutex_unlock(&anon_vma->root->mutex);
+ up_write(&anon_vma->root->rwsem);
}
+static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
+{
+ down_read(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
+{
+ up_read(&anon_vma->root->rwsem);
+}
+
+
/*
* anon_vma helper functions.
*/
@@ -220,8 +231,8 @@ int try_to_munlock(struct page *);
/*
* Called by memory-failure.c to kill processes.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page);
-void page_unlock_anon_vma(struct anon_vma *anon_vma);
+struct anon_vma *page_lock_anon_vma_read(struct page *page);
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c2f3072beef..b089c92c609b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1527,6 +1527,14 @@ struct task_struct {
short il_next;
short pref_node_fork;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ int numa_scan_seq;
+ int numa_migrate_seq;
+ unsigned int numa_scan_period;
+ u64 node_stamp; /* migration stamp */
+ struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+
struct rcu_head rcu;
/*
@@ -1601,6 +1609,18 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages, bool migrated);
+extern void set_numabalancing_state(bool enabled);
+#else
+static inline void task_numa_fault(int node, int pages, bool migrated)
+{
+}
+static inline void set_numabalancing_state(bool enabled)
+{
+}
+#endif
+
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -2030,6 +2050,13 @@ enum sched_tunable_scaling {
};
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+exter