summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 17:52:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 17:52:23 -0700
commitf6f7a6369203fa3e07efb7f35cfd81efe9f25b07 (patch)
tree97bec9ddd999040822acf314647eaf4208213589 /fs
parent839fe9156fbe89c3157aa6146d22090f8cffddd8 (diff)
parentdf69f52d990bd85159727bd26e819d3a6e49c666 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton: "Almost all of the rest of MM. There was an unusually large amount of MM material this time" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits) zpool: remove no-op module init/exit mm: zbud: constify the zbud_ops mm: zpool: constify the zpool_ops mm: swap: zswap: maybe_preload & refactoring zram: unify error reporting zsmalloc: remove null check from destroy_handle_cache() zsmalloc: do not take class lock in zs_shrinker_count() zsmalloc: use class->pages_per_zspage zsmalloc: consider ZS_ALMOST_FULL as migrate source zsmalloc: partial page ordering within a fullness_list zsmalloc: use shrinker to trigger auto-compaction zsmalloc: account the number of compacted pages zsmalloc/zram: introduce zs_pool_stats api zsmalloc: cosmetic compaction code adjustments zsmalloc: introduce zs_can_compact() function zsmalloc: always keep per-class stats zsmalloc: drop unused variable `nr_to_migrate' mm/memblock.c: fix comment in __next_mem_range() mm/page_alloc.c: fix type information of memoryless node memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node() ...
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/dax.c197
-rw-r--r--fs/ext2/file.c10
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/file.c68
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c12
-rw-r--r--fs/hugetlbfs/inode.c302
-rw-r--r--fs/proc/task_mmu.c285
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_file.c30
-rw-r--r--fs/xfs/xfs_trace.h1
13 files changed, 721 insertions, 190 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f77da0ec0e64..22ea424ee741 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -28,6 +28,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/fs/dax.c b/fs/dax.c
index 57bb70b4af70..e43389c74bbc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -283,7 +283,6 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct address_space *mapping = inode->i_mapping;
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
void __pmem *addr;
@@ -291,8 +290,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
pgoff_t size;
int error;
- i_mmap_lock_read(mapping);
-
/*
* Check truncate didn't happen while we were allocating a block.
* If it did, this block may or may not be still allocated to the
@@ -322,8 +319,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
error = vm_insert_mixed(vma, vaddr, pfn);
out:
- i_mmap_unlock_read(mapping);
-
return error;
}
@@ -385,15 +380,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* from a read fault and we've raced with a truncate
*/
error = -EIO;
- goto unlock_page;
+ goto unlock;
}
+ } else {
+ i_mmap_lock_write(mapping);
}
error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */
if (error)
- goto unlock_page;
+ goto unlock;
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -404,8 +401,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO;
if (error)
- goto unlock_page;
+ goto unlock;
} else {
+ i_mmap_unlock_write(mapping);
return dax_load_hole(mapping, page, vmf);
}
}
@@ -417,17 +415,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
else
clear_user_highpage(new_page, vaddr);
if (error)
- goto unlock_page;
+ goto unlock;
vmf->page = page;
if (!page) {
- i_mmap_lock_read(mapping);
/* Check we didn't race with truncate */
size = (i_size_read(inode) + PAGE_SIZE - 1) >>
PAGE_SHIFT;
if (vmf->pgoff >= size) {
- i_mmap_unlock_read(mapping);
error = -EIO;
- goto out;
+ goto unlock;
}
}
return VM_FAULT_LOCKED;
@@ -463,6 +459,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
}
+ if (!page)
+ i_mmap_unlock_write(mapping);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
@@ -471,11 +469,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
- unlock_page:
+ unlock:
if (page) {
unlock_page(page);
page_cache_release(page);
+ } else {
+ i_mmap_unlock_write(mapping);
}
+
goto out;
}
EXPORT_SYMBOL(__dax_fault);
@@ -507,6 +508,176 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
EXPORT_SYMBOL_GPL(dax_fault);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below function.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head bh;
+ unsigned blkbits = inode->i_blkbits;
+ unsigned long pmd_addr = address & PMD_MASK;
+ bool write = flags & FAULT_FLAG_WRITE;
+ long length;
+ void *kaddr;
+ pgoff_t size, pgoff;
+ sector_t block, sector;
+ unsigned long pfn;
+ int result = 0;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vma->vm_flags & VM_SHARED))
+ return VM_FAULT_FALLBACK;
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vma->vm_start)
+ return VM_FAULT_FALLBACK;
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+
+ pgoff = linear_page_index(vma, pmd_addr);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size)
+ return VM_FAULT_SIGBUS;
+ /* If the PMD would cover blocks out of the file */
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ return VM_FAULT_FALLBACK;
+
+ memset(&bh, 0, sizeof(bh));
+ block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+
+ bh.b_size = PMD_SIZE;
+ i_mmap_lock_write(mapping);
+ length = get_block(inode, block, &bh, write);
+ if (length)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * If the filesystem isn't willing to tell us the length of a hole,
+ * just fall back to PTEs. Calling get_block 512 times in a loop
+ * would be silly.
+ */
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+ goto fallback;
+
+ if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+ int i;
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ clear_page(kaddr + i * PAGE_SIZE);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ result |= VM_FAULT_MAJOR;
+ }
+
+ /*
+ * If we allocated new storage, make sure no process has any
+ * zero pages covering this hole
+ */
+ if (buffer_new(&bh)) {
+ i_mmap_unlock_write(mapping);
+ unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+ i_mmap_lock_write(mapping);
+ }
+
+ /*
+ * If a truncate happened while we were allocating blocks, we may
+ * leave blocks allocated to the file that are beyond EOF. We can't
+ * take i_mutex here, so just leave them hanging; they'll be freed
+ * when the file is deleted.
+ */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ goto fallback;
+
+ if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+ spinlock_t *ptl;
+ pmd_t entry;
+ struct page *zero_page = get_huge_zero_page();
+
+ if (unlikely(!zero_page))
+ goto fallback;
+
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_none(*pmd)) {
+ spin_unlock(ptl);
+ goto fallback;
+ }
+
+ entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
+ result = VM_FAULT_NOPAGE;
+ spin_unlock(ptl);
+ } else {
+ sector = bh.b_blocknr << (blkbits - 9);
+ length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+ bh.b_size);
+ if (length < 0) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+ goto fallback;
+
+ result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+ }
+
+ out:
+ if (buffer_unwritten(&bh))
+ complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+
+ i_mmap_unlock_write(mapping);
+
+ return result;
+
+ fallback:
+ count_vm_event(THP_FAULT_FALLBACK);
+ result = VM_FAULT_FALLBACK;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ int result;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ }
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+ complete_unwritten);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(sb);
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_pmd_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3b57c9f83c9b..1982c3f11aec 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
@@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return dax_fault(vma, vmf, ext2_get_block, NULL);
}
+static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+}
+
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
@@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
+ .pmd_fault = ext2_dax_pmd_fault,
.page_mkwrite = ext2_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
#else
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a3a404c5df2e..c60a248c640c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -25,6 +25,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 32071f5c1c26..fd1f28be5296 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bc313ac5d3fa..113837e7ba98 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
@@ -195,7 +196,7 @@ out:
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
@@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
- /* Is this the right get_block? */
+ int result;
+ handle_t *handle = NULL;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_fault(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
+}
+
+static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ int result;
+ handle_t *handle = NULL;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ bool write = flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ ext4_chunk_trans_blocks(inode,
+ PMD_SIZE / PAGE_SIZE));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_pmd_fault(vma, addr, pmd, flags,
+ ext4_get_block_dax, ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
+ return dax_mkwrite(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
+ .pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 4f6ac499f09e..2468261748b2 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include "ext4_jbd2.h"
#include "truncate.h"
+#include <linux/dax.h>
#include <linux/uio.h>
#include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 29f1af7c2cab..612fbcf76b5c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
@@ -3020,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_NO_LOCK);
}
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
+ if (create)
+ flags |= EXT4_GET_BLOCKS_CREATE;
+ ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result, flags);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 973c24ce59ad..316adb968b65 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+ vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+ index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+ mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
static void huge_pagevec_release(struct pagevec *pvec)
{
int i;
@@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
{
ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch. There are subtle differences in operation for each case.
+
+ * truncation is indicated by end of range being LLONG_MAX
+ * In this case, we first scan the range and release found pages.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ * maps and global counts.
+ * hole punch is indicated if end is not LLONG_MAX
+ * In the hole punch case we scan the range and release found pages.
+ * Only when releasing a page is the associated region/reserv map
+ * deleted. The region/reserv map for ranges without associated
+ * pages are not modified.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+ loff_t lend)
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
+ const pgoff_t end = lend >> huge_page_shift(h);
+ struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next;
int i, freed = 0;
+ long lookup_nr = PAGEVEC_SIZE;
+ bool truncate_op = (lend == LLONG_MAX);
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init(&pvec, 0);
next = start;
- while (1) {
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (next < end) {
+ /*
+ * Make sure to never grab more pages that we
+ * might possibly need.
+ */
+ if (end - next < lookup_nr)
+ lookup_nr = end - next;
+
+ /*
+ * This pagevec_lookup() may return pages past 'end',
+ * so we must check for page->index > end.
+ */
+ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
if (next == start)
break;
next = start;
@@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ u32 hash;
+
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, next, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
lock_page(page);
+ if (page->index >= end) {
+ unlock_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ next = end; /* we are done */
+ break;
+ }
+
+ /*
+ * If page is mapped, it was faulted in after being
+ * unmapped. Do nothing in this race case. In the
+ * normal case page is not mapped.
+ */
+ if (!page_mapped(page)) {
+ bool rsv_on_error = !PagePrivate(page);
+ /*
+ * We must free the huge page and remove
+ * from page cache (remove_huge_page) BEFORE
+ * removing the region/reserve map
+ * (hugetlb_unreserve_pages). In rare out
+ * of memory conditions, removal of the
+ * region/reserve map could fail. Before
+ * free'ing the page, note PagePrivate which
+ * is used in case of error.
+ */
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(
+ inode, next,
+ next + 1, 1)))
+ hugetlb_fix_reserve_counts(
+ inode, rsv_on_error);
+ }
+ }
+
if (page->index > next)
next = page->index;
+
++next;
- truncate_huge_page(page);
unlock_page(page);
- freed++;
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
}
- BUG_ON(!lstart && mapping->nrpages);
- hugetlb_unreserve_pages(inode, start, freed);
+
+ if (truncate_op)
+ (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
- truncate_hugepages(inode, 0);
+ remove_inode_hugepages(inode, 0, LLONG_MAX);
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
@@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
}
static inline void
-hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
{
struct vm_area_struct *vma;
- vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
unsigned long v_offset;
/*
@@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
* which overlap the truncated area starting at pgoff,
* and no vma on a 32-bit arch can span beyond the 4GB.
*/
- if (vma->vm_pgoff < pgoff)
- v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
else
v_offset = 0;
- unmap_hugepage_range(vma, vma->vm_start + v_offset,
- vma->vm_end, NULL);
+ if (end) {
+ end = ((end - start) << PAGE_SHIFT) +
+ vma->vm_start + v_offset;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ } else
+ end = vma->vm_end;
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
}
}
@@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
- hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
- truncate_hugepages(inode, offset);
+ remove_inode_hugepages(inode, offset, LLONG_MAX);
return 0;
}
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct hstate *h = hstate_inode(inode);
+ loff_t hpage_size = huge_page_size(h);
+ loff_t hole_start, hole_end;
+
+ /*
+ * For hole punch round up the beginning offset of the hole and
+ * round down the end.
+ */
+ hole_start = round_up(offset, hpage_size);
+ hole_end = round_down(offset + len, hpage_size);
+
+ if (hole_end > hole_start) {
+ struct address_space *mapping = inode->i_mapping;
+
+ mutex_lock(&inode->i_mutex);
+ i_mmap_lock_write(mapping);
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT);
+ i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, hole_start, hole_end);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_inode(inode);
+ struct vm_area_struct pseudo_vma;
+ struct mm_struct *mm = current->mm;
+ loff_t hpage_size = huge_page_size(h);
+ unsigned long hpage_shift = huge_page_shift(h);
+ pgoff_t start, index, end;
+ int error;
+ u32 hash;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return hugetlbfs_punch_hole(inode, offset, len);
+
+ /*
+ * Default preallocate case.
+ * For this range, start is rounded down and end is rounded up
+ * as well as being converted to page offsets.
+ */
+ start = offset >> hpage_shift;
+ end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+ mutex_lock(&inode->i_mutex);
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ /*
+ * Initialize a pseudo vma as this is required by the huge page
+ * allocation routines. If NUMA is configured, use page index
+ * as input to create an allocation policy.
+ */
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+ pseudo_vma.vm_file = file;
+
+ for (index = start; index < end; index++) {
+ /*
+ * This is supposed to be the vaddr where the page is being
+ * faulted in, but we have no vaddr here.
+ */
+ struct page *page;
+ unsigned long addr;
+ int avoid_reserve = 0;
+
+ cond_resched();
+
+ /*
+ * fallocate(2) manpage permits EINTR; we may have been
+ * interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current)) {
+ error = -EINTR;
+ break;
+ }
+
+ /* Set numa allocation policy based on index */
+ hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+ /* addr is the offset within the file (zero based) */
+ addr = index * hpage_size;
+
+ /* mutex taken here, fault path and hole punch */
+ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+ index, addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /* See if already present in mapping to avoid alloc/free */
+ page = find_get_page(mapping, index);
+ if (page) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ continue;
+ }
+
+ /* Allocate page and add to page cache */
+ page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ if (IS_ERR(page)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ error = PTR_ERR(page);
+ goto out;
+ }
+ clear_huge_page(page, addr, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+ error = huge_add_to_page_cache(page, mapping, index);
+ if (unlikely(error)) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * page_put due to reference from alloc_huge_page()
+ * unlock_page because locked by add_to_page_cache()
+ */
+ put_page(page);
+ unlock_page(page);
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
- .llseek = default_llseek,
+ .llseek = default_llseek,
+ .fallocate = hugetlbfs_fallocate,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3b4d8255e806..41f1a50c10c9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -446,6 +446,7 @@ struct mem_size_stats {
unsigned long anonymous_thp;
unsigned long swap;
u64 pss;
+ u64 swap_pss;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -492,9 +493,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (!non_swap_entry(swpent))
+ if (!non_swap_entry(swpent)) {
+ int mapcount;
+
mss->swap += PAGE_SIZE;
- else if (is_migration_entry(swpent))
+ mapcount = swp_swapcount(swpent);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+ }
+ } else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
}
@@ -640,6 +652,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
"MMUPageSize: %8lu kB\n"
"Locked: %8lu kB\n",
@@ -654,6 +667,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
mss.swap >> 10,
+ (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10,
(vma->vm_flags & VM_LOCKED) ?
@@ -712,23 +726,6 @@ const struct file_operations proc_tid_smaps_operations = {
.release = proc_map_release,
};
-/*
- * We do not want to have constant page-shift bits sitting in
- * pagemap entries and are about to reuse them some time soon.
- *
- * Here's the "migration strategy":
- * 1. when the system boots these bits remain what they are,
- * but a warning about future change is printed in log;
- * 2. once anyone clears soft-dirty bits via clear_refs file,
- * these flag is set to denote, that user is aware of the