From e30331ff05f689f8f2faeb51664299c4d7841f15 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Sep 2017 16:18:39 -0700 Subject: dax: relocate some dax functions dax_load_hole() will soon need to call dax_insert_mapping_entry(), so it needs to be moved lower in dax.c so the definition exists. dax_wake_mapping_entry_waiter() will soon be removed from dax.h and be made static to dax.c, so we need to move its definition above all its callers. Link: http://lkml.kernel.org/r/20170724170616.25810-3-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: "Darrick J. Wong" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Steven Rostedt Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 138 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 69 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index ab925dc6647a..b8882b5ce6ed 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -120,6 +120,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo return autoremove_wake_function(wait, mode, sync, NULL); } +/* + * We do not necessarily hold the mapping->tree_lock when we call this + * function so it is possible that 'entry' is no longer a valid item in the + * radix tree. This is okay because all we really need to do is to find the + * correct waitqueue where tasks might be waiting for that old 'entry' and + * wake them. + */ +void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, void *entry, bool wake_all) +{ + struct exceptional_entry_key key; + wait_queue_head_t *wq; + + wq = dax_entry_waitqueue(mapping, index, entry, &key); + + /* + * Checking for locked entry and prepare_to_wait_exclusive() happens + * under mapping->tree_lock, ditto for entry handling in our callers. + * So at this point all tasks that could have seen our entry locked + * must be in the waitqueue and the following check will see them. + */ + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); +} + /* * Check whether the given slot is locked. The function must be called with * mapping->tree_lock held @@ -392,31 +417,6 @@ restart: return entry; } -/* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree. This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. - */ -void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, void *entry, bool wake_all) -{ - struct exceptional_entry_key key; - wait_queue_head_t *wq; - - wq = dax_entry_waitqueue(mapping, index, entry, &key); - - /* - * Checking for locked entry and prepare_to_wait_exclusive() happens - * under mapping->tree_lock, ditto for entry handling in our callers. - * So at this point all tasks that could have seen our entry locked - * must be in the waitqueue and the following check will see them. - */ - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); -} - static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { @@ -468,50 +468,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, return __dax_invalidate_mapping_entry(mapping, index, false); } -/* - * The user has performed a load from a hole in the file. Allocating - * a new page in the file would cause excessive storage usage for - * workloads with sparse files. We allocate a page cache page instead. - * We'll kick it out of the page cache if it's ever written to, - * otherwise it will simply fall out of the page cache under memory - * pressure without ever having been dirtied. - */ -static int dax_load_hole(struct address_space *mapping, void **entry, - struct vm_fault *vmf) -{ - struct inode *inode = mapping->host; - struct page *page; - int ret; - - /* Hole page already exists? Return it... */ - if (!radix_tree_exceptional_entry(*entry)) { - page = *entry; - goto finish_fault; - } - - /* This will replace locked radix tree entry with a hole page */ - page = find_or_create_page(mapping, vmf->pgoff, - vmf->gfp_mask | __GFP_ZERO); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - -finish_fault: - vmf->page = page; - ret = finish_fault(vmf); - vmf->page = NULL; - *entry = page; - if (!ret) { - /* Grab reference for PTE that is now referencing the page */ - get_page(page); - ret = VM_FAULT_NOPAGE; - } -out: - trace_dax_load_hole(inode, vmf, ret); - return ret; -} - static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, sector_t sector, size_t size, struct page *to, unsigned long vaddr) @@ -941,6 +897,50 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); +/* + * The user has performed a load from a hole in the file. Allocating + * a new page in the file would cause excessive storage usage for + * workloads with sparse files. We allocate a page cache page instead. + * We'll kick it out of the page cache if it's ever written to, + * otherwise it will simply fall out of the page cache under memory + * pressure without ever having been dirtied. + */ +static int dax_load_hole(struct address_space *mapping, void **entry, + struct vm_fault *vmf) +{ + struct inode *inode = mapping->host; + struct page *page; + int ret; + + /* Hole page already exists? Return it... */ + if (!radix_tree_exceptional_entry(*entry)) { + page = *entry; + goto finish_fault; + } + + /* This will replace locked radix tree entry with a hole page */ + page = find_or_create_page(mapping, vmf->pgoff, + vmf->gfp_mask | __GFP_ZERO); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + +finish_fault: + vmf->page = page; + ret = finish_fault(vmf); + vmf->page = NULL; + *entry = page; + if (!ret) { + /* Grab reference for PTE that is now referencing the page */ + get_page(page); + ret = VM_FAULT_NOPAGE; + } +out: + trace_dax_load_hole(inode, vmf, ret); + return ret; +} + static bool dax_range_is_aligned(struct block_device *bdev, unsigned int offset, unsigned int length) { -- cgit v1.2.3 From 91d25ba8a6b0d810dc844cebeedc53029118ce3e Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Sep 2017 16:18:43 -0700 Subject: dax: use common 4k zero page for dax mmap reads When servicing mmap() reads from file holes the current DAX code allocates a page cache page of all zeroes and places the struct page pointer in the mapping->page_tree radix tree. This has three major drawbacks: 1) It consumes memory unnecessarily. For every 4k page that is read via a DAX mmap() over a hole, we allocate a new page cache page. This means that if you read 1GiB worth of pages, you end up using 1GiB of zeroed memory. This is easily visible by looking at the overall memory consumption of the system or by looking at /proc/[pid]/smaps: 7f62e72b3000-7f63272b3000 rw-s 00000000 103:00 12 /root/dax/data Size: 1048576 kB Rss: 1048576 kB Pss: 1048576 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 1048576 kB Private_Dirty: 0 kB Referenced: 1048576 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Locked: 0 kB 2) It is slower than using a common zero page because each page fault has more work to do. Instead of just inserting a common zero page we have to allocate a page cache page, zero it, and then insert it. Here are the average latencies of dax_load_hole() as measured by ftrace on a random test box: Old method, using zeroed page cache pages: 3.4 us New method, using the common 4k zero page: 0.8 us This was the average latency over 1 GiB of sequential reads done by this simple fio script: [global] size=1G filename=/root/dax/data fallocate=none [io] rw=read ioengine=mmap 3) The fact that we had to check for both DAX exceptional entries and for page cache pages in the radix tree made the DAX code more complex. Solve these issues by following the lead of the DAX PMD code and using a common 4k zero page instead. As with the PMD code we will now insert a DAX exceptional entry into the radix tree instead of a struct page pointer which allows us to remove all the special casing in the DAX code. Note that we do still pretty aggressively check for regular pages in the DAX radix tree, especially where we take action based on the bits set in the page. If we ever find a regular page in our radix tree now that most likely means that someone besides DAX is inserting pages (which has happened lots of times in the past), and we want to find that out early and fail loudly. This solution also removes the extra memory consumption. Here is that same /proc/[pid]/smaps after 1GiB of reading from a hole with the new code: 7f2054a74000-7f2094a74000 rw-s 00000000 103:00 12 /root/dax/data Size: 1048576 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Locked: 0 kB Overall system memory consumption is similarly improved. Another major change is that we remove dax_pfn_mkwrite() from our fault flow, and instead rely on the page fault itself to make the PTE dirty and writeable. The following description from the patch adding the vm_insert_mixed_mkwrite() call explains this a little more: "To be able to use the common 4k zero page in DAX we need to have our PTE fault path look more like our PMD fault path where a PTE entry can be marked as dirty and writeable as it is first inserted rather than waiting for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call. Right now we can rely on having a dax_pfn_mkwrite() call because we can distinguish between these two cases in do_wp_page(): case 1: 4k zero page => writable DAX storage case 2: read-only DAX storage => writeable DAX storage This distinction is made by via vm_normal_page(). vm_normal_page() returns false for the common 4k zero page, though, just as it does for DAX ptes. Instead of special casing the DAX + 4k zero page case we will simplify our DAX PTE page fault sequence so that it matches our DAX PMD sequence, and get rid of the dax_pfn_mkwrite() helper. We will instead use dax_iomap_fault() to handle write-protection faults. This means that insert_pfn() needs to follow the lead of insert_pfn_pmd() and allow us to pass in a 'mkwrite' flag. If 'mkwrite' is set insert_pfn() will do the work that was previously done by wp_page_reuse() as part of the dax_pfn_mkwrite() call path" Link: http://lkml.kernel.org/r/20170724170616.25810-4-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: "Darrick J. Wong" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Steven Rostedt Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 243 +++++++++++++++++------------------------------------- fs/ext2/file.c | 25 +----- fs/ext4/file.c | 32 +------ fs/xfs/xfs_file.c | 2 +- 4 files changed, 79 insertions(+), 223 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index b8882b5ce6ed..ab67ae30ccbf 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -66,7 +66,7 @@ static int dax_is_pte_entry(void *entry) static int dax_is_zero_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_HZP; + return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) @@ -206,7 +206,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, for (;;) { entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || + if (!entry || + WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; @@ -241,14 +242,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, } static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) + pgoff_t index) { - if (!radix_tree_exceptional_entry(entry)) { - unlock_page(entry); - put_page(entry); - } else { - dax_unlock_mapping_entry(mapping, index); - } + dax_unlock_mapping_entry(mapping, index); } /* @@ -258,7 +254,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, static void put_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { - if (!radix_tree_exceptional_entry(entry)) + if (!entry) return; /* We have to wake up next waiter for the radix tree entry lock */ @@ -266,15 +262,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, } /* - * Find radix tree entry at given index. If it points to a page, return with - * the page locked. If it points to the exceptional entry, return with the - * radix tree entry locked. If the radix tree doesn't contain given index, - * create empty exceptional entry for the index and return with it locked. + * Find radix tree entry at given index. If it points to an exceptional entry, + * return it with the radix tree entry locked. If the radix tree doesn't + * contain given index, create an empty exceptional entry for the index and + * return with it locked. * * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will - * happen if there are any 4k entries (either zero pages or DAX entries) - * within the 2MiB range that we are requesting. + * happen if there are any 4k entries within the 2MiB range that we are + * requesting. * * We always favor 4k entries over 2MiB entries. There isn't a flow where we * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB @@ -301,18 +297,21 @@ restart: spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { + entry = ERR_PTR(-EIO); + goto out_unlock; + } + if (entry) { if (size_flag & RADIX_DAX_PMD) { - if (!radix_tree_exceptional_entry(entry) || - dax_is_pte_entry(entry)) { + if (dax_is_pte_entry(entry)) { put_unlocked_mapping_entry(mapping, index, entry); entry = ERR_PTR(-EEXIST); goto out_unlock; } } else { /* trying to grab a PTE entry */ - if (radix_tree_exceptional_entry(entry) && - dax_is_pmd_entry(entry) && + if (dax_is_pmd_entry(entry) && (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))) { pmd_downgrade = true; @@ -346,7 +345,7 @@ restart: mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); if (err) { if (pmd_downgrade) - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } spin_lock_irq(&mapping->tree_lock); @@ -396,21 +395,6 @@ restart: spin_unlock_irq(&mapping->tree_lock); return entry; } - /* Normal page in radix tree? */ - if (!radix_tree_exceptional_entry(entry)) { - struct page *page = entry; - - get_page(page); - spin_unlock_irq(&mapping->tree_lock); - lock_page(page); - /* Page got truncated? Retry... */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto restart; - } - return page; - } entry = lock_slot(mapping, slot); out_unlock: spin_unlock_irq(&mapping->tree_lock); @@ -426,7 +410,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) + if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || @@ -508,47 +492,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unsigned long flags) { struct radix_tree_root *page_tree = &mapping->page_tree; - int error = 0; - bool hole_fill = false; void *new_entry; pgoff_t index = vmf->pgoff; if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - /* Replacing hole page with block mapping? */ - if (!radix_tree_exceptional_entry(entry)) { - hole_fill = true; - /* - * Unmap the page now before we remove it from page cache below. - * The page is locked so it cannot be faulted in again. - */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); - error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); - if (error) - return ERR_PTR(error); - } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { - /* replacing huge zero page with PMD block mapping */ - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { + /* we are replacing a zero page with block mapping */ + if (dax_is_pmd_entry(entry)) + unmap_mapping_range(mapping, + (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, + PMD_SIZE, 0); + else /* pte entry */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); } spin_lock_irq(&mapping->tree_lock); new_entry = dax_radix_locked_entry(sector, flags); - if (hole_fill) { - __delete_from_page_cache(entry, NULL); - /* Drop pagecache reference */ - put_page(entry); - error = __radix_tree_insert(page_tree, index, - dax_radix_order(new_entry), new_entry); - if (error) { - new_entry = ERR_PTR(error); - goto unlock; - } - mapping->nrexceptional++; - } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* * Only swap our new entry into the radix tree if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -565,23 +529,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, WARN_ON_ONCE(ret != entry); __radix_tree_replace(page_tree, node, slot, new_entry, NULL, NULL); + entry = new_entry; } + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); - unlock: + spin_unlock_irq(&mapping->tree_lock); - if (hole_fill) { - radix_tree_preload_end(); - /* - * We don't need hole page anymore, it has been replaced with - * locked radix tree entry now. - */ - if (mapping->a_ops->freepage) - mapping->a_ops->freepage(entry); - unlock_page(entry); - put_page(entry); - } - return new_entry; + return entry; } static inline unsigned long @@ -683,7 +638,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ - if (!entry2 || !radix_tree_exceptional_entry(entry2)) + if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. We have to @@ -755,7 +710,7 @@ static int dax_writeback_one(struct block_device *bdev, trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); dax_unlock: dax_read_unlock(id); - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ret; put_unlocked: @@ -830,11 +785,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void **entryp, + sector_t sector, size_t size, void *entry, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = vmf->address; - void *entry = *entryp; void *ret, *kaddr; pgoff_t pgoff; int id, rc; @@ -855,87 +809,44 @@ static int dax_insert_mapping(struct address_space *mapping, ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); - *entryp = ret; trace_dax_insert_mapping(mapping->host, vmf, ret); - return vm_insert_mixed(vma, vaddr, pfn); -} - -/** - * dax_pfn_mkwrite - handle first write to DAX page - * @vmf: The description of the fault - */ -int dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - void *entry, **slot; - pgoff_t index = vmf->pgoff; - - spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, &slot); - if (!entry || !radix_tree_exceptional_entry(entry)) { - if (entry) - put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); - trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; - } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - /* - * If we race with somebody updating the PTE and finish_mkwrite_fault() - * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry - * the fault in either case. - */ - finish_mkwrite_fault(vmf); - put_locked_mapping_entry(mapping, index, entry); - trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; + if (vmf->flags & FAULT_FLAG_WRITE) + return vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + return vm_insert_mixed(vma, vaddr, pfn); } -EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); /* - * The user has performed a load from a hole in the file. Allocating - * a new page in the file would cause excessive storage usage for - * workloads with sparse files. We allocate a page cache page instead. - * We'll kick it out of the page cache if it's ever written to, - * otherwise it will simply fall out of the page cache under memory - * pressure without ever having been dirtied. + * The user has performed a load from a hole in the file. Allocating a new + * page in the file would cause excessive storage usage for workloads with + * sparse files. Instead we insert a read-only mapping of the 4k zero page. + * If this page is ever written to we will re-fault and change the mapping to + * point to real DAX storage instead. */ -static int dax_load_hole(struct address_space *mapping, void **entry, +static int dax_load_hole(struct address_space *mapping, void *entry, struct vm_fault *vmf) { struct inode *inode = mapping->host; - struct page *page; - int ret; - - /* Hole page already exists? Return it... */ - if (!radix_tree_exceptional_entry(*entry)) { - page = *entry; - goto finish_fault; - } + unsigned long vaddr = vmf->address; + int ret = VM_FAULT_NOPAGE; + struct page *zero_page; + void *entry2; - /* This will replace locked radix tree entry with a hole page */ - page = find_or_create_page(mapping, vmf->pgoff, - vmf->gfp_mask | __GFP_ZERO); - if (!page) { + zero_page = ZERO_PAGE(0); + if (unlikely(!zero_page)) { ret = VM_FAULT_OOM; goto out; } -finish_fault: - vmf->page = page; - ret = finish_fault(vmf); - vmf->page = NULL; - *entry = page; - if (!ret) { - /* Grab reference for PTE that is now referencing the page */ - get_page(page); - ret = VM_FAULT_NOPAGE; + entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_ZERO_PAGE); + if (IS_ERR(entry2)) { + ret = VM_FAULT_SIGBUS; + goto out; } + + vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); out: trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1223,7 +1134,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, &entry, vmf->vma, vmf); + sector, PAGE_SIZE, entry, vmf->vma, vmf); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; @@ -1231,7 +1142,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - vmf_ret = dax_load_hole(mapping, &entry, vmf); + vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1258,7 +1169,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + put_locked_mapping_entry(mapping, vmf->pgoff); out: trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; @@ -1272,7 +1183,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void **entryp) + loff_t pos, void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; const sector_t sector = dax_iomap_sector(iomap, pos); @@ -1303,11 +1214,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, goto unlock_fallback; dax_read_unlock(id); - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, + ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; - *entryp = ret; trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, @@ -1321,7 +1231,7 @@ fallback: } static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, - void **entryp) + void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -1336,11 +1246,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, if (unlikely(!zero_page)) goto fallback; - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, - RADIX_DAX_PMD | RADIX_DAX_HZP); + ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); if (IS_ERR(ret)) goto fallback; - *entryp = ret; ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1416,10 +1325,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, goto fallback; /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. + * grab_mapping_entry() will make sure we get a 2MiB empty entry, a + * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page + * is already in the tree, for instance), it will return -EEXIST and + * we just fall back to 4k entries. */ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); if (IS_ERR(entry)) @@ -1452,13 +1361,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); + result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; - result = dax_pmd_load_hole(vmf, &iomap, &entry); + result = dax_pmd_load_hole(vmf, &iomap, entry); break; default: WARN_ON_ONCE(1); @@ -1481,7 +1390,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); + put_locked_mapping_entry(mapping, pgoff); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index d34d32bdc944..ff3a3636a5ca 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf) return ret; } -static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - loff_t size; - int ret; - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - down_read(&ei->dax_sem); - - /* check that the faulting page hasn't raced with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vmf); - - up_read(&ei->dax_sem); - sb_end_pagefault(inode->i_sb); - return ret; -} - static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, /* @@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = { * will always fail and fail back to regular faults. */ .page_mkwrite = ext2_dax_fault, - .pfn_mkwrite = ext2_dax_pfn_mkwrite, + .pfn_mkwrite = ext2_dax_fault, }; static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0d7cf0cc9b87..f28ac999dfba 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -311,41 +311,11 @@ static int ext4_dax_fault(struct vm_fault *vmf) return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); } -/* - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() - * handler we check for races agaist truncate. Note that since we cycle through - * i_mmap_sem, we are sure that also any hole punching that began before we - * were called is finished by now and so if it included part of the file we - * are working on, our pte will get unmapped and the check for pte_same() in - * wp_pfn_shared() fails. Thus fault gets retried and things work out as - * desired. - */ -static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct super_block *sb = inode->i_sb; - loff_t size; - int ret; - - sb_start_pagefault(sb); - file_update_time(vmf->vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(sb); - - return ret; -} - static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, - .pfn_mkwrite = ext4_dax_pfn_mkwrite, + .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c4893e226fd8..62db8ffa83b9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1130,7 +1130,7 @@ xfs_filemap_pfn_mkwrite( if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else if (IS_DAX(inode)) - ret = dax_pfn_mkwrite(vmf); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; -- cgit v1.2.3 From d01ad197ac3b50a99ea668697acefe12e73c5fea Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Sep 2017 16:18:47 -0700 Subject: dax: remove DAX code from page_cache_tree_insert() Now that we no longer insert struct page pointers in DAX radix trees we can remove the special casing for DAX in page_cache_tree_insert(). This also allows us to make dax_wake_mapping_entry_waiter() local to fs/dax.c, removing it from dax.h. Link: http://lkml.kernel.org/r/20170724170616.25810-5-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: "Darrick J. Wong" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Steven Rostedt Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index ab67ae30ccbf..8f70e3b59b91 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -127,7 +127,7 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo * correct waitqueue where tasks might be waiting for that old 'entry' and * wake them. */ -void dax_wake_mapping_entry_waiter(struct address_space *mapping, +static void dax_wake_mapping_entry_waiter(struct address_space *mapping, pgoff_t index, void *entry, bool wake_all) { struct exceptional_entry_key key; -- cgit v1.2.3 From 527b19d0808e75fbba896beb2435c2b4d6bcd32a Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Sep 2017 16:18:51 -0700 Subject: dax: move all DAX radix tree defs to fs/dax.c Now that we no longer insert struct page pointers in DAX radix trees the page cache code no longer needs to know anything about DAX exceptional entries. Move all the DAX exceptional entry definitions from dax.h to fs/dax.c. Link: http://lkml.kernel.org/r/20170724170616.25810-6-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: "Darrick J. Wong" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Steven Rostedt Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 8f70e3b59b91..c576f6181dc8 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -54,6 +54,40 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); +/* + * We use lowest available bit in exceptional entry for locking, one bit for + * the entry size (PMD) and two more to tell us if the entry is a zero page or + * an empty entry that is just used for locking. In total four special bits. + * + * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE + * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem + * block allocation. + */ +#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) +#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) + +static unsigned long dax_radix_sector(void *entry) +{ + return (unsigned long)entry >> RADIX_DAX_SHIFT; +} + +static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) +{ + return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | + ((unsigned long)sector << RADIX_DAX_SHIFT) | + RADIX_DAX_ENTRY_LOCK); +} + +static unsigned int dax_radix_order(void *entry) +{ + if ((unsigned long)entry & RADIX_DAX_PMD) + return PMD_SHIFT - PAGE_SHIFT; + return 0; +} + static int dax_is_pmd_entry(void *entry) { return (unsigned long)entry & RADIX_DAX_PMD; -- cgit v1.2.3 From a2e050f5a9a9bd2b632d67bd06d87088e6a02dae Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Sep 2017 16:18:54 -0700 Subject: dax: explain how read(2)/write(2) addresses are validated Add a comment explaining how the user addresses provided to read(2) and write(2) are validated in the DAX I/O path. We call dax_copy_from_iter() or copy_to_iter() on these without calling access_ok() first in the DAX code, and there was a concern that the user might be able to read/write to arbitrary kernel addresses with this path. Link: http://lkml.kernel.org/r/20170816173615.10098-1-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Christoph Hellwig Cc: Dan Williams Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index c576f6181dc8..5b2eac3ef077 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1004,6 +1004,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (map_len > end - pos) map_len = end - pos; + /* + * The userspace address for the memory copy has already been + * validated via access_ok() in either vfs_read() or + * vfs_write(), depending on which operation we are doing. + */ if (iov_iter_rw(iter) == WRITE) map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); -- cgit v1.2.3 From 917f34526c4123ccd93e4373719c645c85020a5a Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Sep 2017 16:18:58 -0700 Subject: dax: use PG_PMD_COLOUR instead of open coding Use ~PG_PMD_COLOUR in dax_entry_waitqueue() instead of open coding an equivalent page offset mask. Link: http://lkml.kernel.org/r/20170822222436.18926-2-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: "Slusarz, Marcin" Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 5b2eac3ef077..23674cda0b70 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -42,6 +42,9 @@ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) +/* The 'colour' (ie low bits) within a PMD of a page offset. */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -132,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, * the range covered by the PMD map to the same bit lock. */ if (dax_is_pmd_entry(entry)) - index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); + index &= ~PG_PMD_COLOUR; key->mapping = mapping; key->entry_start = index; @@ -1215,12 +1218,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, loff_t pos, void *entry) { -- cgit v1.2.3 From 2f52074d35135ecf3fb719f3430d72c17ae07287 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Wed, 6 Sep 2017 16:19:01 -0700 Subject: dax: initialize variable pfn before using it dax_pmd_insert_mapping() contains the following code: pfn_t pfn; if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) goto fallback; /* ... */ fallback: trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); When the condition in the if statement fails, the function calls trace_dax_pmd_insert_mapping_fallback() with an uninitialized pfn value. This issue has been found while building the kernel with clang. The compiler reported: fs/dax.c:1280:6: error: variable 'pfn' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized] if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fs/dax.c:1310:60: note: uninitialized use occurs here trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); ^~~ Link: http://lkml.kernel.org/r/20170903083000.587-1-nicolas.iooss_linux@m4x.org Signed-off-by: Nicolas Iooss Reviewed-by: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 23674cda0b70..6afcacb3a87b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1230,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, void *ret = NULL, *kaddr; long length = 0; pgoff_t pgoff; - pfn_t pfn; + pfn_t pfn = {}; int id; if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) -- cgit v1.2.3 From 01ffb56bc1cb27f0d9fd50be096446a8d1ab6354 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 16:19:08 -0700 Subject: ocfs2: make ocfs2_set_acl() static The function is never called outside of fs/ocfs2/acl.c. Link: http://lkml.kernel.org/r/20170801141252.19675-2-jack@suse.cz Signed-off-by: Jan Kara Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/acl.c | 2 +- fs/ocfs2/acl.h | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e50a387959bf..40b5cc97f7b0 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -221,7 +221,7 @@ out: /* * Set the access or default ACL of an inode. */ -int ocfs2_set_acl(handle_t *handle, +static int ocfs2_set_acl(handle_t *handle, struct inode *inode, struct buffer_head *di_bh, int type, diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 2783a75b3999..7be0bb756286 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -28,13 +28,6 @@ struct ocfs2_acl_entry { struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); -int ocfs2_set_acl(handle_t *handle, - struct inode *inode, - struct buffer_head *di_bh, - int type, - struct posix_acl *acl, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, -- cgit v1.2.3 From 964f14a0d350486d17cfd24b3b7dc4f7c4bdc4d3 Mon Sep 17 00:00:00 2001 From: Jun Piao Date: Wed, 6 Sep 2017 16:19:11 -0700 Subject: ocfs2: clean up some dead code clean up some unused functions and parameters. Link: http://lkml.kernel.org/r/598A5E21.2080807@huawei.com Signed-off-by: Jun Piao Reviewed-by: Alex Chen Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 22 ++++++++-------------- fs/ocfs2/alloc.h | 3 +-- fs/ocfs2/cluster/heartbeat.c | 42 ++++-------------------------------------- fs/ocfs2/dir.c | 2 +- fs/ocfs2/file.c | 7 ------- fs/ocfs2/journal.c | 1 - fs/ocfs2/move_extents.c | 2 +- fs/ocfs2/ocfs2.h | 4 +--- fs/ocfs2/refcounttree.c | 2 +- fs/ocfs2/suballoc.c | 2 +- fs/ocfs2/super.c | 1 - fs/ocfs2/xattr.c | 2 +- 12 files changed, 19 insertions(+), 71 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fb15a96df0b6..a177eae3aa1a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, /* * How many free extents have we got before we need more meta data? */ -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et) +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) { int retval; struct ocfs2_extent_list *el = NULL; @@ -1933,14 +1932,12 @@ out: * the new changes. * * left_rec: the record on the left. - * left_child_el: is the child list pointed to by left_rec * right_rec: the record to the right of left_rec * right_child_el: is the child list pointed to by right_rec * * By definition, this only works on interior nodes. */ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, - struct ocfs2_extent_list *left_child_el, struct ocfs2_extent_rec *right_rec, struct ocfs2_extent_list *right_child_el) { @@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, */ BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); - ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], &root_el->l_recs[i + 1], right_el); } @@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, el = right_path->p_node[i].el; right_rec = &el->l_recs[0]; - ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, - right_el); + ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el); ocfs2_journal_dirty(handle, left_path->p_node[i].bh); ocfs2_journal_dirty(handle, right_path->p_node[i].bh); @@ -2509,7 +2505,7 @@ out_ret_path: static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_tree *et, - int subtree_index, struct ocfs2_path *path) + struct ocfs2_path *path) { int i, idx, ret; struct ocfs2_extent_rec *rec; @@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(handle, et, left_path, right_path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, ocfs2_unlink_subtree(handle, et, left_path, path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, if (mark_unwritten) flags = OCFS2_EXT_UNWRITTEN; - free_extents = ocfs2_num_free_extents(osb, et); + free_extents = ocfs2_num_free_extents(et); if (free_extents < 0) { status = free_extents; mlog_errno(status); @@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, *ac = NULL; - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 4a5152ec88a3..27b75cf32cfa 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_cached_dealloc_ctxt *dealloc, u64 refcount_loc, bool refcount_tree_locked); -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et); +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et); /* * how many new metadata chunks would an allocation need at maximum? diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ffe003982d95..56ac07cd35f6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, } } -static void o2hb_wait_on_io(struct o2hb_region *reg, - struct o2hb_bio_wait_ctxt *wc) +static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc) { o2hb_bio_wait_dec(wc, 1); wait_for_completion(&wc->wc_io_complete); @@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, status = 0; bail_and_wait: - o2hb_wait_on_io(reg, &wc); + o2hb_wait_on_io(&wc); if (wc.wc_error && !status) status = wc.wc_error; @@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * before we can go to steady state. This ensures that * people we find in our steady state have seen us. */ - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); if (write_wc.wc_error) { /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to @@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data) o2hb_prepare_block(reg, 0); ret = o2hb_issue_node_write(reg, &write_wc); if (ret == 0) - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); else mlog_errno(ret); } @@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid, } EXPORT_SYMBOL_GPL(o2hb_unregister_callback); -int o2hb_check_node_heartbeating(u8 node_num) -{ - unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - - o2hb_fill_node_map(testing_map, sizeof(testing_map)); - if (!test_bit(node_num, testing_map)) { - mlog(ML_HEARTBEAT, - "node (%u) does not have heartbeating enabled.\n", - node_num); - return 0; - } - - return 1; -} -EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); - int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) } EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); -/* Makes sure our local node is configured with a node number, and is - * heartbeating. */ -int o2hb_check_local_node_heartbeating(void) -{ - u8 node_num; - - /* if this node was set then we have networking */ - node_num = o2nm_this_node(); - if (node_num == O2NM_MAX_NODES) { - mlog(ML_HEARTBEAT, "this node has not been configured.\n"); - return 0; - } - - return o2hb_check_node_heartbeating(node_num); -} -EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); - /* * this is just a hack until we get the plumbing which flips file systems * read only and drops the hb ref instead of killing the node dead. diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 3ecb9f337b7d..febe6312ceff 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), parent_fe_bh); - num_free_extents = ocfs2_num_free_extents(osb, &et); + num_free_extents = ocfs2_num_free_extents(&et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index bfeb647459d9..b4512fad99ed 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -713,13 +713,6 @@ leave: return status; } -int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) -{ - return __ocfs2_extend_allocation(inode, logical_start, - clusters_to_add, mark_unwritten); -} - /* * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d5e5fa7f0743..36304434eacf 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; - osb->dirty = 0; /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index e52a2852d50d..7eb3b0a6347e 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 0c39d71c67a1..9a50f222ac97 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -320,7 +320,6 @@ struct ocfs2_super u64 system_dir_blkno; u64 bitmap_blkno; u32 bitmap_cpg; - u8 *uuid; char *uuid_str; u32 uuid_hash; u8 *vol_label; @@ -388,9 +387,8 @@ struct ocfs2_super unsigned int osb_resv_level; unsigned int osb_dir_resv_level; - /* Next three fields are for local node slot recovery during + /* Next two fields are for local node slot recovery during * mount. */ - int dirty; struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index f8933cb53d68..ab156e35ec00 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb, int *credits) { int ret = 0, meta_add = 0; - int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + int num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6ad3533940ba..71f22c8fbffd 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode, BUG_ON(clusters_to_add != 0 && data_ac == NULL); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 83005f486451..3f936be379a9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) if (dirty) { /* Recovery will be completed after we've mounted the * rest of the volume. */ - osb->dirty = 1; osb->local_alloc_copy = local_alloc; local_alloc = NULL; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f70c3778d600..5fdf269ba82e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators( *credits += 1; /* count in the xattr tree change. */ - num_free_extents = ocfs2_num_free_extents(osb, xt_et); + num_free_extents = ocfs2_num_free_extents(xt_et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); -- cgit v1.2.3 From 26b433d0da062d6e19d75350c0171d3cf8ff560d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 16:21:15 -0700 Subject: fscache: remove unused ->now_uncached callback Patch series "Ranged pagevec lookup", v2. In this series I make pagevec_lookup() update the index (to be consistent with pagevec_lookup_tag() and also as a preparation for ranged lookups), provide ranged variant of pagevec_lookup() and use it in places where it makes sense. This not only removes some common code but is also a measurable performance win for some use cases (see patch 4/10) where radix tree is sparse and searching & grabing of a page after the end of the range has measurable overhead. This patch (of 10): The callback doesn't ever get called. Remove it. Link: http://lkml.kernel.org/r/20170726114704.7626-2-jack@suse.cz Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/cache.c | 29 ----------------------------- fs/afs/cache.c | 43 ------------------------------------------- fs/ceph/cache.c | 31 ------------------------------- fs/cifs/cache.c | 31 ------------------------------- fs/nfs/fscache-index.c | 40 ---------------------------------------- 5 files changed, 174 deletions(-) (limited to 'fs') diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 103ca5e1267b..64c58eb26159 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OKAY; } -static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) -{ - struct v9fs_inode *v9inode = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - for (;;) { - nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - const struct fscache_cookie_def v9fs_cache_inode_index_def = { .name = "9p.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = { .get_attr = v9fs_cache_inode_get_attr, .get_aux = v9fs_cache_inode_get_aux, .check_aux = v9fs_cache_inode_check_aux, - .now_uncached = v9fs_cache_inode_now_uncached, }; void v9fs_cache_inode_get_cookie(struct inode *inode) diff --git a/fs/afs/cache.c b/fs/afs/cache.c index 577763c3d88b..1fe855191261 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, const void *buffer, uint16_t buflen); -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); struct fscache_netfs afs_cache_netfs = { .name = "afs", @@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = { .get_attr = afs_vnode_cache_get_attr, .get_aux = afs_vnode_cache_get_aux, .check_aux = afs_vnode_cache_check_aux, - .now_uncached = afs_vnode_cache_now_uncached, }; /* @@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, _leave(" = SUCCESS"); return FSCACHE_CHECKAUX_OKAY; } - -/* - * indication the cookie is no longer uncached - * - this function is called when the backing store currently caching a cookie - * is removed - * - the netfs should use this to clean up any markers indicating cached pages - * - this is mandatory for any object that may have data - */ -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) -{ - struct afs_vnode *vnode = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - _enter("{%x,%x,%Lx}", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); - - pagevec_init(&pvec, 0); - first = 0; - - for (;;) { - /* grab a bunch of pages to clean */ - nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } - - _leave(""); -} diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 337f88673ed9..174d6e6569a8 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( return FSCACHE_CHECKAUX_OKAY; } -static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) -{ - struct ceph_inode_info* ci = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - dout("ceph inode 0x%p now uncached", ci); - - while (1) { - nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .name = "CEPH.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .get_attr = ceph_fscache_inode_get_attr, .get_aux = ceph_fscache_inode_get_aux, .check_aux = ceph_fscache_inode_check_aux, - .now_uncached = ceph_fscache_inode_now_uncached, }; void ceph_fscache_register_inode_cookie(struct inode *inode) diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 6c665bf4a27c..2c14020e5e1d 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OKAY; } -static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) -{ - struct cifsInodeInfo *cifsi = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi); - - for (;;) { - nr_pages = pagevec_lookup(&pvec, - cifsi->vfs_inode.i_mapping, first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - const struct fscache_cookie_def cifs_fscache_inode_object_def = { .name = "CIFS.uniqueid", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = { .get_attr = cifs_fscache_inode_get_attr, .get_aux = cifs_fscac