From 4f622938a5e2b7f1374ffb1e5fc212744898f513 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:17 +0200 Subject: dax: Allow DAX code to replace exceptional entries Currently we forbid page_cache_tree_insert() to replace exceptional radix tree entries for DAX inodes. However to make DAX faults race free we will lock radix tree entries and when hole is created, we need to replace such locked radix tree entry with a hole page. So modify page_cache_tree_insert() to allow that. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- mm/filemap.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index f2479af09da9..dfe55c2cfb34 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -597,14 +597,21 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!radix_tree_exceptional_entry(p)) return -EEXIST; - if (WARN_ON(dax_mapping(mapping))) - return -EINVAL; - - if (shadowp) - *shadowp = p; mapping->nrexceptional--; - if (node) - workingset_node_shadows_dec(node); + if (!dax_mapping(mapping)) { + if (shadowp) + *shadowp = p; + if (node) + workingset_node_shadows_dec(node); + } else { + /* DAX can replace empty locked entry with a hole */ + WARN_ON_ONCE(p != + (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK)); + /* DAX accounts exceptional entries as normal pages */ + if (node) + workingset_node_pages_dec(node); + } } radix_tree_replace_slot(slot, page); mapping->nrpages++; -- cgit v1.2.3 From ac401cc782429cc8560ce4840b1405d603740917 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:18 +0200 Subject: dax: New fault locking Currently DAX page fault locking is racy. CPU0 (write fault) CPU1 (read fault) __dax_fault() __dax_fault() get_block(inode, block, &bh, 0) -> not mapped get_block(inode, block, &bh, 0) -> not mapped if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) get_block(inode, block, &bh, 1) -> allocates blocks if (page) -> no if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) { } else { dax_load_hole(); } dax_insert_mapping() And we are in a situation where we fail in dax_radix_entry() with -EIO. Another problem with the current DAX page fault locking is that there is no race-free way to clear dirty tag in the radix tree. We can always end up with clean radix tree and dirty data in CPU cache. We fix the first problem by introducing locking of exceptional radix tree entries in DAX mappings acting very similarly to page lock and thus synchronizing properly faults against the same mapping index. The same lock can later be used to avoid races when clearing radix tree dirty tag. Reviewed-by: NeilBrown Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- mm/filemap.c | 9 +++++++-- mm/truncate.c | 62 +++++++++++++++++++++++++++++------------------------------ 2 files changed, 37 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index dfe55c2cfb34..7b9a4b180cae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -160,13 +160,15 @@ static void page_cache_tree_delete(struct address_space *mapping, return; /* - * Track node that only contains shadow entries. + * Track node that only contains shadow entries. DAX mappings contain + * no shadow entries and may contain other exceptional entries so skip + * those. * * Avoid acquiring the list_lru lock if already tracked. The * list_empty() test is safe as node->private_list is * protected by mapping->tree_lock. */ - if (!workingset_node_pages(node) && + if (!dax_mapping(mapping) && !workingset_node_pages(node) && list_empty(&node->private_list)) { node->private_data = mapping; list_lru_add(&workingset_shadow_nodes, &node->private_list); @@ -611,6 +613,9 @@ static int page_cache_tree_insert(struct address_space *mapping, /* DAX accounts exceptional entries as normal pages */ if (node) workingset_node_pages_dec(node); + /* Wakeup waiters for exceptional entry lock */ + dax_wake_mapping_entry_waiter(mapping, page->index, + false); } } radix_tree_replace_slot(slot, page); diff --git a/mm/truncate.c b/mm/truncate.c index b00272810871..4064f8f53daa 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping, if (shmem_mapping(mapping)) return; - spin_lock_irq(&mapping->tree_lock); - if (dax_mapping(mapping)) { - if (radix_tree_delete_item(&mapping->page_tree, index, entry)) - mapping->nrexceptional--; - } else { - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - radix_tree_replace_slot(slot, NULL); - mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); + dax_delete_mapping_entry(mapping, index); + return; } + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, + &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrexceptional--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } -- cgit v1.2.3 From bc2466e4257369d0ebee2b6265070d323343fa72 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:19 +0200 Subject: dax: Use radix tree entry lock to protect cow faults When doing cow faults, we cannot directly fill in PTE as we do for other faults as we rely on generic code to do proper accounting of the cowed page. We also have no page to lock to protect against races with truncate as other faults have and we need the protection to extend until the moment generic code inserts cowed page into PTE thus at that point we have no protection of fs-specific i_mmap_sem. So far we relied on using i_mmap_lock for the protection however that is completely special to cow faults. To make fault locking more uniform use DAX entry lock instead. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- mm/memory.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 93897f23cc11..f09cdb8d48fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -2785,7 +2786,8 @@ oom: */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, pgoff_t pgoff, unsigned int flags, - struct page *cow_page, struct page **page) + struct page *cow_page, struct page **page, + void **entry) { struct vm_fault vmf; int ret; @@ -2800,8 +2802,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - if (!vmf.page) - goto out; + if (ret & VM_FAULT_DAX_LOCKED) { + *entry = vmf.entry; + return ret; + } if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) @@ -2815,7 +2819,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); - out: *page = vmf.page; return ret; } @@ -2987,7 +2990,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3010,6 +3013,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page, *new_page; + void *fault_entry; struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; @@ -3027,26 +3031,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } - ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, + &fault_entry); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; - if (fault_page) + if (!(ret & VM_FAULT_DAX_LOCKED)) copy_user_highpage(new_page, fault_page, address, vma); __SetPageUptodate(new_page); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); - if (fault_page) { + if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for read to protect against truncate. - */ - i_mmap_unlock_read(vma->vm_file->f_mapping); + dax_unlock_mapping_entry(vma->vm_file->f_mapping, + pgoff); } goto uncharge_out; } @@ -3054,15 +3056,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); - if (fault_page) { + if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for read to protect against truncate. - */ - i_mmap_unlock_read(vma->vm_file->f_mapping); + dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); } return ret; uncharge_out: @@ -3082,7 +3080,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, int dirtied = 0; int ret, tmp; - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; -- cgit v1.2.3 From 4d9a2c8746671efbb0c27d3ae28c7474597a7aad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:20 +0200 Subject: dax: Remove i_mmap_lock protection Currently faults are protected against truncate by filesystem specific i_mmap_sem and page lock in case of hole page. Cow faults are protected DAX radix tree entry locking. So there's no need for i_mmap_lock in DAX code. Remove it. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index f09cdb8d48fa..06f552504e79 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2453,8 +2453,6 @@ void unmap_mapping_range(struct address_space *mapping, if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - - /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); -- cgit v1.2.3