From 844f35db1088dd1a9de37b53d4d823626232bd19 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:57 -0700 Subject: dax: add huge page fault support This is the support code for DAX-enabled filesystems to allow them to provide huge pages in response to faults. Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) (limited to 'fs/dax.c') diff --git a/fs/dax.c b/fs/dax.c index a7f77e1fa18c..15f8ffc13fa6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -494,6 +494,158 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL_GPL(dax_fault); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below function. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + +int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, get_block_t get_block, + dax_iodone_t complete_unwritten) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct buffer_head bh; + unsigned blkbits = inode->i_blkbits; + unsigned long pmd_addr = address & PMD_MASK; + bool write = flags & FAULT_FLAG_WRITE; + long length; + void *kaddr; + pgoff_t size, pgoff; + sector_t block, sector; + unsigned long pfn; + int result = 0; + + /* Fall back to PTEs if we're going to COW */ + if (write && !(vma->vm_flags & VM_SHARED)) + return VM_FAULT_FALLBACK; + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vma->vm_start) + return VM_FAULT_FALLBACK; + if ((pmd_addr + PMD_SIZE) > vma->vm_end) + return VM_FAULT_FALLBACK; + + pgoff = ((pmd_addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) + return VM_FAULT_SIGBUS; + /* If the PMD would cover blocks out of the file */ + if ((pgoff | PG_PMD_COLOUR) >= size) + return VM_FAULT_FALLBACK; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); + + bh.b_size = PMD_SIZE; + length = get_block(inode, block, &bh, write); + if (length) + return VM_FAULT_SIGBUS; + i_mmap_lock_read(mapping); + + /* + * If the filesystem isn't willing to tell us the length of a hole, + * just fall back to PTEs. Calling get_block 512 times in a loop + * would be silly. + */ + if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) + goto fallback; + + /* Guard against a race with truncate */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) { + result = VM_FAULT_SIGBUS; + goto out; + } + if ((pgoff | PG_PMD_COLOUR) >= size) + goto fallback; + + if (is_huge_zero_pmd(*pmd)) + unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); + + if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { + bool set; + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + struct page *zero_page = get_huge_zero_page(); + if (unlikely(!zero_page)) + goto fallback; + + ptl = pmd_lock(mm, pmd); + set = set_huge_zero_page(NULL, mm, vma, pmd_addr, pmd, + zero_page); + spin_unlock(ptl); + result = VM_FAULT_NOPAGE; + } else { + sector = bh.b_blocknr << (blkbits - 9); + length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, + bh.b_size); + if (length < 0) { + result = VM_FAULT_SIGBUS; + goto out; + } + if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) + goto fallback; + + if (buffer_unwritten(&bh) || buffer_new(&bh)) { + int i; + for (i = 0; i < PTRS_PER_PMD; i++) + clear_page(kaddr + i * PAGE_SIZE); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + result |= VM_FAULT_MAJOR; + } + + result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); + } + + out: + i_mmap_unlock_read(mapping); + + if (buffer_unwritten(&bh)) + complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); + + return result; + + fallback: + count_vm_event(THP_FAULT_FALLBACK); + result = VM_FAULT_FALLBACK; + goto out; +} +EXPORT_SYMBOL_GPL(__dax_pmd_fault); + +/** + * dax_pmd_fault - handle a PMD fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * pmd_fault handler for DAX files. + */ +int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, get_block_t get_block, + dax_iodone_t complete_unwritten) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = __dax_pmd_fault(vma, address, pmd, flags, get_block, + complete_unwritten); + if (flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_pmd_fault); +#endif /* CONFIG_TRANSPARENT_HUGEPAGES */ + /** * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred -- cgit v1.2.3 From dd8a2b6c29a3221c19ab475c8408fc2b914ccfab Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Tue, 8 Sep 2015 14:59:09 -0700 Subject: fs/dax.c: fix typo in #endif comment Fix typo s/CONFIG_TRANSPARENT_HUGEPAGES/CONFIG_TRANSPARENT_HUGEPAGE/ in #endif comment introduced by commit 2b26a9206d6a ("dax: add huge page fault support"). Signed-off-by: Valentin Rothberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dax.c') diff --git a/fs/dax.c b/fs/dax.c index 15f8ffc13fa6..2deed64b7eea 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -644,7 +644,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, return result; } EXPORT_SYMBOL_GPL(dax_pmd_fault); -#endif /* CONFIG_TRANSPARENT_HUGEPAGES */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * dax_pfn_mkwrite - handle first write to DAX page -- cgit v1.2.3 From 84c4e5e675408b6fb7d74eec7da9a4a5698b50af Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:59:17 -0700 Subject: dax: improve comment about truncate race Jan Kara pointed out I should be more explicit here about the perils of racing against truncate. The comment is mostly the same as for the PTE case. Signed-off-by: Matthew Wilcox Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/dax.c') diff --git a/fs/dax.c b/fs/dax.c index 2deed64b7eea..c694117a7062 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -553,7 +553,12 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) goto fallback; - /* Guard against a race with truncate */ + /* + * If a truncate happened while we were allocating blocks, we may + * leave blocks allocated to the file that are beyond EOF. We can't + * take i_mutex here, so just leave them hanging; they'll be freed + * when the file is deleted. + */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) { result = VM_FAULT_SIGBUS; -- cgit v1.2.3 From 843172978bb92997310d2f7fbc172ece423cfc02 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:59:25 -0700 Subject: dax: fix race between simultaneous faults If two threads write-fault on the same hole at the same time, the winner of the race will return to userspace and complete their store, only to have the loser overwrite their store with zeroes. Fix this for now by taking the i_mmap_sem for write instead of read, and do so outside the call to get_block(). Now the loser of the race will see the block has already been zeroed, and will not zero it again. This severely limits our scalability. I have ideas for improving it, but those can wait for a later patch. Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'fs/dax.c') diff --git a/fs/dax.c b/fs/dax.c index c694117a7062..9593f4bee327 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -272,7 +272,6 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh, static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct address_space *mapping = inode->i_mapping; sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); unsigned long vaddr = (unsigned long)vmf->virtual_address; void *addr; @@ -280,8 +279,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, pgoff_t size; int error; - i_mmap_lock_read(mapping); - /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the @@ -309,8 +306,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, error = vm_insert_mixed(vma, vaddr, pfn); out: - i_mmap_unlock_read(mapping); - return error; } @@ -372,15 +367,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * from a read fault and we've raced with a truncate */ error = -EIO; - goto unlock_page; + goto unlock; } + } else { + i_mmap_lock_write(mapping); } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock_page; + goto unlock; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { @@ -391,8 +388,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) - goto unlock_page; + goto unlock; } else { + i_mmap_unlock_write(mapping); return dax_load_hole(mapping, page, vmf); } } @@ -404,17 +402,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock_page; + goto unlock; vmf->page = page; if (!page) { - i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { - i_mmap_unlock_read(mapping); error = -EIO; - goto out; + goto unlock; } } return VM_FAULT_LOCKED; @@ -450,6 +446,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } + if (!page) + i_mmap_unlock_write(mapping); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -458,11 +456,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - unlock_page: + unlock: if (page) { unlock_page(page); page_cache_release(page); + } else { + i_mmap_unlock_write(mapping); } + goto out; } EXPORT_SYMBOL(__dax_fault); @@ -540,10 +541,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; + i_mmap_lock_write(mapping); length = get_block(inode, block, &bh, write); if (length) return VM_FAULT_SIGBUS; - i_mmap_lock_read(mapping); /* * If the filesystem isn't willing to tell us the length of a hole, @@ -607,11 +608,11 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } out: - i_mmap_unlock_read(mapping); - if (buffer_unwritten(&bh)) complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); + i_mmap_unlock_write(mapping); + return result; fallback: -- cgit v1.2.3 From d295e3415a88ae63a37a22652808b20c7fcb970e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 8 Sep 2015 14:59:34 -0700 Subject: dax: don't use set_huge_zero_page() This is another place where DAX assumed that pgtable_t was a pointer. Open code the important parts of set_huge_zero_page() in DAX and make set_huge_zero_page() static again. Signed-off-by: Kirill A. Shutemov Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs/dax.c') diff --git a/fs/dax.c b/fs/dax.c index 9593f4bee327..d778e5f1a01c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -572,18 +572,24 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { - bool set; spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; + pmd_t entry; struct page *zero_page = get_huge_zero_page(); + if (unlikely(!zero_page)) goto fallback; - ptl = pmd_lock(mm, pmd); - set = set_huge_zero_page(NULL, mm, vma, pmd_addr, pmd, - zero_page); - spin_unlock(ptl); + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_none(*pmd)) { + spin_unlock(ptl); + goto fallback; + } + + entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = pmd_mkhuge(entry); + set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); result = VM_FAULT_NOPAGE; + spin_unlock(ptl); } else { sector = bh.b_blocknr << (blkbits - 9); length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, -- cgit v1.2.3 From 73a6ec47f68787df1b41869def52915da2f4a6b7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:59:37 -0700 Subject: dax: ensure that zero pages are removed from other processes If the first access to a huge page was a store, there would be no existing zero pmd in this process's page tables. There could be a zero pmd in another process's page tables, if it had done a load. We can detect this case by noticing that the buffer_head returned from the filesystem is New, and ensure that other processes mapping this huge page have their page tables flushed. Signed-off-by: Matthew Wilcox Reported-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/dax.c') diff --git a/fs/dax.c b/fs/dax.c index d778e5f1a01c..74838c43be91 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -568,7 +568,11 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if ((pgoff | PG_PMD_COLOUR) >= size) goto fallback; - if (is_huge_zero_pmd(*pmd)) + /* + * If we allocated new storage, make sure no process has any + * zero pages covering this hole + */ + if (buffer_new(&bh)) unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { -- cgit v1.2.3 From 3fdd1b479dbc03347e98f904f54133a9cef5521f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:59:39 -0700 Subject: dax: use linear_page_index() I was basically open-coding it (thanks to copying code from do_fault() which probably also needs to be fixed). Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dax.c') diff --git a/fs/dax.c b/fs/dax.c index 74838c43be91..9ef9b80cc132 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -529,7 +529,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if ((pmd_addr + PMD_SIZE) > vma->vm_end) return VM_FAULT_FALLBACK; - pgoff = ((pmd_addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + pgoff = linear_page_index(vma, pmd_addr); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) return VM_FAULT_SIGBUS; -- cgit v1.2.3 From 46c043ede4711e8d598b9d63c5616c1fedb0605e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 8 Sep 2015 14:59:42 -0700 Subject: mm: take i_mmap_lock in unmap_mapping_range() for DAX DAX is not so special: we need i_mmap_lock to protect mapping->i_mmap. __dax_pmd_fault() uses unmap_mapping_range() shoot out zero page from all mappings. We need to drop i_mmap_lock there to avoid lock deadlock. Re-aquiring the lock should be fine since we check i_size after the point. Signed-off-by: Kirill A. Shutemov Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'fs/dax.c') diff --git a/fs/dax.c b/fs/dax.c index 9ef9b80cc132..ed54efedade6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -554,6 +554,25 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) goto fallback; + if (buffer_unwritten(&bh) || buffer_new(&bh)) { + int i; + for (i = 0; i < PTRS_PER_PMD; i++) + clear_page(kaddr + i * PAGE_SIZE); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + result |= VM_FAULT_MAJOR; + } + + /* + * If we allocated new storage, make sure no process has any + * zero pages covering this hole + */ + if (buffer_new(&bh)) { + i_mmap_unlock_write(mapping); + unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); + i_mmap_lock_write(mapping); + } + /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't @@ -568,13 +587,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if ((pgoff | PG_PMD_COLOUR) >= size) goto fallback; - /* - * If we allocated new storage, make sure no process has any - * zero pages covering this hole - */ - if (buffer_new(&bh)) - unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); - if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; @@ -605,15 +617,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) goto fallback; - if (buffer_unwritten(&bh) || buffer_new(&bh)) { - int i; - for (i = 0; i < PTRS_PER_PMD; i++) - clear_page(kaddr + i * PAGE_SIZE); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - result |= VM_FAULT_MAJOR; - } - result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); } -- cgit v1.2.3