From 348e967ab07c96a9e7a6a194812254a8df2045c0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:15 +0200 Subject: dax: Make huge page handling depend of CONFIG_BROKEN Currently the handling of huge pages for DAX is racy. For example the following can happen: CPU0 (THP write fault) CPU1 (normal read fault) __dax_pmd_fault() __dax_fault() get_block(inode, block, &bh, 0) -> not mapped get_block(inode, block, &bh, 0) -> not mapped if (!buffer_mapped(&bh) && write) get_block(inode, block, &bh, 1) -> allocates blocks truncate_pagecache_range(inode, lstart, lend); dax_load_hole(); This results in data corruption since process on CPU1 won't see changes into the file done by CPU0. The race can happen even if two normal faults race however with THP the situation is even worse because the two faults don't operate on the same entries in the radix tree and we want to use these entries for serialization. So make THP support in DAX code depend on CONFIG_BROKEN for now. Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- include/linux/dax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 90fbc99e5313..72dc81de3ddb 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -29,7 +29,7 @@ static inline int __dax_zero_page_range(struct block_device *bdev, } #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, unsigned int flags, get_block_t); int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, -- cgit v1.2.3 From e804315dd0f574b56155c5a2406ab5e0318104f7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:16 +0200 Subject: dax: Define DAX lock bit for radix tree exceptional entry We will use lowest available bit in the radix tree exceptional entry for locking of the entry. Define it. Also clean up definitions of DAX entry type bits in DAX exceptional entries to use defined constants instead of hardcoding numbers and cleanup checking of these bits to not rely on how other bits in the entry are set. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- include/linux/dax.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 72dc81de3ddb..70600b63083f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -5,6 +5,9 @@ #include #include +/* We use lowest available exceptional entry bit for locking */ +#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) + ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, get_block_t, dio_iodone_t, int flags); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); -- cgit v1.2.3 From 4f622938a5e2b7f1374ffb1e5fc212744898f513 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:17 +0200 Subject: dax: Allow DAX code to replace exceptional entries Currently we forbid page_cache_tree_insert() to replace exceptional radix tree entries for DAX inodes. However to make DAX faults race free we will lock radix tree entries and when hole is created, we need to replace such locked radix tree entry with a hole page. So modify page_cache_tree_insert() to allow that. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- include/linux/dax.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 70600b63083f..aa148937bb3f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -3,6 +3,7 @@ #include #include +#include #include /* We use lowest available exceptional entry bit for locking */ -- cgit v1.2.3 From ac401cc782429cc8560ce4840b1405d603740917 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:18 +0200 Subject: dax: New fault locking Currently DAX page fault locking is racy. CPU0 (write fault) CPU1 (read fault) __dax_fault() __dax_fault() get_block(inode, block, &bh, 0) -> not mapped get_block(inode, block, &bh, 0) -> not mapped if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) get_block(inode, block, &bh, 1) -> allocates blocks if (page) -> no if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) { } else { dax_load_hole(); } dax_insert_mapping() And we are in a situation where we fail in dax_radix_entry() with -EIO. Another problem with the current DAX page fault locking is that there is no race-free way to clear dirty tag in the radix tree. We can always end up with clean radix tree and dirty data in CPU cache. We fix the first problem by introducing locking of exceptional radix tree entries in DAX mappings acting very similarly to page lock and thus synchronizing properly faults against the same mapping index. The same lock can later be used to avoid races when clearing radix tree dirty tag. Reviewed-by: NeilBrown Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- include/linux/dax.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index aa148937bb3f..756625c6d0dd 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -15,6 +15,9 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); +void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, bool wake_all); #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); -- cgit v1.2.3 From bc2466e4257369d0ebee2b6265070d323343fa72 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:19 +0200 Subject: dax: Use radix tree entry lock to protect cow faults When doing cow faults, we cannot directly fill in PTE as we do for other faults as we rely on generic code to do proper accounting of the cowed page. We also have no page to lock to protect against races with truncate as other faults have and we need the protection to extend until the moment generic code inserts cowed page into PTE thus at that point we have no protection of fs-specific i_mmap_sem. So far we relied on using i_mmap_lock for the protection however that is completely special to cow faults. To make fault locking more uniform use DAX entry lock instead. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- include/linux/dax.h | 7 +++++++ include/linux/mm.h | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 756625c6d0dd..7bf12277c006 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -21,6 +21,7 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index); int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length); #else @@ -29,6 +30,12 @@ static inline struct page *read_dax_sector(struct block_device *bdev, { return ERR_PTR(-ENXIO); } +/* Shouldn't ever be called when dax is disabled. */ +static inline void dax_unlock_mapping_entry(struct address_space *mapping, + pgoff_t index) +{ + BUG(); +} static inline int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length) { diff --git a/include/linux/mm.h b/include/linux/mm.h index a55e5be0894f..0ef9dc720ec3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -299,6 +299,12 @@ struct vm_fault { * is set (which is also implied by * VM_FAULT_ERROR). */ + void *entry; /* ->fault handler can alternatively + * return locked DAX entry. In that + * case handler should return + * VM_FAULT_DAX_LOCKED and fill in + * entry here. + */ /* for ->map_pages() only */ pgoff_t max_pgoff; /* map pages for offset from pgoff till * max_pgoff inclusive */ @@ -1084,6 +1090,7 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ +#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ -- cgit v1.2.3