From 3159f943aafdbacb2f94c38fdaadabf2bbde2a14 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 3 Nov 2017 13:30:42 -0400 Subject: xarray: Replace exceptional entries Introduce xarray value entries and tagged pointers to replace radix tree exceptional entries. This is a slight change in encoding to allow the use of an extra bit (we can now store BITS_PER_LONG - 1 bits in a value entry). It is also a change in emphasis; exceptional entries are intimidating and different. As the comment explains, you can choose to store values or pointers in the xarray and they are both first-class citizens. Signed-off-by: Matthew Wilcox Reviewed-by: Josef Bacik --- mm/filemap.c | 10 +++++----- mm/khugepaged.c | 2 +- mm/madvise.c | 2 +- mm/memcontrol.c | 2 +- mm/mincore.c | 2 +- mm/readahead.c | 2 +- mm/shmem.c | 10 +++++----- mm/swap.c | 2 +- mm/truncate.c | 12 ++++++------ mm/workingset.c | 13 ++++++------- 10 files changed, 28 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 52517f28e6f4..4de14e75c4ec 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -127,7 +127,7 @@ static int page_cache_tree_insert(struct address_space *mapping, p = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - if (!radix_tree_exceptional_entry(p)) + if (!xa_is_value(p)) return -EEXIST; mapping->nrexceptional--; @@ -336,7 +336,7 @@ page_cache_tree_delete_batch(struct address_space *mapping, break; page = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!tail_pages) { /* @@ -1355,7 +1355,7 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, struct page *page; page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + if (!page || xa_is_value(page)) break; index++; if (index == 0) @@ -1396,7 +1396,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, struct page *page; page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + if (!page || xa_is_value(page)) break; index--; if (index == ULONG_MAX) @@ -1539,7 +1539,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, repeat: page = find_get_entry(mapping, offset); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) page = NULL; if (!page) goto no_page; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a31d740e6cd1..4820e4adf853 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1369,7 +1369,7 @@ static void collapse_shmem(struct mm_struct *mm, page = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { + if (xa_is_value(page) || !PageUptodate(page)) { xa_unlock_irq(&mapping->i_pages); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, diff --git a/mm/madvise.c b/mm/madvise.c index 972a9eaa898b..9d802566c494 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -251,7 +251,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; page = find_get_entry(mapping, index); - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { if (page) put_page(page); continue; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e79cb59552d9..29d9d1a69b36 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4750,7 +4750,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /* shmem/tmpfs may report page out on swap: account for that too. */ if (shmem_mapping(mapping)) { page = find_get_entry(mapping, pgoff); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e6..4985965aa20a 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -66,7 +66,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * shmem/tmpfs may return swap: account for swapcache * page too. */ - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); page = find_get_page(swap_address_space(swp), swp_offset(swp)); diff --git a/mm/readahead.c b/mm/readahead.c index 4e630143a0ba..de657077d41d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -179,7 +179,7 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, rcu_read_lock(); page = radix_tree_lookup(&mapping->i_pages, page_offset); rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch of * contiguous pages before continuing with the next diff --git a/mm/shmem.c b/mm/shmem.c index 446942677cd4..c1062760fe41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -709,7 +709,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, continue; } - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) swapped++; if (need_resched()) { @@ -824,7 +824,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, @@ -921,7 +921,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; if (shmem_free_swap(mapping, index, page)) { @@ -1643,7 +1643,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, repeat: swap.val = 0; page = find_lock_entry(mapping, index); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swap = radix_to_swp_entry(page); page = NULL; } @@ -2578,7 +2578,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, index = indices[i]; } page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_value(page)) { if (!PageUptodate(page)) page = NULL; } diff --git a/mm/swap.c b/mm/swap.c index 26fc9b5f1b6c..4c5c7fcc6e46 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -965,7 +965,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) for (i = 0, j = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) + if (!xa_is_value(page)) pvec->pages[j++] = page; } pvec->nr = j; diff --git a/mm/truncate.c b/mm/truncate.c index 1d2fb2dca96f..ed778555c9f3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -70,7 +70,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; for (j = 0; j < pagevec_count(pvec); j++) - if (radix_tree_exceptional_entry(pvec->pages[j])) + if (xa_is_value(pvec->pages[j])) break; if (j == pagevec_count(pvec)) @@ -85,7 +85,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, struct page *page = pvec->pages[i]; pgoff_t index = indices[i]; - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { pvec->pages[j++] = page; continue; } @@ -347,7 +347,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!trylock_page(page)) @@ -442,7 +442,7 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; lock_page(page); @@ -561,7 +561,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { invalidate_exceptional_entry(mapping, index, page); continue; @@ -692,7 +692,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (!invalidate_exceptional_entry2(mapping, index, page)) ret = -EBUSY; diff --git a/mm/workingset.c b/mm/workingset.c index 4516dd790129..bb109b3afac2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -155,8 +155,8 @@ * refault distance will immediately activate the refaulting page. */ -#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ +#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) @@ -173,20 +173,19 @@ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) { eviction >>= bucket_order; + eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; - eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); - return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp) { - unsigned long entry = (unsigned long)shadow; + unsigned long entry = xa_to_value(shadow); int memcgid, nid; - entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -453,7 +452,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { - if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) + if (WARN_ON_ONCE(!xa_is_value(node->slots[i]))) goto out_invalid; if (WARN_ON_ONCE(!node->exceptional)) goto out_invalid; -- cgit v1.2.3 From 01959dfe771c6893365482ec78dc1d9cbbbe6de8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 9 Nov 2017 09:23:56 -0500 Subject: xarray: Define struct xa_node This is a direct replacement for struct radix_tree_node. A couple of struct members have changed name, so convert those. Use a #define so that radix tree users continue to work without change. Signed-off-by: Matthew Wilcox Reviewed-by: Josef Bacik --- mm/workingset.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index bb109b3afac2..c201cbb8c00f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -349,7 +349,7 @@ void workingset_update_node(struct radix_tree_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - if (node->count && node->count == node->exceptional) { + if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) list_lru_add(&shadow_nodes, &node->private_list); } else { @@ -428,8 +428,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, i_pages); + node = container_of(item, struct xa_node, private_list); + mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { @@ -446,25 +446,25 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!node->exceptional)) + if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; - if (WARN_ON_ONCE(node->count != node->exceptional)) + if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { if (WARN_ON_ONCE(!xa_is_value(node->slots[i]))) goto out_invalid; - if (WARN_ON_ONCE(!node->exceptional)) + if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(!mapping->nrexceptional)) goto out_invalid; node->slots[i] = NULL; - node->exceptional--; + node->nr_values--; node->count--; mapping->nrexceptional--; } } - if (WARN_ON_ONCE(node->exceptional)) + if (WARN_ON_ONCE(node->nr_values)) goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->i_pages, node, -- cgit v1.2.3 From 0d3f92966629e536b0c5c2355c1ada8e21c245f6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 21 Nov 2017 14:07:06 -0500 Subject: page cache: Convert hole search to XArray The page cache offers the ability to search for a miss in the previous or next N locations. Rather than teach the XArray about the page cache's definition of a miss, use xas_prev() and xas_next() to search the page array. This should be more efficient as it does not have to start the lookup from the top for each index. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 110 ++++++++++++++++++++++++++------------------------------- mm/readahead.c | 4 +-- 2 files changed, 52 insertions(+), 62 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 4de14e75c4ec..714d3d0f60f5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1326,86 +1326,76 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * page_cache_next_hole - find the next hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the - * lowest indexed hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= - * max_scan' will be true). In rare cases of index wrap-around, 0 will - * be returned. - * - * page_cache_next_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 5, then subsequently a hole is created at - * index 10, page_cache_next_hole covering both indexes may return 10 - * if called under rcu_read_lock. + * page_cache_next_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the + * gap with the lowest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 5, then subsequently a gap is + * created at index 10, page_cache_next_miss covering both indices may + * return 10 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'return - index >= max_scan' will be true). + * In the rare case of index wrap-around, 0 will be returned. */ -pgoff_t page_cache_next_hole(struct address_space *mapping, +pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; + XA_STATE(xas, &mapping->i_pages, index); - for (i = 0; i < max_scan; i++) { - struct page *page; - - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || xa_is_value(page)) + while (max_scan--) { + void *entry = xas_next(&xas); + if (!entry || xa_is_value(entry)) break; - index++; - if (index == 0) + if (xas.xa_index == 0) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_next_hole); +EXPORT_SYMBOL(page_cache_next_miss); /** - * page_cache_prev_hole - find the prev hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search backwards in the range [max(index-max_scan+1, 0), index] for - * the first hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX - * will be returned. - * - * page_cache_prev_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 10, then subsequently a hole is created at - * index 5, page_cache_prev_hole covering both indexes may return 5 if - * called under rcu_read_lock. + * page_cache_prev_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [max(index - max_scan + 1, 0), index] for the + * gap with the highest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 10, then subsequently a gap is + * created at index 5, page_cache_prev_miss() covering both indices may + * return 5 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'index - return >= max_scan' will be true). + * In the rare case of wrap-around, ULONG_MAX will be returned. */ -pgoff_t page_cache_prev_hole(struct address_space *mapping, +pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; - - for (i = 0; i < max_scan; i++) { - struct page *page; + XA_STATE(xas, &mapping->i_pages, index); - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || xa_is_value(page)) + while (max_scan--) { + void *entry = xas_prev(&xas); + if (!entry || xa_is_value(entry)) break; - index--; - if (index == ULONG_MAX) + if (xas.xa_index == ULONG_MAX) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_prev_hole); +EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry diff --git a/mm/readahead.c b/mm/readahead.c index de657077d41d..fc4dd364b37a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -336,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = page_cache_prev_hole(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -425,7 +425,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, offset + 1, max_pages); rcu_read_unlock(); if (!start || start - offset > max_pages) -- cgit v1.2.3 From 74d609585d8bd6083bd9d75bc1fd2c0d3851bcc5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 17 Nov 2017 10:01:45 -0500 Subject: page cache: Add and replace pages using the XArray Use the XArray APIs to add and replace pages in the page cache. This removes two uses of the radix tree preload API and is significantly shorter code. It also removes the last user of __radix_tree_create() outside radix-tree.c itself, so make it static. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 139 ++++++++++++++++++++++++----------------------------------- 1 file changed, 57 insertions(+), 82 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 714d3d0f60f5..b63caebd1367 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -111,35 +111,6 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ -static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) -{ - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->i_pages, page->index, 0, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (!xa_is_value(p)) - return -EEXIST; - - mapping->nrexceptional--; - if (shadowp) - *shadowp = p; - } - __radix_tree_replace(&mapping->i_pages, node, slot, page, - workingset_lookup_update(mapping)); - mapping->nrpages++; - return 0; -} - static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { @@ -775,51 +746,44 @@ EXPORT_SYMBOL(file_write_and_wait_range); * locked. This function does not add the new page to the LRU, the * caller must do that. * - * The remove + add is atomic. The only way this function can fail is - * memory allocation failure. + * The remove + add is atomic. This function cannot fail. */ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { - int error; + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *) = mapping->a_ops->freepage; + pgoff_t offset = old->index; + XA_STATE(xas, &mapping->i_pages, offset); + unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); - if (!error) { - struct address_space *mapping = old->mapping; - void (*freepage)(struct page *); - unsigned long flags; - - pgoff_t offset = old->index; - freepage = mapping->a_ops->freepage; + get_page(new); + new->mapping = mapping; + new->index = offset; - get_page(new); - new->mapping = mapping; - new->index = offset; + xas_lock_irqsave(&xas, flags); + xas_store(&xas, new); - xa_lock_irqsave(&mapping->i_pages, flags); - __delete_from_page_cache(old, NULL); - error = page_cache_tree_insert(mapping, new, NULL); - BUG_ON(error); - - /* - * hugetlb pages do not participate in page cache accounting. - */ - if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); - xa_unlock_irqrestore(&mapping->i_pages, flags); - mem_cgroup_migrate(old, new); - radix_tree_preload_end(); - if (freepage) - freepage(old); - put_page(old); - } + old->mapping = NULL; + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(old)) + __dec_node_page_state(new, NR_FILE_PAGES); + if (!PageHuge(new)) + __inc_node_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(old)) + __dec_node_page_state(new, NR_SHMEM); + if (PageSwapBacked(new)) + __inc_node_page_state(new, NR_SHMEM); + xas_unlock_irqrestore(&xas, flags); + mem_cgroup_migrate(old, new); + if (freepage) + freepage(old); + put_page(old); - return error; + return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@ -828,12 +792,15 @@ static int __add_to_page_cache_locked(struct page *page, pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); struct mem_cgroup *memcg; int error; + void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); + mapping_set_update(&xas, mapping); if (!huge) { error = mem_cgroup_try_charge(page, current->mm, @@ -842,39 +809,47 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); - if (error) { - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; - xa_lock_irq(&mapping->i_pages); - error = page_cache_tree_insert(mapping, page, shadowp); - radix_tree_preload_end(); - if (unlikely(error)) - goto err_insert; + do { + xas_lock_irq(&xas); + old = xas_load(&xas); + if (old && !xa_is_value(old)) + xas_set_err(&xas, -EEXIST); + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; + + if (xa_is_value(old)) { + mapping->nrexceptional--; + if (shadowp) + *shadowp = old; + } + mapping->nrpages++; + + /* hugetlb pages do not participate in page cache accounting */ + if (!huge) + __inc_node_page_state(page, NR_FILE_PAGES); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + + if (xas_error(&xas)) + goto error; - /* hugetlb pages do not participate in page cache accounting. */ - if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; -err_insert: +error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return error; + return xas_error(&xas); } /** -- cgit v1.2.3 From 5c024e6a4ebc1740db9f0f075aaa476210108a97 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 21 Nov 2017 09:17:59 -0500 Subject: page cache: Convert page deletion to XArray The code is slightly shorter and simpler. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index b63caebd1367..414efbdc95df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -111,31 +111,26 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ -static void page_cache_tree_delete(struct address_space *mapping, +static void page_cache_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr; + XA_STATE(xas, &mapping->i_pages, page->index); + unsigned int nr = 1; - /* hugetlb pages are represented by one entry in the radix tree */ - nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + mapping_set_update(&xas, mapping); + + /* hugetlb pages are represented by a single entry in the xarray */ + if (!PageHuge(page)) { + xas_set_order(&xas, page->index, compound_order(page)); + nr = 1U << compound_order(page); + } VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(nr != 1 && shadow, page); - for (i = 0; i < nr; i++) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(&mapping->i_pages, page->index + i, - &node, &slot); - - VM_BUG_ON_PAGE(!node && nr != 1, page); - - radix_tree_clear_tags(&mapping->i_pages, node, slot); - __radix_tree_replace(&mapping->i_pages, node, slot, shadow, - workingset_lookup_update(mapping)); - } + xas_store(&xas, shadow); + xas_init_marks(&xas); page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ @@ -234,7 +229,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) trace_mm_filemap_delete_from_page_cache(page); unaccount_page_cache_page(mapping, page); - page_cache_tree_delete(mapping, page, shadow); + page_cache_delete(mapping, page, shadow); } static void page_cache_free_page(struct address_space *mapping, -- cgit v1.2.3 From 4c7472c0df2f889df417a37571e622e02b5058fe Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 16:12:50 -0400 Subject: page cache: Convert find_get_entry to XArray Slightly shorter and simpler code. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 63 +++++++++++++++++++++++++++--------------------------------- 1 file changed, 28 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 414efbdc95df..2bf9f0742082 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1382,47 +1382,40 @@ EXPORT_SYMBOL(page_cache_prev_miss); */ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { - void **pagep; + XA_STATE(xas, &mapping->i_pages, offset); struct page *head, *page; rcu_read_lock(); repeat: - page = NULL; - pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); - if (pagep) { - page = radix_tree_deref_slot(pagep); - if (unlikely(!page)) - goto out; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto repeat; - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. - */ - goto out; - } + xas_reset(&xas); + page = xas_load(&xas); + if (xas_retry(&xas, page)) + goto repeat; + /* + * A shadow entry of a recently evicted page, or a swap entry from + * shmem/tmpfs. Return it without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; + head = compound_head(page); + if (!page_cache_get_speculative(head)) + goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } - /* - * Has the page moved? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != *pagep)) { - put_page(head); - goto repeat; - } + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != xas_reload(&xas))) { + put_page(head); + goto repeat; } out: rcu_read_unlock(); @@ -1453,7 +1446,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_entry(mapping, offset); - if (page && !radix_tree_exception(page)) { + if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page_mapping(page) != mapping)) { -- cgit v1.2.3 From f280bf092d48ff2db12b4a01127cd855c9e0ffb0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 17:20:45 -0400 Subject: page cache: Convert find_get_entries to XArray Slightly shorter and simpler code. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2bf9f0742082..4707156b9fbd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1578,53 +1578,48 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, ULONG_MAX) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; -- cgit v1.2.3 From fd1b3cee2a867868d39bb8cbcc4b00c36d07cc01 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 17:38:56 -0400 Subject: page cache: Convert find_get_pages_range to XArray The 'end' parameter of the xas_for_each iterator avoids a useless iteration at the end of the range. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 52 +++++++++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 4707156b9fbd..b72c39fe61c2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1650,64 +1650,50 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *start); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, end) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Skip - * over it. - */ + /* Skip over shadow, swap and DAX entries */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *start = pages[ret - 1]->index + 1; + *start = page->index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This - * breaks the iteration when there is page at index -1 but that is + * breaks the iteration when there is a page at index -1 but that is * already broken anyway. */ if (end == (pgoff_t)-1) -- cgit v1.2.3 From 3ece58a270cd1e5026282abe778bd50db7a11d08 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 18:00:33 -0400 Subject: page cache: Convert find_get_pages_contig to XArray There's no direct replacement for radix_tree_for_each_contig() in the XArray API as it's an unusual thing to do. Instead, open-code a loop using xas_next(). This removes the only user of radix_tree_for_each_contig() so delete the iterator from the API and the test suite code for it. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 53 ++++++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index b72c39fe61c2..089b67598100 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1721,57 +1721,43 @@ out: unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); + struct page *page; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - /* The hole, there no reason to continue */ - if (unlikely(!page)) - break; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Stop - * looking for contiguous pages. - */ + for (page = xas_load(&xas); page; page = xas_next(&xas)) { + struct page *head; + if (xas_retry(&xas, page)) + continue; + /* + * If the entry has been swapped out, we can stop looking. + * No current caller is looking for DAX entries. + */ + if (xa_is_value(page)) break; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; /* * must check mapping and index after taking the ref. * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { + if (!page->mapping || page_to_pgoff(page) != xas.xa_index) { put_page(page); break; } @@ -1779,6 +1765,11 @@ repeat: pages[ret] = page; if (++ret == nr_pages) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; -- cgit v1.2.3 From a6906972fe67fb6f73ba04088f7897227fd1cd8f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 18:12:54 -0400 Subject: page cache; Convert find_get_pages_range_tag to XArray The 'end' parameter of the xas_for_each iterator avoids a useless iteration at the end of the range. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 68 +++++++++++++++++++++++------------------------------------- 1 file changed, 26 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 089b67598100..0df28aa6411c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1789,74 +1789,58 @@ EXPORT_SYMBOL(find_get_pages_contig); * @tag. We update @index to index the next page for the traversal. */ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, int tag, unsigned int nr_pages, + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *index); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, end, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page. - * - * Those entries should never be tagged, but - * this tree walk is lockless and the tags are - * looked up in bulk, one radix tree node at a - * time, so there is a sizable window for page - * reclaim to evict a page we saw tagged. - * - * Skip over it. - */ + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *index = pages[ret - 1]->index + 1; + *index = page->index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* - * We come here when we got at @end. We take care to not overflow the + * We come here when we got to @end. We take care to not overflow the * index @index as it confuses some of the callers. This breaks the - * iteration when there is page at index -1 but that is already broken - * anyway. + * iteration when there is a page at index -1 but that is already + * broken anyway. */ if (end == (pgoff_t)-1) *index = (pgoff_t)-1; -- cgit v1.2.3 From c1901cd33cf407d77a181f8dd4ffff98041ef480 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 23:56:04 -0400 Subject: page cache: Convert find_get_entries_tag to XArray Slightly shorter and simpler code. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 54 ++++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 0df28aa6411c..35ae011971e1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1866,57 +1866,51 @@ EXPORT_SYMBOL(find_get_pages_range_tag); * @tag. */ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_mark_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, ULONG_MAX, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; -- cgit v1.2.3 From 070e807c690bf9a648d4a878f3c68ea9f5f5ce14 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 May 2018 00:08:30 -0400 Subject: page cache: Convert filemap_map_pages to XArray Slight change of strategy here; if we have trouble getting hold of a page for whatever reason (eg a compound page is split underneath us), don't spin to stabilise the page, just continue the iteration, like we would if we failed to trylock the page. Since this is a speculative optimisation, it feels like we should allow the process to take an extra fault if it turns out to need this page instead of spending time to pin down a page it may not need. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 35ae011971e1..ae1fcaa24f97 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2516,45 +2516,31 @@ EXPORT_SYMBOL(filemap_fault); void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { - struct radix_tree_iter iter; - void **slot; struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; + XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { - if (iter.index > end_pgoff) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - goto next; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } + xas_for_each(&xas, page, end_pgoff) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) goto next; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto next; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto skip; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto skip; if (!PageUptodate(page) || PageReadahead(page) || @@ -2573,10 +2559,10 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; if (vmf->pte) - vmf->pte += iter.index - last_pgoff; - last_pgoff = iter.index; + vmf->pte += xas.xa_index - last_pgoff; + last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); @@ -2589,8 +2575,6 @@ next: /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) break; - if (iter.index == end_pgoff) - break; } rcu_read_unlock(); } -- cgit v1.2.3 From ef8e5717db01bfa4f425d2cda551a6fccc85529d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 03:59:45 -0500 Subject: page cache: Convert delete_batch to XArray Rename the function from page_cache_tree_delete_batch to just page_cache_delete_batch. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index ae1fcaa24f97..f7f9af1d98b0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -272,7 +272,7 @@ void delete_from_page_cache(struct page *page) EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_tree_delete_batch - delete several pages from page cache + * page_cache_delete_batch - delete several pages from page cache * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * @@ -285,23 +285,18 @@ EXPORT_SYMBOL(delete_from_page_cache); * * The function expects the i_pages lock to be held. */ -static void -page_cache_tree_delete_batch(struct address_space *mapping, +static void page_cache_delete_batch(struct address_space *mapping, struct pagevec *pvec) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; int i = 0, tail_pages = 0; struct page *page; - pgoff_t start; - start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + mapping_set_update(&xas, mapping); + xas_for_each(&xas, page, ULONG_MAX) { if (i >= pagevec_count(pvec) && !tail_pages) break; - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); if (xa_is_value(page)) continue; if (!tail_pages) { @@ -310,8 +305,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, * have our pages locked so they are protected from * being removed. */ - if (page != pvec->pages[i]) + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > + pvec->pages[i]->index, page); continue; + } WARN_ON_ONCE(!PageLocked(page)); if (PageTransHuge(page) && !PageHuge(page)) tail_pages = HPAGE_PMD_NR - 1; @@ -322,11 +320,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, */ i++; } else { + VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages + != pvec->pages[i]->index, page); tail_pages--; } - radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); - __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, - workingset_lookup_update(mapping)); + xas_store(&xas, NULL); total_pages++; } mapping->nrpages -= total_pages; @@ -347,7 +345,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, unaccount_page_cache_page(mapping, pvec->pages[i]); } - page_cache_tree_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, pvec); xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) -- cgit v1.2.3 From 22ecdb4f8b7dbfefa3deaf6b144093b5c4cd2aff Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 04:02:00 -0500 Subject: page cache: Remove stray radix comment Signed-off-by: Matthew Wilcox --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index f7f9af1d98b0..bedd8680f789 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2682,7 +2682,7 @@ repeat: put_page(page); if (err == -EEXIST) goto repeat; - /* Presumably ENOMEM for radix tree node */ + /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } -- cgit v1.2.3 From 8fa8e538e4be359e9042573737632dda35a8700d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 16 Jan 2018 06:26:49 -0500 Subject: page cache: Convert filemap_range_has_page to XArray Instead of calling find_get_pages_range() and putting any reference, use xas_find() to iterate over any entries in the range, skipping the shadow/swap entries. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index bedd8680f789..6b36516bc31d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -455,20 +455,31 @@ EXPORT_SYMBOL(filemap_flush); bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - pgoff_t index = start_byte >> PAGE_SHIFT; - pgoff_t end = end_byte >> PAGE_SHIFT; struct page *page; + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; - if (mapping->nrpages == 0) - return false; + rcu_read_lock(); + for (;;) { + page = xas_find(&xas, max); + if (xas_retry(&xas, page)) + continue; + /* Shadow entries don't count */ + if (xa_is_value(page)) + continue; + /* + * We don't need to try to pin this page; we're about to + * release the RCU lock anyway. It is enough to know that + * there was a page here recently. + */ + break; + } + rcu_read_unlock(); - if (!find_get_pages_range(mapping, &index, end, 1, &page)) - return false; - put_page(page); - return true; + return page != NULL; } EXPORT_SYMBOL(filemap_range_has_page); -- cgit v1.2.3 From ff9c745b81ff1e482167fd73558450e66ad43a33 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 22 Nov 2017 11:41:23 -0500 Subject: mm: Convert page-writeback to XArray Includes moving mapping_tagged() to fs.h as a static inline, and changing it to return bool. Signed-off-by: Matthew Wilcox --- mm/page-writeback.c | 72 +++++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 84ae9bf5858a..fc6e5743b0bf 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2097,34 +2097,25 @@ void __init page_writeback_init(void) * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ -/* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock - * latency. - */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { -#define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged = 0; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + unsigned int tagged = 0; + void *page; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, - PAGECACHE_TAG_DIRTY) { - if (iter.index > end) - break; - radix_tree_iter_tag_set(&mapping->i_pages, &iter, - PAGECACHE_TAG_TOWRITE); - tagged++; - if ((tagged % WRITEBACK_TAG_BATCH) != 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) continue; - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); + + xas_pause(&xas); + xas_unlock_irq(&xas); cond_resched(); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2164,7 +2155,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { @@ -2445,7 +2436,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, /* * For address_spaces which do not use buffers. Just tag the page as dirty in - * its radix tree. + * the xarray. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" @@ -2471,7 +2462,7 @@ int __set_page_dirty_nobuffers(struct page *page) BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, page_index(page), + __xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); @@ -2634,13 +2625,13 @@ EXPORT_SYMBOL(__cancel_dirty_page); * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the radix tree so that a concurrent write-for-sync + * tagged as dirty in the xarray so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and radix-tree dirty tag + * at which stage we bring the page's dirty flag and xarray dirty tag * back into sync. * - * This incoherency between the page's dirty flag and radix-tree tag is + * This incoherency between the page's dirty flag and xarray tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) @@ -2721,7 +2712,7 @@ int test_clear_page_writeback(struct page *page) xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2761,11 +2752,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - xa_lock_irqsave(&mapping->i_pages, flags); + xas_lock_irqsave(&xas, flags); + xas_load(&xas); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2773,8 +2766,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_WRITEBACK); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2787,12 +2779,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_TOWRITE); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); } else { ret = TestSetPageWriteback(page); } @@ -2806,16 +2796,6 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } EXPORT_SYMBOL(__test_set_page_writeback); -/* - * Return true if any of the pages in the mapping are marked with the - * passed tag. - */ -int mapping_tagged(struct address_space *mapping, int tag) -{ - return radix_tree_tagged(&mapping->i_pages, tag); -} -EXPORT_SYMBOL(mapping_tagged); - /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. -- cgit v1.2.3 From a97e7904c0806309fd77103005bb7820c3f1c5e4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Nov 2017 14:24:59 -0500 Subject: mm: Convert workingset to XArray We construct an XA_STATE and use it to delete the node with xas_store() rather than adding a special function for this unique use case. Includes a test that simulates this usage for the test suite. Signed-off-by: Matthew Wilcox --- mm/workingset.c | 51 +++++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index c201cbb8c00f..5cfb29ec3fd9 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -148,7 +148,7 @@ * and activations is maintained (node->inactive_age). * * On eviction, a snapshot of this counter (along with some bits to - * identify the node) is stored in the now empty page cache radix tree + * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible @@ -162,7 +162,7 @@ /* * Eviction timestamps need to be able to cover the full range of - * actionable refaults. However, bits are tight in the radix tree + * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group @@ -339,7 +339,7 @@ out: static struct list_lru shadow_nodes; -void workingset_update_node(struct radix_tree_node *node) +void workingset_update_node(struct xa_node *node) { /* * Track non-empty nodes that contain only shadow entries; @@ -368,7 +368,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, nodes = list_lru_shrink_count(&shadow_nodes, sc); /* - * Approximate a reasonable limit for the radix tree nodes + * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. @@ -383,11 +383,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * - * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ if (sc->memcg) { cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, @@ -396,7 +396,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); } - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + max_nodes = cache >> (XA_CHUNK_SHIFT - 3); if (!nodes) return SHRINK_EMPTY; @@ -409,11 +409,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, - void *arg) + void *arg) __must_hold(lru_lock) { + struct xa_node *node = container_of(item, struct xa_node, private_list); + XA_STATE(xas, node->array, 0); struct address_space *mapping; - struct radix_tree_node *node; - unsigned int i; int ret; /* @@ -421,14 +421,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any - * address_space that has radix tree nodes on the LRU. + * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct xa_node, private_list); mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ @@ -450,25 +449,17 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[i]) { - if (WARN_ON_ONCE(!xa_is_value(node->slots[i]))) - goto out_invalid; - if (WARN_ON_ONCE(!node->nr_values)) - goto out_invalid; - if (WARN_ON_ONCE(!mapping->nrexceptional)) - goto out_invalid; -