summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h4
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h4
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c17
-rw-r--r--drivers/staging/erofs/utils.c18
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/dax.c112
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--include/linux/radix-tree.h36
-rw-r--r--include/linux/swapops.h19
-rw-r--r--include/linux/xarray.h102
-rw-r--r--lib/idr.c60
-rw-r--r--lib/radix-tree.c21
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/shmem.c10
-rw-r--r--mm/swap.c2
-rw-r--r--mm/truncate.c12
-rw-r--r--mm/workingset.c13
-rw-r--r--tools/testing/radix-tree/idr-test.c6
-rw-r--r--tools/testing/radix-tree/linux/radix-tree.h1
-rw-r--r--tools/testing/radix-tree/multiorder.c47
-rw-r--r--tools/testing/radix-tree/test.c2
26 files changed, 278 insertions, 232 deletions
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 2fdc865ca374..62039e557ac0 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -723,9 +723,7 @@ static inline bool pte_user(pte_t pte)
BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \
} while (0)
-/*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
+
#define SWP_TYPE_BITS 5
#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
& ((1UL << SWP_TYPE_BITS) - 1))
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 7cd6809f4d33..05765c2d2c1f 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -313,9 +313,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
#define MAX_SWAPFILES_CHECK() do { \
BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
} while (0)
-/*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
+
#define SWP_TYPE_BITS 5
#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
& ((1UL << SWP_TYPE_BITS) - 1))
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fcc73a6ab503..316730b45f84 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5996,7 +5996,8 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
count = __sg_page_count(sg);
while (idx + count <= n) {
- unsigned long exception, i;
+ void *entry;
+ unsigned long i;
int ret;
/* If we cannot allocate and insert this entry, or the
@@ -6011,12 +6012,9 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
if (ret && ret != -EEXIST)
goto scan;
- exception =
- RADIX_TREE_EXCEPTIONAL_ENTRY |
- idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
+ entry = xa_mk_value(idx);
for (i = 1; i < count; i++) {
- ret = radix_tree_insert(&iter->radix, idx + i,
- (void *)exception);
+ ret = radix_tree_insert(&iter->radix, idx + i, entry);
if (ret && ret != -EEXIST)
goto scan;
}
@@ -6054,15 +6052,14 @@ lookup:
GEM_BUG_ON(!sg);
/* If this index is in the middle of multi-page sg entry,
- * the radixtree will contain an exceptional entry that points
+ * the radix tree will contain a value entry that points
* to the start of that range. We will return the pointer to
* the base page and the offset of this page within the
* sg entry's range.
*/
*offset = 0;
- if (unlikely(radix_tree_exception(sg))) {
- unsigned long base =
- (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
+ if (unlikely(xa_is_value(sg))) {
+ unsigned long base = xa_to_value(sg);
sg = radix_tree_lookup(&iter->radix, base);
GEM_BUG_ON(!sg);
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
index 595cf90af9bb..bdee9bd09f11 100644
--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -35,7 +35,6 @@ static atomic_long_t erofs_global_shrink_cnt;
#ifdef CONFIG_EROFS_FS_ZIP
-/* radix_tree and the future XArray both don't use tagptr_t yet */
struct erofs_workgroup *erofs_find_workgroup(
struct super_block *sb, pgoff_t index, bool *tag)
{
@@ -47,9 +46,8 @@ repeat:
rcu_read_lock();
grp = radix_tree_lookup(&sbi->workstn_tree, index);
if (grp != NULL) {
- *tag = radix_tree_exceptional_entry(grp);
- grp = (void *)((unsigned long)grp &
- ~RADIX_TREE_EXCEPTIONAL_ENTRY);
+ *tag = xa_pointer_tag(grp);
+ grp = xa_untag_pointer(grp);
if (erofs_workgroup_get(grp, &oldcount)) {
/* prefer to relax rcu read side */
@@ -83,9 +81,7 @@ int erofs_register_workgroup(struct super_block *sb,
sbi = EROFS_SB(sb);
erofs_workstn_lock(sbi);
- if (tag)
- grp = (void *)((unsigned long)grp |
- 1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
+ grp = xa_tag_pointer(grp, tag);
err = radix_tree_insert(&sbi->workstn_tree,
grp->index, grp);
@@ -131,9 +127,7 @@ repeat:
for (i = 0; i < found; ++i) {
int cnt;
- struct erofs_workgroup *grp = (void *)
- ((unsigned long)batch[i] &
- ~RADIX_TREE_EXCEPTIONAL_ENTRY);
+ struct erofs_workgroup *grp = xa_untag_pointer(batch[i]);
first_index = grp->index + 1;
@@ -150,8 +144,8 @@ repeat:
#endif
continue;
- if (radix_tree_delete(&sbi->workstn_tree,
- grp->index) != grp) {
+ if (xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree,
+ grp->index)) != grp) {
#ifdef EROFS_FS_HAS_MANAGED_CACHE
skip:
erofs_workgroup_unfreeze(grp, 1);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9bfa66592aa7..fd25e125303c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -440,7 +440,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
rcu_read_lock();
page = radix_tree_lookup(&mapping->i_pages, pg_index);
rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ if (page && !xa_is_value(page)) {
misses++;
if (misses > 4)
break;
diff --git a/fs/dax.c b/fs/dax.c
index b68ce484e1be..ebcec36335eb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -59,56 +59,57 @@ static int __init init_dax_wait_table(void)
fs_initcall(init_dax_wait_table);
/*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
- * an empty entry that is just used for locking. In total four special bits.
+ * DAX pagecache entries use XArray value entries so they can't be mistaken
+ * for pages. We use one bit for locking, one bit for the entry size (PMD)
+ * and two more to tell us if the entry is a zero page or an empty entry that
+ * is just used for locking. In total four special bits.
*
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
* block allocation.
*/
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+#define DAX_SHIFT (4)
+#define DAX_LOCKED (1UL << 0)
+#define DAX_PMD (1UL << 1)
+#define DAX_ZERO_PAGE (1UL << 2)
+#define DAX_EMPTY (1UL << 3)
static unsigned long dax_radix_pfn(void *entry)
{
- return (unsigned long)entry >> RADIX_DAX_SHIFT;
+ return xa_to_value(entry) >> DAX_SHIFT;
}
static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
{
- return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
- (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
+ return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) |
+ DAX_LOCKED);
}
static unsigned int dax_radix_order(void *entry)
{
- if ((unsigned long)entry & RADIX_DAX_PMD)
+ if (xa_to_value(entry) & DAX_PMD)
return PMD_SHIFT - PAGE_SHIFT;
return 0;
}
static int dax_is_pmd_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_PMD;
+ return xa_to_value(entry) & DAX_PMD;
}
static int dax_is_pte_entry(void *entry)
{
- return !((unsigned long)entry & RADIX_DAX_PMD);
+ return !(xa_to_value(entry) & DAX_PMD);
}
static int dax_is_zero_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
+ return xa_to_value(entry) & DAX_ZERO_PAGE;
}
static int dax_is_empty_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_EMPTY;
+ return xa_to_value(entry) & DAX_EMPTY;
}
/*
@@ -186,9 +187,9 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
*/
static inline int slot_locked(struct address_space *mapping, void **slot)
{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
- return entry & RADIX_DAX_ENTRY_LOCK;
+ unsigned long entry = xa_to_value(
+ radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
+ return entry & DAX_LOCKED;
}
/*
@@ -196,12 +197,11 @@ static inline int slot_locked(struct address_space *mapping, void **slot)
*/
static inline void *lock_slot(struct address_space *mapping, void **slot)
{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
+ unsigned long v = xa_to_value(
+ radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
+ void *entry = xa_mk_value(v | DAX_LOCKED);
+ radix_tree_replace_slot(&mapping->i_pages, slot, entry);
+ return entry;
}
/*
@@ -209,17 +209,16 @@ static inline void *lock_slot(struct address_space *mapping, void **slot)
*/
static inline void *unlock_slot(struct address_space *mapping, void **slot)
{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
+ unsigned long v = xa_to_value(
+ radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
+ void *entry = xa_mk_value(v & ~DAX_LOCKED);
+ radix_tree_replace_slot(&mapping->i_pages, slot, entry);
+ return entry;
}
/*
* Lookup entry in radix tree, wait for it to become unlocked if it is
- * exceptional entry and return it. The caller must call
+ * a DAX entry and return it. The caller must call
* put_unlocked_mapping_entry() when he decided not to lock the entry or
* put_locked_mapping_entry() when he locked the entry and now wants to
* unlock it.
@@ -242,7 +241,7 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
&slot);
if (!entry ||
- WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
+ WARN_ON_ONCE(!xa_is_value(entry)) ||
!slot_locked(mapping, slot)) {
if (slotp)
*slotp = slot;
@@ -283,7 +282,7 @@ static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
xa_lock_irq(&mapping->i_pages);
entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
- if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
+ if (WARN_ON_ONCE(!entry || !xa_is_value(entry) ||
!slot_locked(mapping, slot))) {
xa_unlock_irq(&mapping->i_pages);
return;
@@ -472,12 +471,11 @@ void dax_unlock_mapping_entry(struct page *page)
}
/*
- * Find radix tree entry at given index. If it points to an exceptional entry,
- * return it with the radix tree entry locked. If the radix tree doesn't
- * contain given index, create an empty exceptional entry for the index and
- * return with it locked.
+ * Find radix tree entry at given index. If it is a DAX entry, return it
+ * with the radix tree entry locked. If the radix tree doesn't contain the
+ * given index, create an empty entry for the index and return with it locked.
*
- * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
+ * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
* either return that locked entry or will return an error. This error will
* happen if there are any 4k entries within the 2MiB range that we are
* requesting.
@@ -507,13 +505,13 @@ restart:
xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
- if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+ if (WARN_ON_ONCE(entry && !xa_is_value(entry))) {
entry = ERR_PTR(-EIO);
goto out_unlock;
}
if (entry) {
- if (size_flag & RADIX_DAX_PMD) {
+ if (size_flag & DAX_PMD) {
if (dax_is_pte_entry(entry)) {
put_unlocked_mapping_entry(mapping, index,
entry);
@@ -584,7 +582,7 @@ restart:
true);
}
- entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
+ entry = dax_radix_locked_entry(0, size_flag | DAX_EMPTY);
err = __radix_tree_insert(&mapping->i_pages, index,
dax_radix_order(entry), entry);
@@ -673,8 +671,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
if (index >= end)
break;
- if (WARN_ON_ONCE(
- !radix_tree_exceptional_entry(pvec_ent)))
+ if (WARN_ON_ONCE(!xa_is_value(pvec_ent)))
continue;
xa_lock_irq(&mapping->i_pages);
@@ -713,7 +710,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
xa_lock_irq(pages);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
goto out;
if (!trunc &&
(radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
@@ -729,8 +726,8 @@ out:
return ret;
}
/*
- * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
- * entry to get unlocked before deleting it.
+ * Delete DAX entry at @index from @mapping. Wait for it
+ * to be unlocked before deleting it.
*/
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
@@ -740,7 +737,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
* This gets called from truncate / punch_hole path. As such, the caller
* must hold locks protecting against concurrent modifications of the
* radix tree (usually fs-private i_mmap_sem for writing). Since the
- * caller has seen exceptional entry for this index, we better find it
+ * caller has seen a DAX entry for this index, we better find it
* at that index as well...
*/
WARN_ON_ONCE(!ret);
@@ -748,7 +745,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
}
/*
- * Invalidate exceptional DAX entry if it is clean.
+ * Invalidate DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index)
@@ -802,7 +799,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+ if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
@@ -940,13 +937,13 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* A page got tagged dirty in DAX mapping? Something is seriously
* wrong.
*/
- if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+ if (WARN_ON(!xa_is_value(entry)))
return -EIO;
xa_lock_irq(pages);
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
/* Entry got punched out / reallocated? */
- if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
+ if (!entry2 || WARN_ON_ONCE(!xa_is_value(entry2)))
goto put_unlocked;
/*
* Entry got reallocated elsewhere? No need to writeback. We have to
@@ -1123,8 +1120,9 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
- false);
+ dax_insert_mapping_entry(mapping, vmf, entry, pfn,
+ DAX_ZERO_PAGE, false);
+
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
return ret;
@@ -1514,7 +1512,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pfn = page_to_pfn_t(zero_page);
ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
+ DAX_PMD | DAX_ZERO_PAGE, false);
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
@@ -1597,7 +1595,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* is already in the tree, for instance), it will return -EEXIST and
* we just fall back to 4k entries.
*/
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+ entry = grab_mapping_entry(mapping, pgoff, DAX_PMD);
if (IS_ERR(entry))
goto fallback;
@@ -1635,7 +1633,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
goto finish_iomap;
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD, write && !sync);
+ DAX_PMD, write && !sync);
/*
* If we are doing synchronous page fault and inode needs fsync,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5ea1d64cb0b4..669abb617321 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -521,7 +521,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
mss->swap += PAGE_SIZE;
else
put_page(page);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 34149e8b5f73..e9e76ab4fbbf 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -28,34 +28,26 @@
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
+#include <linux/xarray.h>
/*
* The bottom two bits of the slot determine how the remaining bits in the
* slot are interpreted:
*
* 00 - data pointer
- * 01 - internal entry
- * 10 - exceptional entry
- * 11 - this bit combination is currently unused/reserved
+ * 10 - internal entry
+ * x1 - value entry
*
* The internal entry may be a pointer to the next level in the tree, a
* sibling entry, or an indicator that the entry in this slot has been moved
* to another location in the tree and the lookup should be restarted. While
* NULL fits the 'data pointer' pattern, it means that there is no entry in
* the tree for this index (no matter what level of the tree it is found at).
- * This means that you cannot store NULL in the tree as a value for the index.
+ * This means that storing a NULL entry in the tree is the same as deleting
+ * the entry from the tree.
*/
#define RADIX_TREE_ENTRY_MASK 3UL
-#define RADIX_TREE_INTERNAL_NODE 1UL
-
-/*
- * Most users of the radix tree store pointers but shmem/tmpfs stores swap
- * entries in the same tree. They are marked as exceptional entries to
- * distinguish them from pointers to struct page.
- * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
- */
-#define RADIX_TREE_EXCEPTIONAL_ENTRY 2
-#define RADIX_TREE_EXCEPTIONAL_SHIFT 2
+#define RADIX_TREE_INTERNAL_NODE 2UL
static inline bool radix_tree_is_internal_node(void *ptr)
{
@@ -83,11 +75,10 @@ static inline bool radix_tree_is_internal_node(void *ptr)
/*
* @count is the count of every non-NULL element in the ->slots array
- * whether that is an exceptional entry, a retry entry, a user pointer,
+ * whether that is a value entry, a retry entry, a user pointer,
* a sibling entry or a pointer to the next level of the tree.
* @exceptional is the count of every element in ->slots which is
- * either radix_tree_exceptional_entry() or is a sibling entry for an
- * exceptional entry.
+ * either a value entry or a sibling of a value entry.
*/
struct radix_tree_node {
unsigned char shift; /* Bits remaining in each slot */
@@ -269,17 +260,6 @@ static inline int radix_tree_deref_retry(void *arg)
}
/**
- * radix_tree_exceptional_entry - radix_tree_deref_slot gave exceptional entry?
- * @arg: value returned by radix_tree_deref_slot
- * Returns: 0 if well-aligned pointer, non-0 if exceptional entry.
- */
-static inline int radix_tree_exceptional_entry(void *arg)
-{
- /* Not unlikely because radix_tree_exception often tested first */
- return (unsigned long)arg & RADIX_TREE_EXCEPTIONAL_ENTRY;
-}
-
-/**
* radix_tree_exception - radix_tree_deref_slot returned either exception?
* @arg: value returned by radix_tree_deref_slot
* Returns: 0 if well-aligned pointer, non-0 if either kind of exception.
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 22af9d8a84ae..4d961668e5fc 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -18,9 +18,8 @@
*
* swp_entry_t's are *never* stored anywhere in their arch-dependent format.
*/
-#define SWP_TYPE_SHIFT(e) ((sizeof(e.val) * 8) - \
- (MAX_SWAPFILES_SHIFT + RADIX_TREE_EXCEPTIONAL_SHIFT))
-#define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1)
+#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
+#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
/*
* Store a type+offset into a swp_entry_t in an arch-independent format
@@ -29,8 +28,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
{
swp_entry_t ret;
- ret.val = (type << SWP_TYPE_SHIFT(ret)) |
- (offset & SWP_OFFSET_MASK(ret));
+ ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK);
return ret;
}
@@ -40,7 +38,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
*/
static inline unsigned swp_type(swp_entry_t entry)
{
- return (entry.val >> SWP_TYPE_SHIFT(entry));
+ return (entry.val >> SWP_TYPE_SHIFT);
}
/*
@@ -49,7 +47,7 @@ static inline unsigned swp_type(swp_entry_t entry)
*/
static inline pgoff_t swp_offset(swp_entry_t entry)
{
- return entry.val & SWP_OFFSET_MASK(entry);
+ return entry.val & SWP_OFFSET_MASK;
}
#ifdef CONFIG_MMU
@@ -90,16 +88,13 @@ static inline swp_entry_t radix_to_swp_entry(void *arg)
{
swp_entry_t entry;
- entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
+ entry.val = xa_to_value(arg);
return entry;
}
static inline void *swp_to_radix_entry(swp_entry_t entry)
{
- unsigned long value;
-
- value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT;
- return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
+ return xa_mk_value(entry.val);
}
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 9e4c86853fa4..5c8acfc4ff55 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -5,9 +5,111 @@
* eXtensible Arrays
* Copyright (c) 2017 Microsoft Corporation
* Author: Matthew Wilcox <willy@infradead.org>
+ *
+ * See Documentation/core-api/xarray.rst for how to use the XArray.
*/
+#include <linux/bug.h>
#include <linux/spinlock.h>
+#include <linux/types.h>
+
+/*
+ * The bottom two bits of the entry determine how the XArray interprets
+ * the contents:
+ *
+ * 00: Pointer entry
+ * 10: Internal entry
+ * x1: Value entry or tagged pointer
+ *
+ * Attempting to store internal entries in the XArray is a bug.
+ */
+
+#define BITS_PER_XA_VALUE (BITS_PER_LONG - 1)
+
+/**
+ * xa_mk_value() - Create an XArray entry from an integer.
+ * @v: Value to store in XArray.
+ *
+ * Context: Any context.
+ * Return: An entry suitable for storing in the XArray.
+ */
+static inline void *xa_mk_value(unsigned long v)
+{
+ WARN_ON((long)v < 0);
+ return (void *)((v << 1) | 1);
+}
+
+/**
+ * xa_to_value() - Get value stored in an XArray entry.
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: The value stored in the XArray entry.
+ */
+static inline unsigned long xa_to_value(const void *entry)
+{
+ return (unsigned long)entry >> 1;
+}
+
+/**
+ * xa_is_value() - Determine if an entry is a value.
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: True if the entry is a value, false if it is a pointer.
+ */
+static inline bool xa_is_value(const void *entry)
+{
+ return (unsigned long)entry & 1;
+}
+
+/**
+ * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
+ * @p: Plain pointer.
+ * @tag: Tag value (0, 1 or 3).
+ *
+ * If the user of the XArray prefers, they can tag their pointers instead
+ * of storing value entries. Three tags are available (0, 1 and 3).
+ * These are distinct from the xa_mark_t as they are not replicated up
+ * through the array and cannot be searched for.
+ *
+ * Context: Any context.
+ * Return: An XArray entry.
+ */
+static inline void *xa_tag_pointer(void *p, unsigned long tag)
+{
+ return (void *)((unsigned long)p | tag);
+}
+
+/**
+ * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
+ * @entry: XArray entry.
+ *
+ * If you have stored a tagged pointer in the XArray, call this function
+ * to get the untagged version of the pointer.
+ *
+ * Context: Any context.
+ * Return: A pointer.
+ */
+static inline void *xa_untag_pointer(void *entry)
+{
+ return (void *)((unsigned long)entry & ~3UL);
+}
+
+/**
+ * xa_pointer_tag() - Get the tag stored in an XArray entry.
+ * @entry: XArray entry.
+ *
+ * If you have stored a tagged pointer in the XArray, call this function
+ * to get the tag of that pointer.
+ *
+ * Context: Any context.
+ * Return: A tag.
+ */
+static inline unsigned int xa_pointer_tag(void *entry)
+{
+ return (unsigned long)entry & 3UL;
+}
#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa) spin_lock(&(xa)->xa_lock)
diff --git a/lib/idr.c b/lib/idr.c
index 729e381e23b4..88419fbc5737 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -338,11 +338,8 @@ EXPORT_SYMBOL(idr_replace);
* by the number of bits in the leaf bitmap before doing a radix tree lookup.
*
* As an optimisation, if there are only a few low bits set in any given
- * leaf, instead of allocating a 128-byte bitmap, we use the 'exceptional
- * entry' functionality of the radix tree to store BITS_PER_LONG - 2 bits
- * directly in the entry. By being really tricksy, we could store
- * BITS_PER_LONG - 1 bits, but there're diminishing returns after optimising
- * for 0-3 allocated IDs.
+ * leaf, instead of allocating a 128-byte bitmap, we store the bits
+ * directly in the entry.
*
* We allow the radix tree 'exceptional' count to get out of date. Nothing
* in the IDA nor the radix tree code checks it. If it becomes important
@@ -366,12 +363,11 @@ static int ida_get_new_above(struct ida *ida, int start)
struct radix_tree_iter iter;
struct ida_bitmap *bitmap;
unsigned long index;
- unsigned bit, ebit;
+ unsigned bit;
int new;
index = start / IDA_BITMAP_BITS;
bit = start % IDA_BITMAP_BITS;
- ebit = bit + RADIX_TREE_EXCEPTIONAL_SHIFT;
slot = radix_tree_iter_init(&iter, index);
for (;;) {
@@ -386,25 +382,23 @@ static int ida_get_new_above(struct ida *ida, int start)
return PTR_ERR(slot);
}
}
- if (iter.index > index) {
+ if (iter.index > index)
bit = 0;
- ebit = RADIX_TREE_EXCEPTIONAL_SHIFT;
- }
new = iter.index * IDA_BITMAP_BITS;
bitmap = rcu_dereference_raw(*slot);
- if (radix_tree_exception(bitmap)) {
- unsigned long tmp = (unsigned long)bitmap;
- ebit = find_next_zero_bit(&tmp, BITS_PER_LONG, ebit);
- if (ebit < BITS_PER_LONG) {
- tmp |= 1UL << ebit;
- rcu_assign_pointer(*slot, (void *)tmp);
- return new + ebit -
- RADIX_TREE_EXCEPTIONAL_SHIFT;
+ if (xa_is_value(bitmap)) {
+ unsigned long tmp = xa_to_value(bitmap);
+ int vbit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE,
+ bit);
+ if (vbit < BITS_PER_XA_VALUE) {
+ tmp |= 1UL << vbit;
+ rcu_assign_pointer(*slot, xa_mk_value(tmp));
+ return new + vbit;
}
bitmap = this_cpu_xchg(ida_bitmap, NULL);
if (!bitmap)
return -EAGAIN;
- bitmap->bitmap[0] = tmp >> RADIX_TREE_EXCEPTIONAL_SHIFT;
+ bitmap->bitmap[0] = tmp;
rcu_assign_pointer(*slot, bitmap);
}
@@ -425,17 +419,14 @@ static int ida_get_new_above(struct ida *ida, int start)
new += bit;
if (new < 0)
return -ENOSPC;
- if (ebit < BITS_PER_LONG) {
- bitmap = (void *)((1UL << ebit) |
- RADIX_TREE_EXCEPTIONAL_ENTRY);
- radix_tree_iter_replace(root, &iter, slot,
- bitmap);
- return new;
+ if (bit < BITS_PER_XA_VALUE) {
+ bitmap = xa_mk_value(1UL << bit);
+ } else {
+ bitmap = this_cpu_xchg(ida_bitmap, NULL);
+ if (!bitmap)
+ return -EAGAIN;
+ __set_bit(bit, bitmap->bitmap);
}
- bitmap = this_cpu_xchg(ida_bitmap, NULL);
- if (!bitmap)
- return -EAGAIN;
- __set_bit(bit, bitmap->bitmap);
radix_tree_iter_replace(root, &iter, slot, bitmap);
}
@@ -457,9 +448,9 @@ static void ida_remove(struct ida *ida, int id)
goto err;
bitmap = rcu_dereference_raw(*slot);
- if (radix_tree_exception(bitmap)) {
+ if (xa_is_value(bitmap)) {
btmp = (unsigned long *)slot;
- offset += RADIX_TREE_EXCEPTIONAL_SHIFT;
+ offset += 1; /* Intimate knowledge of the value encoding */
if (offset >= BITS_PER_LONG)
goto err;
} else {
@@ -470,9 +461,8 @@ static void ida_remove(struct ida *ida, int id)
__clear_bit(offset, btmp);
radix_tree_iter_tag_set(&ida->ida_rt, &iter, IDR_FREE);
- if (radix_tree_exception(bitmap)) {
- if (rcu_dereference_raw(*slot) ==
- (void *)RADIX_TREE_EXCEPTIONAL_ENTRY)