// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/mmu_notifier.c
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
* Christoph Lameter <cl@linux.com>
*/
#include <linux/rculist.h>
#include <linux/mmu_notifier.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/interval_tree.h>
#include <linux/srcu.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
/* global SRCU for all MMs */
DEFINE_STATIC_SRCU(srcu);
#ifdef CONFIG_LOCKDEP
struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
.name = "mmu_notifier_invalidate_range_start"
};
#endif
/*
* The mmu_notifier_subscriptions structure is allocated and installed in
* mm->notifier_subscriptions inside the mm_take_all_locks() protected
* critical section and it's released only when mm_count reaches zero
* in mmdrop().
*/
struct mmu_notifier_subscriptions {
/* all mmu notifiers registered in this mm are queued in this list */
struct hlist_head list;
bool has_itree;
/* to serialize the list modifications and hlist_unhashed */
spinlock_t lock;
unsigned long invalidate_seq;
unsigned long active_invalidate_ranges;
struct rb_root_cached itree;
wait_queue_head_t wq;
struct hlist_head deferred_list;
};
/*
* This is a collision-retry read-side/write-side 'lock', a lot like a
* seqcount, however this allows multiple write-sides to hold it at
* once. Conceptually the write side is protecting the values of the PTEs in
* this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
* writer exists.
*
* Note that the core mm creates nested invalidate_range_start()/end() regions
* within the same thread, and runs invalidate_range_start()/end() in parallel
* on multiple CPUs. This is designed to not reduce concurrency or block
* progress on the mm side.
*
* As a secondary function, holding the full write side also serves to prevent
* writers for the itree, this is an optimization to avoid extra locking
* during invalidate_range_start/end notifiers.
*
* The write side has two states, fully excluded:
* - mm->active_invalidate_ranges != 0
* - subscriptions->invalidate_seq & 1 == True (odd)
* - some range on the mm_struct is being invalidated
* - the itree is not allowed to change
*
* And partially excluded:
* - mm->active_invalidate_ranges != 0
* - subscriptions->invalidate_seq & 1 == False (even)
* - some range on the mm_struct is being invalidated
* - the itree is allowed to change
*
* Operations on notifier_subscriptions->invalidate_seq (under spinlock):
* seq |= 1 # Begin writing
* seq++ # Release the writing state
* seq & 1 # True if a writer exists
*
* The later state avoids some expensive work on inv_end in the common case of
* no mmu_interval_notifier monitoring the VA.
*/
static bool
mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions)
{
lockdep_assert_held(&subscriptions->lock);
return subscriptions->invalidate_seq & 1;
}
static struct mmu_interval_notifier *
mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions,
const struct mmu_notifier_range *range,
unsigned long *seq)
{
struct interval_tree_node *node;
struct mmu_interval_notifier *res = NULL;
spin_lock(&subscriptions->lock);
subscriptions->active_invalidate_ranges++;
node = interval_tree_iter_first(&subscriptions->itree, range->start,
range->end - 1);
if (node) {
subscriptions->invalidate_seq |= 1;
res = container_of(node, struct mmu_interval_notifier,
interval_tree);
}
*seq = subscriptions->invalidate_seq;
spin_unlock(&subscriptions->lock);
return res;
}
static struct mmu_interval_notifier *
mn_itree_inv_next(struct mmu_interval_notifier *interval_sub,
const struct mmu_notifier_range *range)
{
struct interval_tree_node *node;
node = interval_tree_iter_next(&interval_sub->interval_tree,
range->start, range->end - 1);
if (!node)
return NULL;
return container_of(node, struct mmu_interval_notifier, interval_tree);
}
static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
{
struct mmu_interval_notifier *interval_sub;
struct hlist_node *next;
spin_lock(&subscriptions->lock);
if (--subscriptions->active_invalidate_ranges ||
!mn_itree_is_invalidating(subscriptions)) {
spin_unlock(&subscriptions->lock);
return;
}
/* Make invalidate_seq even */
subscriptions->invalidate_seq++;
/*
* The inv_end incorporates a deferred mechanism like rtnl_unlock().
* Adds and removes are queued until the final inv_end happens then
* they are progressed. This arrangement for tree updates is used to
* avoid using a blocking lock during invalidate_range_start.
*/
hlist_for_each_entry_safe(interval_sub, next,
&subscriptions->deferred_list,
deferred_item) {
if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb))
interval_tree_insert(&interval_sub->interval_tree,
&subscriptions->itree);
else
interval_tree_remove(&interval_sub->interval_tree,
&subscriptions->itree);
hlist_del(&interval_sub->deferred_item);
}