summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-18 20:10:44 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-18 20:10:44 -0800
commit4c929feed7e9ce69efbe85e3932393db67fbce76 (patch)
treee71435174ea1c22e98c93d3c0f93598d5841ce02
parent018cb13eb33383cbc3fb6d3a286ef32ecb816779 (diff)
parenta7cfef21e3d066343bec14d3113a9f9c92d1c2a8 (diff)
Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband
Pull infiniband updates from Roland Dreier: "Main batch of InfiniBand/RDMA changes for 3.19: - On-demand paging support in core midlayer and mlx5 driver. This lets userspace create non-pinned memory regions and have the adapter HW trigger page faults. - iSER and IPoIB updates and fixes. - Low-level HW driver updates for cxgb4, mlx4 and ocrdma. - Other miscellaneous fixes" * tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: (56 commits) IB/mlx5: Implement on demand paging by adding support for MMU notifiers IB/mlx5: Add support for RDMA read/write responder page faults IB/mlx5: Handle page faults IB/mlx5: Page faults handling infrastructure IB/mlx5: Add mlx5_ib_update_mtt to update page tables after creation IB/mlx5: Changes in memory region creation to support on-demand paging IB/mlx5: Implement the ODP capability query verb mlx5_core: Add support for page faults events and low level handling mlx5_core: Re-add MLX5_DEV_CAP_FLAG_ON_DMND_PG flag IB/srp: Allow newline separator for connection string IB/core: Implement support for MMU notifiers regarding on demand paging regions IB/core: Add support for on demand paging regions IB/core: Add flags for on demand paging support IB/core: Add support for extended query device caps IB/mlx5: Add function to read WQE from user-space IB/core: Add umem function to read data from user-space IB/core: Replace ib_umem's offset field with a full address IB/mlx5: Enhance UMR support to allow partial page table update IB/mlx5: Remove per-MR pas and dma pointers RDMA/ocrdma: Always resolve destination mac from GRH for UD QPs ...
-rw-r--r--drivers/infiniband/Kconfig11
-rw-r--r--drivers/infiniband/core/Makefile1
-rw-r--r--drivers/infiniband/core/addr.c4
-rw-r--r--drivers/infiniband/core/multicast.c11
-rw-r--r--drivers/infiniband/core/umem.c72
-rw-r--r--drivers/infiniband/core/umem_odp.c668
-rw-r--r--drivers/infiniband/core/umem_rbtree.c94
-rw-r--r--drivers/infiniband/core/uverbs.h1
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c171
-rw-r--r--drivers/infiniband/core/uverbs_main.c5
-rw-r--r--drivers/infiniband/core/verbs.c3
-rw-r--r--drivers/infiniband/hw/amso1100/c2_provider.c2
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c7
-rw-r--r--drivers/infiniband/hw/cxgb4/device.c2
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c28
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c2
-rw-r--r--drivers/infiniband/hw/ehca/ehca_mrmw.c2
-rw-r--r--drivers/infiniband/hw/ipath/ipath_mr.c2
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c1
-rw-r--r--drivers/infiniband/hw/mlx5/Makefile1
-rw-r--r--drivers/infiniband/hw/mlx5/main.c45
-rw-r--r--drivers/infiniband/hw/mlx5/mem.c69
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h116
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c323
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c798
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c197
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.c6
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_ah.c5
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.c4
-rw-r--r--drivers/infiniband/hw/qib/qib_mr.c2
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h19
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c18
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c27
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c49
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c239
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c22
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c104
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h30
-rw-r--r--drivers/infiniband/ulp/iser/iser_initiator.c6
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c102
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c91
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/main.c6
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eq.c13
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/fw.c40
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/qp.c119
-rw-r--r--include/linux/mlx5/device.h72
-rw-r--r--include/linux/mlx5/driver.h14
-rw-r--r--include/linux/mlx5/qp.h65
-rw-r--r--include/rdma/ib_umem.h34
-rw-r--r--include/rdma/ib_umem_odp.h160
-rw-r--r--include/rdma/ib_verbs.h54
-rw-r--r--include/uapi/rdma/ib_user_verbs.h29
53 files changed, 3511 insertions, 457 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 77089399359b..b899531498eb 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -38,6 +38,17 @@ config INFINIBAND_USER_MEM
depends on INFINIBAND_USER_ACCESS != n
default y
+config INFINIBAND_ON_DEMAND_PAGING
+ bool "InfiniBand on-demand paging support"
+ depends on INFINIBAND_USER_MEM
+ select MMU_NOTIFIER
+ default y
+ ---help---
+ On demand paging support for the InfiniBand subsystem.
+ Together with driver support this allows registration of
+ memory regions without pinning their pages, fetching the
+ pages on demand instead.
+
config INFINIBAND_ADDR_TRANS
bool
depends on INFINIBAND
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index ffd0af6734af..acf736764445 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 8172d37f9add..f80da50d84a5 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -176,8 +176,8 @@ static void set_timeout(unsigned long time)
unsigned long delay;
delay = time - jiffies;
- if ((long)delay <= 0)
- delay = 1;
+ if ((long)delay < 0)
+ delay = 0;
mod_delayed_work(addr_wq, &work, delay);
}
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index d2360a8ef0b2..fa17b552ff78 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -525,17 +525,22 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
if (status)
process_join_error(group, status);
else {
+ int mgids_changed, is_mgid0;
ib_find_pkey(group->port->dev->device, group->port->port_num,
be16_to_cpu(rec->pkey), &pkey_index);
spin_lock_irq(&group->port->lock);
- group->rec = *rec;
if (group->state == MCAST_BUSY &&
group->pkey_index == MCAST_INVALID_PKEY_INDEX)
group->pkey_index = pkey_index;
- if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
+ mgids_changed = memcmp(&rec->mgid, &group->rec.mgid,
+ sizeof(group->rec.mgid));
+ group->rec = *rec;
+ if (mgids_changed) {
rb_erase(&group->node, &group->port->table);
- mcast_insert(group->port, group, 1);
+ is_mgid0 = !memcmp(&mgid0, &group->rec.mgid,
+ sizeof(mgid0));
+ mcast_insert(group->port, group, is_mgid0);
}
spin_unlock_irq(&group->port->lock);
}
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index df0c4f605a21..aec7a6aa2951 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -39,6 +39,7 @@
#include <linux/hugetlb.h>
#include <linux/dma-attrs.h>
#include <linux/slab.h>
+#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
@@ -69,6 +70,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
/**
* ib_umem_get - Pin and DMA map userspace memory.
+ *
+ * If access flags indicate ODP memory, avoid pinning. Instead, stores
+ * the mm for future page fault handling in conjunction with MMU notifiers.
+ *
* @context: userspace context to pin memory for
* @addr: userspace virtual address to start at
* @size: length of region to pin
@@ -103,17 +108,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
umem->context = context;
umem->length = size;
- umem->offset = addr & ~PAGE_MASK;
+ umem->address = addr;
umem->page_size = PAGE_SIZE;
umem->pid = get_task_pid(current, PIDTYPE_PID);
/*
- * We ask for writable memory if any access flags other than
- * "remote read" are set. "Local write" and "remote write"
+ * We ask for writable memory if any of the following
+ * access flags are set. "Local write" and "remote write"
* obviously require write access. "Remote atomic" can do
* things like fetch and add, which will modify memory, and
* "MW bind" can change permissions by binding a window.
*/
- umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
+ umem->writable = !!(access &
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
+
+ if (access & IB_ACCESS_ON_DEMAND) {
+ ret = ib_umem_odp_get(context, umem);
+ if (ret) {
+ kfree(umem);
+ return ERR_PTR(ret);
+ }
+ return umem;
+ }
+
+ umem->odp_data = NULL;
/* We assume the memory is from hugetlb until proved otherwise */
umem->hugetlb = 1;
@@ -132,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
if (!vma_list)
umem->hugetlb = 0;
- npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
+ npages = ib_umem_num_pages(umem);
down_write(&current->mm->mmap_sem);
@@ -235,6 +253,11 @@ void ib_umem_release(struct ib_umem *umem)
struct task_struct *task;
unsigned long diff;
+ if (umem->odp_data) {
+ ib_umem_odp_release(umem);
+ return;
+ }
+
__ib_umem_release(umem->context->device, umem, 1);
task = get_pid_task(umem->pid, PIDTYPE_PID);
@@ -246,7 +269,7 @@ void ib_umem_release(struct ib_umem *umem)
if (!mm)
goto out;
- diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
+ diff = ib_umem_num_pages(umem);
/*
* We may be called with the mm's mmap_sem already held. This
@@ -283,6 +306,9 @@ int ib_umem_page_count(struct ib_umem *umem)
int n;
struct scatterlist *sg;
+ if (umem->odp_data)
+ return ib_umem_num_pages(umem);
+
shift = ilog2(umem->page_size);
n = 0;
@@ -292,3 +318,37 @@ int ib_umem_page_count(struct ib_umem *umem)
return n;
}
EXPORT_SYMBOL(ib_umem_page_count);
+
+/*
+ * Copy from the given ib_umem's pages to the given buffer.
+ *
+ * umem - the umem to copy from
+ * offset - offset to start copying from
+ * dst - destination buffer
+ * length - buffer length
+ *
+ * Returns 0 on success, or an error code.
+ */
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+ size_t length)
+{
+ size_t end = offset + length;
+ int ret;
+
+ if (offset > umem->length || length > umem->length - offset) {
+ pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
+ offset, umem->length, end);
+ return -EINVAL;
+ }
+
+ ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
+ offset + ib_umem_offset(umem));
+
+ if (ret < 0)
+ return ret;
+ else if (ret != length)
+ return -EINVAL;
+ else
+ return 0;
+}
+EXPORT_SYMBOL(ib_umem_copy_from);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
new file mode 100644
index 000000000000..6095872549e7
--- /dev/null
+++ b/drivers/infiniband/core/umem_odp.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/pid.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+
+static void ib_umem_notifier_start_account(struct ib_umem *item)
+{
+ mutex_lock(&item->odp_data->umem_mutex);
+
+ /* Only update private counters for this umem if it has them.
+ * Otherwise skip it. All page faults will be delayed for this umem. */
+ if (item->odp_data->mn_counters_active) {
+ int notifiers_count = item->odp_data->notifiers_count++;
+
+ if (notifiers_count == 0)
+ /* Initialize the completion object for waiting on
+ * notifiers. Since notifier_count is zero, no one
+ * should be waiting right now. */
+ reinit_completion(&item->odp_data->notifier_completion);
+ }
+ mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+static void ib_umem_notifier_end_account(struct ib_umem *item)
+{
+ mutex_lock(&item->odp_data->umem_mutex);
+
+ /* Only update private counters for this umem if it has them.
+ * Otherwise skip it. All page faults will be delayed for this umem. */
+ if (item->odp_data->mn_counters_active) {
+ /*
+ * This sequence increase will notify the QP page fault that
+ * the page that is going to be mapped in the spte could have
+ * been freed.
+ */
+ ++item->odp_data->notifiers_seq;
+ if (--item->odp_data->notifiers_count == 0)
+ complete_all(&item->odp_data->notifier_completion);
+ }
+ mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+/* Account for a new mmu notifier in an ib_ucontext. */
+static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
+{
+ atomic_inc(&context->notifier_count);
+}
+
+/* Account for a terminating mmu notifier in an ib_ucontext.
+ *
+ * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
+ * the function takes the semaphore itself. */
+static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
+{
+ int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
+
+ if (zero_notifiers &&
+ !list_empty(&context->no_private_counters)) {
+ /* No currently running mmu notifiers. Now is the chance to
+ * add private accounting to all previously added umems. */
+ struct ib_umem_odp *odp_data, *next;
+
+ /* Prevent concurrent mmu notifiers from working on the
+ * no_private_counters list. */
+ down_write(&context->umem_rwsem);
+
+ /* Read the notifier_count again, with the umem_rwsem
+ * semaphore taken for write. */
+ if (!atomic_read(&context->notifier_count)) {
+ list_for_each_entry_safe(odp_data, next,
+ &context->no_private_counters,
+ no_private_counters) {
+ mutex_lock(&odp_data->umem_mutex);
+ odp_data->mn_counters_active = true;
+ list_del(&odp_data->no_private_counters);
+ complete_all(&odp_data->notifier_completion);
+ mutex_unlock(&odp_data->umem_mutex);
+ }
+ }
+
+ up_write(&context->umem_rwsem);
+ }
+}
+
+static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie) {
+ /*
+ * Increase the number of notifiers running, to
+ * prevent any further fault handling on this MR.
+ */
+ ib_umem_notifier_start_account(item);
+ item->odp_data->dying = 1;
+ /* Make sure that the fact the umem is dying is out before we release
+ * all pending page faults. */
+ smp_wmb();
+ complete_all(&item->odp_data->notifier_completion);
+ item->context->invalidate_range(item, ib_umem_start(item),
+ ib_umem_end(item));
+ return 0;
+}
+
+static void ib_umem_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
+ ULLONG_MAX,
+ ib_umem_notifier_release_trampoline,
+ NULL);
+ up_read(&context->umem_rwsem);
+}
+
+static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_start_account(item);
+ item->context->invalidate_range(item, start, start + PAGE_SIZE);
+ ib_umem_notifier_end_account(item);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
+ address + PAGE_SIZE,
+ invalidate_page_trampoline, NULL);
+ up_read(&context->umem_rwsem);
+ ib_ucontext_notifier_end_account(context);
+}
+
+static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_start_account(item);
+ item->context->invalidate_range(item, start, end);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+ end,
+ invalidate_range_start_trampoline, NULL);
+ up_read(&context->umem_rwsem);
+}
+
+static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_end_account(item);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+ end,
+ invalidate_range_end_trampoline, NULL);
+ up_read(&context->umem_rwsem);
+ ib_ucontext_notifier_end_account(context);
+}
+
+static struct mmu_notifier_ops ib_umem_notifiers = {
+ .release = ib_umem_notifier_release,
+ .invalidate_page = ib_umem_notifier_invalidate_page,
+ .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
+ .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
+};
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
+{
+ int ret_val;
+ struct pid *our_pid;
+ struct mm_struct *mm = get_task_mm(current);
+
+ if (!mm)
+ return -EINVAL;
+
+ /* Prevent creating ODP MRs in child processes */
+ rcu_read_lock();
+ our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ rcu_read_unlock();
+ put_pid(our_pid);
+ if (context->tgid != our_pid) {
+ ret_val = -EINVAL;
+ goto out_mm;
+ }
+
+ umem->hugetlb = 0;
+ umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
+ if (!umem->odp_data) {
+ ret_val = -ENOMEM;
+ goto out_mm;
+ }
+ umem->odp_data->umem = umem;
+
+ mutex_init(&umem->odp_data->umem_mutex);
+
+ init_completion(&umem->odp_data->notifier_completion);
+
+ umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
+ sizeof(*umem->odp_data->page_list));
+ if (!umem->odp_data->page_list) {
+ ret_val = -ENOMEM;
+ goto out_odp_data;
+ }
+
+ umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
+ sizeof(*umem->odp_data->dma_list));
+ if (!umem->odp_data->dma_list) {
+ ret_val = -ENOMEM;
+ goto out_page_list;
+ }
+
+ /*
+ * When using MMU notifiers, we will get a
+ * notification before the "current" task (and MM) is
+ * destroyed. We use the umem_rwsem semaphore to synchronize.
+ */
+ down_write(&context->umem_rwsem);
+ context->odp_mrs_count++;
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_insert(&umem->odp_data->interval_tree,
+ &context->umem_tree);
+ if (likely(!atomic_read(&context->notifier_count)))
+ umem->odp_data->mn_counters_active = true;
+ else
+ list_add(&umem->odp_data->no_private_counters,
+ &context->no_private_counters);
+ downgrade_write(&context->umem_rwsem);
+
+ if (context->odp_mrs_count == 1) {
+ /*
+ * Note that at this point, no MMU notifier is running
+ * for this context!
+ */
+ atomic_set(&context->notifier_count, 0);
+ INIT_HLIST_NODE(&context->mn.hlist);
+ context->mn.ops = &ib_umem_notifiers;
+ /*
+ * Lock-dep detects a false positive for mmap_sem vs.
+ * umem_rwsem, due to not grasping downgrade_write correctly.
+ */
+ lockdep_off();
+ ret_val = mmu_notifier_register(&context->mn, mm);
+ lockdep_on();
+ if (ret_val) {
+ pr_err("Failed to register mmu_notifier %d\n", ret_val);
+ ret_val = -EBUSY;
+ goto out_mutex;
+ }
+ }
+
+ up_read(&context->umem_rwsem);
+
+ /*
+ * Note that doing an mmput can cause a notifier for the relevant mm.
+ * If the notifier is called while we hold the umem_rwsem, this will
+ * cause a deadlock. Therefore, we release the reference only after we
+ * released the semaphore.
+ */
+ mmput(mm);
+ return 0;
+
+out_mutex:
+ up_read(&context->umem_rwsem);
+ vfree(umem->odp_data->dma_list);
+out_page_list:
+ vfree(umem->odp_data->page_list);
+out_odp_data:
+ kfree(umem->odp_data);
+out_mm:
+ mmput(mm);
+ return ret_val;
+}
+
+void ib_umem_odp_release(struct ib_umem *umem)
+{
+ struct ib_ucontext *context = umem->context;
+
+ /*
+ * Ensure that no more pages are mapped in the umem.
+ *
+ * It is the driver's responsibility to ensure, before calling us,
+ * that the hardware will not attempt to access the MR any more.
+ */
+ ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
+ ib_umem_end(umem));
+
+ down_write(&context->umem_rwsem);
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_remove(&umem->odp_data->interval_tree,
+ &context->umem_tree);
+ context->odp_mrs_count--;
+ if (!umem->odp_data->mn_counters_active) {
+ list_del(&umem->odp_data->no_private_counters);
+ complete_all(&umem->odp_data->notifier_completion);
+ }
+
+ /*
+ * Downgrade the lock to a read lock. This ensures that the notifiers
+ * (who lock the mutex for reading) will be able to finish, and we
+ * will be able to enventually obtain the mmu notifiers SRCU. Note
+ * that since we are doing it atomically, no other user could register
+ * and unregister while we do the check.
+ */
+ downgrade_write(&context->umem_rwsem);
+ if (!context->odp_mrs_count) {
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+
+ owning_process = get_pid_task(context->tgid,
+ PIDTYPE_PID);
+ if (owning_process == NULL)
+ /*
+ * The process is already dead, notifier were removed
+ * already.
+ */
+ goto out;
+
+ owning_mm = get_task_mm(owning_process);
+ if (owning_mm == NULL)
+ /*
+ * The process' mm is already dead, notifier were
+ * removed already.
+ */
+ goto out_put_task;
+ mmu_notifier_unregister(&context->mn, owning_mm);
+
+ mmput(owning_mm);
+
+out_put_task:
+ put_task_struct(o