Commit f25a546e authored by Jason Gunthorpe's avatar Jason Gunthorpe
Browse files

RDMA/odp: Use mmu_interval_notifier_insert()

Replace the internal interval tree based mmu notifier with the new common
mmu_interval_notifier_insert() API. This removes a lot of code and fixes a
deadlock that can be triggered in ODP:

 zap_page_range()
  mmu_notifier_invalidate_range_start()
   [..]
    ib_umem_notifier_invalidate_range_start()
       down_read(&per_mm->umem_rwsem)
  unmap_single_vma()
    [..]
      __split_huge_page_pmd()
        mmu_notifier_invalidate_range_start()
        [..]
           ib_umem_notifier_invalidate_range_start()
              down_read(&per_mm->umem_rwsem)   // DEADLOCK

        mmu_notifier_invalidate_range_end()
           up_read(&per_mm->umem_rwsem)
  mmu_notifier_invalidate_range_end()
     up_read(&per_mm->umem_rwsem)

The umem_rwsem is held across the range_start/end as the ODP algorithm for
invalidate_range_end cannot tolerate changes to the interval
tree. However, due to the nested invalidation regions the second
down_read() can deadlock if there are competing writers. The new core code
provides an alternative scheme to solve this problem.

Fixes: ca748c39 ("RDMA/umem: Get rid of per_mm->notifier_count")
Link: https://lore.kernel.org/r/20191112202231.3856-6-jgg@ziepe.ca


Tested-by: default avatarArtemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent 107e8998
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -2617,7 +2617,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
	SET_DEVICE_OP(dev_ops, get_vf_config);
	SET_DEVICE_OP(dev_ops, get_vf_stats);
	SET_DEVICE_OP(dev_ops, init_port);
	SET_DEVICE_OP(dev_ops, invalidate_range);
	SET_DEVICE_OP(dev_ops, iw_accept);
	SET_DEVICE_OP(dev_ops, iw_add_ref);
	SET_DEVICE_OP(dev_ops, iw_connect);
+41 −262
Original line number Diff line number Diff line
@@ -48,197 +48,33 @@

#include "uverbs.h"

static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
				   const struct mmu_interval_notifier_ops *ops)
{
	mutex_lock(&umem_odp->umem_mutex);
	if (umem_odp->notifiers_count++ == 0)
		/*
		 * Initialize the completion object for waiting on
		 * notifiers. Since notifier_count is zero, no one should be
		 * waiting right now.
		 */
		reinit_completion(&umem_odp->notifier_completion);
	mutex_unlock(&umem_odp->umem_mutex);
}

static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
{
	mutex_lock(&umem_odp->umem_mutex);
	/*
	 * This sequence increase will notify the QP page fault that the page
	 * that is going to be mapped in the spte could have been freed.
	 */
	++umem_odp->notifiers_seq;
	if (--umem_odp->notifiers_count == 0)
		complete_all(&umem_odp->notifier_completion);
	mutex_unlock(&umem_odp->umem_mutex);
}

static void ib_umem_notifier_release(struct mmu_notifier *mn,
				     struct mm_struct *mm)
{
	struct ib_ucontext_per_mm *per_mm =
		container_of(mn, struct ib_ucontext_per_mm, mn);
	struct rb_node *node;

	down_read(&per_mm->umem_rwsem);
	if (!per_mm->mn.users)
		goto out;

	for (node = rb_first_cached(&per_mm->umem_tree); node;
	     node = rb_next(node)) {
		struct ib_umem_odp *umem_odp =
			rb_entry(node, struct ib_umem_odp, interval_tree.rb);

		/*
		 * Increase the number of notifiers running, to prevent any
		 * further fault handling on this MR.
		 */
		ib_umem_notifier_start_account(umem_odp);
		complete_all(&umem_odp->notifier_completion);
		umem_odp->umem.ibdev->ops.invalidate_range(
			umem_odp, ib_umem_start(umem_odp),
			ib_umem_end(umem_odp));
	}

out:
	up_read(&per_mm->umem_rwsem);
}

static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
					     u64 start, u64 end, void *cookie)
{
	ib_umem_notifier_start_account(item);
	item->umem.ibdev->ops.invalidate_range(item, start, end);
	return 0;
}

static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
				const struct mmu_notifier_range *range)
{
	struct ib_ucontext_per_mm *per_mm =
		container_of(mn, struct ib_ucontext_per_mm, mn);
	int rc;

	if (mmu_notifier_range_blockable(range))
		down_read(&per_mm->umem_rwsem);
	else if (!down_read_trylock(&per_mm->umem_rwsem))
		return -EAGAIN;

	if (!per_mm->mn.users) {
		up_read(&per_mm->umem_rwsem);
		/*
		 * At this point users is permanently zero and visible to this
		 * CPU without a lock, that fact is relied on to skip the unlock
		 * in range_end.
		 */
		return 0;
	}

	rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
					   range->end,
					   invalidate_range_start_trampoline,
					   mmu_notifier_range_blockable(range),
					   NULL);
	if (rc)
		up_read(&per_mm->umem_rwsem);
	return rc;
}

static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
					   u64 end, void *cookie)
{
	ib_umem_notifier_end_account(item);
	return 0;
}

static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
				const struct mmu_notifier_range *range)
{
	struct ib_ucontext_per_mm *per_mm =
		container_of(mn, struct ib_ucontext_per_mm, mn);

	if (unlikely(!per_mm->mn.users))
		return;

	rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
				      range->end,
				      invalidate_range_end_trampoline, true, NULL);
	up_read(&per_mm->umem_rwsem);
}

static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm)
{
	struct ib_ucontext_per_mm *per_mm;

	per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL);
	if (!per_mm)
		return ERR_PTR(-ENOMEM);

	per_mm->umem_tree = RB_ROOT_CACHED;
	init_rwsem(&per_mm->umem_rwsem);

	WARN_ON(mm != current->mm);
	rcu_read_lock();
	per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
	rcu_read_unlock();
	return &per_mm->mn;
}

static void ib_umem_free_notifier(struct mmu_notifier *mn)
{
	struct ib_ucontext_per_mm *per_mm =
		container_of(mn, struct ib_ucontext_per_mm, mn);

	WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root));

	put_pid(per_mm->tgid);
	kfree(per_mm);
}

static const struct mmu_notifier_ops ib_umem_notifiers = {
	.release                    = ib_umem_notifier_release,
	.invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
	.invalidate_range_end       = ib_umem_notifier_invalidate_range_end,
	.alloc_notifier		    = ib_umem_alloc_notifier,
	.free_notifier		    = ib_umem_free_notifier,
};

static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp)
{
	struct ib_ucontext_per_mm *per_mm;
	struct mmu_notifier *mn;
	int ret;

	umem_odp->umem.is_odp = 1;
	mutex_init(&umem_odp->umem_mutex);

	if (!umem_odp->is_implicit_odp) {
		size_t page_size = 1UL << umem_odp->page_shift;
		unsigned long start;
		unsigned long end;
		size_t pages;

		umem_odp->interval_tree.start =
			ALIGN_DOWN(umem_odp->umem.address, page_size);
		start = ALIGN_DOWN(umem_odp->umem.address, page_size);
		if (check_add_overflow(umem_odp->umem.address,
				       (unsigned long)umem_odp->umem.length,
				       &umem_odp->interval_tree.last))
				       &end))
			return -EOVERFLOW;
		umem_odp->interval_tree.last =
			ALIGN(umem_odp->interval_tree.last, page_size);
		if (unlikely(umem_odp->interval_tree.last < page_size))
		end = ALIGN(end, page_size);
		if (unlikely(end < page_size))
			return -EOVERFLOW;

		pages = (umem_odp->interval_tree.last -
			 umem_odp->interval_tree.start) >>
			umem_odp->page_shift;
		pages = (end - start) >> umem_odp->page_shift;
		if (!pages)
			return -EINVAL;

		/*
		 * Note that the representation of the intervals in the
		 * interval tree considers the ending point as contained in
		 * the interval.
		 */
		umem_odp->interval_tree.last--;

		umem_odp->page_list = kvcalloc(
			pages, sizeof(*umem_odp->page_list), GFP_KERNEL);
		if (!umem_odp->page_list)
@@ -250,26 +86,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp)
			ret = -ENOMEM;
			goto out_page_list;
		}
	}

	mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm);
	if (IS_ERR(mn)) {
		ret = PTR_ERR(mn);
		ret = mmu_interval_notifier_insert(&umem_odp->notifier,
						   umem_odp->umem.owning_mm,
						   start, end - start, ops);
		if (ret)
			goto out_dma_list;
	}
	umem_odp->per_mm = per_mm =
		container_of(mn, struct ib_ucontext_per_mm, mn);

	mutex_init(&umem_odp->umem_mutex);
	init_completion(&umem_odp->notifier_completion);

	if (!umem_odp->is_implicit_odp) {
		down_write(&per_mm->umem_rwsem);
		interval_tree_insert(&umem_odp->interval_tree,
				     &per_mm->umem_tree);
		up_write(&per_mm->umem_rwsem);
	}
	mmgrab(umem_odp->umem.owning_mm);

	return 0;

@@ -305,8 +128,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,

	if (!context)
		return ERR_PTR(-EIO);
	if (WARN_ON_ONCE(!context->device->ops.invalidate_range))
		return ERR_PTR(-EINVAL);

	umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
	if (!umem_odp)
@@ -318,8 +139,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
	umem_odp->is_implicit_odp = 1;
	umem_odp->page_shift = PAGE_SHIFT;

	ret = ib_init_umem_odp(umem_odp);
	umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
	ret = ib_init_umem_odp(umem_odp, NULL);
	if (ret) {
		put_pid(umem_odp->tgid);
		kfree(umem_odp);
		return ERR_PTR(ret);
	}
@@ -336,8 +159,10 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
 * @addr: The starting userspace VA
 * @size: The length of the userspace VA
 */
struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
					    unsigned long addr, size_t size)
struct ib_umem_odp *
ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
			size_t size,
			const struct mmu_interval_notifier_ops *ops)
{
	/*
	 * Caller must ensure that root cannot be freed during the call to
@@ -360,9 +185,12 @@ struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
	umem->writable   = root->umem.writable;
	umem->owning_mm  = root->umem.owning_mm;
	odp_data->page_shift = PAGE_SHIFT;
	odp_data->notifier.ops = ops;

	ret = ib_init_umem_odp(odp_data);
	odp_data->tgid = get_pid(root->tgid);
	ret = ib_init_umem_odp(odp_data, ops);
	if (ret) {
		put_pid(odp_data->tgid);
		kfree(odp_data);
		return ERR_PTR(ret);
	}
@@ -383,7 +211,8 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
 * conjunction with MMU notifiers.
 */
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
				    size_t size, int access)
				    size_t size, int access,
				    const struct mmu_interval_notifier_ops *ops)
{
	struct ib_umem_odp *umem_odp;
	struct ib_ucontext *context;
@@ -398,8 +227,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
	if (!context)
		return ERR_PTR(-EIO);

	if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) ||
	    WARN_ON_ONCE(!context->device->ops.invalidate_range))
	if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
		return ERR_PTR(-EINVAL);

	umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
@@ -411,6 +239,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
	umem_odp->umem.address = addr;
	umem_odp->umem.writable = ib_access_writable(access);
	umem_odp->umem.owning_mm = mm = current->mm;
	umem_odp->notifier.ops = ops;

	umem_odp->page_shift = PAGE_SHIFT;
	if (access & IB_ACCESS_HUGETLB) {
@@ -429,11 +258,14 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
		up_read(&mm->mmap_sem);
	}

	ret = ib_init_umem_odp(umem_odp);
	umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
	ret = ib_init_umem_odp(umem_odp, ops);
	if (ret)
		goto err_free;
		goto err_put_pid;
	return umem_odp;

err_put_pid:
	put_pid(umem_odp->tgid);
err_free:
	kfree(umem_odp);
	return ERR_PTR(ret);
@@ -442,8 +274,6 @@ EXPORT_SYMBOL(ib_umem_odp_get);

void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
	struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;

	/*
	 * Ensure that no more pages are mapped in the umem.
	 *
@@ -455,28 +285,11 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
		ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
					    ib_umem_end(umem_odp));
		mutex_unlock(&umem_odp->umem_mutex);
		mmu_interval_notifier_remove(&umem_odp->notifier);
		kvfree(umem_odp->dma_list);
		kvfree(umem_odp->page_list);
		put_pid(umem_odp->tgid);
	}

	down_write(&per_mm->umem_rwsem);
	if (!umem_odp->is_implicit_odp) {
		interval_tree_remove(&umem_odp->interval_tree,
				     &per_mm->umem_tree);
		complete_all(&umem_odp->notifier_completion);
	}
	/*
	 * NOTE! mmu_notifier_unregister() can happen between a start/end
	 * callback, resulting in a missing end, and thus an unbalanced
	 * lock. This doesn't really matter to us since we are about to kfree
	 * the memory that holds the lock, however LOCKDEP doesn't like this.
	 * Thus we call the mmu_notifier_put under the rwsem and test the
	 * internal users count to reliably see if we are past this point.
	 */
	mmu_notifier_put(&per_mm->mn);
	up_write(&per_mm->umem_rwsem);

	mmdrop(umem_odp->umem.owning_mm);
	kfree(umem_odp);
}
EXPORT_SYMBOL(ib_umem_odp_release);
@@ -501,7 +314,7 @@ EXPORT_SYMBOL(ib_umem_odp_release);
 */
static int ib_umem_odp_map_dma_single_page(
		struct ib_umem_odp *umem_odp,
		int page_index,
		unsigned int page_index,
		struct page *page,
		u64 access_mask,
		unsigned long current_seq)
@@ -510,12 +323,7 @@ static int ib_umem_odp_map_dma_single_page(
	dma_addr_t dma_addr;
	int ret = 0;

	/*
	 * Note: we avoid writing if seq is different from the initial seq, to
	 * handle case of a racing notifier. This check also allows us to bail
	 * early if we have a notifier running in parallel with us.
	 */
	if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) {
	if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) {
		ret = -EAGAIN;
		goto out;
	}
@@ -618,7 +426,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
	 * existing beyond the lifetime of the originating process.. Presumably
	 * mmget_not_zero will fail in this case.
	 */
	owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
	owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
	if (!owning_process || !mmget_not_zero(owning_mm)) {
		ret = -EINVAL;
		goto out_put_task;
@@ -762,32 +570,3 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
	}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);

/* @last is not a part of the interval. See comment for function
 * node_last.
 */
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
				  u64 start, u64 last,
				  umem_call_back cb,
				  bool blockable,
				  void *cookie)
{
	int ret_val = 0;
	struct interval_tree_node *node, *next;
	struct ib_umem_odp *umem;

	if (unlikely(start == last))
		return ret_val;

	for (node = interval_tree_iter_first(root, start, last - 1);
			node; node = next) {
		/* TODO move the blockable decision up to the callback */
		if (!blockable)
			return -EAGAIN;
		next = interval_tree_iter_next(node, start, last - 1);
		umem = container_of(node, struct ib_umem_odp, interval_tree);
		ret_val = cb(umem, start, last, cookie) || ret_val;
	}

	return ret_val;
}
+2 −5
Original line number Diff line number Diff line
@@ -1263,8 +1263,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
			      unsigned long end);
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
			   size_t nentries, struct mlx5_ib_mr *mr, int flags);
@@ -1294,11 +1292,10 @@ mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
{
	return -EOPNOTSUPP;
}
static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
					    unsigned long start,
					    unsigned long end){};
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */

extern const struct mmu_interval_notifier_ops mlx5_mn_ops;

/* Needed for rep profile */
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
		      const struct mlx5_ib_profile *profile,
+2 −1
Original line number Diff line number Diff line
@@ -743,7 +743,8 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
	if (access_flags & IB_ACCESS_ON_DEMAND) {
		struct ib_umem_odp *odp;

		odp = ib_umem_odp_get(udata, start, length, access_flags);
		odp = ib_umem_odp_get(udata, start, length, access_flags,
				      &mlx5_mn_ops);
		if (IS_ERR(odp)) {
			mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
				    PTR_ERR(odp));
+23 −27
Original line number Diff line number Diff line
@@ -241,17 +241,26 @@ out_unlock:
	xa_unlock(&imr->implicit_children);
}

void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
			      unsigned long end)
static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
				     const struct mmu_notifier_range *range,
				     unsigned long cur_seq)
{
	struct ib_umem_odp *umem_odp =
		container_of(mni, struct ib_umem_odp, notifier);
	struct mlx5_ib_mr *mr;
	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
				    sizeof(struct mlx5_mtt)) - 1;
	u64 idx = 0, blk_start_idx = 0;
	unsigned long start;
	unsigned long end;
	int in_block = 0;
	u64 addr;

	if (!mmu_notifier_range_blockable(range))
		return false;

	mutex_lock(&umem_odp->umem_mutex);
	mmu_interval_set_seq(mni, cur_seq);
	/*
	 * If npages is zero then umem_odp->private may not be setup yet. This
	 * does not complete until after the first page is mapped for DMA.
@@ -260,8 +269,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
		goto out;
	mr = umem_odp->private;

	start = max_t(u64, ib_umem_start(umem_odp), start);
	end = min_t(u64, ib_umem_end(umem_odp), end);
	start = max_t(u64, ib_umem_start(umem_odp), range->start);
	end = min_t(u64, ib_umem_end(umem_odp), range->end);

	/*
	 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
@@ -312,8 +321,13 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
		destroy_unused_implicit_child_mr(mr);
out:
	mutex_unlock(&umem_odp->umem_mutex);
	return true;
}

const struct mmu_interval_notifier_ops mlx5_mn_ops = {
	.invalidate = mlx5_ib_invalidate_range,
};

void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
{
	struct ib_odp_caps *caps = &dev->odp_caps;
@@ -414,7 +428,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,

	odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
				      idx * MLX5_IMR_MTT_SIZE,
				      MLX5_IMR_MTT_SIZE);
				      MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
	if (IS_ERR(odp))
		return ERR_CAST(odp);

@@ -600,8 +614,9 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
			     u64 user_va, size_t bcnt, u32 *bytes_mapped,
			     u32 flags)
{
	int current_seq, page_shift, ret, np;
	int page_shift, ret, np;
	bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
	unsigned long current_seq;
	u64 access_mask;
	u64 start_idx, page_mask;

@@ -613,12 +628,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
	if (odp->umem.writable && !downgrade)
		access_mask |= ODP_WRITE_ALLOWED_BIT;

	current_seq = READ_ONCE(odp->notifiers_seq);
	/*
	 * Ensure the sequence number is valid for some time before we call
	 * gup.
	 */
	smp_rmb();
	current_seq = mmu_interval_read_begin(&odp->notifier);

	np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask,
				       current_seq);
@@ -626,7 +636,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
		return np;

	mutex_lock(&odp->umem_mutex);
	if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
	if (!mmu_interval_read_retry(&odp->notifier, current_seq)) {
		/*
		 * No need to check whether the MTTs really belong to
		 * this MR, since ib_umem_odp_map_dma_pages already
@@ -656,19 +666,6 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
	return np << (page_shift - PAGE_SHIFT);

out:
	if (ret == -EAGAIN) {
		unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);

		if (!wait_for_completion_timeout(&odp->notifier_completion,
						 timeout)) {
			mlx5_ib_warn(
				mr->dev,
				"timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
				current_seq, odp->notifiers_seq,
				odp->notifiers_count);
		}
	}

	return ret;
}

@@ -1609,7 +1606,6 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)

static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
	.advise_mr = mlx5_ib_advise_mr,
	.invalidate_range = mlx5_ib_invalidate_range,
};

int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
Loading