Commit 549af008 authored by Parav Pandit's avatar Parav Pandit Committed by Jason Gunthorpe
Browse files

IB/core: Avoid deadlock during netlink message handling

When rdmacm module is not loaded, and when netlink message is received to
get char device info, it results into a deadlock due to recursive locking
of rdma_nl_mutex with the below call sequence.

[..]
  rdma_nl_rcv()
  mutex_lock()
   [..]
   rdma_nl_rcv_msg()
      ib_get_client_nl_info()
         request_module()
           iw_cm_init()
             rdma_nl_register()
               mutex_lock(); <- Deadlock, acquiring mutex again

Due to above call sequence, following call trace and deadlock is observed.

  kernel: __mutex_lock+0x35e/0x860
  kernel: ? __mutex_lock+0x129/0x860
  kernel: ? rdma_nl_register+0x1a/0x90 [ib_core]
  kernel: rdma_nl_register+0x1a/0x90 [ib_core]
  kernel: ? 0xffffffffc029b000
  kernel: iw_cm_init+0x34/0x1000 [iw_cm]
  kernel: do_one_initcall+0x67/0x2d4
  kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0
  kernel: do_init_module+0x5a/0x223
  kernel: load_module+0x1998/0x1e10
  kernel: ? __symbol_put+0x60/0x60
  kernel: __do_sys_finit_module+0x94/0xe0
  kernel: do_syscall_64+0x5a/0x270
  kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe

  process stack trace:
  [<0>] __request_module+0x1c9/0x460
  [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core]
  [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core]
  [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core]
  [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core]
  [<0>] netlink_unicast+0x179/0x220
  [<0>] netlink_sendmsg+0x2f6/0x3f0
  [<0>] sock_sendmsg+0x30/0x40
  [<0>] ___sys_sendmsg+0x27a/0x290
  [<0>] __sys_sendmsg+0x58/0xa0
  [<0>] do_syscall_64+0x5a/0x270
  [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe

To overcome this deadlock and to allow multiple netlink messages to
progress in parallel, following scheme is implemented.

1. Split the lock protecting the cb_table into a per-index lock, and make
   it a rwlock. This lock is used to ensure no callbacks are running after
   unregistration returns. Since a module will not be registered once it
   is already running callbacks, this avoids the deadlock.

2. Use smp_store_release() to update the cb_table during registration so
   that no lock is required. This avoids lockdep problems with thinking
   all the rwsems are the same lock class.

Fixes: 0e2d00eb ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload")
Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org


Signed-off-by: default avatarParav Pandit <parav@mellanox.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent a15542bb
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -199,6 +199,7 @@ void ib_mad_cleanup(void);
int ib_sa_init(void);
void ib_sa_cleanup(void);

void rdma_nl_init(void);
void rdma_nl_exit(void);

int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+2 −0
Original line number Diff line number Diff line
@@ -2716,6 +2716,8 @@ static int __init ib_core_init(void)
		goto err_comp_unbound;
	}

	rdma_nl_init();

	ret = addr_init();
	if (ret) {
		pr_warn("Could't init IB address resolution\n");
+53 −54
Original line number Diff line number Diff line
@@ -42,9 +42,12 @@
#include <linux/module.h>
#include "core_priv.h"

static DEFINE_MUTEX(rdma_nl_mutex);
static struct {
	const struct rdma_nl_cbs *cb_table;
	/* Synchronizes between ongoing netlink commands and netlink client
	 * unregistration.
	 */
	struct rw_semaphore sem;
} rdma_nl_types[RDMA_NL_NUM_CLIENTS];

bool rdma_nl_chk_listeners(unsigned int group)
@@ -75,70 +78,53 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op)
	return (op < max_num_ops[type]) ? true : false;
}

static bool
is_nl_valid(const struct sk_buff *skb, unsigned int type, unsigned int op)
static const struct rdma_nl_cbs *
get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op)
{
	const struct rdma_nl_cbs *cb_table;

	if (!is_nl_msg_valid(type, op))
		return false;

	/*
	 * Currently only NLDEV client is supporting netlink commands in
	 * non init_net net namespace.
	 */
	if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV)
		return false;
		return NULL;

	if (!rdma_nl_types[type].cb_table) {
		mutex_unlock(&rdma_nl_mutex);
		request_module("rdma-netlink-subsys-%d", type);
		mutex_lock(&rdma_nl_mutex);
	}
	cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
	if (!cb_table) {
		/*
		 * Didn't get valid reference of the table, attempt module
		 * load once.
		 */
		up_read(&rdma_nl_types[type].sem);

	cb_table = rdma_nl_types[type].cb_table;
		request_module("rdma-netlink-subsys-%d", type);

		down_read(&rdma_nl_types[type].sem);
		cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
	}
	if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
		return false;
	return true;
		return NULL;
	return cb_table;
}

void rdma_nl_register(unsigned int index,
		      const struct rdma_nl_cbs cb_table[])
{
	mutex_lock(&rdma_nl_mutex);
	if (!is_nl_msg_valid(index, 0)) {
		/*
		 * All clients are not interesting in success/failure of
		 * this call. They want to see the print to error log and
		 * continue their initialization. Print warning for them,
		 * because it is programmer's error to be here.
		 */
		mutex_unlock(&rdma_nl_mutex);
		WARN(true,
		     "The not-valid %u index was supplied to RDMA netlink\n",
		     index);
	if (WARN_ON(!is_nl_msg_valid(index, 0)) ||
	    WARN_ON(READ_ONCE(rdma_nl_types[index].cb_table)))
		return;
	}

	if (rdma_nl_types[index].cb_table) {
		mutex_unlock(&rdma_nl_mutex);
		WARN(true,
		     "The %u index is already registered in RDMA netlink\n",
		     index);
		return;
	}

	rdma_nl_types[index].cb_table = cb_table;
	mutex_unlock(&rdma_nl_mutex);
	/* Pairs with the READ_ONCE in is_nl_valid() */
	smp_store_release(&rdma_nl_types[index].cb_table, cb_table);
}
EXPORT_SYMBOL(rdma_nl_register);

void rdma_nl_unregister(unsigned int index)
{
	mutex_lock(&rdma_nl_mutex);
	down_write(&rdma_nl_types[index].sem);
	rdma_nl_types[index].cb_table = NULL;
	mutex_unlock(&rdma_nl_mutex);
	up_write(&rdma_nl_types[index].sem);
}
EXPORT_SYMBOL(rdma_nl_unregister);

@@ -170,15 +156,21 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
	unsigned int index = RDMA_NL_GET_CLIENT(type);
	unsigned int op = RDMA_NL_GET_OP(type);
	const struct rdma_nl_cbs *cb_table;
	int err = -EINVAL;

	if (!is_nl_valid(skb, index, op))
	if (!is_nl_msg_valid(index, op))
		return -EINVAL;

	cb_table = rdma_nl_types[index].cb_table;
	down_read(&rdma_nl_types[index].sem);
	cb_table = get_cb_table(skb, index, op);
	if (!cb_table)
		goto done;

	if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) &&
	    !netlink_capable(skb, CAP_NET_ADMIN))
		return -EPERM;
	    !netlink_capable(skb, CAP_NET_ADMIN)) {
		err = -EPERM;
		goto done;
	}

	/*
	 * LS responses overload the 0x100 (NLM_F_ROOT) flag.  Don't
@@ -186,8 +178,8 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
	 */
	if (index == RDMA_NL_LS) {
		if (cb_table[op].doit)
			return cb_table[op].doit(skb, nlh, extack);
		return -EINVAL;
			err = cb_table[op].doit(skb, nlh, extack);
		goto done;
	}
	/* FIXME: Convert IWCM to properly handle doit callbacks */
	if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) {
@@ -195,14 +187,15 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
			.dump = cb_table[op].dump,
		};
		if (c.dump)
			return netlink_dump_start(skb->sk, skb, nlh, &c);
		return -EINVAL;
			err = netlink_dump_start(skb->sk, skb, nlh, &c);
		goto done;
	}

	if (cb_table[op].doit)
		return cb_table[op].doit(skb, nlh, extack);

	return 0;
		err = cb_table[op].doit(skb, nlh, extack);
done:
	up_read(&rdma_nl_types[index].sem);
	return err;
}

/*
@@ -263,9 +256,7 @@ skip:

static void rdma_nl_rcv(struct sk_buff *skb)
{
	mutex_lock(&rdma_nl_mutex);
	rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg);
	mutex_unlock(&rdma_nl_mutex);
}

int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid)
@@ -297,6 +288,14 @@ int rdma_nl_multicast(struct net *net, struct sk_buff *skb,
}
EXPORT_SYMBOL(rdma_nl_multicast);

void rdma_nl_init(void)
{
	int idx;

	for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
		init_rwsem(&rdma_nl_types[idx].sem);
}

void rdma_nl_exit(void)
{
	int idx;