Commit c9467447 authored by Gerd Rausch's avatar Gerd Rausch Committed by David S. Miller
Browse files

net/rds: Get rid of "wait_clean_list_grace" and add locking



Waiting for activity on the "clean_list" to quiesce is no substitute
for proper locking.

We can have multiple threads competing for "llist_del_first"
via "rds_ib_reuse_mr", and a single thread competing
for "llist_del_all" and "llist_del_first" via "rds_ib_flush_mr_pool".

Since "llist_del_first" depends on "list->first->next" not to change
in the midst of the operation, simply waiting for all current calls
to "rds_ib_reuse_mr" to quiesce across all CPUs is woefully inadequate:

By the time "wait_clean_list_grace" is done iterating over all CPUs to see
that there is no concurrent caller to "rds_ib_reuse_mr", a new caller may
have just shown up on the first CPU.

Furthermore, <linux/llist.h> explicitly calls out the need for locking:
 * Cases where locking is needed:
 * If we have multiple consumers with llist_del_first used in one consumer,
 * and llist_del_first or llist_del_all used in other consumers,
 * then a lock is needed.

Also, while at it, drop the unused "pool" parameter
from "list_to_llist_nodes".

Signed-off-by: default avatarGerd Rausch <gerd.rausch@oracle.com>
Acked-by: default avatarSantosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 2c7da8e6
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -98,6 +98,7 @@ struct rds_ib_mr_pool {
	struct llist_head	free_list;	/* unused MRs */
	struct llist_head	clean_list;	/* unused & unmapped MRs */
	wait_queue_head_t	flush_wait;
	spinlock_t		clean_lock;	/* "clean_list" concurrency */

	atomic_t		free_pinned;	/* memory pinned by free MRs */
	unsigned long		max_items;
+18 −38
Original line number Diff line number Diff line
@@ -40,9 +40,6 @@

struct workqueue_struct *rds_ib_mr_wq;

static DEFINE_PER_CPU(unsigned long, clean_list_grace);
#define CLEAN_LIST_BUSY_BIT 0

static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
	struct rds_ib_device *rds_ibdev;
@@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
{
	struct rds_ib_mr *ibmr = NULL;
	struct llist_node *ret;
	unsigned long *flag;
	unsigned long flags;

	preempt_disable();
	flag = this_cpu_ptr(&clean_list_grace);
	set_bit(CLEAN_LIST_BUSY_BIT, flag);
	spin_lock_irqsave(&pool->clean_lock, flags);
	ret = llist_del_first(&pool->clean_list);
	spin_unlock_irqrestore(&pool->clean_lock, flags);
	if (ret) {
		ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
		if (pool->pool_type == RDS_IB_MR_8K_POOL)
@@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
			rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
	}

	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
	preempt_enable();
	return ibmr;
}

static inline void wait_clean_list_grace(void)
{
	int cpu;
	unsigned long *flag;

	for_each_online_cpu(cpu) {
		flag = &per_cpu(clean_list_grace, cpu);
		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
			cpu_relax();
	}
}

void rds_ib_sync_mr(void *trans_private, int direction)
{
	struct rds_ib_mr *ibmr = trans_private;
@@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist,
 * of clusters.  Each cluster has linked llist nodes of
 * MR_CLUSTER_SIZE mrs that are ready for reuse.
 */
static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
				struct list_head *list,
static void list_to_llist_nodes(struct list_head *list,
				struct llist_node **nodes_head,
				struct llist_node **nodes_tail)
{
@@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
	 */
	dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
	dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
	if (free_all)
	if (free_all) {
		unsigned long flags;

		spin_lock_irqsave(&pool->clean_lock, flags);
		llist_append_to_list(&pool->clean_list, &unmap_list);
		spin_unlock_irqrestore(&pool->clean_lock, flags);
	}

	free_goal = rds_ib_flush_goal(pool, free_all);

@@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
		rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);

	if (!list_empty(&unmap_list)) {
		/* we have to make sure that none of the things we're about
		 * to put on the clean list would race with other cpus trying
		 * to pull items off.  The llist would explode if we managed to
		 * remove something from the clean list and then add it back again
		 * while another CPU was spinning on that same item in llist_del_first.
		 *
		 * This is pretty unlikely, but just in case  wait for an llist grace period
		 * here before adding anything back into the clean list.
		 */
		wait_clean_list_grace();
		unsigned long flags;

		list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
		list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
		if (ibmr_ret) {
			*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
			clean_nodes = clean_nodes->next;
		}
		/* more than one entry in llist nodes */
		if (clean_nodes)
		if (clean_nodes) {
			spin_lock_irqsave(&pool->clean_lock, flags);
			llist_add_batch(clean_nodes, clean_tail,
					&pool->clean_list);

			spin_unlock_irqrestore(&pool->clean_lock, flags);
		}
	}

	atomic_sub(unpinned, &pool->free_pinned);
@@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
	init_llist_head(&pool->free_list);
	init_llist_head(&pool->drop_list);
	init_llist_head(&pool->clean_list);
	spin_lock_init(&pool->clean_lock);
	mutex_init(&pool->flush_lock);
	init_waitqueue_head(&pool->flush_wait);
	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);