Commit 4809bba7 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'net-rds-RDMA-fixes'



Gerd Rausch says:

====================
net/rds: RDMA fixes

A number of net/rds fixes necessary to make "rds_rdma.ko"
pass some basic Oracle internal tests.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f11fe1da aa494893
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -156,6 +156,7 @@ struct rds_ib_connection {

	/* To control the number of wrs from fastreg */
	atomic_t		i_fastreg_wrs;
	atomic_t		i_fastreg_inuse_count;

	/* interrupt handling */
	struct tasklet_struct	i_send_tasklet;
+8 −1
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@
#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
#include "ib_mr.h"

/*
 * Set the selected protocol version
@@ -526,7 +527,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
	attr.qp_type = IB_QPT_RC;
	attr.send_cq = ic->i_send_cq;
	attr.recv_cq = ic->i_recv_cq;
	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);

	/*
	 * XXX this can fail if max_*_wr is too large?  Are we supposed
@@ -993,6 +993,11 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
				ic->i_cm_id, err);
		}

		/* kick off "flush_worker" for all pools in order to reap
		 * all FRMR registrations that are still marked "FRMR_IS_INUSE"
		 */
		rds_ib_flush_mrs();

		/*
		 * We want to wait for tx and rx completion to finish
		 * before we tear down the connection, but we have to be
@@ -1005,6 +1010,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
		wait_event(rds_ib_ring_empty_wait,
			   rds_ib_ring_empty(&ic->i_recv_ring) &&
			   (atomic_read(&ic->i_signaled_sends) == 0) &&
			   (atomic_read(&ic->i_fastreg_inuse_count) == 0) &&
			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
		tasklet_kill(&ic->i_send_tasklet);
		tasklet_kill(&ic->i_recv_tasklet);
@@ -1132,6 +1138,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
	spin_lock_init(&ic->i_ack_lock);
#endif
	atomic_set(&ic->i_signaled_sends, 0);
	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);

	/*
	 * rds_ib_conn_shutdown() waits for these to be emptied so they
+76 −8
Original line number Diff line number Diff line
@@ -32,6 +32,24 @@

#include "ib_mr.h"

static inline void
rds_transition_frwr_state(struct rds_ib_mr *ibmr,
			  enum rds_ib_fr_state old_state,
			  enum rds_ib_fr_state new_state)
{
	if (cmpxchg(&ibmr->u.frmr.fr_state,
		    old_state, new_state) == old_state &&
	    old_state == FRMR_IS_INUSE) {
		/* enforce order of ibmr->u.frmr.fr_state update
		 * before decrementing i_fastreg_inuse_count
		 */
		smp_mb__before_atomic();
		atomic_dec(&ibmr->ic->i_fastreg_inuse_count);
		if (waitqueue_active(&rds_ib_ring_empty_wait))
			wake_up(&rds_ib_ring_empty_wait);
	}
}

static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
					   int npages)
{
@@ -75,6 +93,8 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
		pool->max_items_soft = pool->max_items;

	frmr->fr_state = FRMR_IS_FREE;
	init_waitqueue_head(&frmr->fr_inv_done);
	init_waitqueue_head(&frmr->fr_reg_done);
	return ibmr;

out_no_cigar:
@@ -116,13 +136,19 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
	if (unlikely(ret != ibmr->sg_len))
		return ret < 0 ? ret : -EINVAL;

	if (cmpxchg(&frmr->fr_state,
		    FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE)
		return -EBUSY;

	atomic_inc(&ibmr->ic->i_fastreg_inuse_count);

	/* Perform a WR for the fast_reg_mr. Each individual page
	 * in the sg list is added to the fast reg page list and placed
	 * inside the fast_reg_mr WR.  The key used is a rolling 8bit
	 * counter, which should guarantee uniqueness.
	 */
	ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
	frmr->fr_state = FRMR_IS_INUSE;
	frmr->fr_reg = true;

	memset(&reg_wr, 0, sizeof(reg_wr));
	reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
@@ -138,12 +164,23 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
	ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, NULL);
	if (unlikely(ret)) {
		/* Failure here can be because of -ENOMEM as well */
		frmr->fr_state = FRMR_IS_STALE;
		rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);

		atomic_inc(&ibmr->ic->i_fastreg_wrs);
		if (printk_ratelimit())
			pr_warn("RDS/IB: %s returned error(%d)\n",
				__func__, ret);
		goto out;
	}

	/* Wait for the registration to complete in order to prevent an invalid
	 * access error resulting from a race between the memory region already
	 * being accessed while registration is still pending.
	 */
	wait_event(frmr->fr_reg_done, !frmr->fr_reg);

out:

	return ret;
}

@@ -255,12 +292,29 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)

	ret = ib_post_send(i_cm_id->qp, s_wr, NULL);
	if (unlikely(ret)) {
		frmr->fr_state = FRMR_IS_STALE;
		rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
		frmr->fr_inv = false;
		/* enforce order of frmr->fr_inv update
		 * before incrementing i_fastreg_wrs
		 */
		smp_mb__before_atomic();
		atomic_inc(&ibmr->ic->i_fastreg_wrs);
		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
		goto out;
	}

	/* Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition in order to
	 * 1) avoid a silly bouncing between "clean_list" and "drop_list"
	 *    triggered by function "rds_ib_reg_frmr" as it is releases frmr
	 *    regions whose state is not "FRMR_IS_FREE" right away.
	 * 2) prevents an invalid access error in a race
	 *    from a pending "IB_WR_LOCAL_INV" operation
	 *    with a teardown ("dma_unmap_sg", "put_page")
	 *    and de-registration ("ib_dereg_mr") of the corresponding
	 *    memory region.
	 */
	wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);

out:
	return ret;
}
@@ -271,7 +325,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
	struct rds_ib_frmr *frmr = &ibmr->u.frmr;

	if (wc->status != IB_WC_SUCCESS) {
		frmr->fr_state = FRMR_IS_STALE;
		rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
		if (rds_conn_up(ic->conn))
			rds_ib_conn_error(ic->conn,
					  "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n",
@@ -283,10 +337,20 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
	}

	if (frmr->fr_inv) {
		frmr->fr_state = FRMR_IS_FREE;
		rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_FREE);
		frmr->fr_inv = false;
		wake_up(&frmr->fr_inv_done);
	}

	if (frmr->fr_reg) {
		frmr->fr_reg = false;
		wake_up(&frmr->fr_reg_done);
	}

	/* enforce order of frmr->{fr_reg,fr_inv} update
	 * before incrementing i_fastreg_wrs
	 */
	smp_mb__before_atomic();
	atomic_inc(&ic->i_fastreg_wrs);
}

@@ -295,14 +359,18 @@ void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
{
	struct rds_ib_mr *ibmr, *next;
	struct rds_ib_frmr *frmr;
	int ret = 0;
	int ret = 0, ret2;
	unsigned int freed = *nfreed;

	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
	list_for_each_entry(ibmr, list, unmap_list) {
		if (ibmr->sg_dma_len)
			ret |= rds_ib_post_inv(ibmr);
		if (ibmr->sg_dma_len) {
			ret2 = rds_ib_post_inv(ibmr);
			if (ret2 && !ret)
				ret = ret2;
		}
	}

	if (ret)
		pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);

+4 −0
Original line number Diff line number Diff line
@@ -57,6 +57,9 @@ struct rds_ib_frmr {
	struct ib_mr		*mr;
	enum rds_ib_fr_state	fr_state;
	bool			fr_inv;
	wait_queue_head_t	fr_inv_done;
	bool			fr_reg;
	wait_queue_head_t	fr_reg_done;
	struct ib_send_wr	fr_wr;
	unsigned int		dma_npages;
	unsigned int		sg_byte_len;
@@ -97,6 +100,7 @@ struct rds_ib_mr_pool {
	struct llist_head	free_list;	/* unused MRs */
	struct llist_head	clean_list;	/* unused & unmapped MRs */
	wait_queue_head_t	flush_wait;
	spinlock_t		clean_lock;	/* "clean_list" concurrency */

	atomic_t		free_pinned;	/* memory pinned by free MRs */
	unsigned long		max_items;
+20 −40
Original line number Diff line number Diff line
@@ -40,9 +40,6 @@

struct workqueue_struct *rds_ib_mr_wq;

static DEFINE_PER_CPU(unsigned long, clean_list_grace);
#define CLEAN_LIST_BUSY_BIT 0

static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
	struct rds_ib_device *rds_ibdev;
@@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
{
	struct rds_ib_mr *ibmr = NULL;
	struct llist_node *ret;
	unsigned long *flag;
	unsigned long flags;

	preempt_disable();
	flag = this_cpu_ptr(&clean_list_grace);
	set_bit(CLEAN_LIST_BUSY_BIT, flag);
	spin_lock_irqsave(&pool->clean_lock, flags);
	ret = llist_del_first(&pool->clean_list);
	spin_unlock_irqrestore(&pool->clean_lock, flags);
	if (ret) {
		ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
		if (pool->pool_type == RDS_IB_MR_8K_POOL)
@@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
			rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
	}

	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
	preempt_enable();
	return ibmr;
}

static inline void wait_clean_list_grace(void)
{
	int cpu;
	unsigned long *flag;

	for_each_online_cpu(cpu) {
		flag = &per_cpu(clean_list_grace, cpu);
		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
			cpu_relax();
	}
}

void rds_ib_sync_mr(void *trans_private, int direction)
{
	struct rds_ib_mr *ibmr = trans_private;
@@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist,
 * of clusters.  Each cluster has linked llist nodes of
 * MR_CLUSTER_SIZE mrs that are ready for reuse.
 */
static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
				struct list_head *list,
static void list_to_llist_nodes(struct list_head *list,
				struct llist_node **nodes_head,
				struct llist_node **nodes_tail)
{
@@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
	 */
	dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
	dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
	if (free_all)
	if (free_all) {
		unsigned long flags;

		spin_lock_irqsave(&pool->clean_lock, flags);
		llist_append_to_list(&pool->clean_list, &unmap_list);
		spin_unlock_irqrestore(&pool->clean_lock, flags);
	}

	free_goal = rds_ib_flush_goal(pool, free_all);

@@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
		rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);

	if (!list_empty(&unmap_list)) {
		/* we have to make sure that none of the things we're about
		 * to put on the clean list would race with other cpus trying
		 * to pull items off.  The llist would explode if we managed to
		 * remove something from the clean list and then add it back again
		 * while another CPU was spinning on that same item in llist_del_first.
		 *
		 * This is pretty unlikely, but just in case  wait for an llist grace period
		 * here before adding anything back into the clean list.
		 */
		wait_clean_list_grace();
		unsigned long flags;

		list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
		list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
		if (ibmr_ret) {
			*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
			clean_nodes = clean_nodes->next;
		}
		/* more than one entry in llist nodes */
		if (clean_nodes)
		if (clean_nodes) {
			spin_lock_irqsave(&pool->clean_lock, flags);
			llist_add_batch(clean_nodes, clean_tail,
					&pool->clean_list);

			spin_unlock_irqrestore(&pool->clean_lock, flags);
		}
	}

	atomic_sub(unpinned, &pool->free_pinned);
@@ -471,7 +450,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
				rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
			else
				rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
			return ERR_PTR(-EAGAIN);
			break;
		}

		/* We do have some empty MRs. Flush them out. */
@@ -485,7 +464,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
			return ibmr;
	}

	return ibmr;
	return NULL;
}

static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
@@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
	init_llist_head(&pool->free_list);
	init_llist_head(&pool->drop_list);
	init_llist_head(&pool->clean_list);
	spin_lock_init(&pool->clean_lock);
	mutex_init(&pool->flush_lock);
	init_waitqueue_head(&pool->flush_wait);
	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);