Commit 2eafa174 authored by Hans Westgaard Ry's avatar Hans Westgaard Ry Committed by Leon Romanovsky
Browse files

net/rds: Handle ODP mr registration/unregistration



On-Demand-Paging MRs are registered using ib_reg_user_mr and
unregistered with ib_dereg_mr.

Signed-off-by: default avatarHans Westgaard Ry <hans.westgaard.ry@oracle.com>
Acked-by: default avatarSantosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
parent c4c86abb
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -156,6 +156,13 @@ static void rds_ib_add_one(struct ib_device *device)
	has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr &&
		   device->ops.map_phys_fmr && device->ops.unmap_fmr);
	rds_ibdev->use_fastreg = (has_fr && !has_fmr);
	rds_ibdev->odp_capable =
		!!(device->attrs.device_cap_flags &
		   IB_DEVICE_ON_DEMAND_PAGING) &&
		!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
		   IB_ODP_SUPPORT_WRITE) &&
		!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
		   IB_ODP_SUPPORT_READ);

	rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
	rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
+2 −1
Original line number Diff line number Diff line
@@ -247,7 +247,8 @@ struct rds_ib_device {
	struct ib_device	*dev;
	struct ib_pd		*pd;
	struct dma_pool		*rid_hdrs_pool; /* RDS headers DMA pool */
	bool                    use_fastreg;
	u8			use_fastreg:1;
	u8			odp_capable:1;

	unsigned int		max_mrs;
	struct rds_ib_mr_pool	*mr_1m_pool;
+6 −1
Original line number Diff line number Diff line
@@ -67,6 +67,7 @@ struct rds_ib_frmr {

/* This is stored as mr->r_trans_private. */
struct rds_ib_mr {
	struct delayed_work		work;
	struct rds_ib_device		*device;
	struct rds_ib_mr_pool		*pool;
	struct rds_ib_connection	*ic;
@@ -81,9 +82,11 @@ struct rds_ib_mr {
	unsigned int			sg_len;
	int				sg_dma_len;

	u8				odp:1;
	union {
		struct rds_ib_fmr	fmr;
		struct rds_ib_frmr	frmr;
		struct ib_mr		*mr;
	} u;
};

@@ -122,12 +125,14 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
		    struct rds_sock *rs, u32 *key_ret,
		    struct rds_connection *conn);
		    struct rds_connection *conn, u64 start, u64 length,
		    int need_odp);
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
int rds_ib_mr_init(void);
void rds_ib_mr_exit(void);
u32 rds_ib_get_lkey(void *trans_private);

void __rds_ib_teardown_mr(struct rds_ib_mr *);
void rds_ib_teardown_mr(struct rds_ib_mr *);
+74 −1
Original line number Diff line number Diff line
@@ -37,8 +37,15 @@

#include "rds_single_path.h"
#include "ib_mr.h"
#include "rds.h"

struct workqueue_struct *rds_ib_mr_wq;
struct rds_ib_dereg_odp_mr {
	struct work_struct work;
	struct ib_mr *mr;
};

static void rds_ib_odp_mr_worker(struct work_struct *work);

static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
@@ -213,6 +220,9 @@ void rds_ib_sync_mr(void *trans_private, int direction)
	struct rds_ib_mr *ibmr = trans_private;
	struct rds_ib_device *rds_ibdev = ibmr->device;

	if (ibmr->odp)
		return;

	switch (direction) {
	case DMA_FROM_DEVICE:
		ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
@@ -482,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate)

	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);

	if (ibmr->odp) {
		/* A MR created and marked as use_once. We use delayed work,
		 * because there is a change that we are in interrupt and can't
		 * call to ib_dereg_mr() directly.
		 */
		INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
		queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
		return;
	}

	/* Return it to the pool's free list */
	if (rds_ibdev->use_fastreg)
		rds_ib_free_frmr_list(ibmr);
@@ -526,9 +546,17 @@ void rds_ib_flush_mrs(void)
	up_read(&rds_ib_devices_lock);
}

u32 rds_ib_get_lkey(void *trans_private)
{
	struct rds_ib_mr *ibmr = trans_private;

	return ibmr->u.mr->lkey;
}

void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
		    struct rds_sock *rs, u32 *key_ret,
		    struct rds_connection *conn)
		    struct rds_connection *conn,
		    u64 start, u64 length, int need_odp)
{
	struct rds_ib_device *rds_ibdev;
	struct rds_ib_mr *ibmr = NULL;
@@ -541,6 +569,42 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
		goto out;
	}

	if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
		u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
		int access_flags =
			(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
			 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
			 IB_ACCESS_ON_DEMAND);
		struct ib_mr *ib_mr;

		if (!rds_ibdev->odp_capable) {
			ret = -EOPNOTSUPP;
			goto out;
		}

		ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
				       access_flags);

		if (IS_ERR(ib_mr)) {
			rdsdebug("rds_ib_get_user_mr returned %d\n",
				 IS_ERR(ib_mr));
			ret = PTR_ERR(ib_mr);
			goto out;
		}
		if (key_ret)
			*key_ret = ib_mr->rkey;

		ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
		if (!ibmr) {
			ib_dereg_mr(ib_mr);
			ret = -ENOMEM;
			goto out;
		}
		ibmr->u.mr = ib_mr;
		ibmr->odp = 1;
		return ibmr;
	}

	if (conn)
		ic = conn->c_transport_data;

@@ -629,3 +693,12 @@ void rds_ib_mr_exit(void)
{
	destroy_workqueue(rds_ib_mr_wq);
}

static void rds_ib_odp_mr_worker(struct work_struct  *work)
{
	struct rds_ib_mr *ibmr;

	ibmr = container_of(work, struct rds_ib_mr, work.work);
	ib_dereg_mr(ibmr->u.mr);
	kfree(ibmr);
}
+31 −13
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@
#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
#include "ib_mr.h"

/*
 * Convert IB-specific error message to RDS error message and call core
@@ -635,6 +636,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
		send->s_sge[0].addr = ic->i_send_hdrs_dma[pos];

		send->s_sge[0].length = sizeof(struct rds_header);
		send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;

		memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
		       sizeof(struct rds_header));
@@ -650,6 +652,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
			send->s_sge[1].addr = sg_dma_address(scat);
			send->s_sge[1].addr += rm->data.op_dmaoff;
			send->s_sge[1].length = len;
			send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;

			bytes_sent += len;
			rm->data.op_dmaoff += len;
@@ -858,21 +861,30 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
	int ret;
	int num_sge;
	int nr_sig = 0;
	u64 odp_addr = op->op_odp_addr;
	u32 odp_lkey = 0;

	/* map the op the first time we see it */
	if (!op->op_odp_mr) {
		if (!op->op_mapped) {
		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
					     op->op_sg, op->op_nents, (op->op_write) ?
					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
			op->op_count =
				ib_dma_map_sg(ic->i_cm_id->device, op->op_sg,
					      op->op_nents,
					      (op->op_write) ? DMA_TO_DEVICE :
							       DMA_FROM_DEVICE);
			rdsdebug("ic %p mapping op %p: %d\n", ic, op,
				 op->op_count);
			if (op->op_count == 0) {
				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
				ret = -ENOMEM; /* XXX ? */
				goto out;
			}

			op->op_mapped = 1;
		}
	} else {
		op->op_count = op->op_nents;
		odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private);
	}

	/*
	 * Instead of knowing how to return a partial rdma read/write we insist that there
@@ -923,14 +935,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
		for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
		     scat != &op->op_sg[op->op_count]; j++) {
			len = sg_dma_len(scat);
			if (!op->op_odp_mr) {
				send->s_sge[j].addr = sg_dma_address(scat);
			send->s_sge[j].length = len;
				send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
			} else {
				send->s_sge[j].addr = odp_addr;
				send->s_sge[j].lkey = odp_lkey;
			}
			send->s_sge[j].length = len;

			sent += len;
			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);

			remote_addr += len;
			odp_addr += len;
			scat++;
		}

Loading