Commit 614f3c96 authored by Chuck Lever's avatar Chuck Lever Committed by Anna Schumaker
Browse files

xprtrdma: Pull up sometimes



On some platforms, DMA mapping part of a page is more costly than
copying bytes. Restore the pull-up code and use that when we
think it's going to be faster. The heuristic for now is to pull-up
when the size of the RPC message body fits in the buffer underlying
the head iovec.

Indeed, not involving the I/O MMU can help the RPC/RDMA transport
scale better for tiny I/Os across more RDMA devices. This is because
interaction with the I/O MMU is eliminated, as is handling a Send
completion, for each of these small I/Os. Without the explicit
unmapping, the NIC no longer needs to do a costly internal TLB shoot
down for buffers that are just a handful of bytes.

Signed-off-by: default avatarChuck Lever <chuck.lever@oracle.com>
Signed-off-by: default avatarAnna Schumaker <Anna.Schumaker@Netapp.com>
parent d6764bbd
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -532,6 +532,8 @@ DEFINE_WRCH_EVENT(write);
DEFINE_WRCH_EVENT(reply);

TRACE_DEFINE_ENUM(rpcrdma_noch);
TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
TRACE_DEFINE_ENUM(rpcrdma_noch_mapped);
TRACE_DEFINE_ENUM(rpcrdma_readch);
TRACE_DEFINE_ENUM(rpcrdma_areadch);
TRACE_DEFINE_ENUM(rpcrdma_writech);
@@ -540,6 +542,8 @@ TRACE_DEFINE_ENUM(rpcrdma_replych);
#define xprtrdma_show_chunktype(x)					\
		__print_symbolic(x,					\
				{ rpcrdma_noch, "inline" },		\
				{ rpcrdma_noch_pullup, "pullup" },	\
				{ rpcrdma_noch_mapped, "mapped" },	\
				{ rpcrdma_readch, "read list" },	\
				{ rpcrdma_areadch, "*read list" },	\
				{ rpcrdma_writech, "write list" },	\
+1 −1
Original line number Diff line number Diff line
@@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
	*p = xdr_zero;

	if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
				      &rqst->rq_snd_buf, rpcrdma_noch))
				      &rqst->rq_snd_buf, rpcrdma_noch_pullup))
		return -EIO;

	trace_xprtrdma_cb_reply(rqst);
+77 −5
Original line number Diff line number Diff line
@@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
	unsigned int pos;
	int nsegs;

	if (rtype == rpcrdma_noch)
	if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
		goto done;

	pos = rqst->rq_snd_buf.head[0].iov_len;
@@ -691,6 +691,72 @@ out_mapping_err:
	return false;
}

/* Copy the tail to the end of the head buffer.
 */
static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
				    struct rpcrdma_req *req,
				    struct xdr_buf *xdr)
{
	unsigned char *dst;

	dst = (unsigned char *)xdr->head[0].iov_base;
	dst += xdr->head[0].iov_len + xdr->page_len;
	memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
	r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
}

/* Copy pagelist content into the head buffer.
 */
static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
				    struct rpcrdma_req *req,
				    struct xdr_buf *xdr)
{
	unsigned int len, page_base, remaining;
	struct page **ppages;
	unsigned char *src, *dst;

	dst = (unsigned char *)xdr->head[0].iov_base;
	dst += xdr->head[0].iov_len;
	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
	page_base = offset_in_page(xdr->page_base);
	remaining = xdr->page_len;
	while (remaining) {
		src = page_address(*ppages);
		src += page_base;
		len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
		memcpy(dst, src, len);
		r_xprt->rx_stats.pullup_copy_count += len;

		ppages++;
		dst += len;
		remaining -= len;
		page_base = 0;
	}
}

/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
 * When the head, pagelist, and tail are small, a pull-up copy
 * is considerably less costly than DMA mapping the components
 * of @xdr.
 *
 * Assumptions:
 *  - the caller has already verified that the total length
 *    of the RPC Call body will fit into @rl_sendbuf.
 */
static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
					struct rpcrdma_req *req,
					struct xdr_buf *xdr)
{
	if (unlikely(xdr->tail[0].iov_len))
		rpcrdma_pullup_tail_iov(r_xprt, req, xdr);

	if (unlikely(xdr->page_len))
		rpcrdma_pullup_pagelist(r_xprt, req, xdr);

	/* The whole RPC message resides in the head iovec now */
	return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
}

static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
					struct rpcrdma_req *req,
					struct xdr_buf *xdr)
@@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
		goto out_unmap;

	switch (rtype) {
	case rpcrdma_noch:
	case rpcrdma_noch_pullup:
		if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
			goto out_unmap;
		break;
	case rpcrdma_noch_mapped:
		if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
			goto out_unmap;
		break;
@@ -827,6 +897,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
	struct xdr_stream *xdr = &req->rl_stream;
	enum rpcrdma_chunktype rtype, wtype;
	struct xdr_buf *buf = &rqst->rq_snd_buf;
	bool ddp_allowed;
	__be32 *p;
	int ret;
@@ -884,8 +955,9 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
	 */
	if (rpcrdma_args_inline(r_xprt, rqst)) {
		*p++ = rdma_msg;
		rtype = rpcrdma_noch;
	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
		rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
			rpcrdma_noch_pullup : rpcrdma_noch_mapped;
	} else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
		*p++ = rdma_msg;
		rtype = rpcrdma_readch;
	} else {
@@ -927,7 +999,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
		goto out_err;

	ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
					&rqst->rq_snd_buf, rtype);
					buf, rtype);
	if (ret)
		goto out_err;

+1 −1
Original line number Diff line number Diff line
@@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
	for (i = 0; i < buf->rb_max_requests; i++) {
		struct rpcrdma_req *req;

		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE,
		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
					 GFP_KERNEL);
		if (!req)
			goto out;
+2 −0
Original line number Diff line number Diff line
@@ -554,6 +554,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);

enum rpcrdma_chunktype {
	rpcrdma_noch = 0,
	rpcrdma_noch_pullup,
	rpcrdma_noch_mapped,
	rpcrdma_readch,
	rpcrdma_areadch,
	rpcrdma_writech,