Commit 40510a63 authored by Sagi Grimberg's avatar Sagi Grimberg Committed by Keith Busch
Browse files

nvme-tcp: optimize queue io_cpu assignment for multiple queue maps



Currently, queue io_cpu assignment is done sequentially for default,
read and poll queues based on queue id. This causes miss-alignment between
context of CPU initiating I/O and the I/O worker thread processing
queued requests or completions.

Change to modify queue io_cpu assignment to take into account queue
maps offset. Each queue io_cpu will start at zero for each queue map.
This essentially aligns read/poll queues to start over the same range as
default queues.

Testing performed by Mark with:
- ram device (nvmet)
- single CPU core (pinned)
- 100% 4k reads
- engine io_uring (not using sq_thread option)
- hipri flag set

Micro-benchmark results show a net gain of:
- increase of 18%-29% in IOPs
- reduction of 16%-22% in average latency
- reduction of 7%-23% in 99.99% latency

Baseline:
========
QDepth/Batch	| IOPs [k]	| Avg. Lat [us]	| 99.99% Lat [us]
-----------------------------------------------------------------
1/1 		| 32.4		| 30.11		| 50.94
32/8		| 179		| 168.20	| 371

CPU alignment:
=============
QDepth/Batch	| IOPs [k]	| Avg. Lat [us]	| 99.99% Lat [us]
-----------------------------------------------------------------
1/1 		| 38.5		|   25.18	| 39.16
32/8		| 231		|   130.75	| 343

Reported-by: default avatarMark Wunderlich <mark.wunderlich@intel.com>
Signed-off-by: default avatarSagi Grimberg <sagi@grimberg.me>
Signed-off-by: default avatarKeith Busch <kbusch@kernel.org>
parent fa059b85
Loading
Loading
Loading
Loading
+56 −6
Original line number Diff line number Diff line
@@ -1258,13 +1258,67 @@ free_icreq:
	return ret;
}

static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
{
	return nvme_tcp_queue_id(queue) == 0;
}

static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
{
	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
	int qid = nvme_tcp_queue_id(queue);

	return !nvme_tcp_admin_queue(queue) &&
		qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
}

static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
{
	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
	int qid = nvme_tcp_queue_id(queue);

	return !nvme_tcp_admin_queue(queue) &&
		!nvme_tcp_default_queue(queue) &&
		qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
			  ctrl->io_queues[HCTX_TYPE_READ];
}

static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
{
	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
	int qid = nvme_tcp_queue_id(queue);

	return !nvme_tcp_admin_queue(queue) &&
		!nvme_tcp_default_queue(queue) &&
		!nvme_tcp_read_queue(queue) &&
		qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
			  ctrl->io_queues[HCTX_TYPE_READ] +
			  ctrl->io_queues[HCTX_TYPE_POLL];
}

static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
	int qid = nvme_tcp_queue_id(queue);
	int n = 0;

	if (nvme_tcp_default_queue(queue))
		n = qid - 1;
	else if (nvme_tcp_read_queue(queue))
		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
	else if (nvme_tcp_poll_queue(queue))
		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
				ctrl->io_queues[HCTX_TYPE_READ] - 1;
	queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
}

static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
		int qid, size_t queue_size)
{
	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
	int ret, opt, rcv_pdu_size, n;
	int ret, opt, rcv_pdu_size;

	queue->ctrl = ctrl;
	INIT_LIST_HEAD(&queue->send_list);
@@ -1343,11 +1397,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
	}

	queue->sock->sk->sk_allocation = GFP_ATOMIC;
	if (!qid)
		n = 0;
	else
		n = (qid - 1) % num_online_cpus();
	queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
	nvme_tcp_set_queue_io_cpu(queue);
	queue->request = NULL;
	queue->data_remaining = 0;
	queue->ddgst_remaining = 0;