Commit 1b4a51b6 authored by Pavel Begunkov's avatar Pavel Begunkov Committed by Jens Axboe
Browse files

io_uring: drain next sqe instead of shadowing



There's an issue with the shadow drain logic in that we drop the
completion lock after deciding to defer a request, then re-grab it later
and assume that the state is still the same. In the mean time, someone
else completing a request could have found and issued it. This can cause
a stall in the queue, by having a shadow request inserted that nobody is
going to drain.

Additionally, if we fail allocating the shadow request, we simply ignore
the drain.

Instead of using a shadow request, defer the next request/link instead.
This also has the following advantages:

- removes semi-duplicated code
- doesn't allocate memory for shadows
- works better if only the head marked for drain
- doesn't need complex synchronisation

On the flip side, it removes the shadow->seq ==
last_drain_in_in_link->seq optimization. That shouldn't be a common
case, and can always be added back, if needed.

Fixes: 4fe2c963 ("io_uring: add support for link with drain")
Cc: Jackie Liu <liuyun01@kylinos.cn>
Reported-by: default avatarJens Axboe <axboe@kernel.dk>
Signed-off-by: default avatarPavel Begunkov <asml.silence@gmail.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent b76da70f
Loading
Loading
Loading
Loading
+18 −68
Original line number Diff line number Diff line
@@ -186,6 +186,7 @@ struct io_ring_ctx {
		bool			compat;
		bool			account_mem;
		bool			cq_overflow_flushed;
		bool			drain_next;

		/*
		 * Ring buffer of indices into array of io_uring_sqe, which is
@@ -346,7 +347,7 @@ struct io_kiocb {
#define REQ_F_LINK		64	/* linked sqes */
#define REQ_F_LINK_TIMEOUT	128	/* has linked timeout */
#define REQ_F_FAIL_LINK		256	/* fail rest of links */
#define REQ_F_SHADOW_DRAIN	512	/* link-drain shadow req */
#define REQ_F_DRAIN_LINK	512	/* link should be fully drained */
#define REQ_F_TIMEOUT		1024	/* timeout request */
#define REQ_F_ISREG		2048	/* regular file */
#define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
@@ -620,11 +621,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
	__io_commit_cqring(ctx);

	while ((req = io_get_deferred_req(ctx)) != NULL) {
		if (req->flags & REQ_F_SHADOW_DRAIN) {
			/* Just for drain, free it. */
			__io_free_req(req);
			continue;
		}
		req->flags |= REQ_F_IO_DRAINED;
		io_queue_async_work(req);
	}
@@ -2973,6 +2969,12 @@ static void io_queue_sqe(struct io_kiocb *req)
{
	int ret;

	if (unlikely(req->ctx->drain_next)) {
		req->flags |= REQ_F_IO_DRAIN;
		req->ctx->drain_next = false;
	}
	req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);

	ret = io_req_defer(req);
	if (ret) {
		if (ret != -EIOCBQUEUED) {
@@ -2985,56 +2987,15 @@ static void io_queue_sqe(struct io_kiocb *req)
		__io_queue_sqe(req);
}

static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
static inline void io_queue_link_head(struct io_kiocb *req)
{
	int ret;
	int need_submit = false;
	struct io_ring_ctx *ctx = req->ctx;

	if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
		ret = -ECANCELED;
		goto err;
	}
	if (!shadow) {
		io_queue_sqe(req);
		return;
	}

	/*
	 * Mark the first IO in link list as DRAIN, let all the following
	 * IOs enter the defer list. all IO needs to be completed before link
	 * list.
	 */
	req->flags |= REQ_F_IO_DRAIN;
	ret = io_req_defer(req);
	if (ret) {
		if (ret != -EIOCBQUEUED) {
err:
			io_cqring_add_event(req, ret);
			if (req->flags & REQ_F_LINK)
				req->flags |= REQ_F_FAIL_LINK;
		io_cqring_add_event(req, -ECANCELED);
		io_double_put_req(req);
			if (shadow)
				__io_free_req(shadow);
			return;
		}
	} else {
		/*
		 * If ret == 0 means that all IOs in front of link io are
		 * running done. let's queue link head.
		 */
		need_submit = true;
	} else
		io_queue_sqe(req);
}

	/* Insert shadow req to defer_list, blocking next IOs */
	spin_lock_irq(&ctx->completion_lock);
	trace_io_uring_defer(ctx, shadow, true);
	list_add_tail(&shadow->list, &ctx->defer_list);
	spin_unlock_irq(&ctx->completion_lock);

	if (need_submit)
		__io_queue_sqe(req);
}

#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)

@@ -3072,6 +3033,9 @@ err_req:
		struct io_kiocb *prev = *link;
		struct io_uring_sqe *sqe_copy;

		if (s->sqe->flags & IOSQE_IO_DRAIN)
			(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;

		if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
			ret = io_timeout_setup(req);
			/* common setup allows offset being set, we don't */
@@ -3190,7 +3154,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
{
	struct io_submit_state state, *statep = NULL;
	struct io_kiocb *link = NULL;
	struct io_kiocb *shadow_req = NULL;
	int i, submitted = 0;
	bool mm_fault = false;

@@ -3229,18 +3192,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,

		sqe_flags = req->submit.sqe->flags;

		if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
			if (!shadow_req) {
				shadow_req = io_get_req(ctx, NULL);
				if (unlikely(!shadow_req))
					goto out;
				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
				refcount_dec(&shadow_req->refs);
			}
			shadow_req->sequence = req->submit.sequence;
		}

out:
		req->submit.ring_file = ring_file;
		req->submit.ring_fd = ring_fd;
		req->submit.has_user = *mm != NULL;
@@ -3256,14 +3207,13 @@ out:
		 * that's the end of the chain. Submit the previous link.
		 */
		if (!(sqe_flags & IOSQE_IO_LINK) && link) {
			io_queue_link_head(link, shadow_req);
			io_queue_link_head(link);
			link = NULL;
			shadow_req = NULL;
		}
	}

	if (link)
		io_queue_link_head(link, shadow_req);
		io_queue_link_head(link);
	if (statep)
		io_submit_state_end(&state);