Commit 74dea5d9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-5.6-2020-02-28' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:

 - Fix for a race with IOPOLL used with SQPOLL (Xiaoguang)

 - Only show ->fdinfo if procfs is enabled (Tobias)

 - Fix for a chain with multiple personalities in the SQEs

 - Fix for a missing free of personality idr on exit

 - Removal of the spin-for-work optimization

 - Fix for next work lookup on request completion

 - Fix for non-vec read/write result progation in case of links

 - Fix for a fileset references on switch

 - Fix for a recvmsg/sendmsg 32-bit compatability mode

* tag 'io_uring-5.6-2020-02-28' of git://git.kernel.dk/linux-block:
  io_uring: fix 32-bit compatability with sendmsg/recvmsg
  io_uring: define and set show_fdinfo only if procfs is enabled
  io_uring: drop file set ref put/get on switch
  io_uring: import_single_range() returns 0/-ERROR
  io_uring: pick up link work on submit reference drop
  io-wq: ensure work->task_pid is cleared on init
  io-wq: remove spin-for-work optimization
  io_uring: fix poll_list race for SETUP_IOPOLL|SETUP_SQPOLL
  io_uring: fix personality idr leak
  io_uring: handle multiple personalities in link chains
parents c60c0402 d8768362
Loading
Loading
Loading
Loading
+0 −19
Original line number Diff line number Diff line
@@ -535,42 +535,23 @@ next:
	} while (1);
}

static inline void io_worker_spin_for_work(struct io_wqe *wqe)
{
	int i = 0;

	while (++i < 1000) {
		if (io_wqe_run_queue(wqe))
			break;
		if (need_resched())
			break;
		cpu_relax();
	}
}

static int io_wqe_worker(void *data)
{
	struct io_worker *worker = data;
	struct io_wqe *wqe = worker->wqe;
	struct io_wq *wq = wqe->wq;
	bool did_work;

	io_worker_start(wqe, worker);

	did_work = false;
	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
		set_current_state(TASK_INTERRUPTIBLE);
loop:
		if (did_work)
			io_worker_spin_for_work(wqe);
		spin_lock_irq(&wqe->lock);
		if (io_wqe_run_queue(wqe)) {
			__set_current_state(TASK_RUNNING);
			io_worker_handle_work(worker);
			did_work = true;
			goto loop;
		}
		did_work = false;
		/* drops the lock on success, retry */
		if (__io_worker_idle(wqe, worker)) {
			__release(&wqe->lock);
+4 −10
Original line number Diff line number Diff line
@@ -81,13 +81,7 @@ struct io_wq_work {

#define INIT_IO_WORK(work, _func)				\
	do {							\
		(work)->list.next = NULL;		\
		(work)->func = _func;			\
		(work)->files = NULL;			\
		(work)->mm = NULL;			\
		(work)->creds = NULL;			\
		(work)->fs = NULL;			\
		(work)->flags = 0;			\
		*(work) = (struct io_wq_work){ .func = _func };	\
	} while (0)						\

typedef void (get_work_fn)(struct io_wq_work *);
+70 −62
Original line number Diff line number Diff line
@@ -183,17 +183,12 @@ struct fixed_file_table {
	struct file		**files;
};

enum {
	FFD_F_ATOMIC,
};

struct fixed_file_data {
	struct fixed_file_table		*table;
	struct io_ring_ctx		*ctx;

	struct percpu_ref		refs;
	struct llist_head		put_llist;
	unsigned long			state;
	struct work_struct		ref_work;
	struct completion		done;
};
@@ -1483,11 +1478,11 @@ static void io_free_req(struct io_kiocb *req)
__attribute__((nonnull))
static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
	if (refcount_dec_and_test(&req->refs)) {
		io_req_find_next(req, nxtptr);

	if (refcount_dec_and_test(&req->refs))
		__io_free_req(req);
	}
}

static void io_put_req(struct io_kiocb *req)
{
@@ -1821,6 +1816,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
		list_add(&req->list, &ctx->poll_list);
	else
		list_add_tail(&req->list, &ctx->poll_list);

	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
	    wq_has_sleeper(&ctx->sqo_wait))
		wake_up(&ctx->sqo_wait);
}

static void io_file_put(struct io_submit_state *state)
@@ -2071,7 +2070,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
		ssize_t ret;
		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
		*iovec = NULL;
		return ret;
		return ret < 0 ? ret : sqe_len;
	}

	if (req->io) {
@@ -3002,6 +3001,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
	sr->len = READ_ONCE(sqe->len);

#ifdef CONFIG_COMPAT
	if (req->ctx->compat)
		sr->msg_flags |= MSG_CMSG_COMPAT;
#endif

	if (!io || req->opcode == IORING_OP_SEND)
		return 0;
	/* iovec is already imported */
@@ -3154,6 +3158,11 @@ static int io_recvmsg_prep(struct io_kiocb *req,
	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
	sr->len = READ_ONCE(sqe->len);

#ifdef CONFIG_COMPAT
	if (req->ctx->compat)
		sr->msg_flags |= MSG_CMSG_COMPAT;
#endif

	if (!io || req->opcode == IORING_OP_RECV)
		return 0;
	/* iovec is already imported */
@@ -4705,11 +4714,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
	struct io_kiocb *linked_timeout;
	struct io_kiocb *nxt = NULL;
	const struct cred *old_creds = NULL;
	int ret;

again:
	linked_timeout = io_prep_linked_timeout(req);

	if (req->work.creds && req->work.creds != current_cred()) {
		if (old_creds)
			revert_creds(old_creds);
		if (old_creds == req->work.creds)
			old_creds = NULL; /* restored original creds */
		else
			old_creds = override_creds(req->work.creds);
	}

	ret = io_issue_sqe(req, sqe, &nxt, true);

	/*
@@ -4735,7 +4754,7 @@ punt:

err:
	/* drop submission reference */
	io_put_req(req);
	io_put_req_find_next(req, &nxt);

	if (linked_timeout) {
		if (!ret)
@@ -4759,6 +4778,8 @@ done_req:
			goto punt;
		goto again;
	}
	if (old_creds)
		revert_creds(old_creds);
}

static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4803,7 +4824,6 @@ static inline void io_queue_link_head(struct io_kiocb *req)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
			  struct io_submit_state *state, struct io_kiocb **link)
{
	const struct cred *old_creds = NULL;
	struct io_ring_ctx *ctx = req->ctx;
	unsigned int sqe_flags;
	int ret, id;
@@ -4818,14 +4838,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,

	id = READ_ONCE(sqe->personality);
	if (id) {
		const struct cred *personality_creds;

		personality_creds = idr_find(&ctx->personality_idr, id);
		if (unlikely(!personality_creds)) {
		req->work.creds = idr_find(&ctx->personality_idr, id);
		if (unlikely(!req->work.creds)) {
			ret = -EINVAL;
			goto err_req;
		}
		old_creds = override_creds(personality_creds);
		get_cred(req->work.creds);
	}

	/* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -4837,8 +4855,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
err_req:
		io_cqring_add_event(req, ret);
		io_double_put_req(req);
		if (old_creds)
			revert_creds(old_creds);
		return false;
	}

@@ -4899,8 +4915,6 @@ err_req:
		}
	}

	if (old_creds)
		revert_creds(old_creds);
	return true;
}

@@ -5081,9 +5095,8 @@ static int io_sq_thread(void *data)
	const struct cred *old_cred;
	mm_segment_t old_fs;
	DEFINE_WAIT(wait);
	unsigned inflight;
	unsigned long timeout;
	int ret;
	int ret = 0;

	complete(&ctx->completions[1]);

@@ -5091,39 +5104,19 @@ static int io_sq_thread(void *data)
	set_fs(USER_DS);
	old_cred = override_creds(ctx->creds);

	ret = timeout = inflight = 0;
	timeout = jiffies + ctx->sq_thread_idle;
	while (!kthread_should_park()) {
		unsigned int to_submit;

		if (inflight) {
		if (!list_empty(&ctx->poll_list)) {
			unsigned nr_events = 0;

			if (ctx->flags & IORING_SETUP_IOPOLL) {
				/*
				 * inflight is the count of the maximum possible
				 * entries we submitted, but it can be smaller
				 * if we dropped some of them. If we don't have
				 * poll entries available, then we know that we
				 * have nothing left to poll for. Reset the
				 * inflight count to zero in that case.
				 */
			mutex_lock(&ctx->uring_lock);
			if (!list_empty(&ctx->poll_list))
				io_iopoll_getevents(ctx, &nr_events, 0);
			else
					inflight = 0;
				mutex_unlock(&ctx->uring_lock);
			} else {
				/*
				 * Normal IO, just pretend everything completed.
				 * We don't have to poll completions for that.
				 */
				nr_events = inflight;
			}

			inflight -= nr_events;
			if (!inflight)
				timeout = jiffies + ctx->sq_thread_idle;
			mutex_unlock(&ctx->uring_lock);
		}

		to_submit = io_sqring_entries(ctx);
@@ -5152,7 +5145,7 @@ static int io_sq_thread(void *data)
			 * more IO, we should wait for the application to
			 * reap events and wake us up.
			 */
			if (inflight ||
			if (!list_empty(&ctx->poll_list) ||
			    (!time_after(jiffies, timeout) && ret != -EBUSY &&
			    !percpu_ref_is_dying(&ctx->refs))) {
				cond_resched();
@@ -5162,6 +5155,19 @@ static int io_sq_thread(void *data)
			prepare_to_wait(&ctx->sqo_wait, &wait,
						TASK_INTERRUPTIBLE);

			/*
			 * While doing polled IO, before going to sleep, we need
			 * to check if there are new reqs added to poll_list, it
			 * is because reqs may have been punted to io worker and
			 * will be added to poll_list later, hence check the
			 * poll_list again.
			 */
			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
			    !list_empty_careful(&ctx->poll_list)) {
				finish_wait(&ctx->sqo_wait, &wait);
				continue;
			}

			/* Tell userspace we may need a wakeup call */
			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
			/* make sure to read SQ tail after writing flags */
@@ -5189,8 +5195,7 @@ static int io_sq_thread(void *data)
		mutex_lock(&ctx->uring_lock);
		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
		mutex_unlock(&ctx->uring_lock);
		if (ret > 0)
			inflight += ret;
		timeout = jiffies + ctx->sq_thread_idle;
	}

	set_fs(old_fs);
@@ -5595,7 +5600,6 @@ static void io_ring_file_ref_switch(struct work_struct *work)

	data = container_of(work, struct fixed_file_data, ref_work);
	io_ring_file_ref_flush(data);
	percpu_ref_get(&data->refs);
	percpu_ref_switch_to_percpu(&data->refs);
}

@@ -5771,8 +5775,13 @@ static void io_atomic_switch(struct percpu_ref *ref)
{
	struct fixed_file_data *data;

	/*
	 * Juggle reference to ensure we hit zero, if needed, so we can
	 * switch back to percpu mode
	 */
	data = container_of(ref, struct fixed_file_data, refs);
	clear_bit(FFD_F_ATOMIC, &data->state);
	percpu_ref_put(&data->refs);
	percpu_ref_get(&data->refs);
}

static bool io_queue_file_removal(struct fixed_file_data *data,
@@ -5795,11 +5804,7 @@ static bool io_queue_file_removal(struct fixed_file_data *data,
	llist_add(&pfile->llist, &data->put_llist);

	if (pfile == &pfile_stack) {
		if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
			percpu_ref_put(&data->refs);
			percpu_ref_switch_to_atomic(&data->refs,
							io_atomic_switch);
		}
		percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
		wait_for_completion(&done);
		flush_work(&data->ref_work);
		return false;
@@ -5873,10 +5878,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
		up->offset++;
	}

	if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
		percpu_ref_put(&data->refs);
	if (ref_switch)
		percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
	}

	return done ? done : err;
}
@@ -6334,6 +6337,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
	io_sqe_buffer_unregister(ctx);
	io_sqe_files_unregister(ctx);
	io_eventfd_unregister(ctx);
	idr_destroy(&ctx->personality_idr);

#if defined(CONFIG_UNIX)
	if (ctx->ring_sock) {
@@ -6647,6 +6651,7 @@ out_fput:
	return submitted ? submitted : ret;
}

#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
	const struct cred *cred = p;
@@ -6720,6 +6725,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
		percpu_ref_put(&ctx->refs);
	}
}
#endif

static const struct file_operations io_uring_fops = {
	.release	= io_uring_release,
@@ -6731,7 +6737,9 @@ static const struct file_operations io_uring_fops = {
#endif
	.poll		= io_uring_poll,
	.fasync		= io_uring_fasync,
#ifdef CONFIG_PROC_FS
	.show_fdinfo	= io_uring_show_fdinfo,
#endif
};

static int io_allocate_scq_urings(struct io_ring_ctx *ctx,