Commit 896f8d23 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - Support for various new opcodes (fallocate, openat, close, statx,
   fadvise, madvise, openat2, non-vectored read/write, send/recv, and
   epoll_ctl)

 - Faster ring quiesce for fileset updates

 - Optimizations for overflow condition checking

 - Support for max-sized clamping

 - Support for probing what opcodes are supported

 - Support for io-wq backend sharing between "sibling" rings

 - Support for registering personalities

 - Lots of little fixes and improvements

* tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block: (64 commits)
  io_uring: add support for epoll_ctl(2)
  eventpoll: support non-blocking do_epoll_ctl() calls
  eventpoll: abstract out epoll_ctl() handler
  io_uring: fix linked command file table usage
  io_uring: support using a registered personality for commands
  io_uring: allow registering credentials
  io_uring: add io-wq workqueue sharing
  io-wq: allow grabbing existing io-wq
  io_uring/io-wq: don't use static creds/mm assignments
  io-wq: make the io_wq ref counted
  io_uring: fix refcounting with batched allocations at OOM
  io_uring: add comment for drain_next
  io_uring: don't attempt to copy iovec for READ/WRITE
  io_uring: honor IOSQE_ASYNC for linked reqs
  io_uring: prep req when do IOSQE_ASYNC
  io_uring: use labeled array init in io_op_defs
  io_uring: optimise sqe-to-req flags translation
  io_uring: remove REQ_F_IO_DRAINED
  io_uring: file switch work needs to get flushed on exit
  io_uring: hide uring_fd in ctx
  ...
parents 33c84e89 3e4827b0
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
@@ -2249,11 +2249,13 @@ static void binder_deferred_fd_close(int fd)
		return;
	init_task_work(&twcb->twork, binder_do_fd_close);
	__close_fd_get_file(fd, &twcb->file);
	if (twcb->file)
	if (twcb->file) {
		filp_close(twcb->file, current->files);
		task_work_add(current, &twcb->twork, true);
	else
	} else {
		kfree(twcb);
	}
}

static void binder_transaction_buffer_release(struct binder_proc *proc,
					      struct binder_buffer *buffer,
+56 −31
Original line number Diff line number Diff line
@@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
	return container_of(p, struct ep_pqueue, pt)->epi;
}

/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
	return op != EPOLL_CTL_DEL;
}

/* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls)
{
@@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size)
	return do_epoll_create(0);
}

/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
				   bool nonblock)
{
	if (!nonblock) {
		mutex_lock_nested(mutex, depth);
		return 0;
	}
	if (mutex_trylock(mutex))
		return 0;
	return -EAGAIN;
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
		 bool nonblock)
{
	int error;
	int full_check = 0;
	struct fd f, tf;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;
	struct eventpoll *tep = NULL;

	error = -EFAULT;
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	error = -EBADF;
	f = fdget(epfd);
	if (!f.file)
@@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,

	/* Check if EPOLLWAKEUP is allowed */
	if (ep_op_has_event(op))
		ep_take_care_of_epollwakeup(&epds);
		ep_take_care_of_epollwakeup(epds);

	/*
	 * We have to check that the file structure underneath the file descriptor
@@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
	 * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
	 * Also, we do not currently supported nested exclusive wakeups.
	 */
	if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
	if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
		if (op == EPOLL_CTL_MOD)
			goto error_tgt_fput;
		if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
				(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
				(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
			goto error_tgt_fput;
	}

@@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
	 * deep wakeup paths from forming in parallel through multiple
	 * EPOLL_CTL_ADD operations.
	 */
	mutex_lock_nested(&ep->mtx, 0);
	error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
	if (error)
		goto error_tgt_fput;
	if (op == EPOLL_CTL_ADD) {
		if (!list_empty(&f.file->f_ep_links) ||
						is_file_epoll(tf.file)) {
			full_check = 1;
			mutex_unlock(&ep->mtx);
			mutex_lock(&epmutex);
			error = epoll_mutex_lock(&epmutex, 0, nonblock);
			if (error)
				goto error_tgt_fput;
			full_check = 1;
			if (is_file_epoll(tf.file)) {
				error = -ELOOP;
				if (ep_loop_check(ep, tf.file) != 0) {
@@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
			} else
				list_add(&tf.file->f_tfile_llink,
							&tfile_check_list);
			mutex_lock_nested(&ep->mtx, 0);
			error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
			if (error) {
out_del:
				list_del(&tf.file->f_tfile_llink);
				goto error_tgt_fput;
			}
			if (is_file_epoll(tf.file)) {
				tep = tf.file->private_data;
				mutex_lock_nested(&tep->mtx, 1);
				error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
				if (error) {
					mutex_unlock(&ep->mtx);
					goto out_del;
				}
			}
		}
	}
@@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
	switch (op) {
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= EPOLLERR | EPOLLHUP;
			error = ep_insert(ep, &epds, tf.file, fd, full_check);
			epds->events |= EPOLLERR | EPOLLHUP;
			error = ep_insert(ep, epds, tf.file, fd, full_check);
		} else
			error = -EEXIST;
		if (full_check)
@@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
	case EPOLL_CTL_MOD:
		if (epi) {
			if (!(epi->event.events & EPOLLEXCLUSIVE)) {
				epds.events |= EPOLLERR | EPOLLHUP;
				error = ep_modify(ep, epi, &epds);
				epds->events |= EPOLLERR | EPOLLHUP;
				error = ep_modify(ep, epi, epds);
			}
		} else
			error = -ENOENT;
@@ -2231,6 +2239,23 @@ error_return:
	return error;
}

/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	struct epoll_event epds;

	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		return -EFAULT;

	return do_epoll_ctl(epfd, op, fd, &epds, false);
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
+4 −2
Original line number Diff line number Diff line
@@ -642,7 +642,9 @@ out_unlock:
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */

/*
 * variant of __close_fd that gets a ref on the file for later fput
 * variant of __close_fd that gets a ref on the file for later fput.
 * The caller must ensure that filp_close() called on the file, and then
 * an fput().
 */
int __close_fd_get_file(unsigned int fd, struct file **res)
{
@@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
	spin_unlock(&files->file_lock);
	get_file(file);
	*res = file;
	return filp_close(file, files);
	return 0;

out_unlock:
	spin_unlock(&files->file_lock);
+8 −0
Original line number Diff line number Diff line
@@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
		const struct open_flags *op);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
		const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);

long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
long do_faccessat(int dfd, const char __user *filename, int mode);
@@ -182,3 +184,9 @@ extern const struct dentry_operations ns_dentry_operations;

/* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb);

/*
 * fs/stat.c:
 */
unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags);
int cp_statx(const struct kstat *stat, struct statx __user *buffer);
+76 −27
Original line number Diff line number Diff line
@@ -56,7 +56,8 @@ struct io_worker {

	struct rcu_head rcu;
	struct mm_struct *mm;
	const struct cred *creds;
	const struct cred *cur_creds;
	const struct cred *saved_creds;
	struct files_struct *restore_files;
};

@@ -109,10 +110,10 @@ struct io_wq {

	struct task_struct *manager;
	struct user_struct *user;
	const struct cred *creds;
	struct mm_struct *mm;
	refcount_t refs;
	struct completion done;

	refcount_t use_refs;
};

static bool io_worker_get(struct io_worker *worker)
@@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
{
	bool dropped_lock = false;

	if (worker->creds) {
		revert_creds(worker->creds);
		worker->creds = NULL;
	if (worker->saved_creds) {
		revert_creds(worker->saved_creds);
		worker->cur_creds = worker->saved_creds = NULL;
	}

	if (current->files != worker->restore_files) {
@@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
	return NULL;
}

static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
{
	if (worker->mm) {
		unuse_mm(worker->mm);
		mmput(worker->mm);
		worker->mm = NULL;
	}
	if (!work->mm) {
		set_fs(KERNEL_DS);
		return;
	}
	if (mmget_not_zero(work->mm)) {
		use_mm(work->mm);
		if (!worker->mm)
			set_fs(USER_DS);
		worker->mm = work->mm;
		/* hang on to this mm */
		work->mm = NULL;
		return;
	}

	/* failed grabbing mm, ensure work gets cancelled */
	work->flags |= IO_WQ_WORK_CANCEL;
}

static void io_wq_switch_creds(struct io_worker *worker,
			       struct io_wq_work *work)
{
	const struct cred *old_creds = override_creds(work->creds);

	worker->cur_creds = work->creds;
	if (worker->saved_creds)
		put_cred(old_creds); /* creds set by previous switch */
	else
		worker->saved_creds = old_creds;
}

static void io_worker_handle_work(struct io_worker *worker)
	__releases(wqe->lock)
{
@@ -438,24 +476,19 @@ next:
		if (work->flags & IO_WQ_WORK_CB)
			work->func(&work);

		if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
		    current->files != work->files) {
		if (work->files && current->files != work->files) {
			task_lock(current);
			current->files = work->files;
			task_unlock(current);
		}
		if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
		    wq->mm) {
			if (mmget_not_zero(wq->mm)) {
				use_mm(wq->mm);
				set_fs(USER_DS);
				worker->mm = wq->mm;
			} else {
				work->flags |= IO_WQ_WORK_CANCEL;
			}
		}
		if (!worker->creds)
			worker->creds = override_creds(wq->creds);
		if (work->mm != worker->mm)
			io_wq_switch_mm(worker, work);
		if (worker->cur_creds != work->creds)
			io_wq_switch_creds(worker, work);
		/*
		 * OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
		 * the worker function will do the right thing.
		 */
		if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
			work->flags |= IO_WQ_WORK_CANCEL;
		if (worker->mm)
@@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
	int work_flags;
	unsigned long flags;

	/*
@@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
		return;
	}

	work_flags = work->flags;
	spin_lock_irqsave(&wqe->lock, flags);
	wq_list_add_tail(&work->list, &wqe->work_list);
	wqe->flags &= ~IO_WQE_FLAG_STALLED;
	spin_unlock_irqrestore(&wqe->lock, flags);

	if (!atomic_read(&acct->nr_running))
	if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
	    !atomic_read(&acct->nr_running))
		io_wqe_wake_worker(wqe, acct);
}

@@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
	 */
	spin_lock_irqsave(&worker->lock, flags);
	if (worker->cur_work &&
	    !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
	    data->cancel(worker->cur_work, data->caller_data)) {
		send_sig(SIGINT, worker->task, 1);
		ret = true;
@@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
		return false;

	spin_lock_irqsave(&worker->lock, flags);
	if (worker->cur_work == work) {
	if (worker->cur_work == work &&
	    !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
		send_sig(SIGINT, worker->task, 1);
		ret = true;
	}
@@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)

	/* caller must already hold a reference to this */
	wq->user = data->user;
	wq->creds = data->creds;

	for_each_node(node) {
		struct io_wqe *wqe;
@@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)

	init_completion(&wq->done);

	/* caller must have already done mmgrab() on this mm */
	wq->mm = data->mm;

	wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
	if (!IS_ERR(wq->manager)) {
		wake_up_process(wq->manager);
@@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
			ret = -ENOMEM;
			goto err;
		}
		refcount_set(&wq->use_refs, 1);
		reinit_completion(&wq->done);
		return wq;
	}
@@ -1078,13 +1113,21 @@ err:
	return ERR_PTR(ret);
}

bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
	if (data->get_work != wq->get_work || data->put_work != wq->put_work)
		return false;

	return refcount_inc_not_zero(&wq->use_refs);
}

static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
	wake_up_process(worker->task);
	return false;
}

void io_wq_destroy(struct io_wq *wq)
static void __io_wq_destroy(struct io_wq *wq)
{
	int node;

@@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq)
	kfree(wq->wqes);
	kfree(wq);
}

void io_wq_destroy(struct io_wq *wq)
{
	if (refcount_dec_and_test(&wq->use_refs))
		__io_wq_destroy(wq);
}
Loading