Commit 52ae2456 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.2/io_uring-20190507' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Set of changes/improvements for io_uring. This contains:

   - Fix of a shadowed variable (Colin)

   - Add support for draining commands (me)

   - Add support for sync_file_range() (me)

   - Add eventfd support (me)

   - cpu_online() fix (Shenghui)

   - Removal of a redundant ->error assignment (Stefan)"

* tag 'for-5.2/io_uring-20190507' of git://git.kernel.dk/linux-block:
  io_uring: use cpu_online() to check p->sq_thread_cpu instead of cpu_possible()
  io_uring: fix shadowed variable ret return code being not checked
  req->error only used for iopoll
  io_uring: add support for eventfd notifications
  io_uring: add support for IORING_OP_SYNC_FILE_RANGE
  fs: add sync_file_range() helper
  io_uring: add support for marking commands as draining
parents 67a24222 7889f44d
Loading
Loading
Loading
Loading
+188 −6
Original line number Diff line number Diff line
@@ -222,6 +222,8 @@ struct io_ring_ctx {
		unsigned		sq_mask;
		unsigned		sq_thread_idle;
		struct io_uring_sqe	*sq_sqes;

		struct list_head	defer_list;
	} ____cacheline_aligned_in_smp;

	/* IO offload */
@@ -239,6 +241,7 @@ struct io_ring_ctx {
		unsigned		cq_mask;
		struct wait_queue_head	cq_wait;
		struct fasync_struct	*cq_fasync;
		struct eventfd_ctx	*cq_ev_fd;
	} ____cacheline_aligned_in_smp;

	/*
@@ -327,8 +330,11 @@ struct io_kiocb {
#define REQ_F_FIXED_FILE	4	/* ctx owns file */
#define REQ_F_SEQ_PREV		8	/* sequential with previous */
#define REQ_F_PREPPED		16	/* prep already done */
#define REQ_F_IO_DRAIN		32	/* drain existing IO first */
#define REQ_F_IO_DRAINED	64	/* drain done */
	u64			user_data;
	u64			error;
	u32			error;	/* iopoll result from callback */
	u32			sequence;

	struct work_struct	work;
};
@@ -356,6 +362,8 @@ struct io_submit_state {
	unsigned int		ios_left;
};

static void io_sq_wq_submit_work(struct work_struct *work);

static struct kmem_cache *req_cachep;

static const struct file_operations io_uring_fops;
@@ -407,10 +415,36 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
	spin_lock_init(&ctx->completion_lock);
	INIT_LIST_HEAD(&ctx->poll_list);
	INIT_LIST_HEAD(&ctx->cancel_list);
	INIT_LIST_HEAD(&ctx->defer_list);
	return ctx;
}

static void io_commit_cqring(struct io_ring_ctx *ctx)
static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
				     struct io_kiocb *req)
{
	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
		return false;

	return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
}

static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
{
	struct io_kiocb *req;

	if (list_empty(&ctx->defer_list))
		return NULL;

	req = list_first_entry(&ctx->defer_list, struct io_kiocb, list);
	if (!io_sequence_defer(ctx, req)) {
		list_del_init(&req->list);
		return req;
	}

	return NULL;
}

static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
	struct io_cq_ring *ring = ctx->cq_ring;

@@ -425,6 +459,18 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
	}
}

static void io_commit_cqring(struct io_ring_ctx *ctx)
{
	struct io_kiocb *req;

	__io_commit_cqring(ctx);

	while ((req = io_get_deferred_req(ctx)) != NULL) {
		req->flags |= REQ_F_IO_DRAINED;
		queue_work(ctx->sqo_wq, &req->work);
	}
}

static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
	struct io_cq_ring *ring = ctx->cq_ring;
@@ -471,6 +517,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
		wake_up(&ctx->wait);
	if (waitqueue_active(&ctx->sqo_wait))
		wake_up(&ctx->sqo_wait);
	if (ctx->cq_ev_fd)
		eventfd_signal(ctx->cq_ev_fd, 1);
}

static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
@@ -1222,6 +1270,54 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
	return 0;
}

static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
	struct io_ring_ctx *ctx = req->ctx;
	int ret = 0;

	if (!req->file)
		return -EBADF;
	/* Prep already done (EAGAIN retry) */
	if (req->flags & REQ_F_PREPPED)
		return 0;

	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
		return -EINVAL;
	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
		return -EINVAL;

	req->flags |= REQ_F_PREPPED;
	return ret;
}

static int io_sync_file_range(struct io_kiocb *req,
			      const struct io_uring_sqe *sqe,
			      bool force_nonblock)
{
	loff_t sqe_off;
	loff_t sqe_len;
	unsigned flags;
	int ret;

	ret = io_prep_sfr(req, sqe);
	if (ret)
		return ret;

	/* sync_file_range always requires a blocking context */
	if (force_nonblock)
		return -EAGAIN;

	sqe_off = READ_ONCE(sqe->off);
	sqe_len = READ_ONCE(sqe->len);
	flags = READ_ONCE(sqe->sync_range_flags);

	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);

	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
	io_put_req(req);
	return 0;
}

static void io_poll_remove_one(struct io_kiocb *req)
{
	struct io_poll_iocb *poll = &req->poll;
@@ -1424,7 +1520,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
		spin_unlock(&poll->head->lock);
	}
	if (mask) { /* no async, we'd stolen it */
		req->error = mangle_poll(mask);
		ipt.error = 0;
		io_poll_complete(ctx, req, mask);
	}
@@ -1437,6 +1532,34 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	return ipt.error;
}

static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
			const struct io_uring_sqe *sqe)
{
	struct io_uring_sqe *sqe_copy;

	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
		return 0;

	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
	if (!sqe_copy)
		return -EAGAIN;

	spin_lock_irq(&ctx->completion_lock);
	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
		spin_unlock_irq(&ctx->completion_lock);
		kfree(sqe_copy);
		return 0;
	}

	memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
	req->submit.sqe = sqe_copy;

	INIT_WORK(&req->work, io_sq_wq_submit_work);
	list_add_tail(&req->list, &ctx->defer_list);
	spin_unlock_irq(&ctx->completion_lock);
	return -EIOCBQUEUED;
}

static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
			   const struct sqe_submit *s, bool force_nonblock)
{
@@ -1476,6 +1599,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
	case IORING_OP_POLL_REMOVE:
		ret = io_poll_remove(req, s->sqe);
		break;
	case IORING_OP_SYNC_FILE_RANGE:
		ret = io_sync_file_range(req, s->sqe, force_nonblock);
		break;
	default:
		ret = -EINVAL;
		break;
@@ -1684,6 +1810,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
	flags = READ_ONCE(s->sqe->flags);
	fd = READ_ONCE(s->sqe->fd);

	if (flags & IOSQE_IO_DRAIN) {
		req->flags |= REQ_F_IO_DRAIN;
		req->sequence = ctx->cached_sq_head - 1;
	}

	if (!io_op_needs_file(s->sqe)) {
		req->file = NULL;
		return 0;
@@ -1713,7 +1844,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
	int ret;

	/* enforce forwards compatibility on users */
	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
	if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)))
		return -EINVAL;

	req = io_get_req(ctx, state);
@@ -1724,6 +1855,13 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
	if (unlikely(ret))
		goto out;

	ret = io_req_defer(ctx, req, s->sqe);
	if (ret) {
		if (ret == -EIOCBQUEUED)
			ret = 0;
		return ret;
	}

	ret = __io_submit_sqe(ctx, req, s, true);
	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
		struct io_uring_sqe *sqe_copy;
@@ -2225,7 +2363,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
	left = ctx->nr_user_files;
	while (left) {
		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
		int ret;

		ret = __io_sqe_files_scm(ctx, this_files, total);
		if (ret)
@@ -2334,7 +2471,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
							nr_cpu_ids);

			ret = -EINVAL;
			if (!cpu_possible(cpu))
			if (!cpu_online(cpu))
				goto err;

			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
@@ -2621,6 +2758,38 @@ err:
	return ret;
}

static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
{
	__s32 __user *fds = arg;
	int fd;

	if (ctx->cq_ev_fd)
		return -EBUSY;

	if (copy_from_user(&fd, fds, sizeof(*fds)))
		return -EFAULT;

	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
	if (IS_ERR(ctx->cq_ev_fd)) {
		int ret = PTR_ERR(ctx->cq_ev_fd);
		ctx->cq_ev_fd = NULL;
		return ret;
	}

	return 0;
}

static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
	if (ctx->cq_ev_fd) {
		eventfd_ctx_put(ctx->cq_ev_fd);
		ctx->cq_ev_fd = NULL;
		return 0;
	}

	return -ENXIO;
}

static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
	io_finish_async(ctx);
@@ -2630,6 +2799,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
	io_iopoll_reap_events(ctx);
	io_sqe_buffer_unregister(ctx);
	io_sqe_files_unregister(ctx);
	io_eventfd_unregister(ctx);

#if defined(CONFIG_UNIX)
	if (ctx->ring_sock)
@@ -3043,6 +3213,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
			break;
		ret = io_sqe_files_unregister(ctx);
		break;
	case IORING_REGISTER_EVENTFD:
		ret = -EINVAL;
		if (nr_args != 1)
			break;
		ret = io_eventfd_register(ctx, arg);
		break;
	case IORING_UNREGISTER_EVENTFD:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_eventfd_unregister(ctx);
		break;
	default:
		ret = -EINVAL;
		break;
+71 −64
Original line number Diff line number Diff line
@@ -234,58 +234,10 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
	return do_fsync(fd, 1);
}

/*
 * sys_sync_file_range() permits finely controlled syncing over a segment of
 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
 * zero then sys_sync_file_range() will operate from offset out to EOF.
 *
 * The flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
 * before performing the write.
 *
 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
 * range which are not presently under writeback. Note that this may block for
 * significant periods due to exhaustion of disk request structures.
 *
 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
 * after performing the write.
 *
 * Useful combinations of the flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
 * in the range which were dirty on entry to sys_sync_file_range() are placed
 * under writeout.  This is a start-write-for-data-integrity operation.
 *
 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
 * are not presently under writeout.  This is an asynchronous flush-to-disk
 * operation.  Not suitable for data integrity operations.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
 * completion of writeout of all pages in the range.  This will be used after an
 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
 * for that operation to complete and to return the result.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
 * a traditional sync() operation.  This is a write-for-data-integrity operation
 * which will ensure that all pages in the range which were dirty on entry to
 * sys_sync_file_range() are committed to disk.
 *
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
 * I/O errors or ENOSPC conditions and will return those to the caller, after
 * clearing the EIO and ENOSPC flags in the address_space.
 *
 * It should be noted that none of these operations write out the file's
 * metadata.  So unless the application is strictly performing overwrites of
 * already-instantiated disk blocks, there are no guarantees here that the data
 * will be available after a crash.
 */
int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
		    unsigned int flags)
{
	int ret;
	struct fd f;
	struct address_space *mapping;
	loff_t endbyte;			/* inclusive */
	umode_t i_mode;
@@ -325,41 +277,96 @@ int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
	else
		endbyte--;		/* inclusive */

	ret = -EBADF;
	f = fdget(fd);
	if (!f.file)
		goto out;

	i_mode = file_inode(f.file)->i_mode;
	i_mode = file_inode(file)->i_mode;
	ret = -ESPIPE;
	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
			!S_ISLNK(i_mode))
		goto out_put;
		goto out;

	mapping = f.file->f_mapping;
	mapping = file->f_mapping;
	ret = 0;
	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
		ret = file_fdatawait_range(f.file, offset, endbyte);
		ret = file_fdatawait_range(file, offset, endbyte);
		if (ret < 0)
			goto out_put;
			goto out;
	}

	if (flags & SYNC_FILE_RANGE_WRITE) {
		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
						 WB_SYNC_NONE);
		if (ret < 0)
			goto out_put;
			goto out;
	}

	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
		ret = file_fdatawait_range(f.file, offset, endbyte);
		ret = file_fdatawait_range(file, offset, endbyte);

out_put:
	fdput(f);
out:
	return ret;
}

/*
 * sys_sync_file_range() permits finely controlled syncing over a segment of
 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
 * zero then sys_sync_file_range() will operate from offset out to EOF.
 *
 * The flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
 * before performing the write.
 *
 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
 * range which are not presently under writeback. Note that this may block for
 * significant periods due to exhaustion of disk request structures.
 *
 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
 * after performing the write.
 *
 * Useful combinations of the flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
 * in the range which were dirty on entry to sys_sync_file_range() are placed
 * under writeout.  This is a start-write-for-data-integrity operation.
 *
 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
 * are not presently under writeout.  This is an asynchronous flush-to-disk
 * operation.  Not suitable for data integrity operations.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
 * completion of writeout of all pages in the range.  This will be used after an
 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
 * for that operation to complete and to return the result.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
 * a traditional sync() operation.  This is a write-for-data-integrity operation
 * which will ensure that all pages in the range which were dirty on entry to
 * sys_sync_file_range() are committed to disk.
 *
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
 * I/O errors or ENOSPC conditions and will return those to the caller, after
 * clearing the EIO and ENOSPC flags in the address_space.
 *
 * It should be noted that none of these operations write out the file's
 * metadata.  So unless the application is strictly performing overwrites of
 * already-instantiated disk blocks, there are no guarantees here that the data
 * will be available after a crash.
 */
int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
			 unsigned int flags)
{
	int ret;
	struct fd f;

	ret = -EBADF;
	f = fdget(fd);
	if (f.file)
		ret = sync_file_range(f.file, offset, nbytes, flags);

	fdput(f);
	return ret;
}

SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
				unsigned int, flags)
{
+3 −0
Original line number Diff line number Diff line
@@ -2789,6 +2789,9 @@ extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
			   int datasync);
extern int vfs_fsync(struct file *file, int datasync);

extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
				unsigned int flags);

/*
 * Sync the bytes written if this was a synchronous write.  Expect ki_pos
 * to already be updated for the write, and will return either the amount
+5 −0
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@ struct io_uring_sqe {
		__kernel_rwf_t	rw_flags;
		__u32		fsync_flags;
		__u16		poll_events;
		__u32		sync_range_flags;
	};
	__u64	user_data;	/* data to be passed back at completion time */
	union {
@@ -38,6 +39,7 @@ struct io_uring_sqe {
 * sqe->flags
 */
#define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
#define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */

/*
 * io_uring_setup() flags
@@ -54,6 +56,7 @@ struct io_uring_sqe {
#define IORING_OP_WRITE_FIXED	5
#define IORING_OP_POLL_ADD	6
#define IORING_OP_POLL_REMOVE	7
#define IORING_OP_SYNC_FILE_RANGE	8

/*
 * sqe->fsync_flags
@@ -133,5 +136,7 @@ struct io_uring_params {
#define IORING_UNREGISTER_BUFFERS	1
#define IORING_REGISTER_FILES		2
#define IORING_UNREGISTER_FILES		3
#define IORING_REGISTER_EVENTFD		4
#define IORING_UNREGISTER_EVENTFD	5

#endif