Commit 5f1c6d28 authored by Dave Chinner's avatar Dave Chinner
Browse files

Merge branch 'iomap-4.10-directio' into for-next

parents b7b26110 acdda3aa
Loading
Loading
Loading
Loading
+49 −0
Original line number Diff line number Diff line
@@ -847,6 +847,55 @@ done:
}
EXPORT_SYMBOL(bio_add_page);

/**
 * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
 * @bio: bio to add pages to
 * @iter: iov iterator describing the region to be mapped
 *
 * Pins as many pages from *iter and appends them to @bio's bvec array. The
 * pages will have to be released using put_page() when done.
 */
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
	struct page **pages = (struct page **)bv;
	size_t offset, diff;
	ssize_t size;

	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
	if (unlikely(size <= 0))
		return size ? size : -EFAULT;
	nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;

	/*
	 * Deep magic below:  We need to walk the pinned pages backwards
	 * because we are abusing the space allocated for the bio_vecs
	 * for the page array.  Because the bio_vecs are larger than the
	 * page pointers by definition this will always work.  But it also
	 * means we can't use bio_add_page, so any changes to it's semantics
	 * need to be reflected here as well.
	 */
	bio->bi_iter.bi_size += size;
	bio->bi_vcnt += nr_pages;

	diff = (nr_pages * PAGE_SIZE - offset) - size;
	while (nr_pages--) {
		bv[nr_pages].bv_page = pages[nr_pages];
		bv[nr_pages].bv_len = PAGE_SIZE;
		bv[nr_pages].bv_offset = 0;
	}

	bv[0].bv_offset += offset;
	bv[0].bv_len -= offset;
	if (diff)
		bv[bio->bi_vcnt - 1].bv_len -= diff;

	iov_iter_advance(iter, size);
	return 0;
}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);

struct submit_bio_ret {
	struct completion event;
	int error;
+1 −1
Original line number Diff line number Diff line
@@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 * filesystems that don't need it and also allows us to create the workqueue
 * late enough so the we can include s_id in the name of the workqueue.
 */
static int sb_init_dio_done_wq(struct super_block *sb)
int sb_init_dio_done_wq(struct super_block *sb)
{
	struct workqueue_struct *old;
	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
+3 −0
Original line number Diff line number Diff line
@@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
		unsigned flags, struct iomap_ops *ops, void *data,
		iomap_actor_t actor);

/* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb);
+373 −0
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@
#include <linux/uio.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include "internal.h"

@@ -584,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
	return 0;
}
EXPORT_SYMBOL_GPL(iomap_fiemap);

/*
 * Private flags for iomap_dio, must not overlap with the public ones in
 * iomap.h:
 */
#define IOMAP_DIO_WRITE		(1 << 30)
#define IOMAP_DIO_DIRTY		(1 << 31)

struct iomap_dio {
	struct kiocb		*iocb;
	iomap_dio_end_io_t	*end_io;
	loff_t			i_size;
	loff_t			size;
	atomic_t		ref;
	unsigned		flags;
	int			error;

	union {
		/* used during submission and for synchronous completion: */
		struct {
			struct iov_iter		*iter;
			struct task_struct	*waiter;
			struct request_queue	*last_queue;
			blk_qc_t		cookie;
		} submit;

		/* used for aio completion: */
		struct {
			struct work_struct	work;
		} aio;
	};
};

static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
	struct kiocb *iocb = dio->iocb;
	ssize_t ret;

	if (dio->end_io) {
		ret = dio->end_io(iocb,
				dio->error ? dio->error : dio->size,
				dio->flags);
	} else {
		ret = dio->error;
	}

	if (likely(!ret)) {
		ret = dio->size;
		/* check for short read */
		if (iocb->ki_pos + ret > dio->i_size &&
		    !(dio->flags & IOMAP_DIO_WRITE))
			ret = dio->i_size - iocb->ki_pos;
		iocb->ki_pos += ret;
	}

	inode_dio_end(file_inode(iocb->ki_filp));
	kfree(dio);

	return ret;
}

static void iomap_dio_complete_work(struct work_struct *work)
{
	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
	struct kiocb *iocb = dio->iocb;
	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
	ssize_t ret;

	ret = iomap_dio_complete(dio);
	if (is_write && ret > 0)
		ret = generic_write_sync(iocb, ret);
	iocb->ki_complete(iocb, ret, 0);
}

/*
 * Set an error in the dio if none is set yet.  We have to use cmpxchg
 * as the submission context and the completion context(s) can race to
 * update the error.
 */
static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
{
	cmpxchg(&dio->error, 0, ret);
}

static void iomap_dio_bio_end_io(struct bio *bio)
{
	struct iomap_dio *dio = bio->bi_private;
	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);

	if (bio->bi_error)
		iomap_dio_set_error(dio, bio->bi_error);

	if (atomic_dec_and_test(&dio->ref)) {
		if (is_sync_kiocb(dio->iocb)) {
			struct task_struct *waiter = dio->submit.waiter;

			WRITE_ONCE(dio->submit.waiter, NULL);
			wake_up_process(waiter);
		} else if (dio->flags & IOMAP_DIO_WRITE) {
			struct inode *inode = file_inode(dio->iocb->ki_filp);

			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
		} else {
			iomap_dio_complete_work(&dio->aio.work);
		}
	}

	if (should_dirty) {
		bio_check_pages_dirty(bio);
	} else {
		struct bio_vec *bvec;
		int i;

		bio_for_each_segment_all(bvec, bio, i)
			put_page(bvec->bv_page);
		bio_put(bio);
	}
}

static blk_qc_t
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
		unsigned len)
{
	struct page *page = ZERO_PAGE(0);
	struct bio *bio;

	bio = bio_alloc(GFP_KERNEL, 1);
	bio->bi_bdev = iomap->bdev;
	bio->bi_iter.bi_sector =
		iomap->blkno + ((pos - iomap->offset) >> 9);
	bio->bi_private = dio;
	bio->bi_end_io = iomap_dio_bio_end_io;

	get_page(page);
	if (bio_add_page(bio, page, len, 0) != len)
		BUG();
	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);

	atomic_inc(&dio->ref);
	return submit_bio(bio);
}

static loff_t
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
		void *data, struct iomap *iomap)
{
	struct iomap_dio *dio = data;
	unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
	unsigned fs_block_size = (1 << inode->i_blkbits), pad;
	unsigned align = iov_iter_alignment(dio->submit.iter);
	struct iov_iter iter;
	struct bio *bio;
	bool need_zeroout = false;
	int nr_pages, ret;

	if ((pos | length | align) & ((1 << blkbits) - 1))
		return -EINVAL;

	switch (iomap->type) {
	case IOMAP_HOLE:
		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
			return -EIO;
		/*FALLTHRU*/
	case IOMAP_UNWRITTEN:
		if (!(dio->flags & IOMAP_DIO_WRITE)) {
			iov_iter_zero(length, dio->submit.iter);
			dio->size += length;
			return length;
		}
		dio->flags |= IOMAP_DIO_UNWRITTEN;
		need_zeroout = true;
		break;
	case IOMAP_MAPPED:
		if (iomap->flags & IOMAP_F_SHARED)
			dio->flags |= IOMAP_DIO_COW;
		if (iomap->flags & IOMAP_F_NEW)
			need_zeroout = true;
		break;
	default:
		WARN_ON_ONCE(1);
		return -EIO;
	}

	/*
	 * Operate on a partial iter trimmed to the extent we were called for.
	 * We'll update the iter in the dio once we're done with this extent.
	 */
	iter = *dio->submit.iter;
	iov_iter_truncate(&iter, length);

	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
	if (nr_pages <= 0)
		return nr_pages;

	if (need_zeroout) {
		/* zero out from the start of the block to the write offset */
		pad = pos & (fs_block_size - 1);
		if (pad)
			iomap_dio_zero(dio, iomap, pos - pad, pad);
	}

	do {
		if (dio->error)
			return 0;

		bio = bio_alloc(GFP_KERNEL, nr_pages);
		bio->bi_bdev = iomap->bdev;
		bio->bi_iter.bi_sector =
			iomap->blkno + ((pos - iomap->offset) >> 9);
		bio->bi_private = dio;
		bio->bi_end_io = iomap_dio_bio_end_io;

		ret = bio_iov_iter_get_pages(bio, &iter);
		if (unlikely(ret)) {
			bio_put(bio);
			return ret;
		}

		if (dio->flags & IOMAP_DIO_WRITE) {
			bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);
			task_io_account_write(bio->bi_iter.bi_size);
		} else {
			bio_set_op_attrs(bio, REQ_OP_READ, 0);
			if (dio->flags & IOMAP_DIO_DIRTY)
				bio_set_pages_dirty(bio);
		}

		dio->size += bio->bi_iter.bi_size;
		pos += bio->bi_iter.bi_size;

		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);

		atomic_inc(&dio->ref);

		dio->submit.last_queue = bdev_get_queue(iomap->bdev);
		dio->submit.cookie = submit_bio(bio);
	} while (nr_pages);

	if (need_zeroout) {
		/* zero out from the end of the write to the end of the block */
		pad = pos & (fs_block_size - 1);
		if (pad)
			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
	}

	iov_iter_advance(dio->submit.iter, length);
	return length;
}

ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
		iomap_dio_end_io_t end_io)
{
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	struct inode *inode = file_inode(iocb->ki_filp);
	size_t count = iov_iter_count(iter);
	loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
	unsigned int flags = IOMAP_DIRECT;
	struct blk_plug plug;
	struct iomap_dio *dio;

	lockdep_assert_held(&inode->i_rwsem);

	if (!count)
		return 0;

	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
	if (!dio)
		return -ENOMEM;

	dio->iocb = iocb;
	atomic_set(&dio->ref, 1);
	dio->size = 0;
	dio->i_size = i_size_read(inode);
	dio->end_io = end_io;
	dio->error = 0;
	dio->flags = 0;

	dio->submit.iter = iter;
	if (is_sync_kiocb(iocb)) {
		dio->submit.waiter = current;
		dio->submit.cookie = BLK_QC_T_NONE;
		dio->submit.last_queue = NULL;
	}

	if (iov_iter_rw(iter) == READ) {
		if (pos >= dio->i_size)
			goto out_free_dio;

		if (iter->type == ITER_IOVEC)
			dio->flags |= IOMAP_DIO_DIRTY;
	} else {
		dio->flags |= IOMAP_DIO_WRITE;
		flags |= IOMAP_WRITE;
	}

	if (mapping->nrpages) {
		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
		if (ret)
			goto out_free_dio;

		ret = invalidate_inode_pages2_range(mapping,
				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
		WARN_ON_ONCE(ret);
		ret = 0;
	}

	inode_dio_begin(inode);

	blk_start_plug(&plug);
	do {
		ret = iomap_apply(inode, pos, count, flags, ops, dio,
				iomap_dio_actor);
		if (ret <= 0) {
			/* magic error code to fall back to buffered I/O */
			if (ret == -ENOTBLK)
				ret = 0;
			break;
		}
		pos += ret;
	} while ((count = iov_iter_count(iter)) > 0);
	blk_finish_plug(&plug);

	if (ret < 0)
		iomap_dio_set_error(dio, ret);

	if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
			!inode->i_sb->s_dio_done_wq) {
		ret = sb_init_dio_done_wq(inode->i_sb);
		if (ret < 0)
			iomap_dio_set_error(dio, ret);
	}

	if (!atomic_dec_and_test(&dio->ref)) {
		if (!is_sync_kiocb(iocb))
			return -EIOCBQUEUED;

		for (;;) {
			set_current_state(TASK_UNINTERRUPTIBLE);
			if (!READ_ONCE(dio->submit.waiter))
				break;

			if (!(iocb->ki_flags & IOCB_HIPRI) ||
			    !dio->submit.last_queue ||
			    !blk_poll(dio->submit.last_queue,
					dio->submit.cookie))
				io_schedule();
		}
		__set_current_state(TASK_RUNNING);
	}

	/*
	 * Try again to invalidate clean pages which might have been cached by
	 * non-direct readahead, or faulted in by get_user_pages() if the source
	 * of the write was an mmap'ed region of the file we're writing.  Either
	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
	 * this invalidation fails, tough, the write still worked...
	 */
	if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
		ret = invalidate_inode_pages2_range(mapping,
				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
		WARN_ON_ONCE(ret);
	}

	return iomap_dio_complete(dio);

out_free_dio:
	kfree(dio);
	return ret;
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
+10 −288
Original line number Diff line number Diff line
@@ -37,11 +37,6 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>

/* flags for direct write completions */
#define XFS_DIO_FLAG_UNWRITTEN	(1 << 0)
#define XFS_DIO_FLAG_APPEND	(1 << 1)
#define XFS_DIO_FLAG_COW	(1 << 2)

/*
 * structure owned by writepages passed to individual writepage calls
 */
@@ -1175,45 +1170,6 @@ xfs_vm_releasepage(
	return try_to_free_buffers(page);
}

/*
 * When we map a DIO buffer, we may need to pass flags to
 * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
 *
 * Note that for DIO, an IO to the highest supported file block offset (i.e.
 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
 * bit variable. Hence if we see this overflow, we have to assume that the IO is
 * extending the file size. We won't know for sure until IO completion is run
 * and the actual max write offset is communicated to the IO completion
 * routine.
 */
static void
xfs_map_direct(
	struct inode		*inode,
	struct buffer_head	*bh_result,
	struct xfs_bmbt_irec	*imap,
	xfs_off_t		offset,
	bool			is_cow)
{
	uintptr_t		*flags = (uintptr_t *)&bh_result->b_private;
	xfs_off_t		size = bh_result->b_size;

	trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
		XFS_IO_OVERWRITE, imap);

	if (ISUNWRITTEN(imap)) {
		*flags |= XFS_DIO_FLAG_UNWRITTEN;
		set_buffer_defer_completion(bh_result);
	} else if (is_cow) {
		*flags |= XFS_DIO_FLAG_COW;
		set_buffer_defer_completion(bh_result);
	}
	if (offset + size > i_size_read(inode) || offset + size < 0) {
		*flags |= XFS_DIO_FLAG_APPEND;
		set_buffer_defer_completion(bh_result);
	}
}

/*
 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
 * is, so that we can avoid repeated get_blocks calls.
@@ -1254,51 +1210,12 @@ xfs_map_trim_size(
	bh_result->b_size = mapping_size;
}

/* Bounce unaligned directio writes to the page cache. */
static int
xfs_bounce_unaligned_dio_write(
	struct xfs_inode	*ip,
	xfs_fileoff_t		offset_fsb,
	struct xfs_bmbt_irec	*imap)
{
	struct xfs_bmbt_irec	irec;
	xfs_fileoff_t		delta;
	bool			shared;
	bool			x;
	int			error;

	irec = *imap;
	if (offset_fsb > irec.br_startoff) {
		delta = offset_fsb - irec.br_startoff;
		irec.br_blockcount -= delta;
		irec.br_startblock += delta;
		irec.br_startoff = offset_fsb;
	}
	error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
	if (error)
		return error;

	/*
	 * We're here because we're trying to do a directio write to a
	 * region that isn't aligned to a filesystem block.  If any part
	 * of the extent is shared, fall back to buffered mode to handle
	 * the RMW.  This is done by returning -EREMCHG ("remote addr
	 * changed"), which is caught further up the call stack.
	 */
	if (shared) {
		trace_xfs_reflink_bounce_dio_write(ip, imap);
		return -EREMCHG;
	}
	return 0;
}

STATIC int
__xfs_get_blocks(
xfs_get_blocks(
	struct inode		*inode,
	sector_t		iblock,
	struct buffer_head	*bh_result,
	int			create,
	bool			direct)
	int			create)
{
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
@@ -1309,10 +1226,8 @@ __xfs_get_blocks(
	int			nimaps = 1;
	xfs_off_t		offset;
	ssize_t			size;
	int			new = 0;
	bool			is_cow = false;

	BUG_ON(create && !direct);
	BUG_ON(create);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;
@@ -1321,7 +1236,7 @@ __xfs_get_blocks(
	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
	size = bh_result->b_size;

	if (!create && offset >= i_size_read(inode))
	if (offset >= i_size_read(inode))
		return 0;

	/*
@@ -1336,73 +1251,12 @@ __xfs_get_blocks(
	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
	offset_fsb = XFS_B_TO_FSBT(mp, offset);

	if (create && direct && xfs_is_reflink_inode(ip)) {
		is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
		ASSERT(!is_cow || !isnullstartblock(imap.br_startblock));
	}

	if (!is_cow) {
	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
				&imap, &nimaps, XFS_BMAPI_ENTIRE);
		/*
		 * Truncate an overwrite extent if there's a pending CoW
		 * reservation before the end of this extent.  This
		 * forces us to come back to get_blocks to take care of
		 * the CoW.
		 */
		if (create && direct && nimaps &&
		    imap.br_startblock != HOLESTARTBLOCK &&
		    imap.br_startblock != DELAYSTARTBLOCK &&
		    !ISUNWRITTEN(&imap))
			xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
					&imap);
	}
	if (error)
		goto out_unlock;

	/*
	 * The only time we can ever safely find delalloc blocks on direct I/O
	 * is a dio write to post-eof speculative preallocation. All other
	 * scenarios are indicative of a problem or misuse (such as mixing
	 * direct and mapped I/O).
	 *
	 * The file may be unmapped by the time we get here so we cannot
	 * reliably fail the I/O based on mapping. Instead, fail the I/O if this
	 * is a read or a write within eof. Otherwise, carry on but warn as a
	 * precuation if the file happens to be mapped.
	 */
	if (direct && imap.br_startblock == DELAYSTARTBLOCK) {
		if (!create || offset < i_size_read(VFS_I(ip))) {
			WARN_ON_ONCE(1);
			error = -EIO;
			goto out_unlock;
		}
		WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping));
	}

	/* for DAX, we convert unwritten extents directly */
	if (create &&
	    (!nimaps ||
	     (imap.br_startblock == HOLESTARTBLOCK ||
	      imap.br_startblock == DELAYSTARTBLOCK) ||
	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
		/*
		 * xfs_iomap_write_direct() expects the shared lock. It
		 * is unlocked on return.
		 */
		if (lockmode == XFS_ILOCK_EXCL)
			xfs_ilock_demote(ip, lockmode);

		error = xfs_iomap_write_direct(ip, offset, size,
					       &imap, nimaps);
		if (error)
			return error;
		new = 1;

		trace_xfs_get_blocks_alloc(ip, offset, size,
				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
						   : XFS_IO_DELALLOC, &imap);
	} else if (nimaps) {
	if (nimaps) {
		trace_xfs_get_blocks_found(ip, offset, size,
				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
						   : XFS_IO_OVERWRITE, &imap);
@@ -1412,12 +1266,6 @@ __xfs_get_blocks(
		goto out_unlock;
	}

	if (IS_DAX(inode) && create) {
		ASSERT(!ISUNWRITTEN(&imap));
		/* zeroing is not needed at a higher layer */
		new = 0;
	}

	/* trim mapping down to size requested */
	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);

@@ -1427,43 +1275,14 @@ __xfs_get_blocks(
	 */
	if (imap.br_startblock != HOLESTARTBLOCK &&
	    imap.br_startblock != DELAYSTARTBLOCK &&
	    (create || !ISUNWRITTEN(&imap))) {
		if (create && direct && !is_cow) {
			error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
					&imap);
			if (error)
				return error;
		}

	    !ISUNWRITTEN(&imap))
		xfs_map_buffer(inode, bh_result, &imap, offset);
		if (ISUNWRITTEN(&imap))
			set_buffer_unwritten(bh_result);
		/* direct IO needs special help */
		if (create)
			xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
	}

	/*
	 * If this is a realtime file, data may be on a different device.
	 * to that pointed to from the buffer_head b_bdev currently.
	 */
	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);

	/*
	 * If we previously allocated a block out beyond eof and we are now
	 * coming back to use it then we will need to flag it as new even if it
	 * has a disk address.
	 *
	 * With sub-block writes into unwritten extents we also need to mark
	 * the buffer as new so that the unwritten parts of the buffer gets
	 * correctly zeroed.
	 */
	if (create &&
	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
	     (offset >= i_size_read(inode)) ||
	     (new || ISUNWRITTEN(&imap))))
		set_buffer_new(bh_result);

	return 0;

out_unlock:
@@ -1471,100 +1290,6 @@ out_unlock:
	return error;
}

int
xfs_get_blocks(
	struct inode		*inode,
	sector_t		iblock,
	struct buffer_head	*bh_result,
	int			create)
{
	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}

int
xfs_get_blocks_direct(
	struct inode		*inode,
	sector_t		iblock,
	struct buffer_head	*bh_result,
	int			create)
{
	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
}

/*
 * Complete a direct I/O write request.
 *
 * xfs_map_direct passes us some flags in the private data to tell us what to
 * do.  If no flags are set, then the write IO is an overwrite wholly within
 * the existing allocated file size and so there is nothing for us to do.
 *
 * Note that in this case the completion can be called in interrupt context,
 * whereas if we have flags set we will always be called in task context
 * (i.e. from a workqueue).
 */
int
xfs_end_io_direct_write(
	struct kiocb		*iocb,
	loff_t			offset,
	ssize_t			size,
	void			*private)
{
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_inode	*ip = XFS_I(inode);
	uintptr_t		flags = (uintptr_t)private;
	int			error = 0;

	trace_xfs_end_io_direct_write(ip, offset, size);

	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return -EIO;

	if (size <= 0)
		return size;

	/*
	 * The flags tell us whether we are doing unwritten extent conversions
	 * or an append transaction that updates the on-disk file size. These
	 * cases are the only cases where we should *potentially* be needing
	 * to update the VFS inode size.
	 */
	if (flags == 0) {
		ASSERT(offset + size <= i_size_read(inode));
		return 0;
	}

	/*
	 * We need to update the in-core inode size here so that we don't end up
	 * with the on-disk inode size being outside the in-core inode size. We
	 * have no other method of updating EOF for AIO, so always do it here
	 * if necessary.
	 *
	 * We need to lock the test/set EOF update as we can be racing with
	 * other IO completions here to update the EOF. Failing to serialise
	 * here can result in EOF moving backwards and Bad Things Happen when
	 * that occurs.
	 */
	spin_lock(&ip->i_flags_lock);
	if (offset + size > i_size_read(inode))
		i_size_write(inode, offset + size);
	spin_unlock(&ip->i_flags_lock);

	if (flags & XFS_DIO_FLAG_COW)
		error = xfs_reflink_end_cow(ip, offset, size);
	if (flags & XFS_DIO_FLAG_UNWRITTEN) {
		trace_xfs_end_io_direct_write_unwritten(ip, offset, size);

		error = xfs_iomap_write_unwritten(ip, offset, size);
	}
	if (flags & XFS_DIO_FLAG_APPEND) {
		trace_xfs_end_io_direct_write_append(ip, offset, size);

		error = xfs_setfilesize(ip, offset, size);
	}

	return error;
}

STATIC ssize_t
xfs_vm_direct_IO(
	struct kiocb		*iocb,
@@ -1585,7 +1310,6 @@ xfs_vm_bmap(
	struct xfs_inode	*ip = XFS_I(inode);

	trace_xfs_vm_bmap(XFS_I(inode));
	xfs_ilock(ip, XFS_IOLOCK_SHARED);

	/*
	 * The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1593,12 +1317,10 @@ xfs_vm_bmap(
	 * that on reflinks inodes, so we have to skip out here.  And yes,
	 * 0 is the magic code for a bmap error..
	 */
	if (xfs_is_reflink_inode(ip)) {
		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
	if (xfs_is_reflink_inode(ip))
		return 0;
	}

	filemap_write_and_wait(mapping);
	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
	return generic_block_bmap(mapping, block, xfs_get_blocks);
}

Loading