Merge branch 'iomap-4.10-directio' into for-next (5f1c6d28) · Commits · 戴 / test

block/bio.c

+49 −0

Original line number	Diff line number	Diff line
		@@ -847,6 +847,55 @@ done:
		}
		EXPORT_SYMBOL(bio_add_page);

		/**
		* bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
		* @bio: bio to add pages to
		* @iter: iov iterator describing the region to be mapped
		*
		* Pins as many pages from *iter and appends them to @bio's bvec array. The
		* pages will have to be released using put_page() when done.
		*/
		int bio_iov_iter_get_pages(struct bio bio, struct iov_iter iter)
		{
		unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
		struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
		struct page pages = (struct page )bv;
		size_t offset, diff;
		ssize_t size;

		size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
		if (unlikely(size <= 0))
		return size ? size : -EFAULT;
		nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;

		/*
		* Deep magic below: We need to walk the pinned pages backwards
		* because we are abusing the space allocated for the bio_vecs
		* for the page array. Because the bio_vecs are larger than the
		* page pointers by definition this will always work. But it also
		* means we can't use bio_add_page, so any changes to it's semantics
		* need to be reflected here as well.
		*/
		bio->bi_iter.bi_size += size;
		bio->bi_vcnt += nr_pages;

		diff = (nr_pages * PAGE_SIZE - offset) - size;
		while (nr_pages--) {
		bv[nr_pages].bv_page = pages[nr_pages];
		bv[nr_pages].bv_len = PAGE_SIZE;
		bv[nr_pages].bv_offset = 0;
		}

		bv[0].bv_offset += offset;
		bv[0].bv_len -= offset;
		if (diff)
		bv[bio->bi_vcnt - 1].bv_len -= diff;

		iov_iter_advance(iter, size);
		return 0;
		}
		EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);

		struct submit_bio_ret {
		struct completion event;
		int error;

fs/direct-io.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio dio, struct dio_submit sdio)
		* filesystems that don't need it and also allows us to create the workqueue
		* late enough so the we can include s_id in the name of the workqueue.
		*/
		static int sb_init_dio_done_wq(struct super_block *sb)
		int sb_init_dio_done_wq(struct super_block *sb)
		{
		struct workqueue_struct *old;
		struct workqueue_struct *wq = alloc_workqueue("dio/%s",

fs/internal.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -184,3 +184,6 @@ typedef loff_t (iomap_actor_t)(struct inode inode, loff_t pos, loff_t len,
		loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
		unsigned flags, struct iomap_ops ops, void data,
		iomap_actor_t actor);

		/* direct-io.c: */
		int sb_init_dio_done_wq(struct super_block *sb);

fs/iomap.c

+373 −0

Original line number	Diff line number	Diff line
		@@ -24,6 +24,7 @@
		#include <linux/uio.h>
		#include <linux/backing-dev.h>
		#include <linux/buffer_head.h>
		#include <linux/task_io_accounting_ops.h>
		#include <linux/dax.h>
		#include "internal.h"

		@@ -584,3 +585,375 @@ int iomap_fiemap(struct inode inode, struct fiemap_extent_info fi,
		return 0;
		}
		EXPORT_SYMBOL_GPL(iomap_fiemap);

		/*
		* Private flags for iomap_dio, must not overlap with the public ones in
		* iomap.h:
		*/
		#define IOMAP_DIO_WRITE (1 << 30)
		#define IOMAP_DIO_DIRTY (1 << 31)

		struct iomap_dio {
		struct kiocb *iocb;
		iomap_dio_end_io_t *end_io;
		loff_t i_size;
		loff_t size;
		atomic_t ref;
		unsigned flags;
		int error;

		union {
		/* used during submission and for synchronous completion: */
		struct {
		struct iov_iter *iter;
		struct task_struct *waiter;
		struct request_queue *last_queue;
		blk_qc_t cookie;
		} submit;

		/* used for aio completion: */
		struct {
		struct work_struct work;
		} aio;
		};
		};

		static ssize_t iomap_dio_complete(struct iomap_dio *dio)
		{
		struct kiocb *iocb = dio->iocb;
		ssize_t ret;

		if (dio->end_io) {
		ret = dio->end_io(iocb,
		dio->error ? dio->error : dio->size,
		dio->flags);
		} else {
		ret = dio->error;
		}

		if (likely(!ret)) {
		ret = dio->size;
		/* check for short read */
		if (iocb->ki_pos + ret > dio->i_size &&
		!(dio->flags & IOMAP_DIO_WRITE))
		ret = dio->i_size - iocb->ki_pos;
		iocb->ki_pos += ret;
		}

		inode_dio_end(file_inode(iocb->ki_filp));
		kfree(dio);

		return ret;
		}

		static void iomap_dio_complete_work(struct work_struct *work)
		{
		struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
		struct kiocb *iocb = dio->iocb;
		bool is_write = (dio->flags & IOMAP_DIO_WRITE);
		ssize_t ret;

		ret = iomap_dio_complete(dio);
		if (is_write && ret > 0)
		ret = generic_write_sync(iocb, ret);
		iocb->ki_complete(iocb, ret, 0);
		}

		/*
		* Set an error in the dio if none is set yet. We have to use cmpxchg
		* as the submission context and the completion context(s) can race to
		* update the error.
		*/
		static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
		{
		cmpxchg(&dio->error, 0, ret);
		}

		static void iomap_dio_bio_end_io(struct bio *bio)
		{
		struct iomap_dio *dio = bio->bi_private;
		bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);

		if (bio->bi_error)
		iomap_dio_set_error(dio, bio->bi_error);

		if (atomic_dec_and_test(&dio->ref)) {
		if (is_sync_kiocb(dio->iocb)) {
		struct task_struct *waiter = dio->submit.waiter;

		WRITE_ONCE(dio->submit.waiter, NULL);
		wake_up_process(waiter);
		} else if (dio->flags & IOMAP_DIO_WRITE) {
		struct inode *inode = file_inode(dio->iocb->ki_filp);

		INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
		queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
		} else {
		iomap_dio_complete_work(&dio->aio.work);
		}
		}

		if (should_dirty) {
		bio_check_pages_dirty(bio);
		} else {
		struct bio_vec *bvec;
		int i;

		bio_for_each_segment_all(bvec, bio, i)
		put_page(bvec->bv_page);
		bio_put(bio);
		}
		}

		static blk_qc_t
		iomap_dio_zero(struct iomap_dio dio, struct iomap iomap, loff_t pos,
		unsigned len)
		{
		struct page *page = ZERO_PAGE(0);
		struct bio *bio;

		bio = bio_alloc(GFP_KERNEL, 1);
		bio->bi_bdev = iomap->bdev;
		bio->bi_iter.bi_sector =
		iomap->blkno + ((pos - iomap->offset) >> 9);
		bio->bi_private = dio;
		bio->bi_end_io = iomap_dio_bio_end_io;

		get_page(page);
		if (bio_add_page(bio, page, len, 0) != len)
		BUG();
		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);

		atomic_inc(&dio->ref);
		return submit_bio(bio);
		}

		static loff_t
		iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
		void data, struct iomap iomap)
		{
		struct iomap_dio *dio = data;
		unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
		unsigned fs_block_size = (1 << inode->i_blkbits), pad;
		unsigned align = iov_iter_alignment(dio->submit.iter);
		struct iov_iter iter;
		struct bio *bio;
		bool need_zeroout = false;
		int nr_pages, ret;

		if ((pos \| length \| align) & ((1 << blkbits) - 1))
		return -EINVAL;

		switch (iomap->type) {
		case IOMAP_HOLE:
		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
		return -EIO;
		/FALLTHRU/
		case IOMAP_UNWRITTEN:
		if (!(dio->flags & IOMAP_DIO_WRITE)) {
		iov_iter_zero(length, dio->submit.iter);
		dio->size += length;
		return length;
		}
		dio->flags \|= IOMAP_DIO_UNWRITTEN;
		need_zeroout = true;
		break;
		case IOMAP_MAPPED:
		if (iomap->flags & IOMAP_F_SHARED)
		dio->flags \|= IOMAP_DIO_COW;
		if (iomap->flags & IOMAP_F_NEW)
		need_zeroout = true;
		break;
		default:
		WARN_ON_ONCE(1);
		return -EIO;
		}

		/*
		* Operate on a partial iter trimmed to the extent we were called for.
		* We'll update the iter in the dio once we're done with this extent.
		*/
		iter = *dio->submit.iter;
		iov_iter_truncate(&iter, length);

		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
		if (nr_pages <= 0)
		return nr_pages;

		if (need_zeroout) {
		/* zero out from the start of the block to the write offset */
		pad = pos & (fs_block_size - 1);
		if (pad)
		iomap_dio_zero(dio, iomap, pos - pad, pad);
		}

		do {
		if (dio->error)
		return 0;

		bio = bio_alloc(GFP_KERNEL, nr_pages);
		bio->bi_bdev = iomap->bdev;
		bio->bi_iter.bi_sector =
		iomap->blkno + ((pos - iomap->offset) >> 9);
		bio->bi_private = dio;
		bio->bi_end_io = iomap_dio_bio_end_io;

		ret = bio_iov_iter_get_pages(bio, &iter);
		if (unlikely(ret)) {
		bio_put(bio);
		return ret;
		}

		if (dio->flags & IOMAP_DIO_WRITE) {
		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);
		task_io_account_write(bio->bi_iter.bi_size);
		} else {
		bio_set_op_attrs(bio, REQ_OP_READ, 0);
		if (dio->flags & IOMAP_DIO_DIRTY)
		bio_set_pages_dirty(bio);
		}

		dio->size += bio->bi_iter.bi_size;
		pos += bio->bi_iter.bi_size;

		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);

		atomic_inc(&dio->ref);

		dio->submit.last_queue = bdev_get_queue(iomap->bdev);
		dio->submit.cookie = submit_bio(bio);
		} while (nr_pages);

		if (need_zeroout) {
		/* zero out from the end of the write to the end of the block */
		pad = pos & (fs_block_size - 1);
		if (pad)
		iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
		}

		iov_iter_advance(dio->submit.iter, length);
		return length;
		}

		ssize_t
		iomap_dio_rw(struct kiocb iocb, struct iov_iter iter, struct iomap_ops *ops,
		iomap_dio_end_io_t end_io)
		{
		struct address_space *mapping = iocb->ki_filp->f_mapping;
		struct inode *inode = file_inode(iocb->ki_filp);
		size_t count = iov_iter_count(iter);
		loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
		unsigned int flags = IOMAP_DIRECT;
		struct blk_plug plug;
		struct iomap_dio *dio;

		lockdep_assert_held(&inode->i_rwsem);

		if (!count)
		return 0;

		dio = kmalloc(sizeof(*dio), GFP_KERNEL);
		if (!dio)
		return -ENOMEM;

		dio->iocb = iocb;
		atomic_set(&dio->ref, 1);
		dio->size = 0;
		dio->i_size = i_size_read(inode);
		dio->end_io = end_io;
		dio->error = 0;
		dio->flags = 0;

		dio->submit.iter = iter;
		if (is_sync_kiocb(iocb)) {
		dio->submit.waiter = current;
		dio->submit.cookie = BLK_QC_T_NONE;
		dio->submit.last_queue = NULL;
		}

		if (iov_iter_rw(iter) == READ) {
		if (pos >= dio->i_size)
		goto out_free_dio;

		if (iter->type == ITER_IOVEC)
		dio->flags \|= IOMAP_DIO_DIRTY;
		} else {
		dio->flags \|= IOMAP_DIO_WRITE;
		flags \|= IOMAP_WRITE;
		}

		if (mapping->nrpages) {
		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
		if (ret)
		goto out_free_dio;

		ret = invalidate_inode_pages2_range(mapping,
		iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
		WARN_ON_ONCE(ret);
		ret = 0;
		}

		inode_dio_begin(inode);

		blk_start_plug(&plug);
		do {
		ret = iomap_apply(inode, pos, count, flags, ops, dio,
		iomap_dio_actor);
		if (ret <= 0) {
		/* magic error code to fall back to buffered I/O */
		if (ret == -ENOTBLK)
		ret = 0;
		break;
		}
		pos += ret;
		} while ((count = iov_iter_count(iter)) > 0);
		blk_finish_plug(&plug);

		if (ret < 0)
		iomap_dio_set_error(dio, ret);

		if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
		!inode->i_sb->s_dio_done_wq) {
		ret = sb_init_dio_done_wq(inode->i_sb);
		if (ret < 0)
		iomap_dio_set_error(dio, ret);
		}

		if (!atomic_dec_and_test(&dio->ref)) {
		if (!is_sync_kiocb(iocb))
		return -EIOCBQUEUED;

		for (;;) {
		set_current_state(TASK_UNINTERRUPTIBLE);
		if (!READ_ONCE(dio->submit.waiter))
		break;

		if (!(iocb->ki_flags & IOCB_HIPRI) \|\|
		!dio->submit.last_queue \|\|
		!blk_poll(dio->submit.last_queue,
		dio->submit.cookie))
		io_schedule();
		}
		__set_current_state(TASK_RUNNING);
		}

		/*
		* Try again to invalidate clean pages which might have been cached by
		* non-direct readahead, or faulted in by get_user_pages() if the source
		* of the write was an mmap'ed region of the file we're writing. Either
		* one is a pretty crazy thing to do, so we don't support it 100%. If
		* this invalidation fails, tough, the write still worked...
		*/
		if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
		ret = invalidate_inode_pages2_range(mapping,
		iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
		WARN_ON_ONCE(ret);
		}

		return iomap_dio_complete(dio);

		out_free_dio:
		kfree(dio);
		return ret;
		}
		EXPORT_SYMBOL_GPL(iomap_dio_rw);

fs/xfs/xfs_aops.c

+10 −288

Original line number	Diff line number	Diff line
		@@ -37,11 +37,6 @@
		#include <linux/pagevec.h>
		#include <linux/writeback.h>

		/* flags for direct write completions */
		#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
		#define XFS_DIO_FLAG_APPEND (1 << 1)
		#define XFS_DIO_FLAG_COW (1 << 2)

		/*
		* structure owned by writepages passed to individual writepage calls
		*/
		@@ -1175,45 +1170,6 @@ xfs_vm_releasepage(
		return try_to_free_buffers(page);
		}

		/*
		* When we map a DIO buffer, we may need to pass flags to
		* xfs_end_io_direct_write to tell it what kind of write IO we are doing.
		*
		* Note that for DIO, an IO to the highest supported file block offset (i.e.
		* 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
		* bit variable. Hence if we see this overflow, we have to assume that the IO is
		* extending the file size. We won't know for sure until IO completion is run
		* and the actual max write offset is communicated to the IO completion
		* routine.
		*/
		static void
		xfs_map_direct(
		struct inode *inode,
		struct buffer_head *bh_result,
		struct xfs_bmbt_irec *imap,
		xfs_off_t offset,
		bool is_cow)
		{
		uintptr_t flags = (uintptr_t )&bh_result->b_private;
		xfs_off_t size = bh_result->b_size;

		trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
		XFS_IO_OVERWRITE, imap);

		if (ISUNWRITTEN(imap)) {
		*flags \|= XFS_DIO_FLAG_UNWRITTEN;
		set_buffer_defer_completion(bh_result);
		} else if (is_cow) {
		*flags \|= XFS_DIO_FLAG_COW;
		set_buffer_defer_completion(bh_result);
		}
		if (offset + size > i_size_read(inode) \|\| offset + size < 0) {
		*flags \|= XFS_DIO_FLAG_APPEND;
		set_buffer_defer_completion(bh_result);
		}
		}

		/*
		* If this is O_DIRECT or the mpage code calling tell them how large the mapping
		* is, so that we can avoid repeated get_blocks calls.
		@@ -1254,51 +1210,12 @@ xfs_map_trim_size(
		bh_result->b_size = mapping_size;
		}

		/* Bounce unaligned directio writes to the page cache. */
		static int
		xfs_bounce_unaligned_dio_write(
		struct xfs_inode *ip,
		xfs_fileoff_t offset_fsb,
		struct xfs_bmbt_irec *imap)
		{
		struct xfs_bmbt_irec irec;
		xfs_fileoff_t delta;
		bool shared;
		bool x;
		int error;

		irec = *imap;
		if (offset_fsb > irec.br_startoff) {
		delta = offset_fsb - irec.br_startoff;
		irec.br_blockcount -= delta;
		irec.br_startblock += delta;
		irec.br_startoff = offset_fsb;
		}
		error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
		if (error)
		return error;

		/*
		* We're here because we're trying to do a directio write to a
		* region that isn't aligned to a filesystem block. If any part
		* of the extent is shared, fall back to buffered mode to handle
		* the RMW. This is done by returning -EREMCHG ("remote addr
		* changed"), which is caught further up the call stack.
		*/
		if (shared) {
		trace_xfs_reflink_bounce_dio_write(ip, imap);
		return -EREMCHG;
		}
		return 0;
		}

		STATIC int
		__xfs_get_blocks(
		xfs_get_blocks(
		struct inode *inode,
		sector_t iblock,
		struct buffer_head *bh_result,
		int create,
		bool direct)
		int create)
		{
		struct xfs_inode *ip = XFS_I(inode);
		struct xfs_mount *mp = ip->i_mount;
		@@ -1309,10 +1226,8 @@ __xfs_get_blocks(
		int nimaps = 1;
		xfs_off_t offset;
		ssize_t size;
		int new = 0;
		bool is_cow = false;

		BUG_ON(create && !direct);
		BUG_ON(create);

		if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;
		@@ -1321,7 +1236,7 @@ __xfs_get_blocks(
		ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
		size = bh_result->b_size;

		if (!create && offset >= i_size_read(inode))
		if (offset >= i_size_read(inode))
		return 0;

		/*
		@@ -1336,73 +1251,12 @@ __xfs_get_blocks(
		end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
		offset_fsb = XFS_B_TO_FSBT(mp, offset);

		if (create && direct && xfs_is_reflink_inode(ip)) {
		is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
		ASSERT(!is_cow \|\| !isnullstartblock(imap.br_startblock));
		}

		if (!is_cow) {
		error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
		&imap, &nimaps, XFS_BMAPI_ENTIRE);
		/*
		* Truncate an overwrite extent if there's a pending CoW
		* reservation before the end of this extent. This
		* forces us to come back to get_blocks to take care of
		* the CoW.
		*/
		if (create && direct && nimaps &&
		imap.br_startblock != HOLESTARTBLOCK &&
		imap.br_startblock != DELAYSTARTBLOCK &&
		!ISUNWRITTEN(&imap))
		xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
		&imap);
		}
		if (error)
		goto out_unlock;

		/*
		* The only time we can ever safely find delalloc blocks on direct I/O
		* is a dio write to post-eof speculative preallocation. All other
		* scenarios are indicative of a problem or misuse (such as mixing
		* direct and mapped I/O).
		*
		* The file may be unmapped by the time we get here so we cannot
		* reliably fail the I/O based on mapping. Instead, fail the I/O if this
		* is a read or a write within eof. Otherwise, carry on but warn as a
		* precuation if the file happens to be mapped.
		*/
		if (direct && imap.br_startblock == DELAYSTARTBLOCK) {
		if (!create \|\| offset < i_size_read(VFS_I(ip))) {
		WARN_ON_ONCE(1);
		error = -EIO;
		goto out_unlock;
		}
		WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping));
		}

		/* for DAX, we convert unwritten extents directly */
		if (create &&
		(!nimaps \|\|
		(imap.br_startblock == HOLESTARTBLOCK \|\|
		imap.br_startblock == DELAYSTARTBLOCK) \|\|
		(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
		/*
		* xfs_iomap_write_direct() expects the shared lock. It
		* is unlocked on return.
		*/
		if (lockmode == XFS_ILOCK_EXCL)
		xfs_ilock_demote(ip, lockmode);

		error = xfs_iomap_write_direct(ip, offset, size,
		&imap, nimaps);
		if (error)
		return error;
		new = 1;

		trace_xfs_get_blocks_alloc(ip, offset, size,
		ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
		: XFS_IO_DELALLOC, &imap);
		} else if (nimaps) {
		if (nimaps) {
		trace_xfs_get_blocks_found(ip, offset, size,
		ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
		: XFS_IO_OVERWRITE, &imap);
		@@ -1412,12 +1266,6 @@ __xfs_get_blocks(
		goto out_unlock;
		}

		if (IS_DAX(inode) && create) {
		ASSERT(!ISUNWRITTEN(&imap));
		/* zeroing is not needed at a higher layer */
		new = 0;
		}

		/* trim mapping down to size requested */
		xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);

		@@ -1427,43 +1275,14 @@ __xfs_get_blocks(
		*/
		if (imap.br_startblock != HOLESTARTBLOCK &&
		imap.br_startblock != DELAYSTARTBLOCK &&
		(create \|\| !ISUNWRITTEN(&imap))) {
		if (create && direct && !is_cow) {
		error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
		&imap);
		if (error)
		return error;
		}

		!ISUNWRITTEN(&imap))
		xfs_map_buffer(inode, bh_result, &imap, offset);
		if (ISUNWRITTEN(&imap))
		set_buffer_unwritten(bh_result);
		/* direct IO needs special help */
		if (create)
		xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
		}

		/*
		* If this is a realtime file, data may be on a different device.
		* to that pointed to from the buffer_head b_bdev currently.
		*/
		bh_result->b_bdev = xfs_find_bdev_for_inode(inode);

		/*
		* If we previously allocated a block out beyond eof and we are now
		* coming back to use it then we will need to flag it as new even if it
		* has a disk address.
		*
		* With sub-block writes into unwritten extents we also need to mark
		* the buffer as new so that the unwritten parts of the buffer gets
		* correctly zeroed.
		*/
		if (create &&
		((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) \|\|
		(offset >= i_size_read(inode)) \|\|
		(new \|\| ISUNWRITTEN(&imap))))
		set_buffer_new(bh_result);

		return 0;

		out_unlock:
		@@ -1471,100 +1290,6 @@ out_unlock:
		return error;
		}

		int
		xfs_get_blocks(
		struct inode *inode,
		sector_t iblock,
		struct buffer_head *bh_result,
		int create)
		{
		return __xfs_get_blocks(inode, iblock, bh_result, create, false);
		}

		int
		xfs_get_blocks_direct(
		struct inode *inode,
		sector_t iblock,
		struct buffer_head *bh_result,
		int create)
		{
		return __xfs_get_blocks(inode, iblock, bh_result, create, true);
		}

		/*
		* Complete a direct I/O write request.
		*
		* xfs_map_direct passes us some flags in the private data to tell us what to
		* do. If no flags are set, then the write IO is an overwrite wholly within
		* the existing allocated file size and so there is nothing for us to do.
		*
		* Note that in this case the completion can be called in interrupt context,
		* whereas if we have flags set we will always be called in task context
		* (i.e. from a workqueue).
		*/
		int
		xfs_end_io_direct_write(
		struct kiocb *iocb,
		loff_t offset,
		ssize_t size,
		void *private)
		{
		struct inode *inode = file_inode(iocb->ki_filp);
		struct xfs_inode *ip = XFS_I(inode);
		uintptr_t flags = (uintptr_t)private;
		int error = 0;

		trace_xfs_end_io_direct_write(ip, offset, size);

		if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return -EIO;

		if (size <= 0)
		return size;

		/*
		* The flags tell us whether we are doing unwritten extent conversions
		* or an append transaction that updates the on-disk file size. These
		* cases are the only cases where we should potentially be needing
		* to update the VFS inode size.
		*/
		if (flags == 0) {
		ASSERT(offset + size <= i_size_read(inode));
		return 0;
		}

		/*
		* We need to update the in-core inode size here so that we don't end up
		* with the on-disk inode size being outside the in-core inode size. We
		* have no other method of updating EOF for AIO, so always do it here
		* if necessary.
		*
		* We need to lock the test/set EOF update as we can be racing with
		* other IO completions here to update the EOF. Failing to serialise
		* here can result in EOF moving backwards and Bad Things Happen when
		* that occurs.
		*/
		spin_lock(&ip->i_flags_lock);
		if (offset + size > i_size_read(inode))
		i_size_write(inode, offset + size);
		spin_unlock(&ip->i_flags_lock);

		if (flags & XFS_DIO_FLAG_COW)
		error = xfs_reflink_end_cow(ip, offset, size);
		if (flags & XFS_DIO_FLAG_UNWRITTEN) {
		trace_xfs_end_io_direct_write_unwritten(ip, offset, size);

		error = xfs_iomap_write_unwritten(ip, offset, size);
		}
		if (flags & XFS_DIO_FLAG_APPEND) {
		trace_xfs_end_io_direct_write_append(ip, offset, size);

		error = xfs_setfilesize(ip, offset, size);
		}

		return error;
		}

		STATIC ssize_t
		xfs_vm_direct_IO(
		struct kiocb *iocb,
		@@ -1585,7 +1310,6 @@ xfs_vm_bmap(
		struct xfs_inode *ip = XFS_I(inode);

		trace_xfs_vm_bmap(XFS_I(inode));
		xfs_ilock(ip, XFS_IOLOCK_SHARED);

		/*
		* The swap code (ab-)uses ->bmap to get a block mapping and then
		@@ -1593,12 +1317,10 @@ xfs_vm_bmap(
		* that on reflinks inodes, so we have to skip out here. And yes,
		* 0 is the magic code for a bmap error..
		*/
		if (xfs_is_reflink_inode(ip)) {
		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
		if (xfs_is_reflink_inode(ip))
		return 0;
		}

		filemap_write_and_wait(mapping);
		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
		return generic_block_bmap(mapping, block, xfs_get_blocks);
		}

Admin message