Merge branch 'mb/dio' into master (8d0d47ea) · Commits · 戴 / test

fs/dax.c

+8 −5

Original line number	Diff line number	Diff line
		@@ -1090,7 +1090,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);

		static loff_t
		dax_iomap_actor(struct inode inode, loff_t pos, loff_t length, void data,
		struct iomap *iomap)
		struct iomap iomap, struct iomap srcmap)
		{
		struct block_device *bdev = iomap->bdev;
		struct dax_device *dax_dev = iomap->dax_dev;
		@@ -1247,7 +1247,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault vmf, pfn_t pfnp,
		struct inode *inode = mapping->host;
		unsigned long vaddr = vmf->address;
		loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
		struct iomap iomap = { 0 };
		struct iomap iomap = { .type = IOMAP_HOLE };
		struct iomap srcmap = { .type = IOMAP_HOLE };
		unsigned flags = IOMAP_FAULT;
		int error, major = 0;
		bool write = vmf->flags & FAULT_FLAG_WRITE;
		@@ -1292,7 +1293,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault vmf, pfn_t pfnp,
		* the file system block size to be equal the page size, which means
		* that we never have to deal with more than a single extent here.
		*/
		error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
		error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
		if (iomap_errp)
		*iomap_errp = error;
		if (error) {
		@@ -1471,7 +1472,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault vmf, pfn_t pfnp,
		unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) \| IOMAP_FAULT;
		struct inode *inode = mapping->host;
		vm_fault_t result = VM_FAULT_FALLBACK;
		struct iomap iomap = { 0 };
		struct iomap iomap = { .type = IOMAP_HOLE };
		struct iomap srcmap = { .type = IOMAP_HOLE };
		pgoff_t max_pgoff;
		void *entry;
		loff_t pos;
		@@ -1546,7 +1548,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault vmf, pfn_t pfnp,
		* to look up our filesystem block.
		*/
		pos = (loff_t)xas.xa_index << PAGE_SHIFT;
		error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
		error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
		&srcmap);
		if (error)
		goto unlock_entry;

fs/ext2/inode.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,

		#ifdef CONFIG_FS_DAX
		static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
		unsigned flags, struct iomap *iomap)
		unsigned flags, struct iomap iomap, struct iomap srcmap)
		{
		unsigned int blkbits = inode->i_blkbits;
		unsigned long first_block = offset >> blkbits;

fs/ext4/ext4.h

+1 −3

Original line number	Diff line number	Diff line
		@@ -1584,7 +1584,6 @@ enum {
		EXT4_STATE_NO_EXPAND, /* No space for expansion */
		EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
		EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
		EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
		EXT4_STATE_NEWENTRY, /* File just added to dir */
		EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
		EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
		@@ -2565,8 +2564,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
		struct buffer_head *bh_result, int create);
		int ext4_get_block(struct inode *inode, sector_t iblock,
		struct buffer_head *bh_result, int create);
		int ext4_dio_get_block(struct inode *inode, sector_t iblock,
		struct buffer_head *bh_result, int create);
		int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
		struct buffer_head *bh, int create);
		int ext4_walk_page_buffers(handle_t *handle,
		@@ -3391,6 +3388,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
		}

		extern const struct iomap_ops ext4_iomap_ops;
		extern const struct iomap_ops ext4_iomap_report_ops;

		static inline int ext4_buffer_uptodate(struct buffer_head *bh)
		{

fs/ext4/extents.c

+2 −9

Original line number	Diff line number	Diff line
		@@ -1765,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode inode, struct ext4_extent ex1,
		*/
		if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
		return 0;
		/*
		* The check for IO to unwritten extent is somewhat racy as we
		* increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
		* dropping i_data_sem. But reserved blocks should save us in that
		* case.
		*/

		if (ext4_ext_is_unwritten(ex1) &&
		(ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) \|\|
		atomic_read(&EXT4_I(inode)->i_unwritten) \|\|
		(ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
		ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
		return 0;
		#ifdef AGGRESSIVE_TEST
		if (ext1_ee_len >= 4)

fs/ext4/file.c

+338 −74

Original line number	Diff line number	Diff line
		@@ -29,10 +29,58 @@
		#include <linux/pagevec.h>
		#include <linux/uio.h>
		#include <linux/mman.h>
		#include <linux/backing-dev.h>
		#include "ext4.h"
		#include "ext4_jbd2.h"
		#include "xattr.h"
		#include "acl.h"
		#include "truncate.h"

		static bool ext4_dio_supported(struct inode *inode)
		{
		if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
		return false;
		if (fsverity_active(inode))
		return false;
		if (ext4_should_journal_data(inode))
		return false;
		if (ext4_has_inline_data(inode))
		return false;
		return true;
		}

		static ssize_t ext4_dio_read_iter(struct kiocb iocb, struct iov_iter to)
		{
		ssize_t ret;
		struct inode *inode = file_inode(iocb->ki_filp);

		if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!inode_trylock_shared(inode))
		return -EAGAIN;
		} else {
		inode_lock_shared(inode);
		}

		if (!ext4_dio_supported(inode)) {
		inode_unlock_shared(inode);
		/*
		* Fallback to buffered I/O if the operation being performed on
		* the inode is not supported by direct I/O. The IOCB_DIRECT
		* flag needs to be cleared here in order to ensure that the
		* direct I/O path within generic_file_read_iter() is not
		* taken.
		*/
		iocb->ki_flags &= ~IOCB_DIRECT;
		return generic_file_read_iter(iocb, to);
		}

		ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
		is_sync_kiocb(iocb));
		inode_unlock_shared(inode);

		file_accessed(iocb->ki_filp);
		return ret;
		}

		#ifdef CONFIG_FS_DAX
		static ssize_t ext4_dax_read_iter(struct kiocb iocb, struct iov_iter to)
		@@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb iocb, struct iov_iter to)

		static ssize_t ext4_file_read_iter(struct kiocb iocb, struct iov_iter to)
		{
		if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
		struct inode *inode = file_inode(iocb->ki_filp);

		if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;

		if (!iov_iter_count(to))
		return 0; /* skip atime */

		#ifdef CONFIG_FS_DAX
		if (IS_DAX(file_inode(iocb->ki_filp)))
		if (IS_DAX(inode))
		return ext4_dax_read_iter(iocb, to);
		#endif
		if (iocb->ki_flags & IOCB_DIRECT)
		return ext4_dio_read_iter(iocb, to);

		return generic_file_read_iter(iocb, to);
		}

		@@ -103,13 +156,6 @@ static int ext4_release_file(struct inode inode, struct file filp)
		return 0;
		}

		static void ext4_unwritten_wait(struct inode *inode)
		{
		wait_queue_head_t *wq = ext4_ioend_wq(inode);

		wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
		}

		/*
		* This tests whether the IO in question is block-aligned or not.
		* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
		@@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb iocb, struct iov_iter from)
		struct inode *inode = file_inode(iocb->ki_filp);
		ssize_t ret;

		if (unlikely(IS_IMMUTABLE(inode)))
		return -EPERM;

		ret = generic_write_checks(iocb, from);
		if (ret <= 0)
		return ret;

		if (unlikely(IS_IMMUTABLE(inode)))
		return -EPERM;

		/*
		* If we have encountered a bitmap-format file, the size limit
		* is smaller than s_maxbytes, which is for extent-mapped files.
		@@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb iocb, struct iov_iter from)
		return -EFBIG;
		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
		}

		ret = file_modified(iocb->ki_filp);
		if (ret)
		return ret;

		return iov_iter_count(from);
		}

		#ifdef CONFIG_FS_DAX
		static ssize_t
		ext4_dax_write_iter(struct kiocb iocb, struct iov_iter from)
		static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
		struct iov_iter *from)
		{
		struct inode *inode = file_inode(iocb->ki_filp);
		ssize_t ret;
		struct inode *inode = file_inode(iocb->ki_filp);

		if (!inode_trylock(inode)) {
		if (iocb->ki_flags & IOCB_NOWAIT)
		return -EAGAIN;
		return -EOPNOTSUPP;

		inode_lock(inode);
		}
		ret = ext4_write_checks(iocb, from);
		if (ret <= 0)
		goto out;
		ret = file_remove_privs(iocb->ki_filp);
		if (ret)
		goto out;
		ret = file_update_time(iocb->ki_filp);
		if (ret)
		goto out;

		ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
		current->backing_dev_info = inode_to_bdi(inode);
		ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
		current->backing_dev_info = NULL;

		out:
		inode_unlock(inode);
		if (ret > 0)
		if (likely(ret > 0)) {
		iocb->ki_pos += ret;
		ret = generic_write_sync(iocb, ret);
		}

		return ret;
		}
		#endif

		static ssize_t
		ext4_file_write_iter(struct kiocb iocb, struct iov_iter from)
		static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
		ssize_t written, size_t count)
		{
		handle_t *handle;
		bool truncate = false;
		u8 blkbits = inode->i_blkbits;
		ext4_lblk_t written_blk, end_blk;

		/*
		* Note that EXT4_I(inode)->i_disksize can get extended up to
		* inode->i_size while the I/O was running due to writeback of delalloc
		* blocks. But, the code in ext4_iomap_alloc() is careful to use
		* zeroed/unwritten extents if this is possible; thus we won't leave
		* uninitialized blocks in a file even if we didn't succeed in writing
		* as much as we intended.
		*/
		WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
		if (offset + count <= EXT4_I(inode)->i_disksize) {
		/*
		* We need to ensure that the inode is removed from the orphan
		* list if it has been added prematurely, due to writeback of
		* delalloc blocks.
		*/
		if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);

		if (IS_ERR(handle)) {
		ext4_orphan_del(NULL, inode);
		return PTR_ERR(handle);
		}

		ext4_orphan_del(handle, inode);
		ext4_journal_stop(handle);
		}

		return written;
		}

		if (written < 0)
		goto truncate;

		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
		if (IS_ERR(handle)) {
		written = PTR_ERR(handle);
		goto truncate;
		}

		if (ext4_update_inode_size(inode, offset + written))
		ext4_mark_inode_dirty(handle, inode);

		/*
		* We may need to truncate allocated but not written blocks beyond EOF.
		*/
		written_blk = ALIGN(offset + written, 1 << blkbits);
		end_blk = ALIGN(offset + count, 1 << blkbits);
		if (written_blk < end_blk && ext4_can_truncate(inode))
		truncate = true;

		/*
		* Remove the inode from the orphan list if it has been extended and
		* everything went OK.
		*/
		if (!truncate && inode->i_nlink)
		ext4_orphan_del(handle, inode);
		ext4_journal_stop(handle);

		if (truncate) {
		truncate:
		ext4_truncate_failed_write(inode);
		/*
		* If the truncate operation failed early, then the inode may
		* still be on the orphan list. In that case, we need to try
		* remove the inode from the in-memory linked list.
		*/
		if (inode->i_nlink)
		ext4_orphan_del(NULL, inode);
		}

		return written;
		}

		static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
		int error, unsigned int flags)
		{
		loff_t offset = iocb->ki_pos;
		struct inode *inode = file_inode(iocb->ki_filp);
		int o_direct = iocb->ki_flags & IOCB_DIRECT;
		int unaligned_aio = 0;
		int overwrite = 0;

		if (error)
		return error;

		if (size && flags & IOMAP_DIO_UNWRITTEN)
		return ext4_convert_unwritten_extents(NULL, inode,
		offset, size);

		return 0;
		}

		static const struct iomap_dio_ops ext4_dio_write_ops = {
		.end_io = ext4_dio_write_end_io,
		};

		static ssize_t ext4_dio_write_iter(struct kiocb iocb, struct iov_iter from)
		{
		ssize_t ret;
		size_t count;
		loff_t offset;
		handle_t *handle;
		struct inode *inode = file_inode(iocb->ki_filp);
		bool extend = false, overwrite = false, unaligned_aio = false;

		if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;
		if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!inode_trylock(inode))
		return -EAGAIN;
		} else {
		inode_lock(inode);
		}

		if (!ext4_dio_supported(inode)) {
		inode_unlock(inode);
		/*
		* Fallback to buffered I/O if the inode does not support
		* direct I/O.
		*/
		return ext4_buffered_write_iter(iocb, from);
		}

		ret = ext4_write_checks(iocb, from);
		if (ret <= 0) {
		inode_unlock(inode);
		return ret;
		}

		/*
		* Unaligned asynchronous direct I/O must be serialized among each
		* other as the zeroing of partial blocks of two competing unaligned
		* asynchronous direct I/O writes can result in data corruption.
		*/
		offset = iocb->ki_pos;
		count = iov_iter_count(from);
		if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
		!is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
		unaligned_aio = true;
		inode_dio_wait(inode);
		}

		/*
		* Determine whether the I/O will overwrite allocated and initialized
		* blocks. If so, check to see whether it is possible to take the
		* dioread_nolock path.
		*/
		if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
		ext4_should_dioread_nolock(inode)) {
		overwrite = true;
		downgrade_write(&inode->i_rwsem);
		}

		if (offset + count > EXT4_I(inode)->i_disksize) {
		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
		if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		goto out;
		}

		ret = ext4_orphan_add(handle, inode);
		if (ret) {
		ext4_journal_stop(handle);
		goto out;
		}

		extend = true;
		ext4_journal_stop(handle);
		}

		ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
		is_sync_kiocb(iocb) \|\| unaligned_aio \|\| extend);

		if (extend)
		ret = ext4_handle_inode_extension(inode, offset, ret, count);

		out:
		if (overwrite)
		inode_unlock_shared(inode);
		else
		inode_unlock(inode);

		if (ret >= 0 && iov_iter_count(from)) {
		ssize_t err;
		loff_t endbyte;

		offset = iocb->ki_pos;
		err = ext4_buffered_write_iter(iocb, from);
		if (err < 0)
		return err;

		/*
		* We need to ensure that the pages within the page cache for
		* the range covered by this I/O are written to disk and
		* invalidated. This is in attempt to preserve the expected
		* direct I/O semantics in the case we fallback to buffered I/O
		* to complete off the I/O request.
		*/
		ret += err;
		endbyte = offset + err - 1;
		err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
		offset, endbyte);
		if (!err)
		invalidate_mapping_pages(iocb->ki_filp->f_mapping,
		offset >> PAGE_SHIFT,
		endbyte >> PAGE_SHIFT);
		}

		return ret;
		}

		#ifdef CONFIG_FS_DAX
		if (IS_DAX(inode))
		return ext4_dax_write_iter(iocb, from);
		#endif
		static ssize_t
		ext4_dax_write_iter(struct kiocb iocb, struct iov_iter from)
		{
		ssize_t ret;
		size_t count;
		loff_t offset;
		handle_t *handle;
		bool extend = false;
		struct inode *inode = file_inode(iocb->ki_filp);

		if (!inode_trylock(inode)) {
		if (iocb->ki_flags & IOCB_NOWAIT)
		@@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb iocb, struct iov_iter from)
		if (ret <= 0)
		goto out;

		/*
		* Unaligned direct AIO must be serialized among each other as zeroing
		* of partial blocks of two competing unaligned AIOs can result in data
		* corruption.
		*/
		if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
		!is_sync_kiocb(iocb) &&
		ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
		unaligned_aio = 1;
		ext4_unwritten_wait(inode);
		}

		iocb->private = &overwrite;
		/* Check whether we do a DIO overwrite or not */
		if (o_direct && !unaligned_aio) {
		if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
		if (ext4_should_dioread_nolock(inode))
		overwrite = 1;
		} else if (iocb->ki_flags & IOCB_NOWAIT) {
		ret = -EAGAIN;
		offset = iocb->ki_pos;
		count = iov_iter_count(from);

		if (offset + count > EXT4_I(inode)->i_disksize) {
		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
		if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		goto out;
		}
		}

		ret = __generic_file_write_iter(iocb, from);
		/*
		* Unaligned direct AIO must be the only IO in flight. Otherwise
		* overlapping aligned IO after unaligned might result in data
		* corruption.
		*/
		if (ret == -EIOCBQUEUED && unaligned_aio)
		ext4_unwritten_wait(inode);
		inode_unlock(inode);
		ret = ext4_orphan_add(handle, inode);
		if (ret) {
		ext4_journal_stop(handle);
		goto out;
		}

		if (ret > 0)
		ret = generic_write_sync(iocb, ret);
		extend = true;
		ext4_journal_stop(handle);
		}

		return ret;
		ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);

		if (extend)
		ret = ext4_handle_inode_extension(inode, offset, ret, count);
		out:
		inode_unlock(inode);
		if (ret > 0)
		ret = generic_write_sync(iocb, ret);
		return ret;
		}
		#endif

		static ssize_t
		ext4_file_write_iter(struct kiocb iocb, struct iov_iter from)
		{
		struct inode *inode = file_inode(iocb->ki_filp);

		if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;

		#ifdef CONFIG_FS_DAX
		if (IS_DAX(inode))
		return ext4_dax_write_iter(iocb, from);
		#endif
		if (iocb->ki_flags & IOCB_DIRECT)
		return ext4_dio_write_iter(iocb, from);

		return ext4_buffered_write_iter(iocb, from);
		}

		#ifdef CONFIG_FS_DAX
		static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
		@@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
		maxbytes, i_size_read(inode));
		case SEEK_HOLE:
		inode_lock_shared(inode);
		offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
		offset = iomap_seek_hole(inode, offset,
		&ext4_iomap_report_ops);
		inode_unlock_shared(inode);
		break;
		case SEEK_DATA:
		inode_lock_shared(inode);
		offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
		offset = iomap_seek_data(inode, offset,
		&ext4_iomap_report_ops);
		inode_unlock_shared(inode);
		break;
		}

Admin message