Commit afb585a9 authored by Mauricio Faria de Oliveira's avatar Mauricio Faria de Oliveira Committed by Theodore Ts'o
Browse files

ext4: data=journal: write-protect pages on j_submit_inode_data_buffers()



This implements journal callbacks j_submit|finish_inode_data_buffers()
with different behavior for data=journal: to write-protect pages under
commit, preventing changes to buffers writeably mapped to userspace.

If a buffer's content changes between commit's checksum calculation
and write-out to disk, it can cause journal recovery/mount failures
upon a kernel crash or power loss.

    [   27.334874] EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!
    [   27.339492] JBD2: Invalid checksum recovering data block 8705 in log
    [   27.342716] JBD2: recovery failed
    [   27.343316] EXT4-fs (loop0): error loading journal
    mount: /ext4: can't read superblock on /dev/loop0.

In j_submit_inode_data_buffers() we write-protect the inode's pages
with write_cache_pages() and redirty w/ writepage callback if needed.

In j_finish_inode_data_buffers() there is nothing do to.

And in order to use the callbacks, inodes are added to the inode list
in transaction in __ext4_journalled_writepage() and ext4_page_mkwrite().

In ext4_page_mkwrite() we must make sure that the buffers are attached
to the transaction as jbddirty with write_end_fn(), as already done in
__ext4_journalled_writepage().

Signed-off-by: default avatarMauricio Faria de Oliveira <mfo@canonical.com>
Reported-by: default avatarDann Frazier <dann.frazier@canonical.com>
Reported-by: kernel test robot <lkp@intel.com> # wbc.nr_to_write
Suggested-by: default avatarJan Kara <jack@suse.cz>
Reviewed-by: default avatarJan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20201006004841.600488-5-mfo@canonical.com


Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 64a9f144
Loading
Loading
Loading
Loading
+16 −9
Original line number Diff line number Diff line
@@ -1911,6 +1911,9 @@ static int __ext4_journalled_writepage(struct page *page,
		err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
					     write_end_fn);
	}
	if (ret == 0)
		ret = err;
	err = ext4_jbd2_inode_add_write(handle, inode, 0, len);
	if (ret == 0)
		ret = err;
	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -6062,10 +6065,8 @@ retry_alloc:
		size = i_size_read(inode);
		/* Page got truncated from under us? */
		if (page->mapping != mapping || page_offset(page) > size) {
			unlock_page(page);
			ret = VM_FAULT_NOPAGE;
			ext4_journal_stop(handle);
			goto out;
			goto out_error;
		}

		if (page->index == size >> PAGE_SHIFT)
@@ -6075,13 +6076,15 @@ retry_alloc:

		err = __block_write_begin(page, 0, len, ext4_get_block);
		if (!err) {
			if (ext4_walk_page_buffers(handle, page_buffers(page),
					0, len, NULL, do_journal_get_write_access)) {
				unlock_page(page);
			ret = VM_FAULT_SIGBUS;
				ext4_journal_stop(handle);
				goto out;
			}
			if (ext4_walk_page_buffers(handle, page_buffers(page),
					0, len, NULL, do_journal_get_write_access))
				goto out_error;
			if (ext4_walk_page_buffers(handle, page_buffers(page),
					0, len, NULL, write_end_fn))
				goto out_error;
			if (ext4_jbd2_inode_add_write(handle, inode, 0, len))
				goto out_error;
			ext4_set_inode_state(inode, EXT4_STATE_JDATA);
		} else {
			unlock_page(page);
@@ -6096,6 +6099,10 @@ out:
	up_read(&EXT4_I(inode)->i_mmap_sem);
	sb_end_pagefault(inode->i_sb);
	return ret;
out_error:
	unlock_page(page);
	ext4_journal_stop(handle);
	goto out;
}

vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
+85 −2
Original line number Diff line number Diff line
@@ -571,6 +571,89 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
	spin_unlock(&sbi->s_md_lock);
}

/*
 * This writepage callback for write_cache_pages()
 * takes care of a few cases after page cleaning.
 *
 * write_cache_pages() already checks for dirty pages
 * and calls clear_page_dirty_for_io(), which we want,
 * to write protect the pages.
 *
 * However, we may have to redirty a page (see below.)
 */
static int ext4_journalled_writepage_callback(struct page *page,
					      struct writeback_control *wbc,
					      void *data)
{
	transaction_t *transaction = (transaction_t *) data;
	struct buffer_head *bh, *head;
	struct journal_head *jh;

	bh = head = page_buffers(page);
	do {
		/*
		 * We have to redirty a page in these cases:
		 * 1) If buffer is dirty, it means the page was dirty because it
		 * contains a buffer that needs checkpointing. So the dirty bit
		 * needs to be preserved so that checkpointing writes the buffer
		 * properly.
		 * 2) If buffer is not part of the committing transaction
		 * (we may have just accidentally come across this buffer because
		 * inode range tracking is not exact) or if the currently running
		 * transaction already contains this buffer as well, dirty bit
		 * needs to be preserved so that the buffer gets writeprotected
		 * properly on running transaction's commit.
		 */
		jh = bh2jh(bh);
		if (buffer_dirty(bh) ||
		    (jh && (jh->b_transaction != transaction ||
			    jh->b_next_transaction))) {
			redirty_page_for_writepage(wbc, page);
			goto out;
		}
	} while ((bh = bh->b_this_page) != head);

out:
	return AOP_WRITEPAGE_ACTIVATE;
}

static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
	struct writeback_control wbc = {
		.sync_mode =  WB_SYNC_ALL,
		.nr_to_write = LONG_MAX,
		.range_start = jinode->i_dirty_start,
		.range_end = jinode->i_dirty_end,
        };

	return write_cache_pages(mapping, &wbc,
				 ext4_journalled_writepage_callback,
				 jinode->i_transaction);
}

static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
	int ret;

	if (ext4_should_journal_data(jinode->i_vfs_inode))
		ret = ext4_journalled_submit_inode_data_buffers(jinode);
	else
		ret = jbd2_journal_submit_inode_data_buffers(jinode);

	return ret;
}

static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
{
	int ret = 0;

	if (!ext4_should_journal_data(jinode->i_vfs_inode))
		ret = jbd2_journal_finish_inode_data_buffers(jinode);

	return ret;
}

static bool system_going_down(void)
{
	return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
@@ -4753,9 +4836,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)

	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
	sbi->s_journal->j_submit_inode_data_buffers =
		jbd2_journal_submit_inode_data_buffers;
		ext4_journal_submit_inode_data_buffers;
	sbi->s_journal->j_finish_inode_data_buffers =
		jbd2_journal_finish_inode_data_buffers;
		ext4_journal_finish_inode_data_buffers;

no_journal:
	if (!test_opt(sb, NO_MBCACHE)) {