Commit 663dfbb0 authored by Filipe Manana's avatar Filipe Manana Committed by Chris Mason
Browse files

Btrfs: deal with convert_extent_bit errors to avoid fs corruption



When committing a transaction or a log, we look for btree extents that
need to be durably persisted by searching for ranges in a io tree that
have some bits set (EXTENT_DIRTY or EXTENT_NEW). We then attempt to clear
those bits and set the EXTENT_NEED_WAIT bit, with calls to the function
convert_extent_bit, and then start writeback for the extents.

That function however can return an error (at the moment only -ENOMEM
is possible, specially when it does GFP_ATOMIC allocation requests
through alloc_extent_state_atomic) - that means the ranges didn't got
the EXTENT_NEED_WAIT bit set (or at least not for the whole range),
which in turn means a call to btrfs_wait_marked_extents() won't find
those ranges for which we started writeback, causing a transaction
commit or a log commit to persist a new superblock without waiting
for the writeback of extents in that range to finish first.

Therefore if a crash happens after persisting the new superblock and
before writeback finishes, we have a superblock pointing to roots that
weren't fully persisted or roots that point to nodes or leafs that weren't
fully persisted, causing all sorts of unexpected/bad behaviour as we endup
reading garbage from disk or the content of some node/leaf from a past
generation that got cowed or deleted and is no longer valid (for this later
case we end up getting error messages like "parent transid verify failed on
X wanted Y found Z" when reading btree nodes/leafs from disk).

Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarChris Mason <clm@fb.com>
parent 2fc9f6ba
Loading
Loading
Loading
Loading
+76 −16
Original line number Diff line number Diff line
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
	}
}

static void clear_btree_io_tree(struct extent_io_tree *tree)
{
	spin_lock(&tree->lock);
	while (!RB_EMPTY_ROOT(&tree->state)) {
		struct rb_node *node;
		struct extent_state *state;

		node = rb_first(&tree->state);
		state = rb_entry(node, struct extent_state, rb_node);
		rb_erase(&state->rb_node, &tree->state);
		RB_CLEAR_NODE(&state->rb_node);
		/*
		 * btree io trees aren't supposed to have tasks waiting for
		 * changes in the flags of extent states ever.
		 */
		ASSERT(!waitqueue_active(&state->wq));
		free_extent_state(state);
		if (need_resched()) {
			spin_unlock(&tree->lock);
			cond_resched();
			spin_lock(&tree->lock);
		}
	}
	spin_unlock(&tree->lock);
}

static noinline void switch_commit_roots(struct btrfs_transaction *trans,
					 struct btrfs_fs_info *fs_info)
{
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
		root->commit_root = btrfs_root_node(root);
		if (is_fstree(root->objectid))
			btrfs_unpin_free_ino(root);
		clear_btree_io_tree(&root->dirty_log_pages);
	}
	up_write(&fs_info->commit_root_sem);
}
@@ -828,17 +855,38 @@ int btrfs_write_marked_extents(struct btrfs_root *root,

	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      mark, &cached_state)) {
		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
		bool wait_writeback = false;

		err = convert_extent_bit(dirty_pages, start, end,
					 EXTENT_NEED_WAIT,
					 mark, &cached_state, GFP_NOFS);
		cached_state = NULL;
		/*
		 * convert_extent_bit can return -ENOMEM, which is most of the
		 * time a temporary error. So when it happens, ignore the error
		 * and wait for writeback of this range to finish - because we
		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
		 * to btrfs_wait_marked_extents() would not know that writeback
		 * for this range started and therefore wouldn't wait for it to
		 * finish - we don't want to commit a superblock that points to
		 * btree nodes/leafs for which writeback hasn't finished yet
		 * (and without errors).
		 * We cleanup any entries left in the io tree when committing
		 * the transaction (through clear_btree_io_tree()).
		 */
		if (err == -ENOMEM) {
			err = 0;
			wait_writeback = true;
		}
		if (!err)
			err = filemap_fdatawrite_range(mapping, start, end);
		if (err)
			werr = err;
		else if (wait_writeback)
			werr = filemap_fdatawait_range(mapping, start, end);
		cached_state = NULL;
		cond_resched();
		start = end + 1;
	}
	if (err)
		werr = err;
	return werr;
}

@@ -862,8 +910,20 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,

	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      EXTENT_NEED_WAIT, &cached_state)) {
		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
		/*
		 * Ignore -ENOMEM errors returned by clear_extent_bit().
		 * When committing the transaction, we'll remove any entries
		 * left in the io tree. For a log commit, we don't remove them
		 * after committing the log because the tree can be accessed
		 * concurrently - we do it only at transaction commit time when
		 * it's safe to do it (through clear_btree_io_tree()).
		 */
		err = clear_extent_bit(dirty_pages, start, end,
				       EXTENT_NEED_WAIT,
				       0, 0, &cached_state, GFP_NOFS);
		if (err == -ENOMEM)
			err = 0;
		if (!err)
			err = filemap_fdatawait_range(mapping, start, end);
		if (err)
			werr = err;
@@ -919,17 +979,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
	return 0;
}

int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root)
{
	if (!trans || !trans->transaction) {
		struct inode *btree_inode;
		btree_inode = root->fs_info->btree_inode;
		return filemap_write_and_wait(btree_inode->i_mapping);
	}
	return btrfs_write_and_wait_marked_extents(root,
	int ret;

	ret = btrfs_write_and_wait_marked_extents(root,
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
	clear_btree_io_tree(&trans->transaction->dirty_pages);

	return ret;
}

/*
+0 −2
Original line number Diff line number Diff line
@@ -145,8 +145,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
					struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root);

void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);