Commit 312b3a93 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:

 - regression fix: transaction commit can run away due to delayed ref
   waiting heuristic, this is not necessary now because of the proper
   reservation mechanism introduced in 5.0

 - regression fix: potential crash due to use-before-check of an ERR_PTR
   return value

 - fix for transaction abort during transaction commit that needs to
   properly clean up pending block groups

 - fix deadlock during b-tree node/leaf splitting, when this happens on
   some of the fundamental trees, we must prevent new tree block
   allocation to re-enter indirectly via the block group flushing path

 - potential memory leak after errors during mount

* tag 'for-5.0-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: On error always free subvol_name in btrfs_mount
  btrfs: clean up pending block groups when transaction commit aborts
  btrfs: fix potential oops in device_list_add
  btrfs: don't end the transaction for delayed refs in throttle
  Btrfs: fix deadlock when allocating tree block during leaf/node split
parents 12491ed3 532b618b
Loading
Loading
Loading
Loading
+50 −28
Original line number Original line Diff line number Diff line
@@ -968,6 +968,48 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
	return 0;
	return 0;
}
}


static struct extent_buffer *alloc_tree_block_no_bg_flush(
					  struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  u64 parent_start,
					  const struct btrfs_disk_key *disk_key,
					  int level,
					  u64 hint,
					  u64 empty_size)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct extent_buffer *ret;

	/*
	 * If we are COWing a node/leaf from the extent, chunk, device or free
	 * space trees, make sure that we do not finish block group creation of
	 * pending block groups. We do this to avoid a deadlock.
	 * COWing can result in allocation of a new chunk, and flushing pending
	 * block groups (btrfs_create_pending_block_groups()) can be triggered
	 * when finishing allocation of a new chunk. Creation of a pending block
	 * group modifies the extent, chunk, device and free space trees,
	 * therefore we could deadlock with ourselves since we are holding a
	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
	 * try to COW later.
	 * For similar reasons, we also need to delay flushing pending block
	 * groups when splitting a leaf or node, from one of those trees, since
	 * we are holding a write lock on it and its parent or when inserting a
	 * new root node for one of those trees.
	 */
	if (root == fs_info->extent_root ||
	    root == fs_info->chunk_root ||
	    root == fs_info->dev_root ||
	    root == fs_info->free_space_root)
		trans->can_flush_pending_bgs = false;

	ret = btrfs_alloc_tree_block(trans, root, parent_start,
				     root->root_key.objectid, disk_key, level,
				     hint, empty_size);
	trans->can_flush_pending_bgs = true;

	return ret;
}

/*
/*
 * does the dirty work in cow of a single block.  The parent block (if
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
@@ -1015,28 +1057,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
		parent_start = parent->start;
		parent_start = parent->start;


	/*
	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
	 * If we are COWing a node/leaf from the extent, chunk, device or free
					   level, search_start, empty_size);
	 * space trees, make sure that we do not finish block group creation of
	 * pending block groups. We do this to avoid a deadlock.
	 * COWing can result in allocation of a new chunk, and flushing pending
	 * block groups (btrfs_create_pending_block_groups()) can be triggered
	 * when finishing allocation of a new chunk. Creation of a pending block
	 * group modifies the extent, chunk, device and free space trees,
	 * therefore we could deadlock with ourselves since we are holding a
	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
	 * try to COW later.
	 */
	if (root == fs_info->extent_root ||
	    root == fs_info->chunk_root ||
	    root == fs_info->dev_root ||
	    root == fs_info->free_space_root)
		trans->can_flush_pending_bgs = false;

	cow = btrfs_alloc_tree_block(trans, root, parent_start,
			root->root_key.objectid, &disk_key, level,
			search_start, empty_size);
	trans->can_flush_pending_bgs = true;
	if (IS_ERR(cow))
	if (IS_ERR(cow))
		return PTR_ERR(cow);
		return PTR_ERR(cow);


@@ -3345,8 +3367,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
	else
	else
		btrfs_node_key(lower, &lower_key, 0);
		btrfs_node_key(lower, &lower_key, 0);


	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
				   &lower_key, level, root->node->start, 0);
					 root->node->start, 0);
	if (IS_ERR(c))
	if (IS_ERR(c))
		return PTR_ERR(c);
		return PTR_ERR(c);


@@ -3475,8 +3497,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
	mid = (c_nritems + 1) / 2;
	mid = (c_nritems + 1) / 2;
	btrfs_node_key(c, &disk_key, mid);
	btrfs_node_key(c, &disk_key, mid);


	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
			&disk_key, level, c->start, 0);
					     c->start, 0);
	if (IS_ERR(split))
	if (IS_ERR(split))
		return PTR_ERR(split);
		return PTR_ERR(split);


@@ -4260,8 +4282,8 @@ again:
	else
	else
		btrfs_item_key(l, &disk_key, mid);
		btrfs_item_key(l, &disk_key, mid);


	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
			&disk_key, 0, l->start, 0);
					     l->start, 0);
	if (IS_ERR(right))
	if (IS_ERR(right))
		return PTR_ERR(right);
		return PTR_ERR(right);


+3 −0
Original line number Original line Diff line number Diff line
@@ -1621,6 +1621,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
				flags | SB_RDONLY, device_name, data);
				flags | SB_RDONLY, device_name, data);
			if (IS_ERR(mnt_root)) {
			if (IS_ERR(mnt_root)) {
				root = ERR_CAST(mnt_root);
				root = ERR_CAST(mnt_root);
				kfree(subvol_name);
				goto out;
				goto out;
			}
			}


@@ -1630,12 +1631,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
			if (error < 0) {
			if (error < 0) {
				root = ERR_PTR(error);
				root = ERR_PTR(error);
				mntput(mnt_root);
				mntput(mnt_root);
				kfree(subvol_name);
				goto out;
				goto out;
			}
			}
		}
		}
	}
	}
	if (IS_ERR(mnt_root)) {
	if (IS_ERR(mnt_root)) {
		root = ERR_CAST(mnt_root);
		root = ERR_CAST(mnt_root);
		kfree(subvol_name);
		goto out;
		goto out;
	}
	}


+16 −8
Original line number Original line Diff line number Diff line
@@ -850,14 +850,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,


	btrfs_trans_release_chunk_metadata(trans);
	btrfs_trans_release_chunk_metadata(trans);


	if (lock && should_end_transaction(trans) &&
	    READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
		spin_lock(&info->trans_lock);
		if (cur_trans->state == TRANS_STATE_RUNNING)
			cur_trans->state = TRANS_STATE_BLOCKED;
		spin_unlock(&info->trans_lock);
	}

	if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
	if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
		if (throttle)
		if (throttle)
			return btrfs_commit_transaction(trans);
			return btrfs_commit_transaction(trans);
@@ -1879,6 +1871,21 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
}


/*
 * Release reserved delayed ref space of all pending block groups of the
 * transaction and remove them from the list
 */
static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
{
       struct btrfs_fs_info *fs_info = trans->fs_info;
       struct btrfs_block_group_cache *block_group, *tmp;

       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
               btrfs_delayed_refs_rsv_release(fs_info, 1);
               list_del_init(&block_group->bg_list);
       }
}

static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
{
	/*
	/*
@@ -2270,6 +2277,7 @@ scrub_continue:
	btrfs_scrub_continue(fs_info);
	btrfs_scrub_continue(fs_info);
cleanup_transaction:
cleanup_transaction:
	btrfs_trans_release_metadata(trans);
	btrfs_trans_release_metadata(trans);
	btrfs_cleanup_pending_block_groups(trans);
	btrfs_trans_release_chunk_metadata(trans);
	btrfs_trans_release_chunk_metadata(trans);
	trans->block_rsv = NULL;
	trans->block_rsv = NULL;
	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
+2 −2
Original line number Original line Diff line number Diff line
@@ -957,11 +957,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
		else
		else
			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);


		fs_devices->fsid_change = fsid_change_in_progress;

		if (IS_ERR(fs_devices))
		if (IS_ERR(fs_devices))
			return ERR_CAST(fs_devices);
			return ERR_CAST(fs_devices);


		fs_devices->fsid_change = fsid_change_in_progress;

		mutex_lock(&fs_devices->device_list_mutex);
		mutex_lock(&fs_devices->device_list_mutex);
		list_add(&fs_devices->fs_list, &fs_uuids);
		list_add(&fs_devices->fs_list, &fs_uuids);