btrfs: migrate the dirty bg writeout code (77745c05) · Commits · 戴 / test

fs/btrfs/block-group.c

+516 −0

Original line number	Diff line number	Diff line
		@@ -12,6 +12,7 @@
		#include "ref-verify.h"
		#include "sysfs.h"
		#include "tree-log.h"
		#include "delalloc-space.h"

		void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
		{
		@@ -2003,3 +2004,518 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
		spin_unlock(&cache->lock);
		spin_unlock(&sinfo->lock);
		}

		static int write_one_cache_group(struct btrfs_trans_handle *trans,
		struct btrfs_path *path,
		struct btrfs_block_group_cache *cache)
		{
		struct btrfs_fs_info *fs_info = trans->fs_info;
		int ret;
		struct btrfs_root *extent_root = fs_info->extent_root;
		unsigned long bi;
		struct extent_buffer *leaf;

		ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
		if (ret) {
		if (ret > 0)
		ret = -ENOENT;
		goto fail;
		}

		leaf = path->nodes[0];
		bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
		write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
		btrfs_mark_buffer_dirty(leaf);
		fail:
		btrfs_release_path(path);
		return ret;

		}

		static int cache_save_setup(struct btrfs_block_group_cache *block_group,
		struct btrfs_trans_handle *trans,
		struct btrfs_path *path)
		{
		struct btrfs_fs_info *fs_info = block_group->fs_info;
		struct btrfs_root *root = fs_info->tree_root;
		struct inode *inode = NULL;
		struct extent_changeset *data_reserved = NULL;
		u64 alloc_hint = 0;
		int dcs = BTRFS_DC_ERROR;
		u64 num_pages = 0;
		int retries = 0;
		int ret = 0;

		/*
		* If this block group is smaller than 100 megs don't bother caching the
		* block group.
		*/
		if (block_group->key.offset < (100 * SZ_1M)) {
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
		spin_unlock(&block_group->lock);
		return 0;
		}

		if (trans->aborted)
		return 0;
		again:
		inode = lookup_free_space_inode(block_group, path);
		if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
		ret = PTR_ERR(inode);
		btrfs_release_path(path);
		goto out;
		}

		if (IS_ERR(inode)) {
		BUG_ON(retries);
		retries++;

		if (block_group->ro)
		goto out_free;

		ret = create_free_space_inode(trans, block_group, path);
		if (ret)
		goto out_free;
		goto again;
		}

		/*
		* We want to set the generation to 0, that way if anything goes wrong
		* from here on out we know not to trust this cache when we load up next
		* time.
		*/
		BTRFS_I(inode)->generation = 0;
		ret = btrfs_update_inode(trans, root, inode);
		if (ret) {
		/*
		* So theoretically we could recover from this, simply set the
		* super cache generation to 0 so we know to invalidate the
		* cache, but then we'd have to keep track of the block groups
		* that fail this way so we know we _have_ to reset this cache
		* before the next commit or risk reading stale cache. So to
		* limit our exposure to horrible edge cases lets just abort the
		* transaction, this only happens in really bad situations
		* anyway.
		*/
		btrfs_abort_transaction(trans, ret);
		goto out_put;
		}
		WARN_ON(ret);

		/* We've already setup this transaction, go ahead and exit */
		if (block_group->cache_generation == trans->transid &&
		i_size_read(inode)) {
		dcs = BTRFS_DC_SETUP;
		goto out_put;
		}

		if (i_size_read(inode) > 0) {
		ret = btrfs_check_trunc_cache_free_space(fs_info,
		&fs_info->global_block_rsv);
		if (ret)
		goto out_put;

		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
		if (ret)
		goto out_put;
		}

		spin_lock(&block_group->lock);
		if (block_group->cached != BTRFS_CACHE_FINISHED \|\|
		!btrfs_test_opt(fs_info, SPACE_CACHE)) {
		/*
		* don't bother trying to write stuff out _if_
		* a) we're not cached,
		* b) we're with nospace_cache mount option,
		* c) we're with v2 space_cache (FREE_SPACE_TREE).
		*/
		dcs = BTRFS_DC_WRITTEN;
		spin_unlock(&block_group->lock);
		goto out_put;
		}
		spin_unlock(&block_group->lock);

		/*
		* We hit an ENOSPC when setting up the cache in this transaction, just
		* skip doing the setup, we've already cleared the cache so we're safe.
		*/
		if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
		ret = -ENOSPC;
		goto out_put;
		}

		/*
		* Try to preallocate enough space based on how big the block group is.
		* Keep in mind this has to include any pinned space which could end up
		* taking up quite a bit since it's not folded into the other space
		* cache.
		*/
		num_pages = div_u64(block_group->key.offset, SZ_256M);
		if (!num_pages)
		num_pages = 1;

		num_pages *= 16;
		num_pages *= PAGE_SIZE;

		ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
		if (ret)
		goto out_put;

		ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
		num_pages, num_pages,
		&alloc_hint);
		/*
		* Our cache requires contiguous chunks so that we don't modify a bunch
		* of metadata or split extents when writing the cache out, which means
		* we can enospc if we are heavily fragmented in addition to just normal
		* out of space conditions. So if we hit this just skip setting up any
		* other block groups for this transaction, maybe we'll unpin enough
		* space the next time around.
		*/
		if (!ret)
		dcs = BTRFS_DC_SETUP;
		else if (ret == -ENOSPC)
		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);

		out_put:
		iput(inode);
		out_free:
		btrfs_release_path(path);
		out:
		spin_lock(&block_group->lock);
		if (!ret && dcs == BTRFS_DC_SETUP)
		block_group->cache_generation = trans->transid;
		block_group->disk_cache_state = dcs;
		spin_unlock(&block_group->lock);

		extent_changeset_free(data_reserved);
		return ret;
		}

		int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
		{
		struct btrfs_fs_info *fs_info = trans->fs_info;
		struct btrfs_block_group_cache cache, tmp;
		struct btrfs_transaction *cur_trans = trans->transaction;
		struct btrfs_path *path;

		if (list_empty(&cur_trans->dirty_bgs) \|\|
		!btrfs_test_opt(fs_info, SPACE_CACHE))
		return 0;

		path = btrfs_alloc_path();
		if (!path)
		return -ENOMEM;

		/* Could add new block groups, use _safe just in case */
		list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
		dirty_list) {
		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
		cache_save_setup(cache, trans, path);
		}

		btrfs_free_path(path);
		return 0;
		}

		/*
		* Transaction commit does final block group cache writeback during a critical
		* section where nothing is allowed to change the FS. This is required in
		* order for the cache to actually match the block group, but can introduce a
		* lot of latency into the commit.
		*
		* So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
		* There's a chance we'll have to redo some of it if the block group changes
		* again during the commit, but it greatly reduces the commit latency by
		* getting rid of the easy block groups while we're still allowing others to
		* join the commit.
		*/
		int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
		{
		struct btrfs_fs_info *fs_info = trans->fs_info;
		struct btrfs_block_group_cache *cache;
		struct btrfs_transaction *cur_trans = trans->transaction;
		int ret = 0;
		int should_put;
		struct btrfs_path *path = NULL;
		LIST_HEAD(dirty);
		struct list_head *io = &cur_trans->io_bgs;
		int num_started = 0;
		int loops = 0;

		spin_lock(&cur_trans->dirty_bgs_lock);
		if (list_empty(&cur_trans->dirty_bgs)) {
		spin_unlock(&cur_trans->dirty_bgs_lock);
		return 0;
		}
		list_splice_init(&cur_trans->dirty_bgs, &dirty);
		spin_unlock(&cur_trans->dirty_bgs_lock);

		again:
		/* Make sure all the block groups on our dirty list actually exist */
		btrfs_create_pending_block_groups(trans);

		if (!path) {
		path = btrfs_alloc_path();
		if (!path)
		return -ENOMEM;
		}

		/*
		* cache_write_mutex is here only to save us from balance or automatic
		* removal of empty block groups deleting this block group while we are
		* writing out the cache
		*/
		mutex_lock(&trans->transaction->cache_write_mutex);
		while (!list_empty(&dirty)) {
		bool drop_reserve = true;

		cache = list_first_entry(&dirty,
		struct btrfs_block_group_cache,
		dirty_list);
		/*
		* This can happen if something re-dirties a block group that
		* is already under IO. Just wait for it to finish and then do
		* it all again
		*/
		if (!list_empty(&cache->io_list)) {
		list_del_init(&cache->io_list);
		btrfs_wait_cache_io(trans, cache, path);
		btrfs_put_block_group(cache);
		}


		/*
		* btrfs_wait_cache_io uses the cache->dirty_list to decide if
		* it should update the cache_state. Don't delete until after
		* we wait.
		*
		* Since we're not running in the commit critical section
		* we need the dirty_bgs_lock to protect from update_block_group
		*/
		spin_lock(&cur_trans->dirty_bgs_lock);
		list_del_init(&cache->dirty_list);
		spin_unlock(&cur_trans->dirty_bgs_lock);

		should_put = 1;

		cache_save_setup(cache, trans, path);

		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
		cache->io_ctl.inode = NULL;
		ret = btrfs_write_out_cache(trans, cache, path);
		if (ret == 0 && cache->io_ctl.inode) {
		num_started++;
		should_put = 0;

		/*
		* The cache_write_mutex is protecting the
		* io_list, also refer to the definition of
		* btrfs_transaction::io_bgs for more details
		*/
		list_add_tail(&cache->io_list, io);
		} else {
		/*
		* If we failed to write the cache, the
		* generation will be bad and life goes on
		*/
		ret = 0;
		}
		}
		if (!ret) {
		ret = write_one_cache_group(trans, path, cache);
		/*
		* Our block group might still be attached to the list
		* of new block groups in the transaction handle of some
		* other task (struct btrfs_trans_handle->new_bgs). This
		* means its block group item isn't yet in the extent
		* tree. If this happens ignore the error, as we will
		* try again later in the critical section of the
		* transaction commit.
		*/
		if (ret == -ENOENT) {
		ret = 0;
		spin_lock(&cur_trans->dirty_bgs_lock);
		if (list_empty(&cache->dirty_list)) {
		list_add_tail(&cache->dirty_list,
		&cur_trans->dirty_bgs);
		btrfs_get_block_group(cache);
		drop_reserve = false;
		}
		spin_unlock(&cur_trans->dirty_bgs_lock);
		} else if (ret) {
		btrfs_abort_transaction(trans, ret);
		}
		}

		/* If it's not on the io list, we need to put the block group */
		if (should_put)
		btrfs_put_block_group(cache);
		if (drop_reserve)
		btrfs_delayed_refs_rsv_release(fs_info, 1);

		if (ret)
		break;

		/*
		* Avoid blocking other tasks for too long. It might even save
		* us from writing caches for block groups that are going to be
		* removed.
		*/
		mutex_unlock(&trans->transaction->cache_write_mutex);
		mutex_lock(&trans->transaction->cache_write_mutex);
		}
		mutex_unlock(&trans->transaction->cache_write_mutex);

		/*
		* Go through delayed refs for all the stuff we've just kicked off
		* and then loop back (just once)
		*/
		ret = btrfs_run_delayed_refs(trans, 0);
		if (!ret && loops == 0) {
		loops++;
		spin_lock(&cur_trans->dirty_bgs_lock);
		list_splice_init(&cur_trans->dirty_bgs, &dirty);
		/*
		* dirty_bgs_lock protects us from concurrent block group
		* deletes too (not just cache_write_mutex).
		*/
		if (!list_empty(&dirty)) {
		spin_unlock(&cur_trans->dirty_bgs_lock);
		goto again;
		}
		spin_unlock(&cur_trans->dirty_bgs_lock);
		} else if (ret < 0) {
		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
		}

		btrfs_free_path(path);
		return ret;
		}

		int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
		{
		struct btrfs_fs_info *fs_info = trans->fs_info;
		struct btrfs_block_group_cache *cache;
		struct btrfs_transaction *cur_trans = trans->transaction;
		int ret = 0;
		int should_put;
		struct btrfs_path *path;
		struct list_head *io = &cur_trans->io_bgs;
		int num_started = 0;

		path = btrfs_alloc_path();
		if (!path)
		return -ENOMEM;

		/*
		* Even though we are in the critical section of the transaction commit,
		* we can still have concurrent tasks adding elements to this
		* transaction's list of dirty block groups. These tasks correspond to
		* endio free space workers started when writeback finishes for a
		* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
		* allocate new block groups as a result of COWing nodes of the root
		* tree when updating the free space inode. The writeback for the space
		* caches is triggered by an earlier call to
		* btrfs_start_dirty_block_groups() and iterations of the following
		* loop.
		* Also we want to do the cache_save_setup first and then run the
		* delayed refs to make sure we have the best chance at doing this all
		* in one shot.
		*/
		spin_lock(&cur_trans->dirty_bgs_lock);
		while (!list_empty(&cur_trans->dirty_bgs)) {
		cache = list_first_entry(&cur_trans->dirty_bgs,
		struct btrfs_block_group_cache,
		dirty_list);

		/*
		* This can happen if cache_save_setup re-dirties a block group
		* that is already under IO. Just wait for it to finish and
		* then do it all again
		*/
		if (!list_empty(&cache->io_list)) {
		spin_unlock(&cur_trans->dirty_bgs_lock);
		list_del_init(&cache->io_list);
		btrfs_wait_cache_io(trans, cache, path);
		btrfs_put_block_group(cache);
		spin_lock(&cur_trans->dirty_bgs_lock);
		}

		/*
		* Don't remove from the dirty list until after we've waited on
		* any pending IO
		*/
		list_del_init(&cache->dirty_list);
		spin_unlock(&cur_trans->dirty_bgs_lock);
		should_put = 1;

		cache_save_setup(cache, trans, path);

		if (!ret)
		ret = btrfs_run_delayed_refs(trans,
		(unsigned long) -1);

		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
		cache->io_ctl.inode = NULL;
		ret = btrfs_write_out_cache(trans, cache, path);
		if (ret == 0 && cache->io_ctl.inode) {
		num_started++;
		should_put = 0;
		list_add_tail(&cache->io_list, io);
		} else {
		/*
		* If we failed to write the cache, the
		* generation will be bad and life goes on
		*/
		ret = 0;
		}
		}
		if (!ret) {
		ret = write_one_cache_group(trans, path, cache);
		/*
		* One of the free space endio workers might have
		* created a new block group while updating a free space
		* cache's inode (at inode.c:btrfs_finish_ordered_io())
		* and hasn't released its transaction handle yet, in
		* which case the new block group is still attached to
		* its transaction handle and its creation has not
		* finished yet (no block group item in the extent tree
		* yet, etc). If this is the case, wait for all free
		* space endio workers to finish and retry. This is a
		* a very rare case so no need for a more efficient and
		* complex approach.
		*/
		if (ret == -ENOENT) {
		wait_event(cur_trans->writer_wait,
		atomic_read(&cur_trans->num_writers) == 1);
		ret = write_one_cache_group(trans, path, cache);
		}
		if (ret)
		btrfs_abort_transaction(trans, ret);
		}

		/* If its not on the io list, we need to put the block group */
		if (should_put)
		btrfs_put_block_group(cache);
		btrfs_delayed_refs_rsv_release(fs_info, 1);
		spin_lock(&cur_trans->dirty_bgs_lock);
		}
		spin_unlock(&cur_trans->dirty_bgs_lock);

		/*
		* Refer to the definition of io_bgs member for details why it's safe
		* to use it without any locking
		*/
		while (!list_empty(io)) {
		cache = list_first_entry(io, struct btrfs_block_group_cache,
		io_list);
		list_del_init(&cache->io_list);
		btrfs_wait_cache_io(trans, cache, path);
		btrfs_put_block_group(cache);
		}

		btrfs_free_path(path);
		return ret;
		}

fs/btrfs/block-group.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -189,6 +189,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
		void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
		int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
		void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
		int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
		int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
		int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);

		static inline int btrfs_block_group_cache_done(
		struct btrfs_block_group_cache *cache)

fs/btrfs/ctree.h

+0 −3

Original line number	Diff line number	Diff line
		@@ -2523,9 +2523,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
		int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
		struct btrfs_ref *generic_ref);

		int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
		int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
		int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
		int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
		int btrfs_free_block_groups(struct btrfs_fs_info *info);
		void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);

fs/btrfs/extent-tree.c

+0 −518

File changed.

Preview size limit exceeded, changes collapsed.

Admin message