Commit 81a046b1 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "Features, highlights:

   - async discard
       - "mount -o discard=async" to enable it
       - freed extents are not discarded immediatelly, but grouped
         together and trimmed later, with IO rate limiting
       - the "sync" mode submits short extents that could have been
         ignored completely by the device, for SATA prior to 3.1 the
         requests are unqueued and have a big impact on performance
       - the actual discard IO requests have been moved out of
         transaction commit to a worker thread, improving commit latency
       - IO rate and request size can be tuned by sysfs files, for now
         enabled only with CONFIG_BTRFS_DEBUG as we might need to
         add/delete the files and don't have a stable-ish ABI for
         general use, defaults are conservative

   - export device state info in sysfs, eg. missing, writeable

   - no discard of extents known to be untouched on disk (eg. after
     reservation)

   - device stats reset is logged with process name and PID that called
     the ioctl

  Fixes:

   - fix missing hole after hole punching and fsync when using NO_HOLES

   - writeback: range cyclic mode could miss some dirty pages and lead
     to OOM

   - two more corner cases for metadata_uuid change after power loss
     during the change

   - fix infinite loop during fsync after mix of rename operations

  Core changes:

   - qgroup assign returns ENOTCONN when quotas not enabled, used to
     return EINVAL that was confusing

   - device closing does not need to allocate memory anymore

   - snapshot aware code got removed, disabled for years due to
     performance problems, reimplmentation will allow to select wheter
     defrag breaks or does not break COW on shared extents

   - tree-checker:
       - check leaf chunk item size, cross check against number of
         stripes
       - verify location keys for DIR_ITEM, DIR_INDEX and XATTR items

   - new self test for physical -> logical mapping code, used for super
     block range exclusion

   - assertion helpers/macros updated to avoid objtool "unreachable
     code" reports on older compilers or config option combinations"

* tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (84 commits)
  btrfs: free block groups after free'ing fs trees
  btrfs: Fix split-brain handling when changing FSID to metadata uuid
  btrfs: Handle another split brain scenario with metadata uuid feature
  btrfs: Factor out metadata_uuid code from find_fsid.
  btrfs: Call find_fsid from find_fsid_inprogress
  Btrfs: fix infinite loop during fsync after rename operations
  btrfs: set trans->drity in btrfs_commit_transaction
  btrfs: drop log root for dropped roots
  btrfs: sysfs, add devid/dev_state kobject and device attributes
  btrfs: Refactor btrfs_rmap_block to improve readability
  btrfs: Add self-tests for btrfs_rmap_block
  btrfs: selftests: Add support for dummy devices
  btrfs: Move and unexport btrfs_rmap_block
  btrfs: separate definition of assertion failure handlers
  btrfs: device stats, log when stats are zeroed
  btrfs: fix improper setting of scanned for range cyclic write cache pages
  btrfs: safely advance counter when looking up bio csums
  btrfs: remove unused member btrfs_device::work
  btrfs: remove unnecessary wrapper get_alloc_profile
  btrfs: add correction to handle -1 edge case in async discard
  ...
parents 511fdb78 4e19443d
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
	   block-rsv.o delalloc-space.o block-group.o
	   block-rsv.o delalloc-space.o block-group.o discard.o

btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+181 −31
Original line number Diff line number Diff line
@@ -14,6 +14,8 @@
#include "sysfs.h"
#include "tree-log.h"
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"

/*
 * Return target flags in extended format or 0 if restripe for this chunk_type
@@ -95,7 +97,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
	return extended_to_chunk(flags | allowed);
}

static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
	unsigned seq;
	u64 flags;
@@ -115,11 +117,6 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
	return btrfs_reduce_alloc_profile(fs_info, flags);
}

u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
	return get_alloc_profile(fs_info, orig_flags);
}

void btrfs_get_block_group(struct btrfs_block_group *cache)
{
	atomic_inc(&cache->count);
@@ -131,6 +128,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
		WARN_ON(cache->pinned > 0);
		WARN_ON(cache->reserved > 0);

		/*
		 * A block_group shouldn't be on the discard_list anymore.
		 * Remove the block_group from the discard_list to prevent us
		 * from causing a panic due to NULL pointer dereference.
		 */
		if (WARN_ON(!list_empty(&cache->discard_list)))
			btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
						  cache);

		/*
		 * If not empty, someone is still holding mutex of
		 * full_stripe_lock, which can only be released by caller.
@@ -466,8 +472,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
		} else if (extent_start > start && extent_start < end) {
			size = extent_start - start;
			total_added += size;
			ret = btrfs_add_free_space(block_group, start,
						   size);
			ret = btrfs_add_free_space_async_trimmed(block_group,
								 start, size);
			BUG_ON(ret); /* -ENOMEM or logic error */
			start = extent_end + 1;
		} else {
@@ -478,7 +484,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
	if (start < end) {
		size = end - start;
		total_added += size;
		ret = btrfs_add_free_space(block_group, start, size);
		ret = btrfs_add_free_space_async_trimmed(block_group, start,
							 size);
		BUG_ON(ret); /* -ENOMEM or logic error */
	}

@@ -1185,21 +1192,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
	struct btrfs_space_info *sinfo = cache->space_info;
	u64 num_bytes;
	u64 sinfo_used;
	u64 min_allocable_bytes;
	int ret = -ENOSPC;

	/*
	 * We need some metadata space and system metadata space for
	 * allocating chunks in some corner cases until we force to set
	 * it to be readonly.
	 */
	if ((sinfo->flags &
	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
	    !force)
		min_allocable_bytes = SZ_1M;
	else
		min_allocable_bytes = 0;

	spin_lock(&sinfo->lock);
	spin_lock(&cache->lock);

@@ -1217,10 +1211,9 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
	 *
	 * Here we make sure if we mark this bg RO, we still have enough
	 * free space as buffer (if min_allocable_bytes is not 0).
	 * free space as buffer.
	 */
	if (sinfo_used + num_bytes + min_allocable_bytes <=
	    sinfo->total_bytes) {
	if (sinfo_used + num_bytes <= sinfo->total_bytes) {
		sinfo->bytes_readonly += num_bytes;
		cache->ro++;
		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -1233,8 +1226,8 @@ out:
		btrfs_info(cache->fs_info,
			"unable to make block group %llu ro", cache->start);
		btrfs_info(cache->fs_info,
			"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
			sinfo_used, num_bytes, min_allocable_bytes);
			"sinfo_used=%llu bg_num_bytes=%llu",
			sinfo_used, num_bytes);
		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
	}
	return ret;
@@ -1249,6 +1242,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
	struct btrfs_block_group *block_group;
	struct btrfs_space_info *space_info;
	struct btrfs_trans_handle *trans;
	const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
	int ret = 0;

	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
@@ -1272,10 +1266,28 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		}
		spin_unlock(&fs_info->unused_bgs_lock);

		btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);

		mutex_lock(&fs_info->delete_unused_bgs_mutex);

		/* Don't want to race with allocators so take the groups_sem */
		down_write(&space_info->groups_sem);

		/*
		 * Async discard moves the final block group discard to be prior
		 * to the unused_bgs code path.  Therefore, if it's not fully
		 * trimmed, punt it back to the async discard lists.
		 */
		if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
		    !btrfs_is_free_space_trimmed(block_group)) {
			trace_btrfs_skip_unused_block_group(block_group);
			up_write(&space_info->groups_sem);
			/* Requeue if we failed because of async discard */
			btrfs_discard_queue_work(&fs_info->discard_ctl,
						 block_group);
			goto next;
		}

		spin_lock(&block_group->lock);
		if (block_group->reserved || block_group->pinned ||
		    block_group->used || block_group->ro ||
@@ -1347,6 +1359,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		}
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);

		/*
		 * At this point, the block_group is read only and should fail
		 * new allocations.  However, btrfs_finish_extent_commit() can
		 * cause this block_group to be placed back on the discard
		 * lists because now the block_group isn't fully discarded.
		 * Bail here and try again later after discarding everything.
		 */
		spin_lock(&fs_info->discard_ctl.lock);
		if (!list_empty(&block_group->discard_list)) {
			spin_unlock(&fs_info->discard_ctl.lock);
			btrfs_dec_block_group_ro(block_group);
			btrfs_discard_queue_work(&fs_info->discard_ctl,
						 block_group);
			goto end_trans;
		}
		spin_unlock(&fs_info->discard_ctl.lock);

		/* Reset pinned so btrfs_put_block_group doesn't complain */
		spin_lock(&space_info->lock);
		spin_lock(&block_group->lock);
@@ -1362,8 +1391,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		spin_unlock(&block_group->lock);
		spin_unlock(&space_info->lock);

		/*
		 * The normal path here is an unused block group is passed here,
		 * then trimming is handled in the transaction commit path.
		 * Async discard interposes before this to do the trimming
		 * before coming down the unused block group path as trimming
		 * will no longer be done later in the transaction commit path.
		 */
		if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
			goto flip_async;

		/* DISCARD can flip during remount */
		trimming = btrfs_test_opt(fs_info, DISCARD);
		trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);

		/* Implicit trim during transaction commit. */
		if (trimming)
@@ -1406,6 +1445,13 @@ next:
		spin_lock(&fs_info->unused_bgs_lock);
	}
	spin_unlock(&fs_info->unused_bgs_lock);
	return;

flip_async:
	btrfs_end_transaction(trans);
	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
	btrfs_put_block_group(block_group);
	btrfs_discard_punt_unused_bgs_list(fs_info);
}

void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
@@ -1516,6 +1562,102 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
	write_sequnlock(&fs_info->profiles_lock);
}

/**
 * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
 * @chunk_start:   logical address of block group
 * @physical:	   physical address to map to logical addresses
 * @logical:	   return array of logical addresses which map to @physical
 * @naddrs:	   length of @logical
 * @stripe_len:    size of IO stripe for the given block group
 *
 * Maps a particular @physical disk address to a list of @logical addresses.
 * Used primarily to exclude those portions of a block group that contain super
 * block copies.
 */
EXPORT_FOR_TESTS
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 *buf;
	u64 bytenr;
	u64 data_stripe_length;
	u64 io_stripe_size;
	int i, nr = 0;
	int ret = 0;

	em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
	if (IS_ERR(em))
		return -EIO;

	map = em->map_lookup;
	data_stripe_length = em->len;
	io_stripe_size = map->stripe_len;

	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		data_stripe_length = div_u64(data_stripe_length,
					     map->num_stripes / map->sub_stripes);
	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
		data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		data_stripe_length = div_u64(data_stripe_length,
					     nr_data_stripes(map));
		io_stripe_size = map->stripe_len * nr_data_stripes(map);
	}

	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
	if (!buf) {
		ret = -ENOMEM;
		goto out;
	}

	for (i = 0; i < map->num_stripes; i++) {
		bool already_inserted = false;
		u64 stripe_nr;
		int j;

		if (!in_range(physical, map->stripes[i].physical,
			      data_stripe_length))
			continue;

		stripe_nr = physical - map->stripes[i].physical;
		stripe_nr = div64_u64(stripe_nr, map->stripe_len);

		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
			stripe_nr = stripe_nr * map->num_stripes + i;
			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
			stripe_nr = stripe_nr * map->num_stripes + i;
		}
		/*
		 * The remaining case would be for RAID56, multiply by
		 * nr_data_stripes().  Alternatively, just use rmap_len below
		 * instead of map->stripe_len
		 */

		bytenr = chunk_start + stripe_nr * io_stripe_size;

		/* Ensure we don't add duplicate addresses */
		for (j = 0; j < nr; j++) {
			if (buf[j] == bytenr) {
				already_inserted = true;
				break;
			}
		}

		if (!already_inserted)
			buf[nr++] = bytenr;
	}

	*logical = buf;
	*naddrs = nr;
	*stripe_len = io_stripe_size;
out:
	free_extent_map(em);
	return ret;
}

static int exclude_super_stripes(struct btrfs_block_group *cache)
{
	struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -1610,6 +1752,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
	set_free_space_tree_thresholds(cache);

	cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;

	atomic_set(&cache->count, 1);
	spin_lock_init(&cache->lock);
	init_rwsem(&cache->data_rwsem);
@@ -1617,6 +1761,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
	INIT_LIST_HEAD(&cache->cluster_list);
	INIT_LIST_HEAD(&cache->bg_list);
	INIT_LIST_HEAD(&cache->ro_list);
	INIT_LIST_HEAD(&cache->discard_list);
	INIT_LIST_HEAD(&cache->dirty_list);
	INIT_LIST_HEAD(&cache->io_list);
	btrfs_init_free_space_ctl(cache);
@@ -1775,6 +1920,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
		inc_block_group_ro(cache, 1);
	} else if (cache->used == 0) {
		ASSERT(list_empty(&cache->bg_list));
		if (btrfs_test_opt(info, DISCARD_ASYNC))
			btrfs_discard_queue_work(&info->discard_ctl, cache);
		else
			btrfs_mark_bg_unused(cache);
	}
	return 0;
@@ -2738,8 +2886,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
		 * dirty list to avoid races between cleaner kthread and space
		 * cache writeout.
		 */
		if (!alloc && old_val == 0)
		if (!alloc && old_val == 0) {
			if (!btrfs_test_opt(info, DISCARD_ASYNC))
				btrfs_mark_bg_unused(cache);
		}

		btrfs_put_block_group(cache);
		total -= num_bytes;
+40 −0
Original line number Diff line number Diff line
@@ -12,6 +12,19 @@ enum btrfs_disk_cache_state {
	BTRFS_DC_SETUP,
};

/*
 * This describes the state of the block_group for async discard.  This is due
 * to the two pass nature of it where extent discarding is prioritized over
 * bitmap discarding.  BTRFS_DISCARD_RESET_CURSOR is set when we are resetting
 * between lists to prevent contention for discard state variables
 * (eg. discard_cursor).
 */
enum btrfs_discard_state {
	BTRFS_DISCARD_EXTENTS,
	BTRFS_DISCARD_BITMAPS,
	BTRFS_DISCARD_RESET_CURSOR,
};

/*
 * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
 * only allocate a chunk if we really need one.
@@ -116,7 +129,13 @@ struct btrfs_block_group {
	/* For read-only block groups */
	struct list_head ro_list;

	/* For discard operations */
	atomic_t trimming;
	struct list_head discard_list;
	int discard_index;
	u64 discard_eligible_time;
	u64 discard_cursor;
	enum btrfs_discard_state discard_state;

	/* For dirty block groups */
	struct list_head dirty_list;
@@ -158,6 +177,22 @@ struct btrfs_block_group {
	struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
};

static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
{
	return (block_group->start + block_group->length);
}

static inline bool btrfs_is_block_group_data_only(
					struct btrfs_block_group *block_group)
{
	/*
	 * In mixed mode the fragmentation is expected to be high, lowering the
	 * efficiency, so only proper data block groups are considered.
	 */
	return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
	       !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
}

#ifdef CONFIG_BTRFS_DEBUG
static inline int btrfs_should_fragment_free_space(
		struct btrfs_block_group *block_group)
@@ -248,4 +283,9 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
		cache->cached == BTRFS_CACHE_ERROR;
}

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
		     u64 physical, u64 **logical, int *naddrs, int *stripe_len);
#endif

#endif /* BTRFS_BLOCK_GROUP_H */
+1 −3
Original line number Diff line number Diff line
@@ -629,7 +629,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
static int btrfsic_process_superblock(struct btrfsic_state *state,
				      struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_fs_info *fs_info = state->fs_info;
	struct btrfs_super_block *selected_super;
	struct list_head *dev_head = &fs_devices->devices;
	struct btrfs_device *device;
@@ -637,7 +636,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
	int ret = 0;
	int pass;

	BUG_ON(NULL == state);
	selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
	if (NULL == selected_super) {
		pr_info("btrfsic: error, kmalloc failed!\n");
@@ -700,7 +698,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
			break;
		}

		num_copies = btrfs_num_copies(fs_info, next_bytenr,
		num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
					      state->metablock_size);
		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
			pr_info("num_copies(log_bytenr=%llu) = %d\n",
+2 −2
Original line number Diff line number Diff line
@@ -763,7 +763,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,

			if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
				ret = btrfs_lookup_bio_sums(inode, comp_bio,
							    sums);
							    (u64)-1, sums);
				BUG_ON(ret); /* -ENOMEM */
			}

@@ -791,7 +791,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
	BUG_ON(ret); /* -ENOMEM */

	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
		ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
		ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
		BUG_ON(ret); /* -ENOMEM */
	}

Loading