Commit 7fe1e641 authored by Li Zefan's avatar Li Zefan
Browse files

Btrfs: rewrite btrfs_trim_block_group()



There are various bugs in block group trimming:

- It may trim from offset smaller than user-specified offset.
- It may trim beyond user-specified range.
- It may leak free space for extents smaller than specified minlen.
- It may truncate the last trimmed extent thus leak free space.
- With mixed extents+bitmaps, some extents may not be trimmed.
- With mixed extents+bitmaps, some bitmaps may not be trimmed (even
none will be trimmed). Even for those trimmed, not all the free space
in the bitmaps will be trimmed.

I rewrite btrfs_trim_block_group() and break it into two functions.
One is to trim extents only, and the other is to trim bitmaps only.

Before patching:

	# fstrim -v /mnt/
	/mnt/: 1496465408 bytes were trimmed

After patching:

	# fstrim -v /mnt/
	/mnt/: 2193768448 bytes were trimmed

And this matches the total free space:

	# btrfs fi df /mnt
	Data: total=3.58GB, used=1.79GB
	System, DUP: total=8.00MB, used=4.00KB
	System: total=4.00MB, used=0.00
	Metadata, DUP: total=205.12MB, used=97.14MB
	Metadata: total=8.00MB, used=0.00

Signed-off-by: default avatarLi Zefan <lizf@cn.fujitsu.com>
parent ec9ef7a1
Loading
Loading
Loading
Loading
+164 −71
Original line number Diff line number Diff line
@@ -2594,17 +2594,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
	cluster->block_group = NULL;
}

int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
			   u64 *trimmed, u64 start, u64 end, u64 minlen)
static int do_trimming(struct btrfs_block_group_cache *block_group,
		       u64 *total_trimmed, u64 start, u64 bytes,
		       u64 reserved_start, u64 reserved_bytes)
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *entry = NULL;
	struct btrfs_space_info *space_info = block_group->space_info;
	struct btrfs_fs_info *fs_info = block_group->fs_info;
	u64 bytes = 0;
	u64 actually_trimmed;
	int ret = 0;
	int ret;
	int update = 0;
	u64 trimmed = 0;

	*trimmed = 0;
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	if (!block_group->ro) {
		block_group->reserved += reserved_bytes;
		space_info->bytes_reserved += reserved_bytes;
		update = 1;
	}
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

	ret = btrfs_error_discard_extent(fs_info->extent_root,
					 start, bytes, &trimmed);
	if (!ret)
		*total_trimmed += trimmed;

	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);

	if (update) {
		spin_lock(&space_info->lock);
		spin_lock(&block_group->lock);
		if (block_group->ro)
			space_info->bytes_readonly += reserved_bytes;
		block_group->reserved -= reserved_bytes;
		space_info->bytes_reserved -= reserved_bytes;
		spin_unlock(&space_info->lock);
		spin_unlock(&block_group->lock);
	}

	return ret;
}

static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *entry;
	struct rb_node *node;
	int ret = 0;
	u64 extent_start;
	u64 extent_bytes;
	u64 bytes;

	while (start < end) {
		spin_lock(&ctl->tree_lock);
@@ -2615,81 +2655,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
		}

		entry = tree_search_offset(ctl, start, 0, 1);
		if (!entry)
			entry = tree_search_offset(ctl,
						   offset_to_bitmap(ctl, start),
						   1, 1);

		if (!entry || entry->offset >= end) {
		if (!entry) {
			spin_unlock(&ctl->tree_lock);
			break;
		}

		if (entry->bitmap) {
			ret = search_bitmap(ctl, entry, &start, &bytes);
			if (!ret) {
				if (start >= end) {
		/* skip bitmaps */
		while (entry->bitmap) {
			node = rb_next(&entry->offset_index);
			if (!node) {
				spin_unlock(&ctl->tree_lock);
				goto out;
			}
			entry = rb_entry(node, struct btrfs_free_space,
					 offset_index);
		}

		if (entry->offset >= end) {
			spin_unlock(&ctl->tree_lock);
			break;
		}
				bytes = min(bytes, end - start);
				bitmap_clear_bits(ctl, entry, start, bytes);
				if (entry->bytes == 0)
					free_bitmap(ctl, entry);
			} else {
				start = entry->offset + BITS_PER_BITMAP *
					block_group->sectorsize;

		extent_start = entry->offset;
		extent_bytes = entry->bytes;
		start = max(start, extent_start);
		bytes = min(extent_start + extent_bytes, end) - start;
		if (bytes < minlen) {
			spin_unlock(&ctl->tree_lock);
				ret = 0;
				continue;
			goto next;
		}
		} else {
			start = entry->offset;
			bytes = min(entry->bytes, end - start);

		unlink_free_space(ctl, entry);
		kmem_cache_free(btrfs_free_space_cachep, entry);
		}

		spin_unlock(&ctl->tree_lock);

		if (bytes >= minlen) {
			struct btrfs_space_info *space_info;
			int update = 0;
		ret = do_trimming(block_group, total_trimmed, start, bytes,
				  extent_start, extent_bytes);
		if (ret)
			break;
next:
		start += bytes;

			space_info = block_group->space_info;
			spin_lock(&space_info->lock);
			spin_lock(&block_group->lock);
			if (!block_group->ro) {
				block_group->reserved += bytes;
				space_info->bytes_reserved += bytes;
				update = 1;
		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}
			spin_unlock(&block_group->lock);
			spin_unlock(&space_info->lock);

			ret = btrfs_error_discard_extent(fs_info->extent_root,
							 start,
							 bytes,
							 &actually_trimmed);
		cond_resched();
	}
out:
	return ret;
}

			btrfs_add_free_space(block_group, start, bytes);
			if (update) {
				spin_lock(&space_info->lock);
				spin_lock(&block_group->lock);
				if (block_group->ro)
					space_info->bytes_readonly += bytes;
				block_group->reserved -= bytes;
				space_info->bytes_reserved -= bytes;
				spin_unlock(&space_info->lock);
				spin_unlock(&block_group->lock);
static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *entry;
	int ret = 0;
	int ret2;
	u64 bytes;
	u64 offset = offset_to_bitmap(ctl, start);

	while (offset < end) {
		bool next_bitmap = false;

		spin_lock(&ctl->tree_lock);

		if (ctl->free_space < minlen) {
			spin_unlock(&ctl->tree_lock);
			break;
		}

		entry = tree_search_offset(ctl, offset, 1, 0);
		if (!entry) {
			spin_unlock(&ctl->tree_lock);
			next_bitmap = true;
			goto next;
		}

		bytes = minlen;
		ret2 = search_bitmap(ctl, entry, &start, &bytes);
		if (ret2 || start >= end) {
			spin_unlock(&ctl->tree_lock);
			next_bitmap = true;
			goto next;
		}

		bytes = min(bytes, end - start);
		if (bytes < minlen) {
			spin_unlock(&ctl->tree_lock);
			goto next;
		}

		bitmap_clear_bits(ctl, entry, start, bytes);
		if (entry->bytes == 0)
			free_bitmap(ctl, entry);

		spin_unlock(&ctl->tree_lock);

		ret = do_trimming(block_group, total_trimmed, start, bytes,
				  start, bytes);
		if (ret)
			break;
			*trimmed += actually_trimmed;
		}
next:
		if (next_bitmap) {
			offset += BITS_PER_BITMAP * ctl->unit;
		} else {
			start += bytes;
		bytes = 0;
			if (start >= offset + BITS_PER_BITMAP * ctl->unit)
				offset += BITS_PER_BITMAP * ctl->unit;
		}

		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
@@ -2702,6 +2779,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
	return ret;
}

int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
			   u64 *trimmed, u64 start, u64 end, u64 minlen)
{
	int ret;

	*trimmed = 0;

	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
	if (ret)
		return ret;

	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);

	return ret;
}

/*
 * Find the left-most item in the cache tree, and then return the
 * smallest inode number in the item.