Commit 15da849c authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.5/dm-fixes' of...

Merge tag 'for-5.5/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mike Snitzer:

 - Fix DM multipath by restoring full path selector functionality for
   bio-based configurations that don't haave a SCSI device handler.

 - Fix dm-btree removal to ensure non-root btree nodes have at least
   (max_entries / 3) entries. This resolves userspace thin_check
   utility's report of "too few entries in btree_node".

 - Fix both the DM thin-provisioning and dm-clone targets to properly
   flush the data device prior to metadata commit. This resolves the
   potential for inconsistency across a power loss event when the data
   device has a volatile writeback cache.

 - Small documentation fixes to dm-clone and dm-integrity.

* tag 'for-5.5/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  docs: dm-integrity: remove reference to ARC4
  dm thin: Flush data device before committing metadata
  dm thin metadata: Add support for a pre-commit callback
  dm clone: Flush destination device before committing metadata
  dm clone metadata: Use a two phase commit
  dm clone metadata: Track exact changes per transaction
  dm btree: increase rebalance threshold in __rebalance2()
  dm: add dm-clone to the documentation index
  dm mpath: remove harmful bio-based optimization
parents 22ff311a 7fc979f8
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -144,7 +144,7 @@ journal_crypt:algorithm(:key) (the key is optional)
	Encrypt the journal using given algorithm to make sure that the
	attacker can't read the journal. You can use a block cipher here
	(such as "cbc(aes)") or a stream cipher (for example "chacha20",
	"salsa20", "ctr(aes)" or "ecb(arc4)").
	"salsa20" or "ctr(aes)").

	The journal contains history of last writes to the block device,
	an attacker reading the journal could see the last sector nubmers
+1 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ Device Mapper
    cache-policies
    cache
    delay
    dm-clone
    dm-crypt
    dm-dust
    dm-flakey
+99 −37
Original line number Diff line number Diff line
@@ -67,23 +67,34 @@ struct superblock_disk {
 * To save constantly doing look ups on disk we keep an in core copy of the
 * on-disk bitmap, the region_map.
 *
 * To further reduce metadata I/O overhead we use a second bitmap, the dmap
 * (dirty bitmap), which tracks the dirty words, i.e. longs, of the region_map.
 * In order to track which regions are hydrated during a metadata transaction,
 * we use a second set of bitmaps, the dmap (dirty bitmap), which includes two
 * bitmaps, namely dirty_regions and dirty_words. The dirty_regions bitmap
 * tracks the regions that got hydrated during the current metadata
 * transaction. The dirty_words bitmap tracks the dirty words, i.e. longs, of
 * the dirty_regions bitmap.
 *
 * This allows us to precisely track the regions that were hydrated during the
 * current metadata transaction and update the metadata accordingly, when we
 * commit the current transaction. This is important because dm-clone should
 * only commit the metadata of regions that were properly flushed to the
 * destination device beforehand. Otherwise, in case of a crash, we could end
 * up with a corrupted dm-clone device.
 *
 * When a region finishes hydrating dm-clone calls
 * dm_clone_set_region_hydrated(), or for discard requests
 * dm_clone_cond_set_range(), which sets the corresponding bits in region_map
 * and dmap.
 *
 * During a metadata commit we scan the dmap for dirty region_map words (longs)
 * and update accordingly the on-disk metadata. Thus, we don't have to flush to
 * disk the whole region_map. We can just flush the dirty region_map words.
 * During a metadata commit we scan dmap->dirty_words and dmap->dirty_regions
 * and update the on-disk metadata accordingly. Thus, we don't have to flush to
 * disk the whole region_map. We can just flush the dirty region_map bits.
 *
 * We use a dirty bitmap, which is smaller than the original region_map, to
 * reduce the amount of memory accesses during a metadata commit. As dm-bitset
 * accesses the on-disk bitmap in 64-bit word granularity, there is no
 * significant benefit in tracking the dirty region_map bits with a smaller
 * granularity.
 * We use the helper dmap->dirty_words bitmap, which is smaller than the
 * original region_map, to reduce the amount of memory accesses during a
 * metadata commit. Moreover, as dm-bitset also accesses the on-disk bitmap in
 * 64-bit word granularity, the dirty_words bitmap helps us avoid useless disk
 * accesses.
 *
 * We could update directly the on-disk bitmap, when dm-clone calls either
 * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this
@@ -92,12 +103,13 @@ struct superblock_disk {
 * e.g., in a hooked overwrite bio's completion routine, and further reduce the
 * I/O completion latency.
 *
 * We maintain two dirty bitmaps. During a metadata commit we atomically swap
 * the currently used dmap with the unused one. This allows the metadata update
 * functions to run concurrently with an ongoing commit.
 * We maintain two dirty bitmap sets. During a metadata commit we atomically
 * swap the currently used dmap with the unused one. This allows the metadata
 * update functions to run concurrently with an ongoing commit.
 */
struct dirty_map {
	unsigned long *dirty_words;
	unsigned long *dirty_regions;
	unsigned int changed;
};

@@ -115,6 +127,9 @@ struct dm_clone_metadata {
	struct dirty_map dmap[2];
	struct dirty_map *current_dmap;

	/* Protected by lock */
	struct dirty_map *committing_dmap;

	/*
	 * In core copy of the on-disk bitmap to save constantly doing look ups
	 * on disk.
@@ -461,34 +476,53 @@ static size_t bitmap_size(unsigned long nr_bits)
	return BITS_TO_LONGS(nr_bits) * sizeof(long);
}

static int dirty_map_init(struct dm_clone_metadata *cmd)
static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words,
			    unsigned long nr_regions)
{
	cmd->dmap[0].changed = 0;
	cmd->dmap[0].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
	dmap->changed = 0;

	if (!cmd->dmap[0].dirty_words) {
		DMERR("Failed to allocate dirty bitmap");
	dmap->dirty_words = kvzalloc(bitmap_size(nr_words), GFP_KERNEL);
	if (!dmap->dirty_words)
		return -ENOMEM;

	dmap->dirty_regions = kvzalloc(bitmap_size(nr_regions), GFP_KERNEL);
	if (!dmap->dirty_regions) {
		kvfree(dmap->dirty_words);
		return -ENOMEM;
	}

	return 0;
}

static void __dirty_map_exit(struct dirty_map *dmap)
{
	kvfree(dmap->dirty_words);
	kvfree(dmap->dirty_regions);
}

	cmd->dmap[1].changed = 0;
	cmd->dmap[1].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
static int dirty_map_init(struct dm_clone_metadata *cmd)
{
	if (__dirty_map_init(&cmd->dmap[0], cmd->nr_words, cmd->nr_regions)) {
		DMERR("Failed to allocate dirty bitmap");
		return -ENOMEM;
	}

	if (!cmd->dmap[1].dirty_words) {
	if (__dirty_map_init(&cmd->dmap[1], cmd->nr_words, cmd->nr_regions)) {
		DMERR("Failed to allocate dirty bitmap");
		kvfree(cmd->dmap[0].dirty_words);
		__dirty_map_exit(&cmd->dmap[0]);
		return -ENOMEM;
	}

	cmd->current_dmap = &cmd->dmap[0];
	cmd->committing_dmap = NULL;

	return 0;
}

static void dirty_map_exit(struct dm_clone_metadata *cmd)
{
	kvfree(cmd->dmap[0].dirty_words);
	kvfree(cmd->dmap[1].dirty_words);
	__dirty_map_exit(&cmd->dmap[0]);
	__dirty_map_exit(&cmd->dmap[1]);
}

static int __load_bitset_in_core(struct dm_clone_metadata *cmd)
@@ -633,21 +667,23 @@ unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd
	return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
}

static int __update_metadata_word(struct dm_clone_metadata *cmd, unsigned long word)
static int __update_metadata_word(struct dm_clone_metadata *cmd,
				  unsigned long *dirty_regions,
				  unsigned long word)
{
	int r;
	unsigned long index = word * BITS_PER_LONG;
	unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG);

	while (index < max_index) {
		if (test_bit(index, cmd->region_map)) {
		if (test_bit(index, dirty_regions)) {
			r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root,
					      index, &cmd->bitset_root);

			if (r) {
				DMERR("dm_bitset_set_bit failed");
				return r;
			}
			__clear_bit(index, dirty_regions);
		}
		index++;
	}
@@ -721,7 +757,7 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
		if (word == cmd->nr_words)
			break;

		r = __update_metadata_word(cmd, word);
		r = __update_metadata_word(cmd, dmap->dirty_regions, word);

		if (r)
			return r;
@@ -743,15 +779,17 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
	return 0;
}

int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd)
{
	int r = -EPERM;
	int r = 0;
	struct dirty_map *dmap, *next_dmap;

	down_write(&cmd->lock);

	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) {
		r = -EPERM;
		goto out;
	}

	/* Get current dirty bitmap */
	dmap = cmd->current_dmap;
@@ -763,7 +801,7 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
	 * The last commit failed, so we don't have a clean dirty-bitmap to
	 * use.
	 */
	if (WARN_ON(next_dmap->changed)) {
	if (WARN_ON(next_dmap->changed || cmd->committing_dmap)) {
		r = -EINVAL;
		goto out;
	}
@@ -773,11 +811,33 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
	cmd->current_dmap = next_dmap;
	spin_unlock_irq(&cmd->bitmap_lock);

	/*
	 * No one is accessing the old dirty bitmap anymore, so we can flush
	 * it.
	 */
	r = __flush_dmap(cmd, dmap);
	/* Set old dirty bitmap as currently committing */
	cmd->committing_dmap = dmap;
out:
	up_write(&cmd->lock);

	return r;
}

int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
{
	int r = -EPERM;

	down_write(&cmd->lock);

	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
		goto out;

	if (WARN_ON(!cmd->committing_dmap)) {
		r = -EINVAL;
		goto out;
	}

	r = __flush_dmap(cmd, cmd->committing_dmap);
	if (!r) {
		/* Clear committing dmap */
		cmd->committing_dmap = NULL;
	}
out:
	up_write(&cmd->lock);

@@ -802,6 +862,7 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re
	dmap = cmd->current_dmap;

	__set_bit(word, dmap->dirty_words);
	__set_bit(region_nr, dmap->dirty_regions);
	__set_bit(region_nr, cmd->region_map);
	dmap->changed = 1;

@@ -830,6 +891,7 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
		if (!test_bit(region_nr, cmd->region_map)) {
			word = region_nr / BITS_PER_LONG;
			__set_bit(word, dmap->dirty_words);
			__set_bit(region_nr, dmap->dirty_regions);
			__set_bit(region_nr, cmd->region_map);
			dmap->changed = 1;
		}
+17 −0
Original line number Diff line number Diff line
@@ -75,7 +75,23 @@ void dm_clone_metadata_close(struct dm_clone_metadata *cmd);

/*
 * Commit dm-clone metadata to disk.
 *
 * We use a two phase commit:
 *
 * 1. dm_clone_metadata_pre_commit(): Prepare the current transaction for
 *    committing. After this is called, all subsequent metadata updates, done
 *    through either dm_clone_set_region_hydrated() or
 *    dm_clone_cond_set_range(), will be part of the **next** transaction.
 *
 * 2. dm_clone_metadata_commit(): Actually commit the current transaction to
 *    disk and start a new transaction.
 *
 * This allows dm-clone to flush the destination device after step (1) to
 * ensure that all freshly hydrated regions, for which we are updating the
 * metadata, are properly written to non-volatile storage and won't be lost in
 * case of a crash.
 */
int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd);
int dm_clone_metadata_commit(struct dm_clone_metadata *cmd);

/*
@@ -112,6 +128,7 @@ int dm_clone_metadata_abort(struct dm_clone_metadata *cmd);
 * Switches metadata to a read only mode. Once read-only mode has been entered
 * the following functions will return -EPERM:
 *
 *   dm_clone_metadata_pre_commit()
 *   dm_clone_metadata_commit()
 *   dm_clone_set_region_hydrated()
 *   dm_clone_cond_set_range()
+46 −7
Original line number Diff line number Diff line
@@ -86,6 +86,12 @@ struct clone {

	struct dm_clone_metadata *cmd;

	/*
	 * bio used to flush the destination device, before committing the
	 * metadata.
	 */
	struct bio flush_bio;

	/* Region hydration hash table */
	struct hash_table_bucket *ht;

@@ -1108,10 +1114,13 @@ static bool need_commit_due_to_time(struct clone *clone)
/*
 * A non-zero return indicates read-only or fail mode.
 */
static int commit_metadata(struct clone *clone)
static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
{
	int r = 0;

	if (dest_dev_flushed)
		*dest_dev_flushed = false;

	mutex_lock(&clone->commit_lock);

	if (!dm_clone_changed_this_transaction(clone->cmd))
@@ -1122,8 +1131,26 @@ static int commit_metadata(struct clone *clone)
		goto out;
	}

	r = dm_clone_metadata_commit(clone->cmd);
	r = dm_clone_metadata_pre_commit(clone->cmd);
	if (unlikely(r)) {
		__metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
		goto out;
	}

	bio_reset(&clone->flush_bio);
	bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
	clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;

	r = submit_bio_wait(&clone->flush_bio);
	if (unlikely(r)) {
		__metadata_operation_failed(clone, "flush destination device", r);
		goto out;
	}

	if (dest_dev_flushed)
		*dest_dev_flushed = true;

	r = dm_clone_metadata_commit(clone->cmd);
	if (unlikely(r)) {
		__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
		goto out;
@@ -1194,6 +1221,7 @@ static void process_deferred_bios(struct clone *clone)
static void process_deferred_flush_bios(struct clone *clone)
{
	struct bio *bio;
	bool dest_dev_flushed;
	struct bio_list bios = BIO_EMPTY_LIST;
	struct bio_list bio_completions = BIO_EMPTY_LIST;

@@ -1213,7 +1241,7 @@ static void process_deferred_flush_bios(struct clone *clone)
	    !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
		return;

	if (commit_metadata(clone)) {
	if (commit_metadata(clone, &dest_dev_flushed)) {
		bio_list_merge(&bios, &bio_completions);

		while ((bio = bio_list_pop(&bios)))
@@ -1227,9 +1255,18 @@ static void process_deferred_flush_bios(struct clone *clone)
	while ((bio = bio_list_pop(&bio_completions)))
		bio_endio(bio);

	while ((bio = bio_list_pop(&bios)))
	while ((bio = bio_list_pop(&bios))) {
		if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
			/* We just flushed the destination device as part of
			 * the metadata commit, so there is no reason to send
			 * another flush.
			 */
			bio_endio(bio);
		} else {
			generic_make_request(bio);
		}
	}
}

static void do_worker(struct work_struct *work)
{
@@ -1400,7 +1437,7 @@ static void clone_status(struct dm_target *ti, status_type_t type,

		/* Commit to ensure statistics aren't out-of-date */
		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
			(void) commit_metadata(clone);
			(void) commit_metadata(clone, NULL);

		r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);

@@ -1834,6 +1871,7 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
	bio_list_init(&clone->deferred_flush_completions);
	clone->hydration_offset = 0;
	atomic_set(&clone->hydrations_in_flight, 0);
	bio_init(&clone->flush_bio, NULL, 0);

	clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
	if (!clone->wq) {
@@ -1907,6 +1945,7 @@ static void clone_dtr(struct dm_target *ti)
	struct clone *clone = ti->private;

	mutex_destroy(&clone->commit_lock);
	bio_uninit(&clone->flush_bio);

	for (i = 0; i < clone->nr_ctr_args; i++)
		kfree(clone->ctr_args[i]);
@@ -1961,7 +2000,7 @@ static void clone_postsuspend(struct dm_target *ti)
	wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
	flush_workqueue(clone->wq);

	(void) commit_metadata(clone);
	(void) commit_metadata(clone, NULL);
}

static void clone_resume(struct dm_target *ti)
Loading