Commit a0efc03b authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-4.19/dm-fixes' of...

Merge tag 'for-4.19/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mike Snitzer:

 - DM verity fix for crash due to using vmalloc'd buffers with the
   asynchronous crypto hadsh API.

 - Fix to both DM crypt and DM integrity targets to discontinue using
   CRYPTO_TFM_REQ_MAY_SLEEP because its use of GFP_KERNEL can lead to
   deadlock by recursing back into a filesystem.

 - Various DM raid fixes related to reshape and rebuild races.

 - Fix for DM thin-provisioning to avoid data corruption that was a
   side-effect of needing to abort DM thin metadata transaction due to
   running out of metadata space. Fix is to reserve a small amount of
   metadata space so that once it is used the DM thin-pool can finish
   its active transaction before switching to read-only mode.

* tag 'for-4.19/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm thin metadata: try to avoid ever aborting transactions
  dm raid: bump target version, update comments and documentation
  dm raid: fix RAID leg rebuild errors
  dm raid: fix rebuild of specific devices by updating superblock
  dm raid: fix stripe adding reshape deadlock
  dm raid: fix reshape race on small devices
  dm: disable CRYPTO_TFM_REQ_MAY_SLEEP to fix a GFP_KERNEL recursion deadlock
  dm verity: fix crash on bufio buffer that was allocated with vmalloc
parents 0f9aeeac 3ab91828
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -348,3 +348,7 @@ Version History
1.13.1  Fix deadlock caused by early md_stop_writes().  Also fix size an
	state races.
1.13.2  Fix raid redundancy validation and avoid keeping raid set frozen
1.14.0  Fix reshape race on small devices.  Fix stripe adding reshape
	deadlock/potential data corruption.  Update superblock when
	specific devices are requested via rebuild.  Fix RAID leg
	rebuild errors.
+5 −5
Original line number Diff line number Diff line
@@ -332,7 +332,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
	int err;

	desc->tfm = essiv->hash_tfm;
	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
	desc->flags = 0;

	err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt);
	shash_desc_zero(desc);
@@ -606,7 +606,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
	int i, r;

	desc->tfm = lmk->hash_tfm;
	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
	desc->flags = 0;

	r = crypto_shash_init(desc);
	if (r)
@@ -768,7 +768,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,

	/* calculate crc32 for every 32bit part and xor it */
	desc->tfm = tcw->crc32_tfm;
	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
	desc->flags = 0;
	for (i = 0; i < 4; i++) {
		r = crypto_shash_init(desc);
		if (r)
@@ -1251,7 +1251,7 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc,
	 * requests if driver request queue is full.
	 */
	skcipher_request_set_callback(ctx->r.req,
	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
	    CRYPTO_TFM_REQ_MAY_BACKLOG,
	    kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
}

@@ -1268,7 +1268,7 @@ static void crypt_alloc_req_aead(struct crypt_config *cc,
	 * requests if driver request queue is full.
	 */
	aead_request_set_callback(ctx->r.req_aead,
	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
	    CRYPTO_TFM_REQ_MAY_BACKLOG,
	    kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
}

+2 −2
Original line number Diff line number Diff line
@@ -532,7 +532,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
	unsigned j, size;

	desc->tfm = ic->journal_mac;
	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
	desc->flags = 0;

	r = crypto_shash_init(desc);
	if (unlikely(r)) {
@@ -676,7 +676,7 @@ static void complete_journal_encrypt(struct crypto_async_request *req, int err)
static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
{
	int r;
	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
				      complete_journal_encrypt, comp);
	if (likely(encrypt))
		r = crypto_skcipher_encrypt(req);
+61 −93
Original line number Diff line number Diff line
/*
 * Copyright (C) 2010-2011 Neil Brown
 * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
 * Copyright (C) 2010-2018 Red Hat, Inc. All rights reserved.
 *
 * This file is released under the GPL.
 */
@@ -29,9 +29,6 @@
 */
#define	MIN_RAID456_JOURNAL_SPACE (4*2048)

/* Global list of all raid sets */
static LIST_HEAD(raid_sets);

static bool devices_handle_discard_safely = false;

/*
@@ -227,7 +224,6 @@ struct rs_layout {

struct raid_set {
	struct dm_target *ti;
	struct list_head list;

	uint32_t stripe_cache_entries;
	unsigned long ctr_flags;
@@ -273,19 +269,6 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
	mddev->new_chunk_sectors = l->new_chunk_sectors;
}

/* Find any raid_set in active slot for @rs on global list */
static struct raid_set *rs_find_active(struct raid_set *rs)
{
	struct raid_set *r;
	struct mapped_device *md = dm_table_get_md(rs->ti->table);

	list_for_each_entry(r, &raid_sets, list)
		if (r != rs && dm_table_get_md(r->ti->table) == md)
			return r;

	return NULL;
}

/* raid10 algorithms (i.e. formats) */
#define	ALGORITHM_RAID10_DEFAULT	0
#define	ALGORITHM_RAID10_NEAR		1
@@ -764,7 +747,6 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r

	mddev_init(&rs->md);

	INIT_LIST_HEAD(&rs->list);
	rs->raid_disks = raid_devs;
	rs->delta_disks = 0;

@@ -782,9 +764,6 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
	for (i = 0; i < raid_devs; i++)
		md_rdev_init(&rs->dev[i].rdev);

	/* Add @rs to global list. */
	list_add(&rs->list, &raid_sets);

	/*
	 * Remaining items to be initialized by further RAID params:
	 *  rs->md.persistent
@@ -797,7 +776,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
	return rs;
}

/* Free all @rs allocations and remove it from global list. */
/* Free all @rs allocations */
static void raid_set_free(struct raid_set *rs)
{
	int i;
@@ -815,8 +794,6 @@ static void raid_set_free(struct raid_set *rs)
			dm_put_device(rs->ti, rs->dev[i].data_dev);
	}

	list_del(&rs->list);

	kfree(rs);
}

@@ -2649,7 +2626,7 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
		return 0;
	}

	/* HM FIXME: get InSync raid_dev? */
	/* HM FIXME: get In_Sync raid_dev? */
	rdev = &rs->dev[0].rdev;

	if (rs->delta_disks < 0) {
@@ -3149,6 +3126,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
		rs_set_new(rs);
	} else if (rs_is_recovering(rs)) {
		/* Rebuild particular devices */
		if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
			rs_setup_recovery(rs, MaxSector);
		}
		/* A recovering raid set may be resized */
		; /* skip setup rs */
	} else if (rs_is_reshaping(rs)) {
@@ -3242,6 +3224,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
	/* Start raid set read-only and assumed clean to change in raid_resume() */
	rs->md.ro = 1;
	rs->md.in_sync = 1;

	/* Keep array frozen */
	set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);

	/* Has to be held on running the array */
@@ -3265,7 +3249,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
	rs->callbacks.congested_fn = raid_is_congested;
	dm_table_add_target_callbacks(ti->table, &rs->callbacks);

	/* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
	/* If raid4/5/6 journal mode explicitly requested (only possible with journal dev) -> set it */
	if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
		r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
		if (r) {
@@ -3350,32 +3334,53 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
	return DM_MAPIO_SUBMITTED;
}

/* Return string describing the current sync action of @mddev */
static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery)
/* Return sync state string for @state */
enum sync_state { st_frozen, st_reshape, st_resync, st_check, st_repair, st_recover, st_idle };
static const char *sync_str(enum sync_state state)
{
	/* Has to be in above sync_state order! */
	static const char *sync_strs[] = {
		"frozen",
		"reshape",
		"resync",
		"check",
		"repair",
		"recover",
		"idle"
	};

	return __within_range(state, 0, ARRAY_SIZE(sync_strs) - 1) ? sync_strs[state] : "undef";
};

/* Return enum sync_state for @mddev derived from @recovery flags */
static const enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long recovery)
{
	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
		return "frozen";
		return st_frozen;

	/* The MD sync thread can be done with io but still be running */
	/* The MD sync thread can be done with io or be interrupted but still be running */
	if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
	    (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
	     (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
			return "reshape";
			return st_reshape;

		if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
				return "resync";
			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
				return "check";
			return "repair";
				return st_resync;
			if (test_bit(MD_RECOVERY_CHECK, &recovery))
				return st_check;
			return st_repair;
		}

		if (test_bit(MD_RECOVERY_RECOVER, &recovery))
			return "recover";
			return st_recover;

		if (mddev->reshape_position != MaxSector)
			return st_reshape;
	}

	return "idle";
	return st_idle;
}

/*
@@ -3409,6 +3414,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
				sector_t resync_max_sectors)
{
	sector_t r;
	enum sync_state state;
	struct mddev *mddev = &rs->md;

	clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
@@ -3419,20 +3425,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
		set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);

	} else {
		if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) &&
		    !test_bit(MD_RECOVERY_INTR, &recovery) &&
		    (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
		     test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
		     test_bit(MD_RECOVERY_RUNNING, &recovery)))
			r = mddev->curr_resync_completed;
		else
		state = decipher_sync_action(mddev, recovery);

		if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
			r = mddev->recovery_cp;
		else
			r = mddev->curr_resync_completed;

		if (r >= resync_max_sectors &&
		    (!test_bit(MD_RECOVERY_REQUESTED, &recovery) ||
		     (!test_bit(MD_RECOVERY_FROZEN, &recovery) &&
		      !test_bit(MD_RECOVERY_NEEDED, &recovery) &&
		      !test_bit(MD_RECOVERY_RUNNING, &recovery)))) {
		if (state == st_idle && r >= resync_max_sectors) {
			/*
			 * Sync complete.
			 */
@@ -3440,24 +3440,20 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
			if (test_bit(MD_RECOVERY_RECOVER, &recovery))
				set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);

		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) {
		} else if (state == st_recover)
			/*
			 * In case we are recovering, the array is not in sync
			 * and health chars should show the recovering legs.
			 */
			;

		} else if (test_bit(MD_RECOVERY_SYNC, &recovery) &&
			   !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
		else if (state == st_resync)
			/*
			 * If "resync" is occurring, the raid set
			 * is or may be out of sync hence the health
			 * characters shall be 'a'.
			 */
			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);

		} else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) &&
			   !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
		else if (state == st_reshape)
			/*
			 * If "reshape" is occurring, the raid set
			 * is or may be out of sync hence the health
@@ -3465,7 +3461,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
			 */
			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);

		} else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
		else if (state == st_check || state == st_repair)
			/*
			 * If "check" or "repair" is occurring, the raid set has
			 * undergone an initial sync and the health characters
@@ -3473,12 +3469,12 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
			 */
			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);

		} else {
		else {
			struct md_rdev *rdev;

			/*
			 * We are idle and recovery is needed, prevent 'A' chars race
			 * caused by components still set to in-sync by constrcuctor.
			 * caused by components still set to in-sync by constructor.
			 */
			if (test_bit(MD_RECOVERY_NEEDED, &recovery))
				set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
@@ -3542,7 +3538,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
		progress = rs_get_progress(rs, recovery, resync_max_sectors);
		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
				    atomic64_read(&mddev->resync_mismatches) : 0;
		sync_action = decipher_sync_action(&rs->md, recovery);
		sync_action = sync_str(decipher_sync_action(&rs->md, recovery));

		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
		for (i = 0; i < rs->raid_disks; i++)
@@ -3892,14 +3888,13 @@ static int rs_start_reshape(struct raid_set *rs)
	struct mddev *mddev = &rs->md;
	struct md_personality *pers = mddev->pers;

	/* Don't allow the sync thread to work until the table gets reloaded. */
	set_bit(MD_RECOVERY_WAIT, &mddev->recovery);

	r = rs_setup_reshape(rs);
	if (r)
		return r;

	/* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */
	if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
		mddev_resume(mddev);

	/*
	 * Check any reshape constraints enforced by the personalility
	 *
@@ -3923,10 +3918,6 @@ static int rs_start_reshape(struct raid_set *rs)
		}
	}

	/* Suspend because a resume will happen in raid_resume() */
	set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
	mddev_suspend(mddev);

	/*
	 * Now reshape got set up, update superblocks to
	 * reflect the fact so that a table reload will
@@ -3947,29 +3938,6 @@ static int raid_preresume(struct dm_target *ti)
	if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
		return 0;

	if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
		struct raid_set *rs_active = rs_find_active(rs);

		if (rs_active) {
			/*
			 * In case no rebuilds have been requested
			 * and an active table slot exists, copy
			 * current resynchonization completed and
			 * reshape position pointers across from
			 * suspended raid set in the active slot.
			 *
			 * This resumes the new mapping at current
			 * offsets to continue recover/reshape without
			 * necessarily redoing a raid set partially or
			 * causing data corruption in case of a reshape.
			 */
			if (rs_active->md.curr_resync_completed != MaxSector)
				mddev->curr_resync_completed = rs_active->md.curr_resync_completed;
			if (rs_active->md.reshape_position != MaxSector)
				mddev->reshape_position = rs_active->md.reshape_position;
		}
	}

	/*
	 * The superblocks need to be updated on disk if the
	 * array is new or new devices got added (thus zeroed
@@ -4046,7 +4014,7 @@ static void raid_resume(struct dm_target *ti)

static struct target_type raid_target = {
	.name = "raid",
	.version = {1, 13, 2},
	.version = {1, 14, 0},
	.module = THIS_MODULE,
	.ctr = raid_ctr,
	.dtr = raid_dtr,
+35 −1
Original line number Diff line number Diff line
@@ -188,6 +188,12 @@ struct dm_pool_metadata {
	unsigned long flags;
	sector_t data_block_size;

	/*
	 * We reserve a section of the metadata for commit overhead.
	 * All reported space does *not* include this.
	 */
	dm_block_t metadata_reserve;

	/*
	 * Set if a transaction has to be aborted but the attempt to roll back
	 * to the previous (good) transaction failed.  The only pool metadata
@@ -816,6 +822,22 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
	return dm_tm_commit(pmd->tm, sblock);
}

static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
{
	int r;
	dm_block_t total;
	dm_block_t max_blocks = 4096; /* 16M */

	r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
	if (r) {
		DMERR("could not get size of metadata device");
		pmd->metadata_reserve = max_blocks;
	} else {
		sector_div(total, 10);
		pmd->metadata_reserve = min(max_blocks, total);
	}
}

struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
					       sector_t data_block_size,
					       bool format_device)
@@ -849,6 +871,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
		return ERR_PTR(r);
	}

	__set_metadata_reserve(pmd);

	return pmd;
}

@@ -1820,6 +1844,13 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
	down_read(&pmd->root_lock);
	if (!pmd->fail_io)
		r = dm_sm_get_nr_free(pmd->metadata_sm, result);

	if (!r) {
		if (*result < pmd->metadata_reserve)
			*result = 0;
		else
			*result -= pmd->metadata_reserve;
	}
	up_read(&pmd->root_lock);

	return r;
@@ -1932,8 +1963,11 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
	int r = -EINVAL;

	down_write(&pmd->root_lock);
	if (!pmd->fail_io)
	if (!pmd->fail_io) {
		r = __resize_space_map(pmd->metadata_sm, new_count);
		if (!r)
			__set_metadata_reserve(pmd);
	}
	up_write(&pmd->root_lock);

	return r;
Loading