Commit 8d54094e authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'md-next' of https://github.com/liu-song-6/linux into for-5.3/block

Pull MD changes from Song.

* 'md-next' of https://github.com/liu-song-6/linux:
  md: add bitmap_abort label in md_run
  md-bitmap: create and destroy wb_info_pool with the change of bitmap
  md-bitmap: create and destroy wb_info_pool with the change of backlog
  md: introduce mddev_create/destroy_wb_pool for the change of member device
  md/raid1: fix potential data inconsistency issue with write behind device
parents 0ce35379 d494549a
Loading
Loading
Loading
Loading
+20 −0
Original line number Diff line number Diff line
@@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev)
		return;

	md_bitmap_wait_behind_writes(mddev);
	mempool_destroy(mddev->wb_info_pool);
	mddev->wb_info_pool = NULL;

	mutex_lock(&mddev->bitmap_info.mutex);
	spin_lock(&mddev->lock);
@@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev)
	sector_t start = 0;
	sector_t sector = 0;
	struct bitmap *bitmap = mddev->bitmap;
	struct md_rdev *rdev;

	if (!bitmap)
		goto out;

	rdev_for_each(rdev, mddev)
		mddev_create_wb_pool(mddev, rdev, true);

	if (mddev_is_clustered(mddev))
		md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);

@@ -2462,12 +2468,26 @@ static ssize_t
backlog_store(struct mddev *mddev, const char *buf, size_t len)
{
	unsigned long backlog;
	unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
	int rv = kstrtoul(buf, 10, &backlog);
	if (rv)
		return rv;
	if (backlog > COUNTER_MAX)
		return -EINVAL;
	mddev->bitmap_info.max_write_behind = backlog;
	if (!backlog && mddev->wb_info_pool) {
		/* wb_info_pool is not needed if backlog is zero */
		mempool_destroy(mddev->wb_info_pool);
		mddev->wb_info_pool = NULL;
	} else if (backlog && !mddev->wb_info_pool) {
		/* wb_info_pool is needed since backlog is not zero */
		struct md_rdev *rdev;

		rdev_for_each(rdev, mddev)
			mddev_create_wb_pool(mddev, rdev, false);
	}
	if (old_mwb != backlog)
		md_bitmap_update_sb(mddev->bitmap);
	return len;
}

+108 −8
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@

*/

#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
@@ -124,6 +125,77 @@ static inline int speed_max(struct mddev *mddev)
		mddev->sync_speed_max : sysctl_speed_limit_max;
}

static int rdev_init_wb(struct md_rdev *rdev)
{
	if (rdev->bdev->bd_queue->nr_hw_queues == 1)
		return 0;

	spin_lock_init(&rdev->wb_list_lock);
	INIT_LIST_HEAD(&rdev->wb_list);
	init_waitqueue_head(&rdev->wb_io_wait);
	set_bit(WBCollisionCheck, &rdev->flags);

	return 1;
}

/*
 * Create wb_info_pool if rdev is the first multi-queue device flaged
 * with writemostly, also write-behind mode is enabled.
 */
void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
			  bool is_suspend)
{
	if (mddev->bitmap_info.max_write_behind == 0)
		return;

	if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
		return;

	if (mddev->wb_info_pool == NULL) {
		unsigned int noio_flag;

		if (!is_suspend)
			mddev_suspend(mddev);
		noio_flag = memalloc_noio_save();
		mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
							sizeof(struct wb_info));
		memalloc_noio_restore(noio_flag);
		if (!mddev->wb_info_pool)
			pr_err("can't alloc memory pool for writemostly\n");
		if (!is_suspend)
			mddev_resume(mddev);
	}
}
EXPORT_SYMBOL_GPL(mddev_create_wb_pool);

/*
 * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
 */
static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
{
	if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
		return;

	if (mddev->wb_info_pool) {
		struct md_rdev *temp;
		int num = 0;

		/*
		 * Check if other rdevs need wb_info_pool.
		 */
		rdev_for_each(temp, mddev)
			if (temp != rdev &&
			    test_bit(WBCollisionCheck, &temp->flags))
				num++;
		if (!num) {
			mddev_suspend(rdev->mddev);
			mempool_destroy(mddev->wb_info_pool);
			mddev->wb_info_pool = NULL;
			mddev_resume(rdev->mddev);
		}
	}
}

static struct ctl_table_header *raid_table_header;

static struct ctl_table raid_table[] = {
@@ -2210,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
	rdev->mddev = mddev;
	pr_debug("md: bind<%s>\n", b);

	if (mddev->raid_disks)
		mddev_create_wb_pool(mddev, rdev, false);

	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
		goto fail;

@@ -2246,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
	list_del_rcu(&rdev->same_set);
	pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
	mddev_destroy_wb_pool(rdev->mddev, rdev);
	rdev->mddev = NULL;
	sysfs_remove_link(&rdev->kobj, "block");
	sysfs_put(rdev->sysfs_state);
@@ -2758,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
		}
	} else if (cmd_match(buf, "writemostly")) {
		set_bit(WriteMostly, &rdev->flags);
		mddev_create_wb_pool(rdev->mddev, rdev, false);
		err = 0;
	} else if (cmd_match(buf, "-writemostly")) {
		mddev_destroy_wb_pool(rdev->mddev, rdev);
		clear_bit(WriteMostly, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "blocked")) {
@@ -5588,15 +5666,28 @@ int md_run(struct mddev *mddev)
			mddev->bitmap = bitmap;

	}
	if (err) {
		mddev_detach(mddev);
		if (mddev->private)
			pers->free(mddev, mddev->private);
		mddev->private = NULL;
		module_put(pers->owner);
		md_bitmap_destroy(mddev);
		goto abort;
	if (err)
		goto bitmap_abort;

	if (mddev->bitmap_info.max_write_behind > 0) {
		bool creat_pool = false;

		rdev_for_each(rdev, mddev) {
			if (test_bit(WriteMostly, &rdev->flags) &&
			    rdev_init_wb(rdev))
				creat_pool = true;
		}
		if (creat_pool && mddev->wb_info_pool == NULL) {
			mddev->wb_info_pool =
				mempool_create_kmalloc_pool(NR_WB_INFOS,
						    sizeof(struct wb_info));
			if (!mddev->wb_info_pool) {
				err = -ENOMEM;
				goto bitmap_abort;
			}
		}
	}

	if (mddev->queue) {
		bool nonrot = true;

@@ -5657,6 +5748,13 @@ int md_run(struct mddev *mddev)
	sysfs_notify(&mddev->kobj, NULL, "degraded");
	return 0;

bitmap_abort:
	mddev_detach(mddev);
	if (mddev->private)
		pers->free(mddev, mddev->private);
	mddev->private = NULL;
	module_put(pers->owner);
	md_bitmap_destroy(mddev);
abort:
	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
@@ -5825,6 +5923,8 @@ static void __md_stop_writes(struct mddev *mddev)
			mddev->in_sync = 1;
		md_update_sb(mddev, 1);
	}
	mempool_destroy(mddev->wb_info_pool);
	mddev->wb_info_pool = NULL;
}

void md_stop_writes(struct mddev *mddev)
+23 −0
Original line number Diff line number Diff line
@@ -109,6 +109,14 @@ struct md_rdev {
					   * for reporting to userspace and storing
					   * in superblock.
					   */

	/*
	 * The members for check collision of write behind IOs.
	 */
	struct list_head wb_list;
	spinlock_t wb_list_lock;
	wait_queue_head_t wb_io_wait;

	struct work_struct del_work;	/* used for delayed sysfs removal */

	struct kernfs_node *sysfs_state; /* handle for 'state'
@@ -193,6 +201,10 @@ enum flag_bits {
				 * it didn't fail, so don't use FailFast
				 * any more for metadata
				 */
	WBCollisionCheck,	/*
				 * multiqueue device should check if there
				 * is collision between write behind bios.
				 */
};

static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -245,6 +257,14 @@ enum mddev_sb_flags {
	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
};

#define NR_WB_INFOS	8
/* record current range of write behind IOs */
struct wb_info {
	sector_t lo;
	sector_t hi;
	struct list_head list;
};

struct mddev {
	void				*private;
	struct md_personality		*pers;
@@ -461,6 +481,7 @@ struct mddev {
					  */
	struct work_struct flush_work;
	struct work_struct event_work;	/* used by dm to report failure event */
	mempool_t *wb_info_pool;
	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
	struct md_cluster_info		*cluster_info;
	unsigned int			good_device_nr;	/* good device num within cluster raid */
@@ -709,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
				 bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);

+67 −1
Original line number Diff line number Diff line
@@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);

#include "raid1-10.c"

static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
	struct wb_info *wi, *temp_wi;
	unsigned long flags;
	int ret = 0;
	struct mddev *mddev = rdev->mddev;

	wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);

	spin_lock_irqsave(&rdev->wb_list_lock, flags);
	list_for_each_entry(temp_wi, &rdev->wb_list, list) {
		/* collision happened */
		if (hi > temp_wi->lo && lo < temp_wi->hi) {
			ret = -EBUSY;
			break;
		}
	}

	if (!ret) {
		wi->lo = lo;
		wi->hi = hi;
		list_add(&wi->list, &rdev->wb_list);
	} else
		mempool_free(wi, mddev->wb_info_pool);
	spin_unlock_irqrestore(&rdev->wb_list_lock, flags);

	return ret;
}

static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
	struct wb_info *wi;
	unsigned long flags;
	int found = 0;
	struct mddev *mddev = rdev->mddev;

	spin_lock_irqsave(&rdev->wb_list_lock, flags);
	list_for_each_entry(wi, &rdev->wb_list, list)
		if (hi == wi->hi && lo == wi->lo) {
			list_del(&wi->list);
			mempool_free(wi, mddev->wb_info_pool);
			found = 1;
			break;
		}

	if (!found)
		WARN_ON("The write behind IO is not recorded\n");
	spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
	wake_up(&rdev->wb_io_wait);
}

/*
 * for resync bio, r1bio pointer can be retrieved from the per-bio
 * 'struct resync_pages'.
@@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio)
	}

	if (behind) {
		if (test_bit(WBCollisionCheck, &rdev->flags)) {
			sector_t lo = r1_bio->sector;
			sector_t hi = r1_bio->sector + r1_bio->sectors;

			remove_wb(rdev, lo, hi);
		}
		if (test_bit(WriteMostly, &rdev->flags))
			atomic_dec(&r1_bio->behind_remaining);

@@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);

		if (r1_bio->behind_master_bio) {
			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
			struct md_rdev *rdev = conf->mirrors[i].rdev;

			if (test_bit(WBCollisionCheck, &rdev->flags)) {
				sector_t lo = r1_bio->sector;
				sector_t hi = r1_bio->sector + r1_bio->sectors;

				wait_event(rdev->wb_io_wait,
					   check_and_add_wb(rdev, lo, hi) == 0);
			}
			if (test_bit(WriteMostly, &rdev->flags))
				atomic_inc(&r1_bio->behind_remaining);
		}