Commit f6b6ec5c authored by Shaohua Li's avatar Shaohua Li Committed by NeilBrown
Browse files

raid5-cache: add journal hot add/remove support



Add support for journal disk hot add/remove. Mostly trival checks in md
part. The raid5 part is a little tricky. For hot-remove, we can't wait
pending write as it's called from raid5d. The wait will cause deadlock.
We simplily fail the hot-remove. A hot-remove retry can success
eventually since if journal disk is faulty all pending write will be
failed and finish. For hot-add, since an array supporting journal but
without journal disk will be marked read-only, we are safe to hot add
journal without stopping IO (should be read IO, while journal only
handles write IO).

Signed-off-by: default avatarShaohua Li <shli@fb.com>
Signed-off-by: default avatarNeilBrown <neilb@suse.com>
parent 9ebc6ef1
Loading
Loading
Loading
Loading
+30 −12
Original line number Diff line number Diff line
@@ -2055,8 +2055,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
		return -EEXIST;

	/* make sure rdev->sectors exceeds mddev->dev_sectors */
	if (rdev->sectors && (mddev->dev_sectors == 0 ||
			rdev->sectors < mddev->dev_sectors)) {
	if (!test_bit(Journal, &rdev->flags) &&
	    rdev->sectors &&
	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
		if (mddev->pers) {
			/* Cannot change size, so fail
			 * If mddev->level <= 0, then we don't care
@@ -2087,7 +2088,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
		}
	}
	rcu_read_unlock();
	if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
	if (!test_bit(Journal, &rdev->flags) &&
	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
		printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
		       mdname(mddev), mddev->max_disks);
		return -EBUSY;
@@ -6044,8 +6046,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
		else
			clear_bit(WriteMostly, &rdev->flags);

		if (info->state & (1<<MD_DISK_JOURNAL))
		if (info->state & (1<<MD_DISK_JOURNAL)) {
			struct md_rdev *rdev2;
			bool has_journal = false;

			/* make sure no existing journal disk */
			rdev_for_each(rdev2, mddev) {
				if (test_bit(Journal, &rdev2->flags)) {
					has_journal = true;
					break;
				}
			}
			if (has_journal) {
				export_rdev(rdev);
				return -EBUSY;
			}
			set_bit(Journal, &rdev->flags);
		}
		/*
		 * check whether the device shows up in other nodes
		 */
@@ -8181,18 +8198,19 @@ static int remove_and_add_spares(struct mddev *mddev,
			continue;
		if (test_bit(Faulty, &rdev->flags))
			continue;
		if (test_bit(Journal, &rdev->flags))
			continue;
		if (!test_bit(Journal, &rdev->flags)) {
			if (mddev->ro &&
			    ! (rdev->saved_raid_disk >= 0 &&
			       !test_bit(Bitmap_sync, &rdev->flags)))
				continue;

			rdev->recovery_offset = 0;
		}
		if (mddev->pers->
		    hot_add_disk(mddev, rdev) == 0) {
			if (sysfs_link_rdev(mddev, rdev))
				/* failure here is OK */;
			if (!test_bit(Journal, &rdev->flags))
				spares++;
			md_new_event(mddev);
			set_bit(MD_CHANGE_DEVS, &mddev->flags);
+12 −4
Original line number Diff line number Diff line
@@ -799,10 +799,18 @@ void r5l_quiesce(struct r5l_log *log, int state)

bool r5l_log_disk_error(struct r5conf *conf)
{
	struct r5l_log *log;
	bool ret;
	/* don't allow write if journal disk is missing */
	if (!conf->log)
		return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
	return test_bit(Faulty, &conf->log->rdev->flags);
	rcu_read_lock();
	log = rcu_dereference(conf->log);

	if (!log)
		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
	else
		ret = test_bit(Faulty, &log->rdev->flags);
	rcu_read_unlock();
	return ret;
}

struct r5l_recovery_ctx {
@@ -1165,7 +1173,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
	if (r5l_load_log(log))
		goto error;

	conf->log = log;
	rcu_assign_pointer(conf->log, log);
	return 0;
error:
	md_unregister_thread(&log->reclaim_thread);
+26 −8
Original line number Diff line number Diff line
@@ -7139,14 +7139,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
	struct disk_info *p = conf->disks + number;

	print_raid5_conf(conf);
	if (test_bit(Journal, &rdev->flags)) {
	if (test_bit(Journal, &rdev->flags) && conf->log) {
		struct r5l_log *log;
		/*
		 * journal disk is not removable, but we need give a chance to
		 * update superblock of other disks. Otherwise journal disk
		 * will be considered as 'fresh'
		 * we can't wait pending write here, as this is called in
		 * raid5d, wait will deadlock.
		 */
		set_bit(MD_CHANGE_DEVS, &mddev->flags);
		return -EINVAL;
		if (atomic_read(&mddev->writes_pending))
			return -EBUSY;
		log = conf->log;
		conf->log = NULL;
		synchronize_rcu();
		r5l_exit_log(log);
		return 0;
	}
	if (rdev == p->rdev)
		rdevp = &p->rdev;
@@ -7210,8 +7215,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
	int first = 0;
	int last = conf->raid_disks - 1;

	if (test_bit(Journal, &rdev->flags))
		return -EINVAL;
	if (test_bit(Journal, &rdev->flags)) {
		char b[BDEVNAME_SIZE];
		if (conf->log)
			return -EBUSY;

		rdev->raid_disk = 0;
		/*
		 * The array is in readonly mode if journal is missing, so no
		 * write requests running. We should be safe
		 */
		r5l_init_log(conf, rdev);
		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
		       mdname(mddev), bdevname(rdev->bdev, b));
		return 0;
	}
	if (mddev->recovery_disabled == conf->recovery_disabled)
		return -EBUSY;