Commit c492e2d4 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'md/4.1-rc5-fixes' of git://neil.brown.name/md

Pull m,ore md bugfixes gfrom Neil Brown:
 "Assorted fixes for new RAID5 stripe-batching functionality.

  Unfortunately this functionality was merged a little prematurely.  The
  necessary testing and code review is now complete (or as complete as
  it can be) and to code passes a variety of tests and looks quite
  sensible.

  Also a fix for some recent locking changes - a race was introduced
  which causes a reshape request to sometimes fail.  No data safety
  issues"

* tag 'md/4.1-rc5-fixes' of git://neil.brown.name/md:
  md: fix race when unfreezing sync_action
  md/raid5: break stripe-batches when the array has failed.
  md/raid5: call break_stripe_batch_list from handle_stripe_clean_event
  md/raid5: be more selective about distributing flags across batch.
  md/raid5: add handle_flags arg to break_stripe_batch_list.
  md/raid5: duplicate some more handle_stripe_clean_event code in break_stripe_batch_list
  md/raid5: remove condition test from check_break_stripe_batch_list.
  md/raid5: Ensure a batch member is not handled prematurely.
  md/raid5: close race between STRIPE_BIT_DELAY and batching.
  md/raid5: ensure whole batch is delayed for all required bitmap updates.
parents ace6a22a 56ccc112
Loading
Loading
Loading
Loading
+8 −6
Original line number Diff line number Diff line
@@ -4211,12 +4211,12 @@ action_store(struct mddev *mddev, const char *page, size_t len)
	if (!mddev->pers || !mddev->pers->sync_request)
		return -EINVAL;


	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
		if (cmd_match(page, "frozen"))
			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		else
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);

	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
		flush_workqueue(md_misc_wq);
		if (mddev->sync_thread) {
			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -4229,16 +4229,17 @@ action_store(struct mddev *mddev, const char *page, size_t len)
		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
		return -EBUSY;
	else if (cmd_match(page, "resync"))
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
	else if (cmd_match(page, "recover")) {
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	} else if (cmd_match(page, "reshape")) {
		int err;
		if (mddev->pers->start_reshape == NULL)
			return -EINVAL;
		err = mddev_lock(mddev);
		if (!err) {
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
			err = mddev->pers->start_reshape(mddev);
			mddev_unlock(mddev);
		}
@@ -4250,6 +4251,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
		else if (!cmd_match(page, "repair"))
			return -EINVAL;
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
	}
+86 −60
Original line number Diff line number Diff line
@@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static bool stripe_can_batch(struct stripe_head *sh)
{
	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
		is_full_stripe_write(sh);
}

@@ -837,6 +838,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
		    < IO_THRESHOLD)
			md_wakeup_thread(conf->mddev->thread);

	if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
		int seq = sh->bm_seq;
		if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
		    sh->batch_head->bm_seq > seq)
			seq = sh->batch_head->bm_seq;
		set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
		sh->batch_head->bm_seq = seq;
	}

	atomic_inc(&sh->count);
unlock_out:
	unlock_two_stripes(head, sh);
@@ -2987,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
		(unsigned long long)(*bip)->bi_iter.bi_sector,
		(unsigned long long)sh->sector, dd_idx);
	spin_unlock_irq(&sh->stripe_lock);

	if (conf->mddev->bitmap && firstwrite) {
		/* Cannot hold spinlock over bitmap_startwrite,
		 * but must ensure this isn't added to a batch until
		 * we have added to the bitmap and set bm_seq.
		 * So set STRIPE_BITMAP_PENDING to prevent
		 * batching.
		 * If multiple add_stripe_bio() calls race here they
		 * much all set STRIPE_BITMAP_PENDING.  So only the first one
		 * to complete "bitmap_startwrite" gets to set
		 * STRIPE_BIT_DELAY.  This is important as once a stripe
		 * is added to a batch, STRIPE_BIT_DELAY cannot be changed
		 * any more.
		 */
		set_bit(STRIPE_BITMAP_PENDING, &sh->state);
		spin_unlock_irq(&sh->stripe_lock);
		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
				  STRIPE_SECTORS, 0);
		spin_lock_irq(&sh->stripe_lock);
		clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
		if (!sh->batch_head) {
			sh->bm_seq = conf->seq_flush+1;
			set_bit(STRIPE_BIT_DELAY, &sh->state);
		}
	}
	spin_unlock_irq(&sh->stripe_lock);

	if (stripe_can_batch(sh))
		stripe_add_to_batch_list(conf, sh);
@@ -3392,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh,
	set_bit(STRIPE_HANDLE, &sh->state);
}

static void break_stripe_batch_list(struct stripe_head *head_sh,
				    unsigned long handle_flags);
/* handle_stripe_clean_event
 * any written block on an uptodate or failed drive can be returned.
 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
@@ -3405,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf,
	int discard_pending = 0;
	struct stripe_head *head_sh = sh;
	bool do_endio = false;
	int wakeup_nr = 0;

	for (i = disks; i--; )
		if (sh->dev[i].written) {
@@ -3494,44 +3523,8 @@ unhash:
		if (atomic_dec_and_test(&conf->pending_full_writes))
			md_wakeup_thread(conf->mddev->thread);

	if (!head_sh->batch_head || !do_endio)
		return;
	for (i = 0; i < head_sh->disks; i++) {
		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
			wakeup_nr++;
	}
	while (!list_empty(&head_sh->batch_list)) {
		int i;
		sh = list_first_entry(&head_sh->batch_list,
				      struct stripe_head, batch_list);
		list_del_init(&sh->batch_list);

		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
						 (1 << STRIPE_PREREAD_ACTIVE) |
						 STRIPE_EXPAND_SYNC_FLAG));
		sh->check_state = head_sh->check_state;
		sh->reconstruct_state = head_sh->reconstruct_state;
		for (i = 0; i < sh->disks; i++) {
			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
				wakeup_nr++;
			sh->dev[i].flags = head_sh->dev[i].flags;
		}

		spin_lock_irq(&sh->stripe_lock);
		sh->batch_head = NULL;
		spin_unlock_irq(&sh->stripe_lock);
		if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
			set_bit(STRIPE_HANDLE, &sh->state);
		release_stripe(sh);
	}

	spin_lock_irq(&head_sh->stripe_lock);
	head_sh->batch_head = NULL;
	spin_unlock_irq(&head_sh->stripe_lock);
	wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
		set_bit(STRIPE_HANDLE, &head_sh->state);
	if (head_sh->batch_head && do_endio)
		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
}

static void handle_stripe_dirtying(struct r5conf *conf,
@@ -4172,9 +4165,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)

static int clear_batch_ready(struct stripe_head *sh)
{
	/* Return '1' if this is a member of batch, or
	 * '0' if it is a lone stripe or a head which can now be
	 * handled.
	 */
	struct stripe_head *tmp;
	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
		return 0;
		return (sh->batch_head && sh->batch_head != sh);
	spin_lock(&sh->stripe_lock);
	if (!sh->batch_head) {
		spin_unlock(&sh->stripe_lock);
@@ -4202,38 +4199,65 @@ static int clear_batch_ready(struct stripe_head *sh)
	return 0;
}

static void check_break_stripe_batch_list(struct stripe_head *sh)
static void break_stripe_batch_list(struct stripe_head *head_sh,
				    unsigned long handle_flags)
{
	struct stripe_head *head_sh, *next;
	struct stripe_head *sh, *next;
	int i;

	if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
		return;

	head_sh = sh;
	int do_wakeup = 0;

	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {

		list_del_init(&sh->batch_list);

		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
					  (1 << STRIPE_SYNCING) |
					  (1 << STRIPE_REPLACED) |
					  (1 << STRIPE_PREREAD_ACTIVE) |
						 (1 << STRIPE_DEGRADED) |
						 STRIPE_EXPAND_SYNC_FLAG));
					  (1 << STRIPE_DELAYED) |
					  (1 << STRIPE_BIT_DELAY) |
					  (1 << STRIPE_FULL_WRITE) |
					  (1 << STRIPE_BIOFILL_RUN) |
					  (1 << STRIPE_COMPUTE_RUN)  |
					  (1 << STRIPE_OPS_REQ_PENDING) |
					  (1 << STRIPE_DISCARD) |
					  (1 << STRIPE_BATCH_READY) |
					  (1 << STRIPE_BATCH_ERR) |
					  (1 << STRIPE_BITMAP_PENDING)));
		WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
					      (1 << STRIPE_REPLACED)));

		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
					    (1 << STRIPE_DEGRADED)),
			      head_sh->state & (1 << STRIPE_INSYNC));

		sh->check_state = head_sh->check_state;
		sh->reconstruct_state = head_sh->reconstruct_state;
		for (i = 0; i < sh->disks; i++)
		for (i = 0; i < sh->disks; i++) {
			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
				do_wakeup = 1;
			sh->dev[i].flags = head_sh->dev[i].flags &
				(~((1 << R5_WriteError) | (1 << R5_Overlap)));

		}
		spin_lock_irq(&sh->stripe_lock);
		sh->batch_head = NULL;
		spin_unlock_irq(&sh->stripe_lock);

		if (handle_flags == 0 ||
		    sh->state & handle_flags)
			set_bit(STRIPE_HANDLE, &sh->state);
		release_stripe(sh);
	}
	spin_lock_irq(&head_sh->stripe_lock);
	head_sh->batch_head = NULL;
	spin_unlock_irq(&head_sh->stripe_lock);
	for (i = 0; i < head_sh->disks; i++)
		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
			do_wakeup = 1;
	if (head_sh->state & handle_flags)
		set_bit(STRIPE_HANDLE, &head_sh->state);

	if (do_wakeup)
		wake_up(&head_sh->raid_conf->wait_for_overlap);
}

static void handle_stripe(struct stripe_head *sh)
@@ -4258,7 +4282,8 @@ static void handle_stripe(struct stripe_head *sh)
		return;
	}

	check_break_stripe_batch_list(sh);
	if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
		break_stripe_batch_list(sh, 0);

	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
		spin_lock(&sh->stripe_lock);
@@ -4312,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh)
	if (s.failed > conf->max_degraded) {
		sh->check_state = 0;
		sh->reconstruct_state = 0;
		break_stripe_batch_list(sh, 0);
		if (s.to_read+s.to_write+s.written)
			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
		if (s.syncing + s.replacing)
+4 −1
Original line number Diff line number Diff line
@@ -337,9 +337,12 @@ enum {
	STRIPE_ON_RELEASE_LIST,
	STRIPE_BATCH_READY,
	STRIPE_BATCH_ERR,
	STRIPE_BITMAP_PENDING,	/* Being added to bitmap, don't add
				 * to batch yet.
				 */
};

#define STRIPE_EXPAND_SYNC_FLAG \
#define STRIPE_EXPAND_SYNC_FLAGS \
	((1 << STRIPE_EXPAND_SOURCE) |\
	(1 << STRIPE_EXPAND_READY) |\
	(1 << STRIPE_EXPANDING) |\