Commit c4c1663b authored by NeilBrown's avatar NeilBrown
Browse files

md/raid5: replace sh->lock with an 'active' flag.



sh->lock is now mainly used to ensure that two threads aren't running
in the locked part of handle_stripe[56] at the same time.

That can more neatly be achieved with an 'active' flag which we set
while running handle_stripe.  If we find the flag is set, we simply
requeue the stripe for later by setting STRIPE_HANDLE.

For safety we take ->device_lock while examining the state of the
stripe and creating a summary in 'stripe_head_state / r6_state'.
This possibly isn't needed but as shared fields like ->toread,
->towrite are checked it is safer for now at least.

We leave the label after the old 'unlock' called "unlock" because it
will disappear in a few patches, so renaming seems pointless.

This leaves the stripe 'locked' for longer as we clear STRIPE_ACTIVE
later, but that is not a problem.

Signed-off-by: default avatarNeilBrown <neilb@suse.de>
Reviewed-by: default avatarNamhyung Kim <namhyung@gmail.com>
parent cbe47ec5
Loading
Loading
Loading
Loading
+13 −13
Original line number Diff line number Diff line
@@ -1020,14 +1020,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
			struct bio *wbi;

			spin_lock(&sh->lock);
			spin_lock_irq(&sh->raid_conf->device_lock);
			chosen = dev->towrite;
			dev->towrite = NULL;
			BUG_ON(dev->written);
			wbi = dev->written = chosen;
			spin_unlock_irq(&sh->raid_conf->device_lock);
			spin_unlock(&sh->lock);

			while (wbi && wbi->bi_sector <
				dev->sector + STRIPE_SECTORS) {
@@ -1322,7 +1320,6 @@ static int grow_one_stripe(raid5_conf_t *conf)
		return 0;

	sh->raid_conf = conf;
	spin_lock_init(&sh->lock);
	#ifdef CONFIG_MULTICORE_RAID456
	init_waitqueue_head(&sh->ops.wait_for_ops);
	#endif
@@ -1442,7 +1439,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
			break;

		nsh->raid_conf = conf;
		spin_lock_init(&nsh->lock);
		#ifdef CONFIG_MULTICORE_RAID456
		init_waitqueue_head(&nsh->ops.wait_for_ops);
		#endif
@@ -2148,7 +2144,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
		(unsigned long long)sh->sector);


	spin_lock(&sh->lock);
	spin_lock_irq(&conf->device_lock);
	if (forwrite) {
		bip = &sh->dev[dd_idx].towrite;
@@ -2184,7 +2179,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
	}
	spin_unlock_irq(&conf->device_lock);
	spin_unlock(&sh->lock);

	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
		(unsigned long long)(*bip)->bi_sector,
@@ -2201,7 +2195,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 overlap:
	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
	spin_unlock_irq(&conf->device_lock);
	spin_unlock(&sh->lock);
	return 0;
}

@@ -3023,12 +3016,10 @@ static void handle_stripe5(struct stripe_head *sh)
		 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
		 sh->reconstruct_state);

	spin_lock(&sh->lock);
	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
		set_bit(STRIPE_SYNCING, &sh->state);
		clear_bit(STRIPE_INSYNC, &sh->state);
	}
	clear_bit(STRIPE_HANDLE, &sh->state);
	clear_bit(STRIPE_DELAYED, &sh->state);

	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
@@ -3037,6 +3028,7 @@ static void handle_stripe5(struct stripe_head *sh)

	/* Now to look around and see what can be done */
	rcu_read_lock();
	spin_lock_irq(&conf->device_lock);
	for (i=disks; i--; ) {
		mdk_rdev_t *rdev;

@@ -3099,6 +3091,7 @@ static void handle_stripe5(struct stripe_head *sh)
			s.failed_num = i;
		}
	}
	spin_unlock_irq(&conf->device_lock);
	rcu_read_unlock();

	if (unlikely(blocked_rdev)) {
@@ -3275,7 +3268,6 @@ static void handle_stripe5(struct stripe_head *sh)
		handle_stripe_expansion(conf, sh, NULL);

 unlock:
	spin_unlock(&sh->lock);

	/* wait for this device to become unblocked */
	if (unlikely(blocked_rdev))
@@ -3318,12 +3310,10 @@ static void handle_stripe6(struct stripe_head *sh)
	       sh->check_state, sh->reconstruct_state);
	memset(&s, 0, sizeof(s));

	spin_lock(&sh->lock);
	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
		set_bit(STRIPE_SYNCING, &sh->state);
		clear_bit(STRIPE_INSYNC, &sh->state);
	}
	clear_bit(STRIPE_HANDLE, &sh->state);
	clear_bit(STRIPE_DELAYED, &sh->state);

	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
@@ -3332,6 +3322,7 @@ static void handle_stripe6(struct stripe_head *sh)
	/* Now to look around and see what can be done */

	rcu_read_lock();
	spin_lock_irq(&conf->device_lock);
	for (i=disks; i--; ) {
		mdk_rdev_t *rdev;
		dev = &sh->dev[i];
@@ -3395,6 +3386,7 @@ static void handle_stripe6(struct stripe_head *sh)
			s.failed++;
		}
	}
	spin_unlock_irq(&conf->device_lock);
	rcu_read_unlock();

	if (unlikely(blocked_rdev)) {
@@ -3580,7 +3572,6 @@ static void handle_stripe6(struct stripe_head *sh)
		handle_stripe_expansion(conf, sh, &r6s);

 unlock:
	spin_unlock(&sh->lock);

	/* wait for this device to become unblocked */
	if (unlikely(blocked_rdev))
@@ -3608,10 +3599,19 @@ static void handle_stripe6(struct stripe_head *sh)

static void handle_stripe(struct stripe_head *sh)
{
	clear_bit(STRIPE_HANDLE, &sh->state);
	if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
		/* already being handled, ensure it gets handled
		 * again when current action finishes */
		set_bit(STRIPE_HANDLE, &sh->state);
		return;
	}

	if (sh->raid_conf->level == 6)
		handle_stripe6(sh);
	else
		handle_stripe5(sh);
	clear_bit(STRIPE_ACTIVE, &sh->state);
}

static void raid5_activate_delayed(raid5_conf_t *conf)
+16 −19
Original line number Diff line number Diff line
@@ -6,11 +6,11 @@

/*
 *
 * Each stripe contains one buffer per disc.  Each buffer can be in
 * Each stripe contains one buffer per device.  Each buffer can be in
 * one of a number of states stored in "flags".  Changes between
 * these states happen *almost* exclusively under a per-stripe
 * spinlock.  Some very specific changes can happen in bi_end_io, and
 * these are not protected by the spin lock.
 * these states happen *almost* exclusively under the protection of the
 * STRIPE_ACTIVE flag.  Some very specific changes can happen in bi_end_io, and
 * these are not protected by STRIPE_ACTIVE.
 *
 * The flag bits that are used to represent these states are:
 *   R5_UPTODATE and R5_LOCKED
@@ -76,12 +76,10 @@
 * block and the cached buffer are successfully written, any buffer on
 * a written list can be returned with b_end_io.
 *
 * The write list and read list both act as fifos.  The read list is
 * protected by the device_lock.  The write and written lists are
 * protected by the stripe lock.  The device_lock, which can be
 * claimed while the stipe lock is held, is only for list
 * manipulations and will only be held for a very short time.  It can
 * be claimed from interrupts.
 * The write list and read list both act as fifos.  The read list,
 * write list and written list are protected by the device_lock.
 * The device_lock is only for list manipulations and will only be
 * held for a very short time.  It can be claimed from interrupts.
 *
 *
 * Stripes in the stripe cache can be on one of two lists (or on
@@ -96,7 +94,6 @@
 *
 * The inactive_list, handle_list and hash bucket lists are all protected by the
 * device_lock.
 *  - stripes on the inactive_list never have their stripe_lock held.
 *  - stripes have a reference counter. If count==0, they are on a list.
 *  - If a stripe might need handling, STRIPE_HANDLE is set.
 *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
@@ -116,10 +113,10 @@
 *  attach a request to an active stripe (add_stripe_bh())
 *     lockdev attach-buffer unlockdev
 *  handle a stripe (handle_stripe())
 *     lockstripe clrSTRIPE_HANDLE ...
 *     setSTRIPE_ACTIVE,  clrSTRIPE_HANDLE ...
 *		(lockdev check-buffers unlockdev) ..
 *		change-state ..
 *		record io/ops needed unlockstripe schedule io/ops
 *		record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
 *  release an active stripe (release_stripe())
 *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
 *
@@ -128,8 +125,7 @@
 * on a cached buffer, and plus one if the stripe is undergoing stripe
 * operations.
 *
 * Stripe operations are performed outside the stripe lock,
 * the stripe operations are:
 * The stripe operations are:
 * -copying data between the stripe cache and user application buffers
 * -computing blocks to save a disk access, or to recover a missing block
 * -updating the parity on a write operation (reconstruct write and
@@ -159,7 +155,8 @@
 */

/*
 * Operations state - intermediate states that are visible outside of sh->lock
 * Operations state - intermediate states that are visible outside of 
 *   STRIPE_ACTIVE.
 * In general _idle indicates nothing is running, _run indicates a data
 * processing operation is active, and _result means the data processing result
 * is stable and can be acted upon.  For simple operations like biofill and
@@ -209,7 +206,6 @@ struct stripe_head {
	short			ddf_layout;/* use DDF ordering to calculate Q */
	unsigned long		state;		/* state flags */
	atomic_t		count;	      /* nr of active thread/requests */
	spinlock_t		lock;
	int			bm_seq;	/* sequence number for bitmap flushes */
	int			disks;		/* disks in stripe */
	enum check_states	check_state;
@@ -240,7 +236,7 @@ struct stripe_head {
};

/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
 *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
 *     for handle_stripe.
 */
struct stripe_head_state {
	int syncing, expanding, expanded;
@@ -290,6 +286,7 @@ struct r6_state {
 * Stripe state
 */
enum {
	STRIPE_ACTIVE,
	STRIPE_HANDLE,
	STRIPE_SYNC_REQUESTED,
	STRIPE_SYNCING,
@@ -339,7 +336,7 @@ enum {
 * PREREAD_ACTIVE.
 * In stripe_handle, if we find pre-reading is necessary, we do it if
 * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
 * HANDLE gets cleared if stripe_handle leave nothing locked.
 * HANDLE gets cleared if stripe_handle leaves nothing locked.
 */