srcu: Implement more-efficient reader counts (f2c46896) · Commits · 戴 / test

include/linux/srcu.h

+5 −5

Original line number	Diff line number	Diff line
		@@ -33,9 +33,9 @@
		#include <linux/rcupdate.h>
		#include <linux/workqueue.h>

		struct srcu_struct_array {
		unsigned long c[2];
		unsigned long seq[2];
		struct srcu_array {
		unsigned long lock_count[2];
		unsigned long unlock_count[2];
		};

		struct rcu_batch {
		@@ -46,7 +46,7 @@ struct rcu_batch {

		struct srcu_struct {
		unsigned long completed;
		struct srcu_struct_array __percpu *per_cpu_ref;
		struct srcu_array __percpu *per_cpu_ref;
		spinlock_t queue_lock; /* protect ->batch_queue, ->running */
		bool running;
		/* callbacks just queued */
		@@ -118,7 +118,7 @@ void process_srcu(struct work_struct *work);
		* See include/linux/percpu-defs.h for the rules on per-CPU variables.
		*/
		#define __DEFINE_SRCU(name, is_static) \
		static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
		static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
		is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
		#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
		#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)

kernel/rcu/rcutorture.c

+17 −2

Original line number	Diff line number	Diff line
		@@ -564,10 +564,25 @@ static void srcu_torture_stats(void)
		pr_alert("%s%s per-CPU(idx=%d):",
		torture_type, TORTURE_FLAG, idx);
		for_each_possible_cpu(cpu) {
		unsigned long l0, l1;
		unsigned long u0, u1;
		long c0, c1;
		struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);

		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
		u0 = counts->unlock_count[!idx];
		u1 = counts->unlock_count[idx];

		/*
		* Make sure that a lock is always counted if the corresponding
		* unlock is counted.
		*/
		smp_rmb();

		l0 = counts->lock_count[!idx];
		l1 = counts->lock_count[idx];

		c0 = l0 - u0;
		c1 = l1 - u1;
		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
		}
		pr_cont("\n");

kernel/rcu/srcu.c

+44 −78

Original line number	Diff line number	Diff line
		@@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
		rcu_batch_init(&sp->batch_check1);
		rcu_batch_init(&sp->batch_done);
		INIT_DELAYED_WORK(&sp->work, process_srcu);
		sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
		sp->per_cpu_ref = alloc_percpu(struct srcu_array);
		return sp->per_cpu_ref ? 0 : -ENOMEM;
		}

		@@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
		#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

		/*
		* Returns approximate total of the readers' ->seq[] values for the
		* Returns approximate total of the readers' ->lock_count[] values for the
		* rank of per-CPU counters specified by idx.
		*/
		static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
		static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
		{
		int cpu;
		unsigned long sum = 0;
		unsigned long t;

		for_each_possible_cpu(cpu) {
		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
		sum += t;
		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);

		sum += READ_ONCE(cpuc->lock_count[idx]);
		}
		return sum;
		}

		/*
		* Returns approximate number of readers active on the specified rank
		* of the per-CPU ->c[] counters.
		* Returns approximate total of the readers' ->unlock_count[] values for the
		* rank of per-CPU counters specified by idx.
		*/
		static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
		static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
		{
		int cpu;
		unsigned long sum = 0;
		unsigned long t;

		for_each_possible_cpu(cpu) {
		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
		sum += t;
		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);

		sum += READ_ONCE(cpuc->unlock_count[idx]);
		}
		return sum;
		}

		/*
		* Return true if the number of pre-existing readers is determined to
		* be stably zero. An example unstable zero can occur if the call
		* to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
		* but due to task migration, sees the corresponding __srcu_read_unlock()
		* decrement. This can happen because srcu_readers_active_idx() takes
		* time to sum the array, and might in fact be interrupted or preempted
		* partway through the summation.
		* be zero.
		*/
		static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
		{
		unsigned long seq;
		unsigned long unlocks;

		seq = srcu_readers_seq_idx(sp, idx);
		unlocks = srcu_readers_unlock_idx(sp, idx);

		/*
		* The following smp_mb() A pairs with the smp_mb() B located in
		* __srcu_read_lock(). This pairing ensures that if an
		* __srcu_read_lock() increments its counter after the summation
		* in srcu_readers_active_idx(), then the corresponding SRCU read-side
		* critical section will see any changes made prior to the start
		* of the current SRCU grace period.
		* Make sure that a lock is always counted if the corresponding unlock
		* is counted. Needs to be a smp_mb() as the read side may contain a
		* read from a variable that is written to before the synchronize_srcu()
		* in the write side. In this case smp_mb()s A and B act like the store
		* buffering pattern.
		*
		* Also, if the above call to srcu_readers_seq_idx() saw the
		* increment of ->seq[], then the call to srcu_readers_active_idx()
		* must see the increment of ->c[].
		* This smp_mb() also pairs with smp_mb() C to prevent accesses after the
		* synchronize_srcu() from being executed before the grace period ends.
		*/
		smp_mb(); /* A */

		/*
		* Note that srcu_readers_active_idx() can incorrectly return
		* zero even though there is a pre-existing reader throughout.
		* To see this, suppose that task A is in a very long SRCU
		* read-side critical section that started on CPU 0, and that
		* no other reader exists, so that the sum of the counters
		* is equal to one. Then suppose that task B starts executing
		* srcu_readers_active_idx(), summing up to CPU 1, and then that
		* task C starts reading on CPU 0, so that its increment is not
		* summed, but finishes reading on CPU 2, so that its decrement
		* -is- summed. Then when task B completes its sum, it will
		* incorrectly get zero, despite the fact that task A has been
		* in its SRCU read-side critical section the whole time.
		* If the locks are the same as the unlocks, then there must have
		* been no readers on this index at some time in between. This does not
		* mean that there are no more readers, as one could have read the
		* current index but not have incremented the lock counter yet.
		*
		* We therefore do a validation step should srcu_readers_active_idx()
		* return zero.
		* Possible bug: There is no guarantee that there haven't been ULONG_MAX
		* increments of ->lock_count[] since the unlocks were counted, meaning
		* that this could return true even if there are still active readers.
		* Since there are no memory barriers around srcu_flip(), the CPU is not
		* required to increment ->completed before running
		* srcu_readers_unlock_idx(), which means that there could be an
		* arbitrarily large number of critical sections that execute after
		* srcu_readers_unlock_idx() but use the old value of ->completed.
		*/
		if (srcu_readers_active_idx(sp, idx) != 0)
		return false;

		/*
		* The remainder of this function is the validation step.
		* The following smp_mb() D pairs with the smp_mb() C in
		* __srcu_read_unlock(). If the __srcu_read_unlock() was seen
		* by srcu_readers_active_idx() above, then any destructive
		* operation performed after the grace period will happen after
		* the corresponding SRCU read-side critical section.
		*
		* Note that there can be at most NR_CPUS worth of readers using
		* the old index, which is not enough to overflow even a 32-bit
		* integer. (Yes, this does mean that systems having more than
		* a billion or so CPUs need to be 64-bit systems.) Therefore,
		* the sum of the ->seq[] counters cannot possibly overflow.
		* Therefore, the only way that the return values of the two
		* calls to srcu_readers_seq_idx() can be equal is if there were
		* no increments of the corresponding rank of ->seq[] counts
		* in the interim. But the missed-increment scenario laid out
		* above includes an increment of the ->seq[] counter by
		* the corresponding __srcu_read_lock(). Therefore, if this
		* scenario occurs, the return values from the two calls to
		* srcu_readers_seq_idx() will differ, and thus the validation
		* step below suffices.
		*/
		smp_mb(); /* D */

		return srcu_readers_seq_idx(sp, idx) == seq;
		return srcu_readers_lock_idx(sp, idx) == unlocks;
		}

		/**
		@@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
		unsigned long sum = 0;

		for_each_possible_cpu(cpu) {
		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);

		sum += READ_ONCE(cpuc->lock_count[0]);
		sum += READ_ONCE(cpuc->lock_count[1]);
		sum -= READ_ONCE(cpuc->unlock_count[0]);
		sum -= READ_ONCE(cpuc->unlock_count[1]);
		}
		return sum;
		}
		@@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
		int idx;

		idx = READ_ONCE(sp->completed) & 0x1;
		__this_cpu_inc(sp->per_cpu_ref->c[idx]);
		__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
		smp_mb(); /* B / / Avoid leaking the critical section. */
		__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
		return idx;
		}
		EXPORT_SYMBOL_GPL(__srcu_read_lock);
		@@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
		void __srcu_read_unlock(struct srcu_struct *sp, int idx)
		{
		smp_mb(); /* C / / Avoid leaking the critical section. */
		this_cpu_dec(sp->per_cpu_ref->c[idx]);
		this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
		}
		EXPORT_SYMBOL_GPL(__srcu_read_unlock);

		@@ -349,7 +315,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)

		/*
		* Increment the ->completed counter so that future SRCU readers will
		* use the other rank of the ->c[] and ->seq[] arrays. This allows
		* use the other rank of the ->(un)lock_count[] arrays. This allows
		* us to wait for pre-existing readers in a starvation-free manner.
		*/
		static void srcu_flip(struct srcu_struct *sp)

Admin message