Commit 461daba0 authored by Suren Baghdasaryan's avatar Suren Baghdasaryan Committed by Peter Zijlstra
Browse files

psi: eliminate kthread_worker from psi trigger scheduling mechanism



Each psi group requires a dedicated kthread_delayed_work and
kthread_worker. Since no other work can be performed using psi_group's
kthread_worker, the same result can be obtained using a task_struct and
a timer directly. This makes psi triggering simpler by removing lists
and locks involved with kthread_worker usage and eliminates the need for
poll_scheduled atomic use in the hot path.

Signed-off-by: default avatarSuren Baghdasaryan <surenb@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200528195442.190116-1-surenb@google.com
parent f4291df1
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -153,9 +153,10 @@ struct psi_group {
	unsigned long avg[NR_PSI_STATES - 1][3];

	/* Monitor work control */
	atomic_t poll_scheduled;
	struct kthread_worker __rcu *poll_kworker;
	struct kthread_delayed_work poll_work;
	struct task_struct __rcu *poll_task;
	struct timer_list poll_timer;
	wait_queue_head_t poll_wait;
	atomic_t poll_wakeup;

	/* Protects data used by the monitor */
	struct mutex trigger_lock;
+64 −49
Original line number Diff line number Diff line
@@ -190,7 +190,6 @@ static void group_init(struct psi_group *group)
	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
	mutex_init(&group->avgs_lock);
	/* Init trigger-related members */
	atomic_set(&group->poll_scheduled, 0);
	mutex_init(&group->trigger_lock);
	INIT_LIST_HEAD(&group->triggers);
	memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
@@ -199,7 +198,7 @@ static void group_init(struct psi_group *group)
	memset(group->polling_total, 0, sizeof(group->polling_total));
	group->polling_next_update = ULLONG_MAX;
	group->polling_until = 0;
	rcu_assign_pointer(group->poll_kworker, NULL);
	rcu_assign_pointer(group->poll_task, NULL);
}

void __init psi_init(void)
@@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now)
	return now + group->poll_min_period;
}

/*
 * Schedule polling if it's not already scheduled. It's safe to call even from
 * hotpath because even though kthread_queue_delayed_work takes worker->lock
 * spinlock that spinlock is never contended due to poll_scheduled atomic
 * preventing such competition.
 */
/* Schedule polling if it's not already scheduled. */
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
{
	struct kthread_worker *kworker;
	struct task_struct *task;

	/* Do not reschedule if already scheduled */
	if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
	/*
	 * Do not reschedule if already scheduled.
	 * Possible race with a timer scheduled after this check but before
	 * mod_timer below can be tolerated because group->polling_next_update
	 * will keep updates on schedule.
	 */
	if (timer_pending(&group->poll_timer))
		return;

	rcu_read_lock();

	kworker = rcu_dereference(group->poll_kworker);
	task = rcu_dereference(group->poll_task);
	/*
	 * kworker might be NULL in case psi_trigger_destroy races with
	 * psi_task_change (hotpath) which can't use locks
	 */
	if (likely(kworker))
		kthread_queue_delayed_work(kworker, &group->poll_work, delay);
	else
		atomic_set(&group->poll_scheduled, 0);
	if (likely(task))
		mod_timer(&group->poll_timer, jiffies + delay);

	rcu_read_unlock();
}

static void psi_poll_work(struct kthread_work *work)
static void psi_poll_work(struct psi_group *group)
{
	struct kthread_delayed_work *dwork;
	struct psi_group *group;
	u32 changed_states;
	u64 now;

	dwork = container_of(work, struct kthread_delayed_work, work);
	group = container_of(dwork, struct psi_group, poll_work);

	atomic_set(&group->poll_scheduled, 0);

	mutex_lock(&group->trigger_lock);

	now = sched_clock();
@@ -623,6 +613,35 @@ out:
	mutex_unlock(&group->trigger_lock);
}

static int psi_poll_worker(void *data)
{
	struct psi_group *group = (struct psi_group *)data;
	struct sched_param param = {
		.sched_priority = 1,
	};

	sched_setscheduler_nocheck(current, SCHED_FIFO, &param);

	while (true) {
		wait_event_interruptible(group->poll_wait,
				atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
				kthread_should_stop());
		if (kthread_should_stop())
			break;

		psi_poll_work(group);
	}
	return 0;
}

static void poll_timer_fn(struct timer_list *t)
{
	struct psi_group *group = from_timer(group, t, poll_timer);

	atomic_set(&group->poll_wakeup, 1);
	wake_up_interruptible(&group->poll_wait);
}

static void record_times(struct psi_group_cpu *groupc, int cpu,
			 bool memstall_tick)
{
@@ -1099,22 +1118,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,

	mutex_lock(&group->trigger_lock);

	if (!rcu_access_pointer(group->poll_kworker)) {
		struct sched_param param = {
			.sched_priority = 1,
		};
		struct kthread_worker *kworker;
	if (!rcu_access_pointer(group->poll_task)) {
		struct task_struct *task;

		kworker = kthread_create_worker(0, "psimon");
		if (IS_ERR(kworker)) {
		task = kthread_create(psi_poll_worker, group, "psimon");
		if (IS_ERR(task)) {
			kfree(t);
			mutex_unlock(&group->trigger_lock);
			return ERR_CAST(kworker);
			return ERR_CAST(task);
		}
		sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
		kthread_init_delayed_work(&group->poll_work,
				psi_poll_work);
		rcu_assign_pointer(group->poll_kworker, kworker);
		atomic_set(&group->poll_wakeup, 0);
		init_waitqueue_head(&group->poll_wait);
		wake_up_process(task);
		timer_setup(&group->poll_timer, poll_timer_fn, 0);
		rcu_assign_pointer(group->poll_task, task);
	}

	list_add(&t->node, &group->triggers);
@@ -1132,7 +1149,7 @@ static void psi_trigger_destroy(struct kref *ref)
{
	struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
	struct psi_group *group = t->group;
	struct kthread_worker *kworker_to_destroy = NULL;
	struct task_struct *task_to_destroy = NULL;

	if (static_branch_likely(&psi_disabled))
		return;
@@ -1158,13 +1175,13 @@ static void psi_trigger_destroy(struct kref *ref)
			period = min(period, div_u64(tmp->win.size,
					UPDATES_PER_WINDOW));
		group->poll_min_period = period;
		/* Destroy poll_kworker when the last trigger is destroyed */
		/* Destroy poll_task when the last trigger is destroyed */
		if (group->poll_states == 0) {
			group->polling_until = 0;
			kworker_to_destroy = rcu_dereference_protected(
					group->poll_kworker,
			task_to_destroy = rcu_dereference_protected(
					group->poll_task,
					lockdep_is_held(&group->trigger_lock));
			rcu_assign_pointer(group->poll_kworker, NULL);
			rcu_assign_pointer(group->poll_task, NULL);
		}
	}

@@ -1172,25 +1189,23 @@ static void psi_trigger_destroy(struct kref *ref)

	/*
	 * Wait for both *trigger_ptr from psi_trigger_replace and
	 * poll_kworker RCUs to complete their read-side critical sections
	 * before destroying the trigger and optionally the poll_kworker
	 * poll_task RCUs to complete their read-side critical sections
	 * before destroying the trigger and optionally the poll_task
	 */
	synchronize_rcu();
	/*
	 * Destroy the kworker after releasing trigger_lock to prevent a
	 * deadlock while waiting for psi_poll_work to acquire trigger_lock
	 */
	if (kworker_to_destroy) {
	if (task_to_destroy) {
		/*
		 * After the RCU grace period has expired, the worker
		 * can no longer be found through group->poll_kworker.
		 * can no longer be found through group->poll_task.
		 * But it might have been already scheduled before
		 * that - deschedule it cleanly before destroying it.
		 */
		kthread_cancel_delayed_work_sync(&group->poll_work);
		atomic_set(&group->poll_scheduled, 0);

		kthread_destroy_worker(kworker_to_destroy);
		del_timer_sync(&group->poll_timer);
		kthread_stop(task_to_destroy);
	}
	kfree(t);
}