Commit 6499b1b2 authored by Vincent Guittot's avatar Vincent Guittot Committed by Ingo Molnar
Browse files

sched/numa: Replace runnable_load_avg by load_avg



Similarly to what has been done for the normal load balancer, we can
replace runnable_load_avg by load_avg in numa load balancing and track the
other statistics like the utilization and the number of running tasks to
get to better view of the current state of a node.

Signed-off-by: default avatarVincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: default avatarMel Gorman <mgorman@techsingularity.net>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Reviewed-by: default avatar"Dietmar Eggemann <dietmar.eggemann@arm.com&gt;">
Acked-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Hillf Danton <hdanton@sina.com>
Link: https://lore.kernel.org/r/20200224095223.13361-6-mgorman@techsingularity.net
parent 6d4d2246
Loading
Loading
Loading
Loading
+70 −32
Original line number Diff line number Diff line
@@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);

static unsigned long cpu_runnable_load(struct rq *rq)
{
	return cfs_rq_runnable_load_avg(&rq->cfs);
}
/*
 * 'numa_type' describes the node at the moment of load balancing.
 */
enum numa_type {
	/* The node has spare capacity that can be used to run more tasks.  */
	node_has_spare = 0,
	/*
	 * The node is fully used and the tasks don't compete for more CPU
	 * cycles. Nevertheless, some tasks might wait before running.
	 */
	node_fully_busy,
	/*
	 * The node is overloaded and can't provide expected CPU cycles to all
	 * tasks.
	 */
	node_overloaded
};

/* Cached statistics for all CPUs within a node */
struct numa_stats {
	unsigned long load;

	unsigned long util;
	/* Total compute capacity of CPUs on a node */
	unsigned long compute_capacity;
	unsigned int nr_running;
	unsigned int weight;
	enum numa_type node_type;
};

/*
 * XXX borrowed from update_sg_lb_stats
 */
static void update_numa_stats(struct numa_stats *ns, int nid)
{
	int cpu;

	memset(ns, 0, sizeof(*ns));
	for_each_cpu(cpu, cpumask_of_node(nid)) {
		struct rq *rq = cpu_rq(cpu);

		ns->load += cpu_runnable_load(rq);
		ns->compute_capacity += capacity_of(cpu);
	}

}

struct task_numa_env {
	struct task_struct *p;

@@ -1521,6 +1518,47 @@ struct task_numa_env {
	int best_cpu;
};

static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_util(int cpu);

static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
			 struct numa_stats *ns)
{
	if ((ns->nr_running > ns->weight) &&
	    ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
		return node_overloaded;

	if ((ns->nr_running < ns->weight) ||
	    ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
		return node_has_spare;

	return node_fully_busy;
}

/*
 * XXX borrowed from update_sg_lb_stats
 */
static void update_numa_stats(struct task_numa_env *env,
			      struct numa_stats *ns, int nid)
{
	int cpu;

	memset(ns, 0, sizeof(*ns));
	for_each_cpu(cpu, cpumask_of_node(nid)) {
		struct rq *rq = cpu_rq(cpu);

		ns->load += cpu_load(rq);
		ns->util += cpu_util(cpu);
		ns->nr_running += rq->cfs.h_nr_running;
		ns->compute_capacity += capacity_of(cpu);
	}

	ns->weight = cpumask_weight(cpumask_of_node(nid));

	ns->node_type = numa_classify(env->imbalance_pct, ns);
}

static void task_numa_assign(struct task_numa_env *env,
			     struct task_struct *p, long imp)
{
@@ -1556,6 +1594,11 @@ static bool load_too_imbalanced(long src_load, long dst_load,
	long orig_src_load, orig_dst_load;
	long src_capacity, dst_capacity;


	/* If dst node has spare capacity, there is no real load imbalance */
	if (env->dst_stats.node_type == node_has_spare)
		return false;

	/*
	 * The load is corrected for the CPU capacity available on each node.
	 *
@@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
	taskweight = task_weight(p, env.src_nid, dist);
	groupweight = group_weight(p, env.src_nid, dist);
	update_numa_stats(&env.src_stats, env.src_nid);
	update_numa_stats(&env, &env.src_stats, env.src_nid);
	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
	update_numa_stats(&env.dst_stats, env.dst_nid);
	update_numa_stats(&env, &env.dst_stats, env.dst_nid);

	/* Try to find a spot on the preferred nid. */
	task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1824,7 +1867,7 @@ static int task_numa_migrate(struct task_struct *p)

			env.dist = dist;
			env.dst_nid = nid;
			update_numa_stats(&env.dst_stats, env.dst_nid);
			update_numa_stats(&env, &env.dst_stats, env.dst_nid);
			task_numa_find_cpu(&env, taskimp, groupimp);
		}
	}
@@ -3686,11 +3729,6 @@ static void remove_entity_load_avg(struct sched_entity *se)
	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
	return cfs_rq->avg.runnable_load_avg;
}

static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
{
	return cfs_rq->avg.load_avg;