Commit 7bd95320 authored by Rik van Riel's avatar Rik van Riel Committed by Ingo Molnar
Browse files

sched/numa: Prepare for complex topology placement



Preparatory patch for adding NUMA placement on systems with
complex NUMA topology. Also fix a potential divide by zero
in group_weight()

Signed-off-by: default avatarRik van Riel <riel@redhat.com>
Tested-by: default avatarChegu Vinod <chegu_vinod@hp.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: mgorman@suse.de
Cc: chegu_vinod@hp.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1413530994-9732-4-git-send-email-riel@redhat.com


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent e3fe70b1
Loading
Loading
Loading
Loading
+37 −20
Original line number Diff line number Diff line
@@ -931,9 +931,10 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 * larger multiplier, in order to group tasks together that are almost
 * evenly spread out between numa nodes.
 */
static inline unsigned long task_weight(struct task_struct *p, int nid)
static inline unsigned long task_weight(struct task_struct *p, int nid,
					int dist)
{
	unsigned long total_faults;
	unsigned long faults, total_faults;

	if (!p->numa_faults_memory)
		return 0;
@@ -943,15 +944,25 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
	if (!total_faults)
		return 0;

	return 1000 * task_faults(p, nid) / total_faults;
	faults = task_faults(p, nid);
	return 1000 * faults / total_faults;
}

static inline unsigned long group_weight(struct task_struct *p, int nid)
static inline unsigned long group_weight(struct task_struct *p, int nid,
					 int dist)
{
	if (!p->numa_group || !p->numa_group->total_faults)
	unsigned long faults, total_faults;

	if (!p->numa_group)
		return 0;

	total_faults = p->numa_group->total_faults;

	if (!total_faults)
		return 0;

	return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
	faults = group_faults(p, nid);
	return 1000 * faults / total_faults;
}

bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1084,6 +1095,7 @@ struct task_numa_env {
	struct numa_stats src_stats, dst_stats;

	int imbalance_pct;
	int dist;

	struct task_struct *best_task;
	long best_imp;
@@ -1163,6 +1175,7 @@ static void task_numa_compare(struct task_numa_env *env,
	long load;
	long imp = env->p->numa_group ? groupimp : taskimp;
	long moveimp = imp;
	int dist = env->dist;

	rcu_read_lock();

@@ -1196,8 +1209,8 @@ static void task_numa_compare(struct task_numa_env *env,
		 * in any group then look only at task weights.
		 */
		if (cur->numa_group == env->p->numa_group) {
			imp = taskimp + task_weight(cur, env->src_nid) -
			      task_weight(cur, env->dst_nid);
			imp = taskimp + task_weight(cur, env->src_nid, dist) -
			      task_weight(cur, env->dst_nid, dist);
			/*
			 * Add some hysteresis to prevent swapping the
			 * tasks within a group over tiny differences.
@@ -1211,11 +1224,11 @@ static void task_numa_compare(struct task_numa_env *env,
			 * instead.
			 */
			if (cur->numa_group)
				imp += group_weight(cur, env->src_nid) -
				       group_weight(cur, env->dst_nid);
				imp += group_weight(cur, env->src_nid, dist) -
				       group_weight(cur, env->dst_nid, dist);
			else
				imp += task_weight(cur, env->src_nid) -
				       task_weight(cur, env->dst_nid);
				imp += task_weight(cur, env->src_nid, dist) -
				       task_weight(cur, env->dst_nid, dist);
		}
	}

@@ -1314,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
	};
	struct sched_domain *sd;
	unsigned long taskweight, groupweight;
	int nid, ret;
	int nid, ret, dist;
	long taskimp, groupimp;

	/*
@@ -1342,12 +1355,13 @@ static int task_numa_migrate(struct task_struct *p)
		return -EINVAL;
	}

	taskweight = task_weight(p, env.src_nid);
	groupweight = group_weight(p, env.src_nid);
	update_numa_stats(&env.src_stats, env.src_nid);
	env.dst_nid = p->numa_preferred_nid;
	taskimp = task_weight(p, env.dst_nid) - taskweight;
	groupimp = group_weight(p, env.dst_nid) - groupweight;
	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
	taskweight = task_weight(p, env.src_nid, dist);
	groupweight = group_weight(p, env.src_nid, dist);
	update_numa_stats(&env.src_stats, env.src_nid);
	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
	update_numa_stats(&env.dst_stats, env.dst_nid);

	/* Try to find a spot on the preferred nid. */
@@ -1359,12 +1373,15 @@ static int task_numa_migrate(struct task_struct *p)
			if (nid == env.src_nid || nid == p->numa_preferred_nid)
				continue;

			dist = node_distance(env.src_nid, env.dst_nid);

			/* Only consider nodes where both task and groups benefit */
			taskimp = task_weight(p, nid) - taskweight;
			groupimp = group_weight(p, nid) - groupweight;
			taskimp = task_weight(p, nid, dist) - taskweight;
			groupimp = group_weight(p, nid, dist) - groupweight;
			if (taskimp < 0 && groupimp < 0)
				continue;

			env.dist = dist;
			env.dst_nid = nid;
			update_numa_stats(&env.dst_stats, env.dst_nid);
			task_numa_find_cpu(&env, taskimp, groupimp);