sched/fair: Rework find_idlest_group() (57abff06) · Commits · 戴 / test

kernel/sched/fair.c

+256 −128

Original line number	Diff line number	Diff line
		@@ -5531,127 +5531,9 @@ static int wake_affine(struct sched_domain sd, struct task_struct p,
		return target;
		}

		static unsigned long cpu_util_without(int cpu, struct task_struct *p);

		static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
		{
		return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
		}

		/*
		* find_idlest_group finds and returns the least busy CPU group within the
		* domain.
		*
		* Assumes p is allowed on at least one CPU in sd.
		*/
		static struct sched_group *
		find_idlest_group(struct sched_domain sd, struct task_struct p,
		int this_cpu, int sd_flag)
		{
		struct sched_group idlest = NULL, group = sd->groups;
		struct sched_group *most_spare_sg = NULL;
		unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
		unsigned long most_spare = 0, this_spare = 0;
		int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
		unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
		(sd->imbalance_pct-100) / 100;

		do {
		unsigned long load;
		unsigned long spare_cap, max_spare_cap;
		int local_group;
		int i;

		/* Skip over this group if it has no CPUs allowed */
		if (!cpumask_intersects(sched_group_span(group),
		p->cpus_ptr))
		continue;

		local_group = cpumask_test_cpu(this_cpu,
		sched_group_span(group));

		/*
		* Tally up the load of all CPUs in the group and find
		* the group containing the CPU with most spare capacity.
		*/
		load = 0;
		max_spare_cap = 0;

		for_each_cpu(i, sched_group_span(group)) {
		load += cpu_load(cpu_rq(i));

		spare_cap = capacity_spare_without(i, p);

		if (spare_cap > max_spare_cap)
		max_spare_cap = spare_cap;
		}

		/* Adjust by relative CPU capacity of the group */
		load = (load * SCHED_CAPACITY_SCALE) /
		group->sgc->capacity;

		if (local_group) {
		this_load = load;
		this_spare = max_spare_cap;
		} else {
		if (load < min_load) {
		min_load = load;
		idlest = group;
		}

		if (most_spare < max_spare_cap) {
		most_spare = max_spare_cap;
		most_spare_sg = group;
		}
		}
		} while (group = group->next, group != sd->groups);

		/*
		* The cross-over point between using spare capacity or least load
		* is too conservative for high utilization tasks on partially
		* utilized systems if we require spare_capacity > task_util(p),
		* so we allow for some task stuffing by using
		* spare_capacity > task_util(p)/2.
		*
		* Spare capacity can't be used for fork because the utilization has
		* not been set yet, we must first select a rq to compute the initial
		* utilization.
		*/
		if (sd_flag & SD_BALANCE_FORK)
		goto skip_spare;

		if (this_spare > task_util(p) / 2 &&
		imbalance_scalethis_spare > 100most_spare)
		return NULL;

		if (most_spare > task_util(p) / 2)
		return most_spare_sg;

		skip_spare:
		if (!idlest)
		return NULL;

		/*
		* When comparing groups across NUMA domains, it's possible for the
		* local domain to be very lightly loaded relative to the remote
		* domains but "imbalance" skews the comparison making remote CPUs
		* look much more favourable. When considering cross-domain, add
		* imbalance to the load on the remote node and consider staying
		* local.
		*/
		if ((sd->flags & SD_NUMA) &&
		min_load + imbalance >= this_load)
		return NULL;

		if (min_load >= this_load + imbalance)
		return NULL;

		if ((this_load < (min_load + imbalance)) &&
		(100this_load < imbalance_scalemin_load))
		return NULL;

		return idlest;
		}
		int this_cpu, int sd_flag);

		/*
		* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
		@@ -5724,7 +5606,7 @@ static inline int find_idlest_cpu(struct sched_domain sd, struct task_struct p
		return prev_cpu;

		/*
		* We need task's util for capacity_spare_without, sync it up to
		* We need task's util for cpu_util_without, sync it up to
		* prev_cpu's last_update_time.
		*/
		if (!(sd_flag & SD_BALANCE_FORK))
		@@ -7905,13 +7787,13 @@ static inline int sg_imbalanced(struct sched_group *group)
		* any benefit for the load balance.
		*/
		static inline bool
		group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
		group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
		{
		if (sgs->sum_nr_running < sgs->group_weight)
		return true;

		if ((sgs->group_capacity * 100) >
		(sgs->group_util * env->sd->imbalance_pct))
		(sgs->group_util * imbalance_pct))
		return true;

		return false;
		@@ -7926,13 +7808,13 @@ group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
		* false.
		*/
		static inline bool
		group_is_overloaded(struct lb_env env, struct sg_lb_stats sgs)
		group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
		{
		if (sgs->sum_nr_running <= sgs->group_weight)
		return false;

		if ((sgs->group_capacity * 100) <
		(sgs->group_util * env->sd->imbalance_pct))
		(sgs->group_util * imbalance_pct))
		return true;

		return false;
		@@ -7959,11 +7841,11 @@ group_smaller_max_cpu_capacity(struct sched_group sg, struct sched_group ref)
		}

		static inline enum
		group_type group_classify(struct lb_env *env,
		group_type group_classify(unsigned int imbalance_pct,
		struct sched_group *group,
		struct sg_lb_stats *sgs)
		{
		if (group_is_overloaded(env, sgs))
		if (group_is_overloaded(imbalance_pct, sgs))
		return group_overloaded;

		if (sg_imbalanced(group))
		@@ -7975,7 +7857,7 @@ group_type group_classify(struct lb_env *env,
		if (sgs->group_misfit_task_load)
		return group_misfit_task;

		if (!group_has_capacity(env, sgs))
		if (!group_has_capacity(imbalance_pct, sgs))
		return group_fully_busy;

		return group_has_spare;
		@@ -8076,7 +7958,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,

		sgs->group_weight = group->group_weight;

		sgs->group_type = group_classify(env, group, sgs);
		sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);

		/* Computing avg_load makes sense only when group is overloaded */
		if (sgs->group_type == group_overloaded)
		@@ -8231,6 +8113,252 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
		}
		#endif /* CONFIG_NUMA_BALANCING */


		struct sg_lb_stats;

		/*
		* update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
		* @denv: The ched_domain level to look for idlest group.
		* @group: sched_group whose statistics are to be updated.
		* @sgs: variable to hold the statistics for this group.
		*/
		static inline void update_sg_wakeup_stats(struct sched_domain *sd,
		struct sched_group *group,
		struct sg_lb_stats *sgs,
		struct task_struct *p)
		{
		int i, nr_running;

		memset(sgs, 0, sizeof(*sgs));

		for_each_cpu(i, sched_group_span(group)) {
		struct rq *rq = cpu_rq(i);

		sgs->group_load += cpu_load(rq);
		sgs->group_util += cpu_util_without(i, p);
		sgs->sum_h_nr_running += rq->cfs.h_nr_running;

		nr_running = rq->nr_running;
		sgs->sum_nr_running += nr_running;

		/*
		* No need to call idle_cpu() if nr_running is not 0
		*/
		if (!nr_running && idle_cpu(i))
		sgs->idle_cpus++;


		}

		/* Check if task fits in the group */
		if (sd->flags & SD_ASYM_CPUCAPACITY &&
		!task_fits_capacity(p, group->sgc->max_capacity)) {
		sgs->group_misfit_task_load = 1;
		}

		sgs->group_capacity = group->sgc->capacity;

		sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);

		/*
		* Computing avg_load makes sense only when group is fully busy or
		* overloaded
		*/
		if (sgs->group_type < group_fully_busy)
		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
		sgs->group_capacity;
		}

		static bool update_pick_idlest(struct sched_group *idlest,
		struct sg_lb_stats *idlest_sgs,
		struct sched_group *group,
		struct sg_lb_stats *sgs)
		{
		if (sgs->group_type < idlest_sgs->group_type)
		return true;

		if (sgs->group_type > idlest_sgs->group_type)
		return false;

		/*
		* The candidate and the current idlest group are the same type of
		* group. Let check which one is the idlest according to the type.
		*/

		switch (sgs->group_type) {
		case group_overloaded:
		case group_fully_busy:
		/* Select the group with lowest avg_load. */
		if (idlest_sgs->avg_load <= sgs->avg_load)
		return false;
		break;

		case group_imbalanced:
		case group_asym_packing:
		/* Those types are not used in the slow wakeup path */
		return false;

		case group_misfit_task:
		/* Select group with the highest max capacity */
		if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
		return false;
		break;

		case group_has_spare:
		/* Select group with most idle CPUs */
		if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
		return false;
		break;
		}

		return true;
		}

		/*
		* find_idlest_group() finds and returns the least busy CPU group within the
		* domain.
		*
		* Assumes p is allowed on at least one CPU in sd.
		*/
		static struct sched_group *
		find_idlest_group(struct sched_domain sd, struct task_struct p,
		int this_cpu, int sd_flag)
		{
		struct sched_group idlest = NULL, local = NULL, *group = sd->groups;
		struct sg_lb_stats local_sgs, tmp_sgs;
		struct sg_lb_stats *sgs;
		unsigned long imbalance;
		struct sg_lb_stats idlest_sgs = {
		.avg_load = UINT_MAX,
		.group_type = group_overloaded,
		};

		imbalance = scale_load_down(NICE_0_LOAD) *
		(sd->imbalance_pct-100) / 100;

		do {
		int local_group;

		/* Skip over this group if it has no CPUs allowed */
		if (!cpumask_intersects(sched_group_span(group),
		p->cpus_ptr))
		continue;

		local_group = cpumask_test_cpu(this_cpu,
		sched_group_span(group));

		if (local_group) {
		sgs = &local_sgs;
		local = group;
		} else {
		sgs = &tmp_sgs;
		}

		update_sg_wakeup_stats(sd, group, sgs, p);

		if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
		idlest = group;
		idlest_sgs = *sgs;
		}

		} while (group = group->next, group != sd->groups);


		/* There is no idlest group to push tasks to */
		if (!idlest)
		return NULL;

		/*
		* If the local group is idler than the selected idlest group
		* don't try and push the task.
		*/
		if (local_sgs.group_type < idlest_sgs.group_type)
		return NULL;

		/*
		* If the local group is busier than the selected idlest group
		* try and push the task.
		*/
		if (local_sgs.group_type > idlest_sgs.group_type)
		return idlest;

		switch (local_sgs.group_type) {
		case group_overloaded:
		case group_fully_busy:
		/*
		* When comparing groups across NUMA domains, it's possible for
		* the local domain to be very lightly loaded relative to the
		* remote domains but "imbalance" skews the comparison making
		* remote CPUs look much more favourable. When considering
		* cross-domain, add imbalance to the load on the remote node
		* and consider staying local.
		*/

		if ((sd->flags & SD_NUMA) &&
		((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
		return NULL;

		/*
		* If the local group is less loaded than the selected
		* idlest group don't try and push any tasks.
		*/
		if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
		return NULL;

		if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
		return NULL;
		break;

		case group_imbalanced:
		case group_asym_packing:
		/* Those type are not used in the slow wakeup path */
		return NULL;

		case group_misfit_task:
		/* Select group with the highest max capacity */
		if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
		return NULL;
		break;

		case group_has_spare:
		if (sd->flags & SD_NUMA) {
		#ifdef CONFIG_NUMA_BALANCING
		int idlest_cpu;
		/*
		* If there is spare capacity at NUMA, try to select
		* the preferred node
		*/
		if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
		return NULL;

		idlest_cpu = cpumask_first(sched_group_span(idlest));
		if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
		return idlest;
		#endif
		/*
		* Otherwise, keep the task on this node to stay close
		* its wakeup source and improve locality. If there is
		* a real need of migration, periodic load balance will
		* take care of it.
		*/
		if (local_sgs.idle_cpus)
		return NULL;
		}

		/*
		* Select group with highest number of idle CPUs. We could also
		* compare the utilization which is more stable but it can end
		* up that the group has less spare capacity but finally more
		* idle CPUs which means more opportunity to run task.
		*/
		if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
		return NULL;
		break;
		}

		return idlest;
		}

		/**
		* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
		* @env: The load balancing environment.

Admin message