Commit edaa5ddf authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - reorganize & clean up the SD* flags definitions and add a bunch of
   sanity checks. These new checks caught quite a few bugs or at least
   inconsistencies, resulting in another set of patches.

 - rseq updates, add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ

 - add a new tracepoint to improve CPU capacity tracking

 - improve overloaded SMP system load-balancing behavior

 - tweak SMT balancing

 - energy-aware scheduling updates

 - NUMA balancing improvements

 - deadline scheduler fixes and improvements

 - CPU isolation fixes

 - misc cleanups, simplifications and smaller optimizations

* tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (42 commits)
  sched/deadline: Unthrottle PI boosted threads while enqueuing
  sched/debug: Add new tracepoint to track cpu_capacity
  sched/fair: Tweak pick_next_entity()
  rseq/selftests: Test MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
  rseq/selftests,x86_64: Add rseq_offset_deref_addv()
  rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
  sched/fair: Use dst group while checking imbalance for NUMA balancer
  sched/fair: Reduce busy load balance interval
  sched/fair: Minimize concurrent LBs between domain level
  sched/fair: Reduce minimal imbalance threshold
  sched/fair: Relax constraint on task's load during load balance
  sched/fair: Remove the force parameter of update_tg_load_avg()
  sched/fair: Fix wrong cpu selecting from isolated domain
  sched: Remove unused inline function uclamp_bucket_base_value()
  sched/rt: Disable RT_RUNTIME_SHARE by default
  sched/deadline: Fix stale throttling on de-/boosted tasks
  sched/numa: Use runnable_avg to classify node
  sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL
  MAINTAINERS: Add myself as SCHED_DEADLINE reviewer
  sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h
  ...
parents 13cb7349 feff2e65
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -15407,6 +15407,7 @@ R: Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
R:	Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
R:	Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
R:	Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
R:	Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
L:	linux-kernel@vger.kernel.org
S:	Maintained
T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
+0 −26
Original line number Diff line number Diff line
@@ -177,15 +177,6 @@ static inline void parse_dt_topology(void) {}
static inline void update_cpu_capacity(unsigned int cpuid) {}
#endif

/*
 * The current assumption is that we can power gate each core independently.
 * This will be superseded by DT binding once available.
 */
const struct cpumask *cpu_corepower_mask(int cpu)
{
	return &cpu_topology[cpu].thread_sibling;
}

/*
 * store_cpu_topology is called at boot when only one cpu is running
 * and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
@@ -241,20 +232,6 @@ topology_populated:
	update_siblings_masks(cpuid);
}

static inline int cpu_corepower_flags(void)
{
	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
}

static struct sched_domain_topology_level arm_topology[] = {
#ifdef CONFIG_SCHED_MC
	{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
	{ NULL, },
};

/*
 * init_cpu_topology is called at boot when only one cpu is running
 * which prevent simultaneous write access to cpu_topology array
@@ -265,7 +242,4 @@ void __init init_cpu_topology(void)
	smp_wmb();

	parse_dt_topology();

	/* Set scheduler topology descriptor */
	set_sched_topology(arm_topology);
}
+3 −2
Original line number Diff line number Diff line
@@ -1491,9 +1491,10 @@ extern struct pid *cad_pid;
/*
 * Per process flags
 */
#define PF_VCPU			0x00000001	/* I'm a virtual CPU */
#define PF_IDLE			0x00000002	/* I am an IDLE thread */
#define PF_EXITING		0x00000004	/* Getting shut down */
#define PF_VCPU			0x00000010	/* I'm a virtual CPU */
#define PF_IO_WORKER		0x00000010	/* Task is an IO worker */
#define PF_WQ_WORKER		0x00000020	/* I'm a workqueue worker */
#define PF_FORKNOEXEC		0x00000040	/* Forked but didn't exec */
#define PF_MCE_PROCESS		0x00000080      /* Process policy on mce errors */
@@ -1517,7 +1518,6 @@ extern struct pid *cad_pid;
#define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
#define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
#define PF_IO_WORKER		0x20000000	/* Task is an IO worker */
#define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */

@@ -2046,6 +2046,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);

int sched_trace_rq_cpu(struct rq *rq);
int sched_trace_rq_cpu_capacity(struct rq *rq);
int sched_trace_rq_nr_running(struct rq *rq);

const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
+3 −0
Original line number Diff line number Diff line
@@ -348,10 +348,13 @@ enum {
	MEMBARRIER_STATE_GLOBAL_EXPEDITED			= (1U << 3),
	MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY	= (1U << 4),
	MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE		= (1U << 5),
	MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY		= (1U << 6),
	MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ			= (1U << 7),
};

enum {
	MEMBARRIER_FLAG_SYNC_CORE	= (1U << 0),
	MEMBARRIER_FLAG_RSEQ		= (1U << 1),
};

#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
+156 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * sched-domains (multiprocessor balancing) flag declarations.
 */

#ifndef SD_FLAG
# error "Incorrect import of SD flags definitions"
#endif

/*
 * Hierarchical metaflags
 *
 * SHARED_CHILD: These flags are meant to be set from the base domain upwards.
 * If a domain has this flag set, all of its children should have it set. This
 * is usually because the flag describes some shared resource (all CPUs in that
 * domain share the same resource), or because they are tied to a scheduling
 * behaviour that we want to disable at some point in the hierarchy for
 * scalability reasons.
 *
 * In those cases it doesn't make sense to have the flag set for a domain but
 * not have it in (some of) its children: sched domains ALWAYS span their child
 * domains, so operations done with parent domains will cover CPUs in the lower
 * child domains.
 *
 *
 * SHARED_PARENT: These flags are meant to be set from the highest domain
 * downwards. If a domain has this flag set, all of its parents should have it
 * set. This is usually for topology properties that start to appear above a
 * certain level (e.g. domain starts spanning CPUs outside of the base CPU's
 * socket).
 */
#define SDF_SHARED_CHILD       0x1
#define SDF_SHARED_PARENT      0x2

/*
 * Behavioural metaflags
 *
 * NEEDS_GROUPS: These flags are only relevant if the domain they are set on has
 * more than one group. This is usually for balancing flags (load balancing
 * involves equalizing a metric between groups), or for flags describing some
 * shared resource (which would be shared between groups).
 */
#define SDF_NEEDS_GROUPS       0x4

/*
 * Balance when about to become idle
 *
 * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
 * NEEDS_GROUPS: Load balancing flag.
 */
SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*
 * Balance on exec
 *
 * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
 * NEEDS_GROUPS: Load balancing flag.
 */
SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*
 * Balance on fork, clone
 *
 * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
 * NEEDS_GROUPS: Load balancing flag.
 */
SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*
 * Balance on wakeup
 *
 * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
 * NEEDS_GROUPS: Load balancing flag.
 */
SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*
 * Consider waking task on waking CPU.
 *
 * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
 */
SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)

/*
 * Domain members have different CPU capacities
 *
 * SHARED_PARENT: Set from the topmost domain down to the first domain where
 *                asymmetry is detected.
 * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
 */
SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)

/*
 * Domain members share CPU capacity (i.e. SMT)
 *
 * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
 *               CPU capacity.
 * NEEDS_GROUPS: Capacity is shared between groups.
 */
SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*
 * Domain members share CPU package resources (i.e. caches)
 *
 * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
 *               the same cache(s).
 * NEEDS_GROUPS: Caches are shared between groups.
 */
SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*
 * Only a single load balancing instance
 *
 * SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a
 *                different level upwards, but it doesn't change that if a
 *                domain has this flag set, then all of its parents need to have
 *                it too (otherwise the serialization doesn't make sense).
 * NEEDS_GROUPS: No point in preserving domain if it has a single group.
 */
SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)

/*
 * Place busy tasks earlier in the domain
 *
 * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
 *               up, but currently assumed to be set from the base domain
 *               upwards (see update_top_cache_domain()).
 * NEEDS_GROUPS: Load balancing flag.
 */
SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*
 * Prefer to place tasks in a sibling domain
 *
 * Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD
 * flag, but cleared below domains with SD_ASYM_CPUCAPACITY.
 *
 * NEEDS_GROUPS: Load balancing flag.
 */
SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)

/*
 * sched_groups of this level overlap
 *
 * SHARED_PARENT: Set for all NUMA levels above NODE.
 * NEEDS_GROUPS: Overlaps can only exist with more than one group.
 */
SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)

/*
 * Cross-node balancing
 *
 * SHARED_PARENT: Set for all NUMA levels above NODE.
 * NEEDS_GROUPS: No point in preserving domain if it has a single group.
 */
SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
Loading