Commit 0011572c authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull cgroup fixes from Tejun Heo:
 "This has an unusually high density of tricky fixes:

   - task_get_css() could deadlock when it races against a dying cgroup.

   - cgroup.procs didn't list thread group leaders with live threads.

     This could mislead readers to think that a cgroup is empty when
     it's not. Fixed by making PROCS iterator include dead tasks. I made
     a couple mistakes making this change and this pull request contains
     a couple follow-up patches.

   - When cpusets run out of online cpus, it updates cpusmasks of member
     tasks in bizarre ways. Joel improved the behavior significantly"

* 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cpuset: restore sanity to cpuset_cpus_allowed_fallback()
  cgroup: Fix css_task_iter_advance_css_set() cset skip condition
  cgroup: css_task_iter_skip()'d iterators must be advanced before accessed
  cgroup: Include dying leaders with live threads in PROCS iterations
  cgroup: Implement css_task_iter_skip()
  cgroup: Call cgroup_release() before __exit_signal()
  docs cgroups: add another example size for hugetlb
  cgroup: Use css_tryget() instead of css_tryget_online() in task_get_css()
parents 6aa7a22b d477f8c2
Loading
Loading
Loading
Loading
+13 −9
Original line number Diff line number Diff line
@@ -32,14 +32,18 @@ Brief summary of control files
 hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
 hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit

For a system supporting two hugepage size (16M and 16G) the control
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
files include:

hugetlb.16GB.limit_in_bytes
hugetlb.16GB.max_usage_in_bytes
hugetlb.16GB.usage_in_bytes
hugetlb.16GB.failcnt
hugetlb.16MB.limit_in_bytes
hugetlb.16MB.max_usage_in_bytes
hugetlb.16MB.usage_in_bytes
hugetlb.16MB.failcnt
hugetlb.1GB.limit_in_bytes
hugetlb.1GB.max_usage_in_bytes
hugetlb.1GB.usage_in_bytes
hugetlb.1GB.failcnt
hugetlb.64KB.limit_in_bytes
hugetlb.64KB.max_usage_in_bytes
hugetlb.64KB.usage_in_bytes
hugetlb.64KB.failcnt
hugetlb.32MB.limit_in_bytes
hugetlb.32MB.max_usage_in_bytes
hugetlb.32MB.usage_in_bytes
hugetlb.32MB.failcnt
+1 −0
Original line number Diff line number Diff line
@@ -221,6 +221,7 @@ struct css_set {
	 */
	struct list_head tasks;
	struct list_head mg_tasks;
	struct list_head dying_tasks;

	/* all css_task_iters currently walking this cset */
	struct list_head task_iters;
+12 −2
Original line number Diff line number Diff line
@@ -43,6 +43,9 @@
/* walk all threaded css_sets in the domain */
#define CSS_TASK_ITER_THREADED		(1U << 1)

/* internal flags */
#define CSS_TASK_ITER_SKIPPED		(1U << 16)

/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
	struct cgroup_subsys		*ss;
@@ -57,6 +60,7 @@ struct css_task_iter {
	struct list_head		*task_pos;
	struct list_head		*tasks_head;
	struct list_head		*mg_tasks_head;
	struct list_head		*dying_tasks_head;

	struct css_set			*cur_cset;
	struct css_set			*cur_dcset;
@@ -487,7 +491,7 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
 *
 * Find the css for the (@task, @subsys_id) combination, increment a
 * reference on and return it.  This function is guaranteed to return a
 * valid css.
 * valid css.  The returned css may already have been offlined.
 */
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
@@ -497,7 +501,13 @@ task_get_css(struct task_struct *task, int subsys_id)
	rcu_read_lock();
	while (true) {
		css = task_css(task, subsys_id);
		if (likely(css_tryget_online(css)))
		/*
		 * Can't use css_tryget_online() here.  A task which has
		 * PF_EXITING set may stay associated with an offline css.
		 * If such task calls this function, css_tryget_online()
		 * will keep failing.
		 */
		if (likely(css_tryget(css)))
			break;
		cpu_relax();
	}
+76 −30
Original line number Diff line number Diff line
@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];

static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it);
static void css_task_iter_skip(struct css_task_iter *it,
			       struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
					      struct cgroup_subsys *ss);
@@ -738,6 +739,7 @@ struct css_set init_css_set = {
	.dom_cset		= &init_css_set,
	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
	.dying_tasks		= LIST_HEAD_INIT(init_css_set.dying_tasks),
	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
	.threaded_csets		= LIST_HEAD_INIT(init_css_set.threaded_csets),
	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
		cgroup_update_populated(link->cgrp, populated);
}

/*
 * @task is leaving, advance task iterators which are pointing to it so
 * that they can resume at the next position.  Advancing an iterator might
 * remove it from the list, use safe walk.  See css_task_iter_skip() for
 * details.
 */
static void css_set_skip_task_iters(struct css_set *cset,
				    struct task_struct *task)
{
	struct css_task_iter *it, *pos;

	list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
		css_task_iter_skip(it, task);
}

/**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
@@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
		css_set_update_populated(to_cset, true);

	if (from_cset) {
		struct css_task_iter *it, *pos;

		WARN_ON_ONCE(list_empty(&task->cg_list));

		/*
		 * @task is leaving, advance task iterators which are
		 * pointing to it so that they can resume at the next
		 * position.  Advancing an iterator might remove it from
		 * the list, use safe walk.  See css_task_iter_advance*()
		 * for details.
		 */
		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
					 iters_node)
			if (it->task_pos == &task->cg_list)
				css_task_iter_advance(it);

		css_set_skip_task_iters(from_cset, task);
		list_del_init(&task->cg_list);
		if (!css_set_populated(from_cset))
			css_set_update_populated(from_cset, false);
@@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
	cset->dom_cset = cset;
	INIT_LIST_HEAD(&cset->tasks);
	INIT_LIST_HEAD(&cset->mg_tasks);
	INIT_LIST_HEAD(&cset->dying_tasks);
	INIT_LIST_HEAD(&cset->task_iters);
	INIT_LIST_HEAD(&cset->threaded_csets);
	INIT_HLIST_NODE(&cset->hlist);
@@ -4408,15 +4413,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
			it->task_pos = NULL;
			return;
		}
	} while (!css_set_populated(cset));
	} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));

	if (!list_empty(&cset->tasks))
		it->task_pos = cset->tasks.next;
	else
	else if (!list_empty(&cset->mg_tasks))
		it->task_pos = cset->mg_tasks.next;
	else
		it->task_pos = cset->dying_tasks.next;

	it->tasks_head = &cset->tasks;
	it->mg_tasks_head = &cset->mg_tasks;
	it->dying_tasks_head = &cset->dying_tasks;

	/*
	 * We don't keep css_sets locked across iteration steps and thus
@@ -4442,9 +4450,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
	list_add(&it->iters_node, &cset->task_iters);
}

static void css_task_iter_skip(struct css_task_iter *it,
			       struct task_struct *task)
{
	lockdep_assert_held(&css_set_lock);

	if (it->task_pos == &task->cg_list) {
		it->task_pos = it->task_pos->next;
		it->flags |= CSS_TASK_ITER_SKIPPED;
	}
}

static void css_task_iter_advance(struct css_task_iter *it)
{
	struct list_head *next;
	struct task_struct *task;

	lockdep_assert_held(&css_set_lock);
repeat:
@@ -4454,26 +4473,41 @@ repeat:
		 * consumed first and then ->mg_tasks.  After ->mg_tasks,
		 * we move onto the next cset.
		 */
		next = it->task_pos->next;

		if (next == it->tasks_head)
			next = it->mg_tasks_head->next;
		if (it->flags & CSS_TASK_ITER_SKIPPED)
			it->flags &= ~CSS_TASK_ITER_SKIPPED;
		else
			it->task_pos = it->task_pos->next;

		if (next == it->mg_tasks_head)
		if (it->task_pos == it->tasks_head)
			it->task_pos = it->mg_tasks_head->next;
		if (it->task_pos == it->mg_tasks_head)
			it->task_pos = it->dying_tasks_head->next;
		if (it->task_pos == it->dying_tasks_head)
			css_task_iter_advance_css_set(it);
		else
			it->task_pos = next;
	} else {
		/* called from start, proceed to the first cset */
		css_task_iter_advance_css_set(it);
	}

	if (!it->task_pos)
		return;

	task = list_entry(it->task_pos, struct task_struct, cg_list);

	if (it->flags & CSS_TASK_ITER_PROCS) {
		/* if PROCS, skip over tasks which aren't group leaders */
	if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
	    !thread_group_leader(list_entry(it->task_pos, struct task_struct,
					    cg_list)))
		if (!thread_group_leader(task))
			goto repeat;

		/* and dying leaders w/o live member threads */
		if (!atomic_read(&task->signal->live))
			goto repeat;
	} else {
		/* skip all dying ones */
		if (task->flags & PF_EXITING)
			goto repeat;
	}
}

/**
 * css_task_iter_start - initiate task iteration
@@ -4528,6 +4562,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)

	spin_lock_irq(&css_set_lock);

	/* @it may be half-advanced by skips, finish advancing */
	if (it->flags & CSS_TASK_ITER_SKIPPED)
		css_task_iter_advance(it);

	if (it->task_pos) {
		it->cur_task = list_entry(it->task_pos, struct task_struct,
					  cg_list);
@@ -6009,6 +6047,7 @@ void cgroup_exit(struct task_struct *tsk)
	if (!list_empty(&tsk->cg_list)) {
		spin_lock_irq(&css_set_lock);
		css_set_move_task(tsk, cset, NULL, false);
		list_add_tail(&tsk->cg_list, &cset->dying_tasks);
		cset->nr_tasks--;

		WARN_ON_ONCE(cgroup_task_frozen(tsk));
@@ -6034,6 +6073,13 @@ void cgroup_release(struct task_struct *task)
	do_each_subsys_mask(ss, ssid, have_release_callback) {
		ss->release(task);
	} while_each_subsys_mask();

	if (use_task_css_set_links) {
		spin_lock_irq(&css_set_lock);
		css_set_skip_task_iters(task_css_set(task), task);
		list_del_init(&task->cg_list);
		spin_unlock_irq(&css_set_lock);
	}
}

void cgroup_free(struct task_struct *task)
+14 −1
Original line number Diff line number Diff line
@@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
	spin_unlock_irqrestore(&callback_lock, flags);
}

/**
 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
 * @tsk: pointer to task_struct with which the scheduler is struggling
 *
 * Description: In the case that the scheduler cannot find an allowed cpu in
 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
 * which will not contain a sane cpumask during cases such as cpu hotplugging.
 * This is the absolute last resort for the scheduler and it is only used if
 * _every_ other avenue has been traveled.
 **/

void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
	rcu_read_lock();
	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
	do_set_cpus_allowed(tsk, is_in_v2_mode() ?
		task_cs(tsk)->cpus_allowed : cpu_possible_mask);
	rcu_read_unlock();

	/*
Loading