Commit 502b24c2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull cgroup changes from Tejun Heo:
 "Nothing too drastic.

   - Removal of synchronize_rcu() from userland visible paths.

   - Various fixes and cleanups from Li.

   - cgroup_rightmost_descendant() added which will be used by cpuset
     changes (it will be a separate pull request)."

* 'for-3.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: fail if monitored file and event_control are in different cgroup
  cgroup: fix cgroup_rmdir() vs close(eventfd) race
  cpuset: fix cpuset_print_task_mems_allowed() vs rename() race
  cgroup: fix exit() vs rmdir() race
  cgroup: remove bogus comments in cgroup_diput()
  cgroup: remove synchronize_rcu() from cgroup_diput()
  cgroup: remove duplicate RCU free on struct cgroup
  sched: remove redundant NULL cgroup check in task_group_path()
  sched: split out css_online/css_offline from tg creation/destruction
  cgroup: initialize cgrp->dentry before css_alloc()
  cgroup: remove a NULL check in cgroup_exit()
  cgroup: fix bogus kernel warnings when cgroup_create() failed
  cgroup: remove synchronize_rcu() from rebind_subsystems()
  cgroup: remove synchronize_rcu() from cgroup_attach_{task|proc}()
  cgroup: use new hashtable implementation
  cgroups: fix cgroup_event_listener error handling
  cgroups: move cgroup_event_listener.c to tools/cgroup
  cgroup: implement cgroup_rightmost_descendant()
  cgroup: remove unused dummy cgroup_fork_callbacks()
parents ece8e0b2 f169007b
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -4,8 +4,6 @@ blkio-controller.txt
	- Description for Block IO Controller, implementation and usage details.
cgroups.txt
	- Control Groups definition, implementation details, examples and API.
cgroup_event_listener.c
	- A user program for cgroup listener.
cpuacct.txt
	- CPU Accounting Controller; account CPU usage for groups of tasks.
cpusets.txt
+1 −2
Original line number Diff line number Diff line
@@ -399,8 +399,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.

 9.10 Memory thresholds
	Memory controller implements memory thresholds using cgroups notification
	API. You can use Documentation/cgroups/cgroup_event_listener.c to test
	it.
	API. You can use tools/cgroup/cgroup_event_listener.c to test it.

	(Shell-A) Create cgroup and run event listener
	# mkdir /cgroup/A
+2 −1
Original line number Diff line number Diff line
@@ -203,6 +203,7 @@ struct cgroup {

	/* For RCU-protected deletion */
	struct rcu_head rcu_head;
	struct work_struct free_work;

	/* List of events which userspace want to receive */
	struct list_head event_list;
@@ -558,6 +559,7 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,

struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
					  struct cgroup *cgroup);
struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);

/**
 * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants
@@ -706,7 +708,6 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_fork(struct task_struct *p) {}
static inline void cgroup_fork_callbacks(struct task_struct *p) {}
static inline void cgroup_post_fork(struct task_struct *p) {}
static inline void cgroup_exit(struct task_struct *p, int callbacks) {}

+3 −0
Original line number Diff line number Diff line
@@ -2659,7 +2659,10 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
extern struct task_group root_task_group;

extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg,
			       struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
extern void sched_offline_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+168 −120
Original line number Diff line number Diff line
@@ -52,7 +52,7 @@
#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <linux/hash.h>
#include <linux/hashtable.h>
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
 * account cgroups in empty hierarchies.
 */
#define CSS_SET_HASH_BITS	7
#define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
	int i;
	int index;
	unsigned long tmp = 0UL;
	unsigned long key = 0UL;

	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
		tmp += (unsigned long)css[i];
	tmp = (tmp >> 16) ^ tmp;
		key += (unsigned long)css[i];
	key = (key >> 16) ^ key;

	index = hash_long(tmp, CSS_SET_HASH_BITS);

	return &css_set_table[index];
	return key;
}

/* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
	}

	/* This css_set is dead. unlink it and release cgroup refcounts */
	hlist_del(&cg->hlist);
	hash_del(&cg->hlist);
	css_set_count--;

	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
		struct cgroup *cgrp = link->cgrp;
		list_del(&link->cg_link_list);
		list_del(&link->cgrp_link_list);

		/*
		 * We may not be holding cgroup_mutex, and if cgrp->count is
		 * dropped to 0 the cgroup can be destroyed at any time, hence
		 * rcu_read_lock is used to keep it alive.
		 */
		rcu_read_lock();
		if (atomic_dec_and_test(&cgrp->count) &&
		    notify_on_release(cgrp)) {
			if (taskexit)
				set_bit(CGRP_RELEASABLE, &cgrp->flags);
			check_for_release(cgrp);
		}
		rcu_read_unlock();

		kfree(link);
	}
@@ -550,9 +554,9 @@ static struct css_set *find_existing_css_set(
{
	int i;
	struct cgroupfs_root *root = cgrp->root;
	struct hlist_head *hhead;
	struct hlist_node *node;
	struct css_set *cg;
	unsigned long key;

	/*
	 * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +576,8 @@ static struct css_set *find_existing_css_set(
		}
	}

	hhead = css_set_hash(template);
	hlist_for_each_entry(cg, node, hhead, hlist) {
	key = css_set_hash(template);
	hash_for_each_possible(css_set_table, cg, node, hlist, key) {
		if (!compare_css_sets(cg, oldcg, cgrp, template))
			continue;

@@ -657,8 +661,8 @@ static struct css_set *find_css_set(

	struct list_head tmp_cg_links;

	struct hlist_head *hhead;
	struct cg_cgroup_link *link;
	unsigned long key;

	/* First see if we already have a cgroup group that matches
	 * the desired set */
@@ -704,8 +708,8 @@ static struct css_set *find_css_set(
	css_set_count++;

	/* Add this cgroup group to the hash table */
	hhead = css_set_hash(res->subsys);
	hlist_add_head(&res->hlist, hhead);
	key = css_set_hash(res->subsys);
	hash_add(css_set_table, &res->hlist, key);

	write_unlock(&css_set_lock);

@@ -856,20 +860,10 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
	return inode;
}

static void cgroup_diput(struct dentry *dentry, struct inode *inode)
static void cgroup_free_fn(struct work_struct *work)
{
	/* is dentry a directory ? if so, kfree() associated cgroup */
	if (S_ISDIR(inode->i_mode)) {
		struct cgroup *cgrp = dentry->d_fsdata;
	struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
	struct cgroup_subsys *ss;
		BUG_ON(!(cgroup_is_removed(cgrp)));
		/* It's possible for external users to be holding css
		 * reference counts on a cgroup; css_put() needs to
		 * be able to access the cgroup after decrementing
		 * the reference count in order to know if it needs to
		 * queue the cgroup to be handled by the release
		 * agent */
		synchronize_rcu();

	mutex_lock(&cgroup_mutex);
	/*
@@ -896,7 +890,24 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
	simple_xattrs_free(&cgrp->xattrs);

	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
		kfree_rcu(cgrp, rcu_head);
	kfree(cgrp);
}

static void cgroup_free_rcu(struct rcu_head *head)
{
	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);

	schedule_work(&cgrp->free_work);
}

static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
	/* is dentry a directory ? if so, kfree() associated cgroup */
	if (S_ISDIR(inode->i_mode)) {
		struct cgroup *cgrp = dentry->d_fsdata;

		BUG_ON(!(cgroup_is_removed(cgrp)));
		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
	} else {
		struct cfent *cfe = __d_cfe(dentry);
		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +936,17 @@ static void remove_dir(struct dentry *d)
	dput(parent);
}

static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
	struct cfent *cfe;

	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
	lockdep_assert_held(&cgroup_mutex);

	/*
	 * If we're doing cleanup due to failure of cgroup_create(),
	 * the corresponding @cfe may not exist.
	 */
	list_for_each_entry(cfe, &cgrp->files, node) {
		struct dentry *d = cfe->dentry;

@@ -944,9 +959,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
		list_del_init(&cfe->node);
		dput(d);

		return 0;
		break;
	}
	return -ENOENT;
}

/**
@@ -1083,7 +1097,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
		}
	}
	root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
	synchronize_rcu();

	return 0;
}
@@ -1393,6 +1406,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
	INIT_LIST_HEAD(&cgrp->allcg_node);
	INIT_LIST_HEAD(&cgrp->release_list);
	INIT_LIST_HEAD(&cgrp->pidlists);
	INIT_WORK(&cgrp->free_work, cgroup_free_fn);
	mutex_init(&cgrp->pidlist_mutex);
	INIT_LIST_HEAD(&cgrp->event_list);
	spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1611,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
		struct cgroupfs_root *existing_root;
		const struct cred *cred;
		int i;
		struct hlist_node *node;
		struct css_set *cg;

		BUG_ON(sb->s_root != NULL);

@@ -1650,14 +1666,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
		/* Link the top cgroup in this hierarchy into all
		 * the css_set objects */
		write_lock(&css_set_lock);
		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
			struct hlist_head *hhead = &css_set_table[i];
			struct hlist_node *node;
			struct css_set *cg;

			hlist_for_each_entry(cg, node, hhead, hlist)
		hash_for_each(css_set_table, i, node, cg, hlist)
			link_css_set(&tmp_cg_links, cg, root_cgrp);
		}
		write_unlock(&css_set_lock);

		free_cg_links(&tmp_cg_links);
@@ -1773,7 +1783,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
	rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
			   "cgroup_path() called without proper locking");

	if (!dentry || cgrp == dummytop) {
	if (cgrp == dummytop) {
		/*
		 * Inactive subsystems have no dentry for their root
		 * cgroup
@@ -1982,7 +1992,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
			ss->attach(cgrp, &tset);
	}

	synchronize_rcu();
out:
	if (retval) {
		for_each_subsys(root, ss) {
@@ -2151,7 +2160,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
	/*
	 * step 5: success! and cleanup
	 */
	synchronize_rcu();
	retval = 0;
out_put_css_set_refs:
	if (retval) {
@@ -2769,14 +2777,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
			continue;

		if (is_add)
		if (is_add) {
			err = cgroup_add_file(cgrp, subsys, cft);
		else
			err = cgroup_rm_file(cgrp, cft);
		if (err) {
			pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
				   is_add ? "add" : "remove", cft->name, err);
			if (err)
				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
					cft->name, err);
			ret = err;
		} else {
			cgroup_rm_file(cgrp, cft);
		}
	}
	return ret;
@@ -3017,6 +3025,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
}
EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);

/**
 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
 * @pos: cgroup of interest
 *
 * Return the rightmost descendant of @pos.  If there's no descendant,
 * @pos is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
 */
struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
{
	struct cgroup *last, *tmp;

	WARN_ON_ONCE(!rcu_read_lock_held());

	do {
		last = pos;
		/* ->prev isn't RCU safe, walk ->next till the end */
		pos = NULL;
		list_for_each_entry_rcu(tmp, &last->children, sibling)
			pos = tmp;
	} while (pos);

	return last;
}
EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);

static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
{
	struct cgroup *last;
@@ -3752,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work)
			remove);
	struct cgroup *cgrp = event->cgrp;

	remove_wait_queue(event->wqh, &event->wait);

	event->cft->unregister_event(cgrp, event->cft, event->eventfd);

	/* Notify userspace the event is going away. */
	eventfd_signal(event->eventfd, 1);

	eventfd_ctx_put(event->eventfd);
	kfree(event);
	dput(cgrp->dentry);
@@ -3773,16 +3812,26 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
	unsigned long flags = (unsigned long)key;

	if (flags & POLLHUP) {
		__remove_wait_queue(event->wqh, &event->wait);
		/*
		 * If the event has been detached at cgroup removal, we
		 * can simply return knowing the other side will cleanup
		 * for us.
		 *
		 * We can't race against event freeing since the other
		 * side will require wqh->lock via remove_wait_queue(),
		 * which we hold.
		 */
		spin_lock(&cgrp->event_list_lock);
		if (!list_empty(&event->list)) {
			list_del_init(&event->list);
		spin_unlock(&cgrp->event_list_lock);
			/*
		 * We are in atomic context, but cgroup_event_remove() may
		 * sleep, so we have to call it in workqueue.
			 * We are in atomic context, but cgroup_event_remove()
			 * may sleep, so we have to call it in workqueue.
			 */
			schedule_work(&event->remove);
		}
		spin_unlock(&cgrp->event_list_lock);
	}

	return 0;
}
@@ -3807,6 +3856,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
				      const char *buffer)
{
	struct cgroup_event *event = NULL;
	struct cgroup *cgrp_cfile;
	unsigned int efd, cfd;
	struct file *efile = NULL;
	struct file *cfile = NULL;
@@ -3862,6 +3912,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
		goto fail;
	}

	/*
	 * The file to be monitored must be in the same cgroup as
	 * cgroup.event_control is.
	 */
	cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
	if (cgrp_cfile != cgrp) {
		ret = -EINVAL;
		goto fail;
	}

	if (!event->cft->register_event || !event->cft->unregister_event) {
		ret = -EINVAL;
		goto fail;
@@ -4135,6 +4195,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,

	init_cgroup_housekeeping(cgrp);

	dentry->d_fsdata = cgrp;
	cgrp->dentry = dentry;

	cgrp->parent = parent;
	cgrp->root = parent->root;
	cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4235,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
	lockdep_assert_held(&dentry->d_inode->i_mutex);

	/* allocation complete, commit to creation */
	dentry->d_fsdata = cgrp;
	cgrp->dentry = dentry;
	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
	root->number_of_cgroups++;
@@ -4340,20 +4401,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
	/*
	 * Unregister events and notify userspace.
	 * Notify userspace about cgroup removing only after rmdir of cgroup
	 * directory to avoid race between userspace and kernelspace. Use
	 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
	 * cgroup_event_wake() is called with the wait queue head locked,
	 * remove_wait_queue() cannot be called while holding event_list_lock.
	 * directory to avoid race between userspace and kernelspace.
	 */
	spin_lock(&cgrp->event_list_lock);
	list_splice_init(&cgrp->event_list, &tmp_list);
	spin_unlock(&cgrp->event_list_lock);
	list_for_each_entry_safe(event, tmp, &tmp_list, list) {
	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
		list_del_init(&event->list);
		remove_wait_queue(event->wqh, &event->wait);
		eventfd_signal(event->eventfd, 1);
		schedule_work(&event->remove);
	}
	spin_unlock(&cgrp->event_list_lock);

	return 0;
}
@@ -4438,6 +4493,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
	struct cgroup_subsys_state *css;
	int i, ret;
	struct hlist_node *node, *tmp;
	struct css_set *cg;
	unsigned long key;

	/* check name and function validity */
	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4561,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
	 * this is all done under the css_set_lock.
	 */
	write_lock(&css_set_lock);
	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
		struct css_set *cg;
		struct hlist_node *node, *tmp;
		struct hlist_head *bucket = &css_set_table[i], *new_bucket;

		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
	hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) {
		/* skip entries that we already rehashed */
		if (cg->subsys[ss->subsys_id])
			continue;
		/* remove existing entry */
			hlist_del(&cg->hlist);
		hash_del(&cg->hlist);
		/* set new value */
		cg->subsys[ss->subsys_id] = css;
		/* recompute hash and restore entry */
			new_bucket = css_set_hash(cg->subsys);
			hlist_add_head(&cg->hlist, new_bucket);
		}
		key = css_set_hash(cg->subsys);
		hash_add(css_set_table, node, key);
	}
	write_unlock(&css_set_lock);

@@ -4551,7 +4603,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
	struct cg_cgroup_link *link;
	struct hlist_head *hhead;

	BUG_ON(ss->module == NULL);

@@ -4585,11 +4636,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
	write_lock(&css_set_lock);
	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
		struct css_set *cg = link->cg;
		unsigned long key;

		hlist_del(&cg->hlist);
		hash_del(&cg->hlist);
		cg->subsys[ss->subsys_id] = NULL;
		hhead = css_set_hash(cg->subsys);
		hlist_add_head(&cg->hlist, hhead);
		key = css_set_hash(cg->subsys);
		hash_add(css_set_table, &cg->hlist, key);
	}
	write_unlock(&css_set_lock);

@@ -4631,9 +4683,6 @@ int __init cgroup_init_early(void)
	list_add(&init_css_set_link.cg_link_list,
		 &init_css_set.cg_links);

	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
		INIT_HLIST_HEAD(&css_set_table[i]);

	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		struct cgroup_subsys *ss = subsys[i];

@@ -4667,7 +4716,7 @@ int __init cgroup_init(void)
{
	int err;
	int i;
	struct hlist_head *hhead;
	unsigned long key;

	err = bdi_init(&cgroup_backing_dev_info);
	if (err)
@@ -4686,8 +4735,8 @@ int __init cgroup_init(void)
	}

	/* Add init_css_set to the hash table */
	hhead = css_set_hash(init_css_set.subsys);
	hlist_add_head(&init_css_set.hlist, hhead);
	key = css_set_hash(init_css_set.subsys);
	hash_add(css_set_table, &init_css_set.hlist, key);
	BUG_ON(!init_root_id(&rootnode));

	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,7 +5031,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
	}
	task_unlock(tsk);

	if (cg)
	put_css_set_taskexit(cg);
}

Loading