Commit ef2c41cf authored by Christian Brauner's avatar Christian Brauner Committed by Tejun Heo
Browse files

clone3: allow spawning processes into cgroups



This adds support for creating a process in a different cgroup than its
parent. Callers can limit and account processes and threads right from
the moment they are spawned:
- A service manager can directly spawn new services into dedicated
  cgroups.
- A process can be directly created in a frozen cgroup and will be
  frozen as well.
- The initial accounting jitter experienced by process supervisors and
  daemons is eliminated with this.
- Threaded applications or even thread implementations can choose to
  create a specific cgroup layout where each thread is spawned
  directly into a dedicated cgroup.

This feature is limited to the unified hierarchy. Callers need to pass
a directory file descriptor for the target cgroup. The caller can
choose to pass an O_PATH file descriptor. All usual migration
restrictions apply, i.e. there can be no processes in inner nodes. In
general, creating a process directly in a target cgroup adheres to all
migration restrictions.

One of the biggest advantages of this feature is that CLONE_INTO_GROUP does
not need to grab the write side of the cgroup cgroup_threadgroup_rwsem.
This global lock makes moving tasks/threads around super expensive. With
clone3() this lock is avoided.

Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: cgroups@vger.kernel.org
Signed-off-by: default avatarChristian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent f3553220
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -628,8 +628,9 @@ struct cgroup_subsys {
	void (*cancel_attach)(struct cgroup_taskset *tset);
	void (*attach)(struct cgroup_taskset *tset);
	void (*post_attach)(void);
	int (*can_fork)(struct task_struct *task);
	void (*cancel_fork)(struct task_struct *task);
	int (*can_fork)(struct task_struct *task,
			struct css_set *cset);
	void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
	void (*fork)(struct task_struct *task);
	void (*exit)(struct task_struct *task);
	void (*release)(struct task_struct *task);
+14 −6
Original line number Diff line number Diff line
@@ -27,6 +27,8 @@

#include <linux/cgroup-defs.h>

struct kernel_clone_args;

#ifdef CONFIG_CGROUPS

/*
@@ -119,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
		     struct pid *pid, struct task_struct *tsk);

void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p);
extern void cgroup_cancel_fork(struct task_struct *p);
extern void cgroup_post_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
			   struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
			       struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
			     struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);
@@ -705,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
				    struct dentry *dentry) { return -EINVAL; }

static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p) {}
static inline void cgroup_post_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
				  struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
				      struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
				    struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}
+4 −0
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
struct task_struct;
struct rusage;
union thread_union;
struct css_set;

/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL
@@ -29,6 +30,9 @@ struct kernel_clone_args {
	pid_t *set_tid;
	/* Number of elements in *set_tid */
	size_t set_tid_size;
	int cgroup;
	struct cgroup *cgrp;
	struct css_set *cset;
};

/*
+5 −0
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@

/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */

/*
 * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -81,6 +82,8 @@
 * @set_tid_size: This defines the size of the array referenced
 *                in @set_tid. This cannot be larger than the
 *                kernel's limit of nested PID namespaces.
 * @cgroup:       If CLONE_INTO_CGROUP is specified set this to
 *                a file descriptor for the cgroup.
 *
 * The structure is versioned by size and thus extensible.
 * New struct members must go at the end of the struct and
@@ -97,11 +100,13 @@ struct clone_args {
	__aligned_u64 tls;
	__aligned_u64 set_tid;
	__aligned_u64 set_tid_size;
	__aligned_u64 cgroup;
};
#endif

#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */

/*
 * Scheduling policies
+168 −23
Original line number Diff line number Diff line
@@ -5881,8 +5881,7 @@ out:
 * @child: pointer to task_struct of forking parent process.
 *
 * A task is associated with the init_css_set until cgroup_post_fork()
 * attaches it to the parent's css_set.  Empty cg_list indicates that
 * @child isn't holding reference to its css_set.
 * attaches it to the target css_set.
 */
void cgroup_fork(struct task_struct *child)
{
@@ -5908,24 +5907,154 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
	return cgrp;
}

/**
 * cgroup_css_set_fork - find or create a css_set for a child process
 * @kargs: the arguments passed to create the child process
 *
 * This functions finds or creates a new css_set which the child
 * process will be attached to in cgroup_post_fork(). By default,
 * the child process will be given the same css_set as its parent.
 *
 * If CLONE_INTO_CGROUP is specified this function will try to find an
 * existing css_set which includes the requested cgroup and if not create
 * a new css_set that the child will be attached to later. If this function
 * succeeds it will hold cgroup_threadgroup_rwsem on return. If
 * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
 * before grabbing cgroup_threadgroup_rwsem and will hold a reference
 * to the target cgroup.
 */
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
	__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
	int ret;
	struct cgroup *dst_cgrp = NULL;
	struct css_set *cset;
	struct super_block *sb;
	struct file *f;

	if (kargs->flags & CLONE_INTO_CGROUP)
		mutex_lock(&cgroup_mutex);

	cgroup_threadgroup_change_begin(current);

	spin_lock_irq(&css_set_lock);
	cset = task_css_set(current);
	get_css_set(cset);
	spin_unlock_irq(&css_set_lock);

	if (!(kargs->flags & CLONE_INTO_CGROUP)) {
		kargs->cset = cset;
		return 0;
	}

	f = fget_raw(kargs->cgroup);
	if (!f) {
		ret = -EBADF;
		goto err;
	}
	sb = f->f_path.dentry->d_sb;

	dst_cgrp = cgroup_get_from_file(f);
	if (IS_ERR(dst_cgrp)) {
		ret = PTR_ERR(dst_cgrp);
		dst_cgrp = NULL;
		goto err;
	}

	if (cgroup_is_dead(dst_cgrp)) {
		ret = -ENODEV;
		goto err;
	}

	/*
	 * Verify that we the target cgroup is writable for us. This is
	 * usually done by the vfs layer but since we're not going through
	 * the vfs layer here we need to do it "manually".
	 */
	ret = cgroup_may_write(dst_cgrp, sb);
	if (ret)
		goto err;

	ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
					!(kargs->flags & CLONE_THREAD));
	if (ret)
		goto err;

	kargs->cset = find_css_set(cset, dst_cgrp);
	if (!kargs->cset) {
		ret = -ENOMEM;
		goto err;
	}

	put_css_set(cset);
	fput(f);
	kargs->cgrp = dst_cgrp;
	return ret;

err:
	cgroup_threadgroup_change_end(current);
	mutex_unlock(&cgroup_mutex);
	if (f)
		fput(f);
	if (dst_cgrp)
		cgroup_put(dst_cgrp);
	put_css_set(cset);
	if (kargs->cset)
		put_css_set(kargs->cset);
	return ret;
}

/**
 * cgroup_css_set_put_fork - drop references we took during fork
 * @kargs: the arguments passed to create the child process
 *
 * Drop references to the prepared css_set and target cgroup if
 * CLONE_INTO_CGROUP was requested.
 */
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
	__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
	cgroup_threadgroup_change_end(current);

	if (kargs->flags & CLONE_INTO_CGROUP) {
		struct cgroup *cgrp = kargs->cgrp;
		struct css_set *cset = kargs->cset;

		mutex_unlock(&cgroup_mutex);

		if (cset) {
			put_css_set(cset);
			kargs->cset = NULL;
		}

		if (cgrp) {
			cgroup_put(cgrp);
			kargs->cgrp = NULL;
		}
	}
}

/**
 * cgroup_can_fork - called on a new task before the process is exposed
 * @child: the child process
 *
 * This prepares a new css_set for the child process which the child will
 * be attached to in cgroup_post_fork().
 * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
 * callback returns an error, the fork aborts with that error code. This
 * allows for a cgroup subsystem to conditionally allow or deny new forks.
 */
int cgroup_can_fork(struct task_struct *child)
	__acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem)
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
	struct cgroup_subsys *ss;
	int i, j, ret;

	cgroup_threadgroup_change_begin(current);
	ret = cgroup_css_set_fork(kargs);
	if (ret)
		return ret;

	do_each_subsys_mask(ss, i, have_canfork_callback) {
		ret = ss->can_fork(child);
		ret = ss->can_fork(child, kargs->cset);
		if (ret)
			goto out_revert;
	} while_each_subsys_mask();
@@ -5937,10 +6066,10 @@ out_revert:
		if (j >= i)
			break;
		if (ss->cancel_fork)
			ss->cancel_fork(child);
			ss->cancel_fork(child, kargs->cset);
	}

	cgroup_threadgroup_change_end(current);
	cgroup_css_set_put_fork(kargs);

	return ret;
}
@@ -5948,21 +6077,23 @@ out_revert:
/**
 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * This calls the cancel_fork() callbacks if a fork failed *after*
  * cgroup_can_fork() succeded.
 * cgroup_can_fork() succeded and cleans up references we took to
 * prepare a new css_set for the child process in cgroup_can_fork().
 */
void cgroup_cancel_fork(struct task_struct *child)
	__releases(&cgroup_threadgroup_rwsem)
void cgroup_cancel_fork(struct task_struct *child,
			struct kernel_clone_args *kargs)
{
	struct cgroup_subsys *ss;
	int i;

	for_each_subsys(ss, i)
		if (ss->cancel_fork)
			ss->cancel_fork(child);
			ss->cancel_fork(child, kargs->cset);

	cgroup_threadgroup_change_end(current);
	cgroup_css_set_put_fork(kargs);
}

/**
@@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child)
 * Attach the child process to its css_set calling the subsystem fork()
 * callbacks.
 */
void cgroup_post_fork(struct task_struct *child)
	__releases(&cgroup_threadgroup_rwsem)
void cgroup_post_fork(struct task_struct *child,
		      struct kernel_clone_args *kargs)
	__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
	struct cgroup_subsys *ss;
	struct css_set *cset;
	int i;

	cset = kargs->cset;
	kargs->cset = NULL;

	spin_lock_irq(&css_set_lock);

	/* init tasks are special, only link regular threads */
	if (likely(child->pid)) {
		WARN_ON_ONCE(!list_empty(&child->cg_list));
		cset = task_css_set(current); /* current is @child's parent */
		get_css_set(cset);
		cset->nr_tasks++;
		css_set_move_task(child, NULL, cset, false);
	} else {
		put_css_set(cset);
		cset = NULL;
	}

	/*
@@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child)
		ss->fork(child);
	} while_each_subsys_mask();

	cgroup_threadgroup_change_end(current);
	/* Make the new cset the root_cset of the new cgroup namespace. */
	if (kargs->flags & CLONE_NEWCGROUP) {
		struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;

		get_css_set(cset);
		child->nsproxy->cgroup_ns->root_cset = cset;
		put_css_set(rcset);
	}

	cgroup_css_set_put_fork(kargs);
}

/**
Loading