Commit 49cb2fc4 authored by Adrian Reber's avatar Adrian Reber Committed by Christian Brauner
Browse files

fork: extend clone3() to support setting a PID



The main motivation to add set_tid to clone3() is CRIU.

To restore a process with the same PID/TID CRIU currently uses
/proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
ns_last_pid and then (quickly) does a clone(). This works most of the
time, but it is racy. It is also slow as it requires multiple syscalls.

Extending clone3() to support *set_tid makes it possible restore a
process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
race free (as long as the desired PID/TID is available).

This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
on clone3() with *set_tid as they are currently in place for ns_last_pid.

The original version of this change was using a single value for
set_tid. At the 2019 LPC, after presenting set_tid, it was, however,
decided to change set_tid to an array to enable setting the PID of a
process in multiple PID namespaces at the same time. If a process is
created in a PID namespace it is possible to influence the PID inside
and outside of the PID namespace. Details also in the corresponding
selftest.

To create a process with the following PIDs:

      PID NS level         Requested PID
        0 (host)              31496
        1                        42
        2                         1

For that example the two newly introduced parameters to struct
clone_args (set_tid and set_tid_size) would need to be:

  set_tid[0] = 1;
  set_tid[1] = 42;
  set_tid[2] = 31496;
  set_tid_size = 3;

If only the PIDs of the two innermost nested PID namespaces should be
defined it would look like this:

  set_tid[0] = 1;
  set_tid[1] = 42;
  set_tid_size = 2;

The PID of the newly created process would then be the next available
free PID in the PID namespace level 0 (host) and 42 in the PID namespace
at level 1 and the PID of the process in the innermost PID namespace
would be 1.

The set_tid array is used to specify the PID of a process starting
from the innermost nested PID namespaces up to set_tid_size PID namespaces.

set_tid_size cannot be larger then the current PID namespace level.

Signed-off-by: default avatarAdrian Reber <areber@redhat.com>
Reviewed-by: default avatarChristian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: default avatarOleg Nesterov <oleg@redhat.com>
Reviewed-by: default avatarDmitry Safonov <0x7f454c46@gmail.com>
Acked-by: default avatarAndrei Vagin <avagin@gmail.com>
Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com


Signed-off-by: default avatarChristian Brauner <christian.brauner@ubuntu.com>
parent 17a81069
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -124,7 +124,8 @@ extern struct pid *find_vpid(int nr);
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);

extern struct pid *alloc_pid(struct pid_namespace *ns);
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
			     size_t set_tid_size);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);

+2 −0
Original line number Diff line number Diff line
@@ -12,6 +12,8 @@
#include <linux/ns_common.h>
#include <linux/idr.h>

/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32

struct fs_pin;

+3 −0
Original line number Diff line number Diff line
@@ -26,6 +26,9 @@ struct kernel_clone_args {
	unsigned long stack;
	unsigned long stack_size;
	unsigned long tls;
	pid_t *set_tid;
	/* Number of elements in *set_tid */
	size_t set_tid_size;
};

/*
+35 −18
Original line number Diff line number Diff line
@@ -57,6 +57,20 @@
 * @stack_size:   The size of the stack for the child process.
 * @tls:          If CLONE_SETTLS is set, the tls descriptor
 *                is set to tls.
 * @set_tid:      Pointer to an array of type *pid_t. The size
 *                of the array is defined using @set_tid_size.
 *                This array is used to select PIDs/TIDs for
 *                newly created processes. The first element in
 *                this defines the PID in the most nested PID
 *                namespace. Each additional element in the array
 *                defines the PID in the parent PID namespace of
 *                the original PID namespace. If the array has
 *                less entries than the number of currently
 *                nested PID namespaces only the PIDs in the
 *                corresponding namespaces are set.
 * @set_tid_size: This defines the size of the array referenced
 *                in @set_tid. This cannot be larger than the
 *                kernel's limit of nested PID namespaces.
 *
 * The structure is versioned by size and thus extensible.
 * New struct members must go at the end of the struct and
@@ -71,10 +85,13 @@ struct clone_args {
	__aligned_u64 stack;
	__aligned_u64 stack_size;
	__aligned_u64 tls;
	__aligned_u64 set_tid;
	__aligned_u64 set_tid_size;
};
#endif

#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */

/*
 * Scheduling policies
+23 −1
Original line number Diff line number Diff line
@@ -2087,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process(
	stackleak_task_init(p);

	if (pid != &init_struct_pid) {
		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
		pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
				args->set_tid_size);
		if (IS_ERR(pid)) {
			retval = PTR_ERR(pid);
			goto bad_fork_cleanup_thread;
@@ -2590,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
{
	int err;
	struct clone_args args;
	pid_t *kset_tid = kargs->set_tid;

	if (unlikely(usize > PAGE_SIZE))
		return -E2BIG;
@@ -2600,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
	if (err)
		return err;

	if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
		return -EINVAL;

	if (unlikely(!args.set_tid && args.set_tid_size > 0))
		return -EINVAL;

	if (unlikely(args.set_tid && args.set_tid_size == 0))
		return -EINVAL;

	/*
	 * Verify that higher 32bits of exit_signal are unset and that
	 * it is a valid signal
@@ -2617,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
		.stack		= args.stack,
		.stack_size	= args.stack_size,
		.tls		= args.tls,
		.set_tid_size	= args.set_tid_size,
	};

	if (args.set_tid &&
		copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
			(kargs->set_tid_size * sizeof(pid_t))))
		return -EFAULT;

	kargs->set_tid = kset_tid;

	return 0;
}

@@ -2662,6 +2681,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
	int err;

	struct kernel_clone_args kargs;
	pid_t set_tid[MAX_PID_NS_LEVEL];

	kargs.set_tid = set_tid;

	err = copy_clone_args_from_user(&kargs, uargs, size);
	if (err)
Loading