Commit d987ca1c authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull exec/proc updates from Eric Biederman:
 "This contains two significant pieces of work: the work to sort out
  proc_flush_task, and the work to solve a deadlock between strace and
  exec.

  Fixing proc_flush_task so that it no longer requires a persistent
  mount makes improvements to proc possible. The removal of the
  persistent mount solves an old regression that that caused the hidepid
  mount option to only work on remount not on mount. The regression was
  found and reported by the Android folks. This further allows Alexey
  Gladkov's work making proc mount options specific to an individual
  mount of proc to move forward.

  The work on exec starts solving a long standing issue with exec that
  it takes mutexes of blocking userspace applications, which makes exec
  extremely deadlock prone. For the moment this adds a second mutex with
  a narrower scope that handles all of the easy cases. Which makes the
  tricky cases easy to spot. With a little luck the code to solve those
  deadlocks will be ready by next merge window"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (25 commits)
  signal: Extend exec_id to 64bits
  pidfd: Use new infrastructure to fix deadlocks in execve
  perf: Use new infrastructure to fix deadlocks in execve
  proc: io_accounting: Use new infrastructure to fix deadlocks in execve
  proc: Use new infrastructure to fix deadlocks in execve
  kernel/kcmp.c: Use new infrastructure to fix deadlocks in execve
  kernel: doc: remove outdated comment cred.c
  mm: docs: Fix a comment in process_vm_rw_core
  selftests/ptrace: add test cases for dead-locks
  exec: Fix a deadlock in strace
  exec: Add exec_update_mutex to replace cred_guard_mutex
  exec: Move exec_mmap right after de_thread in flush_old_exec
  exec: Move cleanup of posix timers on exec out of de_thread
  exec: Factor unshare_sighand out of de_thread and call it separately
  exec: Only compute current once in flush_old_exec
  pid: Improve the comment about waiting in zap_pid_ns_processes
  proc: Remove the now unnecessary internal mount of proc
  uml: Create a private mount of proc for mconsole
  uml: Don't consult current to find the proc_mnt in mconsole_proc
  proc: Use a list of inodes to flush from proc
  ...
parents 919dce24 d1e7fd64
Loading
Loading
Loading
Loading
+27 −1
Original line number Diff line number Diff line
@@ -36,6 +36,8 @@
#include "mconsole_kern.h"
#include <os.h>

static struct vfsmount *proc_mnt = NULL;

static int do_unlink_socket(struct notifier_block *notifier,
			    unsigned long what, void *data)
{
@@ -123,7 +125,7 @@ void mconsole_log(struct mc_request *req)

void mconsole_proc(struct mc_request *req)
{
	struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
	struct vfsmount *mnt = proc_mnt;
	char *buf;
	int len;
	struct file *file;
@@ -134,6 +136,10 @@ void mconsole_proc(struct mc_request *req)
	ptr += strlen("proc");
	ptr = skip_spaces(ptr);

	if (!mnt) {
		mconsole_reply(req, "Proc not available", 1, 0);
		goto out;
	}
	file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0);
	if (IS_ERR(file)) {
		mconsole_reply(req, "Failed to open file", 1, 0);
@@ -683,6 +689,24 @@ void mconsole_stack(struct mc_request *req)
	with_console(req, stack_proc, to);
}

static int __init mount_proc(void)
{
	struct file_system_type *proc_fs_type;
	struct vfsmount *mnt;

	proc_fs_type = get_fs_type("proc");
	if (!proc_fs_type)
		return -ENODEV;

	mnt = kern_mount(proc_fs_type);
	put_filesystem(proc_fs_type);
	if (IS_ERR(mnt))
		return PTR_ERR(mnt);

	proc_mnt = mnt;
	return 0;
}

/*
 * Changed by mconsole_setup, which is __setup, and called before SMP is
 * active.
@@ -696,6 +720,8 @@ static int __init mconsole_init(void)
	int err;
	char file[UNIX_PATH_MAX];

	mount_proc();

	if (umid_file_name("mconsole", file, sizeof(file)))
		return -1;
	snprintf(mconsole_socket_name, sizeof(file), "%s", file);
+55 −25
Original line number Diff line number Diff line
@@ -1036,16 +1036,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
}
EXPORT_SYMBOL(read_code);

/*
 * Maps the mm_struct mm into the current task struct.
 * On success, this function returns with the mutex
 * exec_update_mutex locked.
 */
static int exec_mmap(struct mm_struct *mm)
{
	struct task_struct *tsk;
	struct mm_struct *old_mm, *active_mm;
	int ret;

	/* Notify parent that we're no longer interested in the old VM */
	tsk = current;
	old_mm = current->mm;
	exec_mm_release(tsk, old_mm);

	ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
	if (ret)
		return ret;

	if (old_mm) {
		sync_mm_rss(old_mm);
		/*
@@ -1057,9 +1067,11 @@ static int exec_mmap(struct mm_struct *mm)
		down_read(&old_mm->mmap_sem);
		if (unlikely(old_mm->core_state)) {
			up_read(&old_mm->mmap_sem);
			mutex_unlock(&tsk->signal->exec_update_mutex);
			return -EINTR;
		}
	}

	task_lock(tsk);
	active_mm = tsk->active_mm;
	membarrier_exec_mmap(mm);
@@ -1215,10 +1227,22 @@ no_thread_group:
	/* we have changed execution domain */
	tsk->exit_signal = SIGCHLD;

#ifdef CONFIG_POSIX_TIMERS
	exit_itimers(sig);
	flush_itimer_signals();
#endif
	BUG_ON(!thread_group_leader(tsk));
	return 0;

killed:
	/* protects against exit_notify() and __exit_signal() */
	read_lock(&tasklist_lock);
	sig->group_exit_task = NULL;
	sig->notify_count = 0;
	read_unlock(&tasklist_lock);
	return -EAGAIN;
}


static int unshare_sighand(struct task_struct *me)
{
	struct sighand_struct *oldsighand = me->sighand;

	if (refcount_read(&oldsighand->count) != 1) {
		struct sighand_struct *newsighand;
@@ -1236,23 +1260,13 @@ no_thread_group:

		write_lock_irq(&tasklist_lock);
		spin_lock(&oldsighand->siglock);
		rcu_assign_pointer(tsk->sighand, newsighand);
		rcu_assign_pointer(me->sighand, newsighand);
		spin_unlock(&oldsighand->siglock);
		write_unlock_irq(&tasklist_lock);

		__cleanup_sighand(oldsighand);
	}

	BUG_ON(!thread_group_leader(tsk));
	return 0;

killed:
	/* protects against exit_notify() and __exit_signal() */
	read_lock(&tasklist_lock);
	sig->group_exit_task = NULL;
	sig->notify_count = 0;
	read_unlock(&tasklist_lock);
	return -EAGAIN;
}

char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
@@ -1286,13 +1300,13 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
 */
int flush_old_exec(struct linux_binprm * bprm)
{
	struct task_struct *me = current;
	int retval;

	/*
	 * Make sure we have a private signal table and that
	 * we are unassociated from the previous thread group.
	 * Make this the only thread in the thread group.
	 */
	retval = de_thread(current);
	retval = de_thread(me);
	if (retval)
		goto out;

@@ -1312,18 +1326,31 @@ int flush_old_exec(struct linux_binprm * bprm)
		goto out;

	/*
	 * After clearing bprm->mm (to mark that current is using the
	 * prepared mm now), we have nothing left of the original
	 * After setting bprm->called_exec_mmap (to mark that current is
	 * using the prepared mm now), we have nothing left of the original
	 * process. If anything from here on returns an error, the check
	 * in search_binary_handler() will SEGV current.
	 */
	bprm->called_exec_mmap = 1;
	bprm->mm = NULL;

#ifdef CONFIG_POSIX_TIMERS
	exit_itimers(me->signal);
	flush_itimer_signals();
#endif

	/*
	 * Make the signal table private.
	 */
	retval = unshare_sighand(me);
	if (retval)
		goto out;

	set_fs(USER_DS);
	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
					PF_NOFREEZE | PF_NO_SETAFFINITY);
	flush_thread();
	current->personality &= ~bprm->per_clear;
	me->personality &= ~bprm->per_clear;

	/*
	 * We have to apply CLOEXEC before we change whether the process is
@@ -1331,7 +1358,7 @@ int flush_old_exec(struct linux_binprm * bprm)
	 * trying to access the should-be-closed file descriptors of a process
	 * undergoing exec(2).
	 */
	do_close_on_exec(current->files);
	do_close_on_exec(me->files);
	return 0;

out:
@@ -1412,7 +1439,7 @@ void setup_new_exec(struct linux_binprm * bprm)

	/* An exec changes our domain. We are no longer part of the thread
	   group */
	current->self_exec_id++;
	WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1);
	flush_signal_handlers(current, 0);
}
EXPORT_SYMBOL(setup_new_exec);
@@ -1450,6 +1477,8 @@ static void free_bprm(struct linux_binprm *bprm)
{
	free_arg_pages(bprm);
	if (bprm->cred) {
		if (bprm->called_exec_mmap)
			mutex_unlock(&current->signal->exec_update_mutex);
		mutex_unlock(&current->signal->cred_guard_mutex);
		abort_creds(bprm->cred);
	}
@@ -1499,6 +1528,7 @@ void install_exec_creds(struct linux_binprm *bprm)
	 * credentials; any time after this it may be unlocked.
	 */
	security_bprm_committed_creds(bprm);
	mutex_unlock(&current->signal->exec_update_mutex);
	mutex_unlock(&current->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(install_exec_creds);
@@ -1690,7 +1720,7 @@ int search_binary_handler(struct linux_binprm *bprm)

		read_lock(&binfmt_lock);
		put_binfmt(fmt);
		if (retval < 0 && !bprm->mm) {
		if (retval < 0 && bprm->called_exec_mmap) {
			/* we got to flush_old_exec() and failed after it */
			read_unlock(&binfmt_lock);
			force_sigsegv(SIGSEGV);
+41 −80
Original line number Diff line number Diff line
@@ -405,11 +405,11 @@ print0:

static int lock_trace(struct task_struct *task)
{
	int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
	int err = mutex_lock_killable(&task->signal->exec_update_mutex);
	if (err)
		return err;
	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
		mutex_unlock(&task->signal->cred_guard_mutex);
		mutex_unlock(&task->signal->exec_update_mutex);
		return -EPERM;
	}
	return 0;
@@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task)

static void unlock_trace(struct task_struct *task)
{
	mutex_unlock(&task->signal->cred_guard_mutex);
	mutex_unlock(&task->signal->exec_update_mutex);
}

#ifdef CONFIG_STACKTRACE
@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
	*rgid = gid;
}

void proc_pid_evict_inode(struct proc_inode *ei)
{
	struct pid *pid = ei->pid;

	if (S_ISDIR(ei->vfs_inode.i_mode)) {
		spin_lock(&pid->wait_pidfd.lock);
		hlist_del_init_rcu(&ei->sibling_inodes);
		spin_unlock(&pid->wait_pidfd.lock);
	}

	put_pid(pid);
}

struct inode *proc_pid_make_inode(struct super_block * sb,
				  struct task_struct *task, umode_t mode)
{
	struct inode * inode;
	struct proc_inode *ei;
	struct pid *pid;

	/* We need a new inode */

@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
	/*
	 * grab the reference to task.
	 */
	ei->pid = get_task_pid(task, PIDTYPE_PID);
	if (!ei->pid)
	pid = get_task_pid(task, PIDTYPE_PID);
	if (!pid)
		goto out_unlock;

	/* Let the pid remember us for quick removal */
	ei->pid = pid;
	if (S_ISDIR(mode)) {
		spin_lock(&pid->wait_pidfd.lock);
		hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
		spin_unlock(&pid->wait_pidfd.lock);
	}

	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
	security_task_to_inode(task, inode);

@@ -2861,7 +2883,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
	unsigned long flags;
	int result;

	result = mutex_lock_killable(&task->signal->cred_guard_mutex);
	result = mutex_lock_killable(&task->signal->exec_update_mutex);
	if (result)
		return result;

@@ -2897,7 +2919,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
	result = 0;

out_unlock:
	mutex_unlock(&task->signal->cred_guard_mutex);
	mutex_unlock(&task->signal->exec_update_mutex);
	return result;
}

@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
	.permission	= proc_pid_permission,
};

static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
{
	struct dentry *dentry, *leader, *dir;
	char buf[10 + 1];
	struct qstr name;

	name.name = buf;
	name.len = snprintf(buf, sizeof(buf), "%u", pid);
	/* no ->d_hash() rejects on procfs */
	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
	if (dentry) {
		d_invalidate(dentry);
		dput(dentry);
	}

	if (pid == tgid)
		return;

	name.name = buf;
	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
	leader = d_hash_and_lookup(mnt->mnt_root, &name);
	if (!leader)
		goto out;

	name.name = "task";
	name.len = strlen(name.name);
	dir = d_hash_and_lookup(leader, &name);
	if (!dir)
		goto out_put_leader;

	name.name = buf;
	name.len = snprintf(buf, sizeof(buf), "%u", pid);
	dentry = d_hash_and_lookup(dir, &name);
	if (dentry) {
		d_invalidate(dentry);
		dput(dentry);
	}

	dput(dir);
out_put_leader:
	dput(leader);
out:
	return;
}

/**
 * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
 * @task: task that should be flushed.
 * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
 * @pid: pid that should be flushed.
 *
 * When flushing dentries from proc, one needs to flush them from global
 * proc (proc_mnt) and from all the namespaces' procs this task was seen
 * in. This call is supposed to do all of this job.
 *
 * Looks in the dcache for
 * /proc/@pid
 * /proc/@tgid/task/@pid
 * if either directory is present flushes it and all of it'ts children
 * from the dcache.
 * This function walks a list of inodes (that belong to any proc
 * filesystem) that are attached to the pid and flushes them from
 * the dentry cache.
 *
 * It is safe and reasonable to cache /proc entries for a task until
 * that task exits.  After that they just clog up the dcache with
 * useless entries, possibly causing useful dcache entries to be
 * flushed instead.  This routine is proved to flush those useless
 * dcache entries at process exit time.
 * flushed instead.  This routine is provided to flush those useless
 * dcache entries when a process is reaped.
 *
 * NOTE: This routine is just an optimization so it does not guarantee
 *       that no dcache entries will exist at process exit time it
 *       just makes it very unlikely that any will persist.
 *       that no dcache entries will exist after a process is reaped
 *       it just makes it very unlikely that any will persist.
 */

void proc_flush_task(struct task_struct *task)
void proc_flush_pid(struct pid *pid)
{
	int i;
	struct pid *pid, *tgid;
	struct upid *upid;

	pid = task_pid(task);
	tgid = task_tgid(task);

	for (i = 0; i <= pid->level; i++) {
		upid = &pid->numbers[i];
		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
					tgid->numbers[i].nr);
	}
	proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
	put_pid(pid);
}

static struct dentry *proc_pid_instantiate(struct dentry * dentry,
+68 −5
Original line number Diff line number Diff line
@@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode)
{
	struct proc_dir_entry *de;
	struct ctl_table_header *head;
	struct proc_inode *ei = PROC_I(inode);

	truncate_inode_pages_final(&inode->i_data);
	clear_inode(inode);

	/* Stop tracking associated processes */
	put_pid(PROC_I(inode)->pid);
	if (ei->pid) {
		proc_pid_evict_inode(ei);
		ei->pid = NULL;
	}

	/* Let go of any associated proc directory entry */
	de = PDE(inode);
	if (de)
	de = ei->pde;
	if (de) {
		pde_put(de);
		ei->pde = NULL;
	}

	head = PROC_I(inode)->sysctl;
	head = ei->sysctl;
	if (head) {
		RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
		RCU_INIT_POINTER(ei->sysctl, NULL);
		proc_sys_evict_inode(inode, head);
	}
}
@@ -68,6 +74,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
	ei->pde = NULL;
	ei->sysctl = NULL;
	ei->sysctl_entry = NULL;
	INIT_HLIST_NODE(&ei->sibling_inodes);
	ei->ns_ops = NULL;
	return &ei->vfs_inode;
}
@@ -102,6 +109,62 @@ void __init proc_init_kmemcache(void)
	BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}

void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
	struct inode *inode;
	struct proc_inode *ei;
	struct hlist_node *node;
	struct super_block *old_sb = NULL;

	rcu_read_lock();
	for (;;) {
		struct super_block *sb;
		node = hlist_first_rcu(inodes);
		if (!node)
			break;
		ei = hlist_entry(node, struct proc_inode, sibling_inodes);
		spin_lock(lock);
		hlist_del_init_rcu(&ei->sibling_inodes);
		spin_unlock(lock);

		inode = &ei->vfs_inode;
		sb = inode->i_sb;
		if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
			continue;
		inode = igrab(inode);
		rcu_read_unlock();
		if (sb != old_sb) {
			if (old_sb)
				deactivate_super(old_sb);
			old_sb = sb;
		}
		if (unlikely(!inode)) {
			rcu_read_lock();
			continue;
		}

		if (S_ISDIR(inode->i_mode)) {
			struct dentry *dir = d_find_any_alias(inode);
			if (dir) {
				d_invalidate(dir);
				dput(dir);
			}
		} else {
			struct dentry *dentry;
			while ((dentry = d_find_alias(inode))) {
				d_invalidate(dentry);
				dput(dentry);
			}
		}
		iput(inode);

		rcu_read_lock();
	}
	rcu_read_unlock();
	if (old_sb)
		deactivate_super(old_sb);
}

static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
	struct super_block *sb = root->d_sb;
+3 −1
Original line number Diff line number Diff line
@@ -91,7 +91,7 @@ struct proc_inode {
	struct proc_dir_entry *pde;
	struct ctl_table_header *sysctl;
	struct ctl_table *sysctl_entry;
	struct hlist_node sysctl_inodes;
	struct hlist_node sibling_inodes;
	const struct proc_ns_operations *ns_ops;
	struct inode vfs_inode;
} __randomize_layout;
@@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
@@ -210,6 +211,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;

void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);
Loading