Commit 0aecba61 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull vfs d_inode/d_flags memory ordering fixes from Al Viro:
 "Fallout from tree-wide audit for ->d_inode/->d_flags barriers use.
  Basically, the problem is that negative pinned dentries require
  careful treatment - unless ->d_lock is locked or parent is held at
  least shared, another thread can make them positive right under us.

  Most of the uses turned out to be safe - the main surprises as far as
  filesystems are concerned were

   - race in dget_parent() fastpath, that might end up with the caller
     observing the returned dentry _negative_, due to insufficient
     barriers. It is positive in memory, but we could end up seeing the
     wrong value of ->d_inode in CPU cache. Fixed.

   - manual checks that result of lookup_one_len_unlocked() is positive
     (and rejection of negatives). Again, insufficient barriers (we
     might end up with inconsistent observed values of ->d_inode and
     ->d_flags). Fixed by switching to a new primitive that does the
     checks itself and returns ERR_PTR(-ENOENT) instead of a negative
     dentry. That way we get rid of boilerplate converting negatives
     into ERR_PTR(-ENOENT) in the callers and have a single place to
     deal with the barrier-related mess - inside fs/namei.c rather than
     in every caller out there.

  The guts of pathname resolution *do* need to be careful - the race
  found by Ritesh is real, as well as several similar races.
  Fortunately, it turns out that we can take care of that with fairly
  local changes in there.

  The tree-wide audit had not been fun, and I hate the idea of repeating
  it. I think the right approach would be to annotate the places where
  we are _not_ guaranteed ->d_inode/->d_flags stability and have sparse
  catch regressions. But I'm still not sure what would be the least
  invasive way of doing that and it's clearly the next cycle fodder"

* 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
  fs/namei.c: fix missing barriers when checking positivity
  fix dget_parent() fastpath race
  new helper: lookup_positive_unlocked()
  fs/namei.c: pull positivity check into follow_managed()
parents b0d4beaa 2fa6b1e0
Loading
Loading
Loading
Loading
+1 −6
Original line number Original line Diff line number Diff line
@@ -730,11 +730,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
		struct inode *dir = d_inode(dentry);
		struct inode *dir = d_inode(dentry);
		struct dentry *child;
		struct dentry *child;


		if (!dir) {
			dput(dentry);
			dentry = ERR_PTR(-ENOENT);
			break;
		}
		if (!S_ISDIR(dir->i_mode)) {
		if (!S_ISDIR(dir->i_mode)) {
			dput(dentry);
			dput(dentry);
			dentry = ERR_PTR(-ENOTDIR);
			dentry = ERR_PTR(-ENOTDIR);
@@ -751,7 +746,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
		while (*s && *s != sep)
		while (*s && *s != sep)
			s++;
			s++;


		child = lookup_one_len_unlocked(p, dentry, s - p);
		child = lookup_positive_unlocked(p, dentry, s - p);
		dput(dentry);
		dput(dentry);
		dentry = child;
		dentry = child;
	} while (!IS_ERR(dentry));
	} while (!IS_ERR(dentry));
+4 −2
Original line number Original line Diff line number Diff line
@@ -319,7 +319,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
	flags = READ_ONCE(dentry->d_flags);
	flags = READ_ONCE(dentry->d_flags);
	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
	flags |= type_flags;
	flags |= type_flags;
	WRITE_ONCE(dentry->d_flags, flags);
	smp_store_release(&dentry->d_flags, flags);
}
}


static inline void __d_clear_type_and_inode(struct dentry *dentry)
static inline void __d_clear_type_and_inode(struct dentry *dentry)
@@ -903,17 +903,19 @@ struct dentry *dget_parent(struct dentry *dentry)
{
{
	int gotref;
	int gotref;
	struct dentry *ret;
	struct dentry *ret;
	unsigned seq;


	/*
	/*
	 * Do optimistic parent lookup without any
	 * Do optimistic parent lookup without any
	 * locking.
	 * locking.
	 */
	 */
	rcu_read_lock();
	rcu_read_lock();
	seq = raw_seqcount_begin(&dentry->d_seq);
	ret = READ_ONCE(dentry->d_parent);
	ret = READ_ONCE(dentry->d_parent);
	gotref = lockref_get_not_zero(&ret->d_lockref);
	gotref = lockref_get_not_zero(&ret->d_lockref);
	rcu_read_unlock();
	rcu_read_unlock();
	if (likely(gotref)) {
	if (likely(gotref)) {
		if (likely(ret == READ_ONCE(dentry->d_parent)))
		if (!read_seqcount_retry(&dentry->d_seq, seq))
			return ret;
			return ret;
		dput(ret);
		dput(ret);
	}
	}
+1 −5
Original line number Original line Diff line number Diff line
@@ -299,13 +299,9 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
	if (!parent)
	if (!parent)
		parent = debugfs_mount->mnt_root;
		parent = debugfs_mount->mnt_root;


	dentry = lookup_one_len_unlocked(name, parent, strlen(name));
	dentry = lookup_positive_unlocked(name, parent, strlen(name));
	if (IS_ERR(dentry))
	if (IS_ERR(dentry))
		return NULL;
		return NULL;
	if (!d_really_is_positive(dentry)) {
		dput(dentry);
		return NULL;
	}
	return dentry;
	return dentry;
}
}
EXPORT_SYMBOL_GPL(debugfs_lookup);
EXPORT_SYMBOL_GPL(debugfs_lookup);
+1 −1
Original line number Original line Diff line number Diff line
@@ -223,7 +223,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
			dput(dentry);
			dput(dentry);
			return ERR_PTR(-EINVAL);
			return ERR_PTR(-EINVAL);
		}
		}
		dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
		dtmp = lookup_positive_unlocked(kntmp->name, dentry,
					       strlen(kntmp->name));
					       strlen(kntmp->name));
		dput(dentry);
		dput(dentry);
		if (IS_ERR(dtmp))
		if (IS_ERR(dtmp))
+32 −24
Original line number Original line Diff line number Diff line
@@ -1210,25 +1210,25 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 * - Flagged as automount point
 * - Flagged as automount point
 *
 *
 * This may only be called in refwalk mode.
 * This may only be called in refwalk mode.
 * On success path->dentry is known positive.
 *
 *
 * Serialization is taken care of in namespace.c
 * Serialization is taken care of in namespace.c
 */
 */
static int follow_managed(struct path *path, struct nameidata *nd)
static int follow_managed(struct path *path, struct nameidata *nd)
{
{
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
	unsigned managed;
	unsigned flags;
	bool need_mntput = false;
	bool need_mntput = false;
	int ret = 0;
	int ret = 0;


	/* Given that we're not holding a lock here, we retain the value in a
	/* Given that we're not holding a lock here, we retain the value in a
	 * local variable for each dentry as we look at it so that we don't see
	 * local variable for each dentry as we look at it so that we don't see
	 * the components of that value change under us */
	 * the components of that value change under us */
	while (managed = READ_ONCE(path->dentry->d_flags),
	while (flags = smp_load_acquire(&path->dentry->d_flags),
	       managed &= DCACHE_MANAGED_DENTRY,
	       unlikely(flags & DCACHE_MANAGED_DENTRY)) {
	       unlikely(managed != 0)) {
		/* Allow the filesystem to manage the transit without i_mutex
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
		 * being held. */
		if (managed & DCACHE_MANAGE_TRANSIT) {
		if (flags & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
			BUG_ON(!path->dentry->d_op->d_manage);
			ret = path->dentry->d_op->d_manage(path, false);
			ret = path->dentry->d_op->d_manage(path, false);
@@ -1237,7 +1237,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
		}
		}


		/* Transit to a mounted filesystem. */
		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
		if (flags & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			struct vfsmount *mounted = lookup_mnt(path);
			if (mounted) {
			if (mounted) {
				dput(path->dentry);
				dput(path->dentry);
@@ -1256,7 +1256,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
		}
		}


		/* Handle an automount point */
		/* Handle an automount point */
		if (managed & DCACHE_NEED_AUTOMOUNT) {
		if (flags & DCACHE_NEED_AUTOMOUNT) {
			ret = follow_automount(path, nd, &need_mntput);
			ret = follow_automount(path, nd, &need_mntput);
			if (ret < 0)
			if (ret < 0)
				break;
				break;
@@ -1269,10 +1269,12 @@ static int follow_managed(struct path *path, struct nameidata *nd)


	if (need_mntput && path->mnt == mnt)
	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
		mntput(path->mnt);
	if (ret == -EISDIR || !ret)
		ret = 1;
	if (need_mntput)
	if (need_mntput)
		nd->flags |= LOOKUP_JUMPED;
		nd->flags |= LOOKUP_JUMPED;
	if (ret == -EISDIR || !ret)
		ret = 1;
	if (ret > 0 && unlikely(d_flags_negative(flags)))
		ret = -ENOENT;
	if (unlikely(ret < 0))
	if (unlikely(ret < 0))
		path_put_conditional(path, nd);
		path_put_conditional(path, nd);
	return ret;
	return ret;
@@ -1621,10 +1623,6 @@ static int lookup_fast(struct nameidata *nd,
		dput(dentry);
		dput(dentry);
		return status;
		return status;
	}
	}
	if (unlikely(d_is_negative(dentry))) {
		dput(dentry);
		return -ENOENT;
	}


	path->mnt = mnt;
	path->mnt = mnt;
	path->dentry = dentry;
	path->dentry = dentry;
@@ -1811,11 +1809,6 @@ static int walk_component(struct nameidata *nd, int flags)
		if (unlikely(err < 0))
		if (unlikely(err < 0))
			return err;
			return err;


		if (unlikely(d_is_negative(path.dentry))) {
			path_to_nameidata(&path, nd);
			return -ENOENT;
		}

		seq = 0;	/* we are already out of RCU mode */
		seq = 0;	/* we are already out of RCU mode */
		inode = d_backing_inode(path.dentry);
		inode = d_backing_inode(path.dentry);
	}
	}
@@ -2568,6 +2561,26 @@ struct dentry *lookup_one_len_unlocked(const char *name,
}
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
EXPORT_SYMBOL(lookup_one_len_unlocked);


/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct dentry *ret = lookup_one_len_unlocked(name, base, len);
	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
		dput(ret);
		ret = ERR_PTR(-ENOENT);
	}
	return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);

#ifdef CONFIG_UNIX98_PTYS
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
int path_pts(struct path *path)
{
{
@@ -2662,7 +2675,7 @@ mountpoint_last(struct nameidata *nd)
				return PTR_ERR(path.dentry);
				return PTR_ERR(path.dentry);
		}
		}
	}
	}
	if (d_is_negative(path.dentry)) {
	if (d_flags_negative(smp_load_acquire(&path.dentry->d_flags))) {
		dput(path.dentry);
		dput(path.dentry);
		return -ENOENT;
		return -ENOENT;
	}
	}
@@ -3356,11 +3369,6 @@ static int do_last(struct nameidata *nd,
	if (unlikely(error < 0))
	if (unlikely(error < 0))
		return error;
		return error;


	if (unlikely(d_is_negative(path.dentry))) {
		path_to_nameidata(&path, nd);
		return -ENOENT;
	}

	/*
	/*
	 * create/update audit record if it already exists.
	 * create/update audit record if it already exists.
	 */
	 */
Loading