Commit 0ffaa9c8 authored by Henri Doreau's avatar Henri Doreau Committed by Greg Kroah-Hartman
Browse files

staging: lustre: hsm: Use file lease to implement migration



Implement non-blocking migration based on exclusive open instead of
group lock. Implemented exclusive close operation to atomically put
a lease, swap two layouts and close a file. This allows race-free
migrations.

Make the caller responsible for retrying on failure (EBUSY, EAGAIN)
in non-blocking mode.

In blocking mode, allow applications to trigger layout swaps using a
grouplock they already own, to prevent race conditions between the
actual data copy and the layout swap. Updated lfs accordingly. File
leases are also taken in blocking mode, so that lfs migrate can issue
a warning if an application attempts to open a file that is being
migrated and gets blocked.

Timestamps (atime/mtime) are set from userland, after the layout swap
is performed, to prevent conflicts with the grouplock.

lli_trunc_sem is taken/released in the vvp_io layer, under the DLM
lock. This re-ordering fixes the original issue between truncate and
migrate.

Signed-off-by: default avatarHenri Doreau <henri.doreau@cea.fr>
Signed-off-by: default avatarJinshan Xiong <jinshan.xiong@intel.com>
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4840
Reviewed-on: http://review.whamcloud.com/10013


Reviewed-by: default avatarJohn L. Hammond <john.hammond@intel.com>
Reviewed-by: default avatarfrank zago <fzago@cray.com>
Reviewed-by: default avatarOleg Drokin <oleg.drokin@intel.com>
Signed-off-by: default avatarJames Simmons <jsimmons@infradead.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent b73d803b
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -1604,7 +1604,9 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
/*	OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) lfs rgetfacl, obsolete */

#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
#define OBD_MD_FLRELEASED    (0x0020000000000000ULL) /* file released */
#define OBD_MD_CLOSE_INTENT_EXECED (0x0020000000000000ULL) /* close intent
							    * executed
							    */

#define OBD_MD_DEFAULT_MEA   (0x0040000000000000ULL) /* default MEA */

@@ -2139,6 +2141,7 @@ enum mds_op_bias {
	MDS_OWNEROVERRIDE	= 1 << 11,
	MDS_HSM_RELEASE		= 1 << 12,
	MDS_RENAME_MIGRATE	= BIT(13),
	MDS_CLOSE_LAYOUT_SWAP   = BIT(14),
};

/* instance of mdt_reint_rec */
+1 −0
Original line number Diff line number Diff line
@@ -645,6 +645,7 @@ struct if_quotactl {
#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
#define SWAP_LAYOUTS_CLOSE		BIT(4)

/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
+1 −1
Original line number Diff line number Diff line
@@ -148,7 +148,7 @@ extern struct req_format RQF_MDS_GETATTR;
 */
extern struct req_format RQF_MDS_GETATTR_NAME;
extern struct req_format RQF_MDS_CLOSE;
extern struct req_format RQF_MDS_RELEASE_CLOSE;
extern struct req_format RQF_MDS_INTENT_CLOSE;
extern struct req_format RQF_MDS_CONNECT;
extern struct req_format RQF_MDS_DISCONNECT;
extern struct req_format RQF_MDS_GET_INFO;
+144 −86
Original line number Diff line number Diff line
@@ -113,10 +113,19 @@ out:
			   0, 0, LUSTRE_OPC_ANY, NULL);
}

/**
 * Perform a close, possibly with a bias.
 * The meaning of "data" depends on the value of "bias".
 *
 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 * swap layouts with.
 */
static int ll_close_inode_openhandle(struct obd_export *md_exp,
				     struct inode *inode,
				     struct obd_client_handle *och,
				     const __u64 *data_version)
				     struct inode *inode,
				     enum mds_op_bias bias,
				     void *data)
{
	struct obd_export *exp = ll_i2mdexp(inode);
	struct md_op_data *op_data;
@@ -143,12 +152,26 @@ static int ll_close_inode_openhandle(struct obd_export *md_exp,
	}

	ll_prepare_close(inode, op_data, och);
	if (data_version) {
		/* Pass in data_version implies release. */
	switch (bias) {
	case MDS_CLOSE_LAYOUT_SWAP:
		LASSERT(data);
		op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
		op_data->op_data_version = 0;
		op_data->op_lease_handle = och->och_lease_handle;
		op_data->op_fid2 = *ll_inode2fid(data);
		break;

	case MDS_HSM_RELEASE:
		LASSERT(data);
		op_data->op_bias |= MDS_HSM_RELEASE;
		op_data->op_data_version = *data_version;
		op_data->op_data_version = *(__u64 *)data;
		op_data->op_lease_handle = och->och_lease_handle;
		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
		break;

	default:
		LASSERT(!data);
		break;
	}

	rc = md_close(md_exp, op_data, och->och_mod, &req);
@@ -169,11 +192,12 @@ static int ll_close_inode_openhandle(struct obd_export *md_exp,
		spin_unlock(&lli->lli_lock);
	}

	if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
	if (op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP) &&
	    !rc) {
		struct mdt_body *body;

		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
		if (!(body->mbo_valid & OBD_MD_FLRELEASED))
		if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
			rc = -EBUSY;
	}

@@ -227,7 +251,7 @@ int ll_md_real_close(struct inode *inode, fmode_t fmode)
		 * be closed.
		 */
		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
					       inode, och, NULL);
					       och, inode, 0, NULL);
	}

	return rc;
@@ -263,7 +287,8 @@ static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
	}

	if (fd->fd_och) {
		rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
		rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
					       NULL);
		fd->fd_och = NULL;
		goto out;
	}
@@ -816,7 +841,7 @@ out_close:
		it.it_lock_mode = 0;
		och->och_lease_handle.cookie = 0ULL;
	}
	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
	if (rc2 < 0)
		CERROR("%s: error closing file "DFID": %d\n",
		       ll_get_fsname(inode->i_sb, NULL, 0),
@@ -829,6 +854,69 @@ out:
	return ERR_PTR(rc);
}

/**
 * Check whether a layout swap can be done between two inodes.
 *
 * \param[in] inode1  First inode to check
 * \param[in] inode2  Second inode to check
 *
 * \retval 0 on success, layout swap can be performed between both inodes
 * \retval negative error code if requirements are not met
 */
static int ll_check_swap_layouts_validity(struct inode *inode1,
					  struct inode *inode2)
{
	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
		return -EINVAL;

	if (inode_permission(inode1, MAY_WRITE) ||
	    inode_permission(inode2, MAY_WRITE))
		return -EPERM;

	if (inode1->i_sb != inode2->i_sb)
		return -EXDEV;

	return 0;
}

static int ll_swap_layouts_close(struct obd_client_handle *och,
				 struct inode *inode, struct inode *inode2)
{
	const struct lu_fid *fid1 = ll_inode2fid(inode);
	const struct lu_fid *fid2;
	int rc;

	CDEBUG(D_INODE, "%s: biased close of file " DFID "\n",
	       ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));

	rc = ll_check_swap_layouts_validity(inode, inode2);
	if (rc < 0)
		goto out_free_och;

	/* We now know that inode2 is a lustre inode */
	fid2 = ll_inode2fid(inode2);

	rc = lu_fid_cmp(fid1, fid2);
	if (!rc) {
		rc = -EINVAL;
		goto out_free_och;
	}

	/*
	 * Close the file and swap layouts between inode & inode2.
	 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
	 * because we still need it to pack l_remote_handle to MDT.
	 */
	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
				       MDS_CLOSE_LAYOUT_SWAP, inode2);

	och = NULL; /* freed in ll_close_inode_openhandle() */

out_free_och:
	kfree(och);
	return rc;
}

/**
 * Release lease and close the file.
 * It will check if the lease has ever broken.
@@ -856,7 +944,7 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
		*lease_broken = cancelled;

	return ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
					 inode, och, NULL);
					 och, inode, 0, NULL);
}

int ll_merge_attr(const struct lu_env *env, struct inode *inode)
@@ -1014,11 +1102,9 @@ restart:

			range_locked = true;
		}
		down_read(&lli->lli_trunc_sem);
		ll_cl_add(file, env, io);
		rc = cl_io_loop(env, io);
		ll_cl_remove(file, env);
		up_read(&lli->lli_trunc_sem);
		if (range_locked) {
			CDEBUG(D_VFSTRACE, "Range unlock [%llu, %llu]\n",
			       range.rl_node.in_extent.start,
@@ -1415,7 +1501,7 @@ int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);

	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
				       inode, och, NULL);
				       och, inode, 0, NULL);
out:
	/* this one is in place of ll_file_open */
	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
@@ -1618,8 +1704,8 @@ int ll_hsm_release(struct inode *inode)
	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
	 * we still need it to pack l_remote_handle to MDT.
	 */
	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
				       &data_version);
	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
				       MDS_HSM_RELEASE, &data_version);
	och = NULL;

out:
@@ -1630,10 +1716,12 @@ out:
}

struct ll_swap_stack {
	struct iattr		 ia1, ia2;
	__u64			 dv1, dv2;
	struct inode		*inode1, *inode2;
	bool			 check_dv1, check_dv2;
	u64		dv1;
	u64		dv2;
	struct inode   *inode1;
	struct inode   *inode2;
	bool		check_dv1;
	bool		check_dv2;
};

static int ll_swap_layouts(struct file *file1, struct file *file2,
@@ -1653,21 +1741,9 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
	llss->inode1 = file_inode(file1);
	llss->inode2 = file_inode(file2);

	if (!S_ISREG(llss->inode2->i_mode)) {
		rc = -EINVAL;
		goto free;
	}

	if (inode_permission(llss->inode1, MAY_WRITE) ||
	    inode_permission(llss->inode2, MAY_WRITE)) {
		rc = -EPERM;
		goto free;
	}

	if (llss->inode2->i_sb != llss->inode1->i_sb) {
		rc = -EXDEV;
	rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
	if (rc < 0)
		goto free;
	}

	/* we use 2 bool because it is easier to swap than 2 bits */
	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
@@ -1681,10 +1757,8 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
	llss->dv2 = lsl->sl_dv2;

	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
	if (rc == 0) /* same file, done! */ {
		rc = 0;
	if (!rc) /* same file, done! */
		goto free;
	}

	if (rc < 0) { /* sequentialize it */
		swap(llss->inode1, llss->inode2);
@@ -1706,19 +1780,6 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
		}
	}

	/* to be able to restore mtime and atime after swap
	 * we need to first save them
	 */
	if (lsl->sl_flags &
	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
		llss->ia1.ia_mtime = llss->inode1->i_mtime;
		llss->ia1.ia_atime = llss->inode1->i_atime;
		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
		llss->ia2.ia_mtime = llss->inode2->i_mtime;
		llss->ia2.ia_atime = llss->inode2->i_atime;
		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
	}

	/* ultimate check, before swapping the layouts we check if
	 * dataversion has changed (if requested)
	 */
@@ -1768,39 +1829,6 @@ putgl:
		ll_put_grouplock(llss->inode1, file1, gid);
	}

	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
	if (rc != 0)
		goto free;

	/* clear useless flags */
	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
		llss->ia1.ia_valid &= ~ATTR_MTIME;
		llss->ia2.ia_valid &= ~ATTR_MTIME;
	}

	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
		llss->ia1.ia_valid &= ~ATTR_ATIME;
		llss->ia2.ia_valid &= ~ATTR_ATIME;
	}

	/* update time if requested */
	rc = 0;
	if (llss->ia2.ia_valid != 0) {
		inode_lock(llss->inode1);
		rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
		inode_unlock(llss->inode1);
	}

	if (llss->ia1.ia_valid != 0) {
		int rc1;

		inode_lock(llss->inode2);
		rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
		inode_unlock(llss->inode2);
		if (rc == 0)
			rc = rc1;
	}

free:
	kfree(llss);

@@ -1957,16 +1985,46 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
				   sizeof(struct lustre_swap_layouts)))
			return -EFAULT;

		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
		if ((file->f_flags & O_ACCMODE) == O_RDONLY)
			return -EPERM;

		file2 = fget(lsl.sl_fd);
		if (!file2)
			return -EBADF;

		/* O_WRONLY or O_RDWR */
		if ((file2->f_flags & O_ACCMODE) == O_RDONLY) {
			rc = -EPERM;
		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
			goto out;
		}

		if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
			struct obd_client_handle *och = NULL;
			struct ll_inode_info *lli;
			struct inode *inode2;

			if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE) {
				rc = -EINVAL;
				goto out;
			}

			lli = ll_i2info(inode);
			mutex_lock(&lli->lli_och_mutex);
			if (fd->fd_lease_och) {
				och = fd->fd_lease_och;
				fd->fd_lease_och = NULL;
			}
			mutex_unlock(&lli->lli_och_mutex);
			if (!och) {
				rc = -ENOLCK;
				goto out;
			}
			inode2 = file_inode(file2);
			rc = ll_swap_layouts_close(och, inode, inode2);
		} else {
			rc = ll_swap_layouts(file, file2, &lsl);
		}
out:
		fput(file2);
		return rc;
	}
+0 −4
Original line number Diff line number Diff line
@@ -1567,11 +1567,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
		 * setting times to past, but it is necessary due to possible
		 * time de-synchronization between MDT inode and OST objects
		 */
		if (attr->ia_valid & ATTR_SIZE)
			down_write(&lli->lli_trunc_sem);
		rc = cl_setattr_ost(ll_i2info(inode)->lli_clob, attr, 0);
		if (attr->ia_valid & ATTR_SIZE)
			up_write(&lli->lli_trunc_sem);
	}
out:
	if (op_data)
Loading