Commit 9fc3c01a authored by Dexuan Cui's avatar Dexuan Cui Committed by Sasha Levin
Browse files

Tools: hv: Reopen the devices if read() or write() returns errors



The state machine in the hv_utils driver can run out of order in some
corner cases, e.g. if the kvp daemon doesn't call write() fast enough
due to some reason, kvp_timeout_func() can run first and move the state
to HVUTIL_READY; next, when kvp_on_msg() is called it returns -EINVAL
since kvp_transaction.state is smaller than HVUTIL_USERSPACE_REQ; later,
the daemon's write() gets an error -EINVAL, and the daemon will exit().

We can reproduce the issue by sending a SIGSTOP signal to the daemon, wait
for 1 minute, and send a SIGCONT signal to the daemon: the daemon will
exit() quickly.

We can fix the issue by forcing a reset of the device (which means the
daemon can close() and open() the device again) and doing extra necessary
clean-up.

Signed-off-by: default avatarDexuan Cui <decui@microsoft.com>
Reviewed-by: default avatarMichael Kelley <mikelley@microsoft.com>
Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
parent 3a6fb6c4
Loading
Loading
Loading
Loading
+32 −5
Original line number Diff line number Diff line
@@ -80,6 +80,8 @@ static int hv_start_fcopy(struct hv_start_fcopy *smsg)

	error = 0;
done:
	if (error)
		target_fname[0] = '\0';
	return error;
}

@@ -108,15 +110,29 @@ static int hv_copy_data(struct hv_do_fcopy *cpmsg)
	return ret;
}

/*
 * Reset target_fname to "" in the two below functions for hibernation: if
 * the fcopy operation is aborted by hibernation, the daemon should remove the
 * partially-copied file; to achieve this, the hv_utils driver always fakes a
 * CANCEL_FCOPY message upon suspend, and later when the VM resumes back,
 * the daemon calls hv_copy_cancel() to remove the file; if a file is copied
 * successfully before suspend, hv_copy_finished() must reset target_fname to
 * avoid that the file can be incorrectly removed upon resume, since the faked
 * CANCEL_FCOPY message is spurious in this case.
 */
static int hv_copy_finished(void)
{
	close(target_fd);
	target_fname[0] = '\0';
	return 0;
}
static int hv_copy_cancel(void)
{
	close(target_fd);
	if (strlen(target_fname) > 0) {
		unlink(target_fname);
		target_fname[0] = '\0';
	}
	return 0;

}
@@ -131,7 +147,7 @@ void print_usage(char *argv[])

int main(int argc, char *argv[])
{
	int fcopy_fd;
	int fcopy_fd = -1;
	int error;
	int daemonize = 1, long_index = 0, opt;
	int version = FCOPY_CURRENT_VERSION;
@@ -141,7 +157,7 @@ int main(int argc, char *argv[])
		struct hv_do_fcopy copy;
		__u32 kernel_modver;
	} buffer = { };
	int in_handshake = 1;
	int in_handshake;

	static struct option long_options[] = {
		{"help",	no_argument,	   0,  'h' },
@@ -170,6 +186,12 @@ int main(int argc, char *argv[])
	openlog("HV_FCOPY", 0, LOG_USER);
	syslog(LOG_INFO, "starting; pid is:%d", getpid());

reopen_fcopy_fd:
	if (fcopy_fd != -1)
		close(fcopy_fd);
	/* Remove any possible partially-copied file on error */
	hv_copy_cancel();
	in_handshake = 1;
	fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR);

	if (fcopy_fd < 0) {
@@ -196,7 +218,7 @@ int main(int argc, char *argv[])
		len = pread(fcopy_fd, &buffer, sizeof(buffer), 0);
		if (len < 0) {
			syslog(LOG_ERR, "pread failed: %s", strerror(errno));
			exit(EXIT_FAILURE);
			goto reopen_fcopy_fd;
		}

		if (in_handshake) {
@@ -231,9 +253,14 @@ int main(int argc, char *argv[])

		}

		/*
		 * pwrite() may return an error due to the faked CANCEL_FCOPY
		 * message upon hibernation. Ignore the error by resetting the
		 * dev file, i.e. closing and re-opening it.
		 */
		if (pwrite(fcopy_fd, &error, sizeof(int), 0) != sizeof(int)) {
			syslog(LOG_ERR, "pwrite failed: %s", strerror(errno));
			exit(EXIT_FAILURE);
			goto reopen_fcopy_fd;
		}
	}
}
+21 −15
Original line number Diff line number Diff line
@@ -76,7 +76,7 @@ enum {
	DNS
};

static int in_hand_shake = 1;
static int in_hand_shake;

static char *os_name = "";
static char *os_major = "";
@@ -1360,7 +1360,7 @@ void print_usage(char *argv[])

int main(int argc, char *argv[])
{
	int kvp_fd, len;
	int kvp_fd = -1, len;
	int error;
	struct pollfd pfd;
	char    *p;
@@ -1400,14 +1400,6 @@ int main(int argc, char *argv[])
	openlog("KVP", 0, LOG_USER);
	syslog(LOG_INFO, "KVP starting; pid is:%d", getpid());

	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);

	if (kvp_fd < 0) {
		syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
			errno, strerror(errno));
		exit(EXIT_FAILURE);
	}

	/*
	 * Retrieve OS release information.
	 */
@@ -1423,6 +1415,18 @@ int main(int argc, char *argv[])
		exit(EXIT_FAILURE);
	}

reopen_kvp_fd:
	if (kvp_fd != -1)
		close(kvp_fd);
	in_hand_shake = 1;
	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);

	if (kvp_fd < 0) {
		syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
		       errno, strerror(errno));
		exit(EXIT_FAILURE);
	}

	/*
	 * Register ourselves with the kernel.
	 */
@@ -1456,9 +1460,7 @@ int main(int argc, char *argv[])
		if (len != sizeof(struct hv_kvp_msg)) {
			syslog(LOG_ERR, "read failed; error:%d %s",
			       errno, strerror(errno));

			close(kvp_fd);
			return EXIT_FAILURE;
			goto reopen_kvp_fd;
		}

		/*
@@ -1617,13 +1619,17 @@ int main(int argc, char *argv[])
			break;
		}

		/* Send the value back to the kernel. */
		/*
		 * Send the value back to the kernel. Note: the write() may
		 * return an error due to hibernation; we can ignore the error
		 * by resetting the dev file, i.e. closing and re-opening it.
		 */
kvp_done:
		len = write(kvp_fd, hv_msg, sizeof(struct hv_kvp_msg));
		if (len != sizeof(struct hv_kvp_msg)) {
			syslog(LOG_ERR, "write failed; error: %d %s", errno,
			       strerror(errno));
			exit(EXIT_FAILURE);
			goto reopen_kvp_fd;
		}
	}

+38 −11
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@
#include <stdbool.h>
#include <dirent.h>

static bool fs_frozen;

/* Don't use syslog() in the function since that can cause write to disk */
static int vss_do_freeze(char *dir, unsigned int cmd)
{
@@ -155,17 +157,26 @@ static int vss_operate(int operation)
			continue;
		}
		error |= vss_do_freeze(ent->mnt_dir, cmd);
		if (error && operation == VSS_OP_FREEZE)
		if (operation == VSS_OP_FREEZE) {
			if (error)
				goto err;
			fs_frozen = true;
		}
	}

	endmntent(mounts);

	if (root_seen) {
		error |= vss_do_freeze("/", cmd);
		if (error && operation == VSS_OP_FREEZE)
		if (operation == VSS_OP_FREEZE) {
			if (error)
				goto err;
			fs_frozen = true;
		}
	}

	if (operation == VSS_OP_THAW && !error)
		fs_frozen = false;

	goto out;
err:
@@ -175,6 +186,7 @@ err:
		endmntent(mounts);
	}
	vss_operate(VSS_OP_THAW);
	fs_frozen = false;
	/* Call syslog after we thaw all filesystems */
	if (ent)
		syslog(LOG_ERR, "FREEZE of %s failed; error:%d %s",
@@ -196,13 +208,13 @@ void print_usage(char *argv[])

int main(int argc, char *argv[])
{
	int vss_fd, len;
	int vss_fd = -1, len;
	int error;
	struct pollfd pfd;
	int	op;
	struct hv_vss_msg vss_msg[1];
	int daemonize = 1, long_index = 0, opt;
	int in_handshake = 1;
	int in_handshake;
	__u32 kernel_modver;

	static struct option long_options[] = {
@@ -232,6 +244,18 @@ int main(int argc, char *argv[])
	openlog("Hyper-V VSS", 0, LOG_USER);
	syslog(LOG_INFO, "VSS starting; pid is:%d", getpid());

reopen_vss_fd:
	if (vss_fd != -1)
		close(vss_fd);
	if (fs_frozen) {
		if (vss_operate(VSS_OP_THAW) || fs_frozen) {
			syslog(LOG_ERR, "failed to thaw file system: err=%d",
			       errno);
			exit(EXIT_FAILURE);
		}
	}

	in_handshake = 1;
	vss_fd = open("/dev/vmbus/hv_vss", O_RDWR);
	if (vss_fd < 0) {
		syslog(LOG_ERR, "open /dev/vmbus/hv_vss failed; error: %d %s",
@@ -284,8 +308,7 @@ int main(int argc, char *argv[])
		if (len != sizeof(struct hv_vss_msg)) {
			syslog(LOG_ERR, "read failed; error:%d %s",
			       errno, strerror(errno));
			close(vss_fd);
			return EXIT_FAILURE;
			goto reopen_vss_fd;
		}

		op = vss_msg->vss_hdr.operation;
@@ -312,14 +335,18 @@ int main(int argc, char *argv[])
		default:
			syslog(LOG_ERR, "Illegal op:%d\n", op);
		}

		/*
		 * The write() may return an error due to the faked VSS_OP_THAW
		 * message upon hibernation. Ignore the error by resetting the
		 * dev file, i.e. closing and re-opening it.
		 */
		vss_msg->error = error;
		len = write(vss_fd, vss_msg, sizeof(struct hv_vss_msg));
		if (len != sizeof(struct hv_vss_msg)) {
			syslog(LOG_ERR, "write failed; error: %d %s", errno,
			       strerror(errno));

			if (op == VSS_OP_FREEZE)
				vss_operate(VSS_OP_THAW);
			goto reopen_vss_fd;
		}
	}