updated readme.md (d9b8f0aa) · Commits · 钟慕尧 / tvgl

TVGL.py→BaseGraphicalLasso.py

+29 −26

Original line number	Diff line number	Diff line
		@@ -4,10 +4,15 @@ import time
		from DataHandler import DataHandler


		class TVGL(object):
		class BaseGraphicalLasso(object):

		# The parent class for Graphical Lasso
		# problems. Most of the methods and
		# attributes are defined and initialized here.

		np.set_printoptions(precision=3)

		""" Initialize attributes, read data """
		def __init__(self, filename, blocks, lambd, beta,
		processes, penalty_function="group_lasso",
		datecolumn=True):
		@@ -32,10 +37,15 @@ class TVGL(object):
		self.u0s = [np.zeros((self.dimension, self.dimension))] * self.blocks
		self.u1s = [np.zeros((self.dimension, self.dimension))] * self.blocks
		self.u2s = [np.zeros((self.dimension, self.dimension))] * self.blocks
		self.nju = float(self.obs)/float(3*self.rho)
		self.eta = float(self.obs)/float(3*self.rho)
		self.e = 1e-5
		self.roundup = 1

		""" Read data from the given file. Get parameters of data
		(number of data samples, observations in a block).
		Compute empirical covariance matrices.
		Compute real inverse covariance matrices,
		if provided in the second line of the data file. """
		def read_data(self, filename, comment="#", splitter=","):
		with open(filename, "r") as f:
		comment_count = 0
		@@ -82,6 +92,8 @@ class TVGL(object):
		count = 0
		block += 1

		""" Computes real inverse covariance matrices with DataHandler,
		if provided in the second line of the data file """
		def generate_real_thetas(self, line, splitter):
		dh = DataHandler()
		infos = line.split(splitter)
		@@ -94,29 +106,13 @@ class TVGL(object):
		self.real_thetas = dh.inverse_sigmas
		dh = None

		""" Assigns rho based on number of observations in a block """
		def get_rho(self):
		#if self.obs % 3 == 0:
		# return self.obs / 3 + 1
		#else:
		# return np.ceil(float(self.obs) / float(3))
		return float(self.obs + 0.1) / float(3)
		#return 2*self.obs

		def adjust_rho(self):
		step = 1/float(self.nju*2)
		step = max(self.max_step, step/1.1)
		self.rho = float(2self.obsstep)/float(3)
		#self.rho = max(float(self.obs + 0.1) / float(3), self.rho/1.1)
		#if 3*self.rho - self.obs <= 0.01:
		# return
		#self.rho = (np.exp(float(-self.iteration)/100) + self.obs)/3
		#if self.rho == self.max_rho:
		# return
		#self.rho = min(self.rho * 1.1, self.max_rho)
		self.nju = float(self.obs)/float(3*self.rho)
		print "Rho: %s" % self.rho
		print "Nju: %s" % self.nju

		""" The core of the ADMM algorithm. To be called separately.
		Contains calls to the three update methods, which are to be
		defined in the child classes. """
		def run_algorithm(self, max_iter=10000):
		self.init_algorithm()
		self.iteration = 0
		@@ -128,8 +124,8 @@ class TVGL(object):
		print "\n* Iteration %s *" % self.iteration
		print "Time passed: {0:.3g}s".format(time.time() - start_time)
		print "Rho: %s" % self.rho
		print "Nju: %s" % self.nju
		print "Step: {0:.3f}".format(1/(2*self.nju))
		print "Eta: %s" % self.eta
		print "Step: {0:.3f}".format(1/(2*self.eta))
		if self.iteration % 500 == 0 or self.iteration == 1:
		s_time = time.time()
		self.theta_update()
		@@ -157,7 +153,6 @@ class TVGL(object):
		stopping_criteria = True
		thetas_pre = list(self.thetas)
		self.iteration += 1
		#self.adjust_rho()
		self.run_time = "{0:.3g}".format(time.time() - start_time)
		self.final_tuning(stopping_criteria, max_iter)

		@@ -176,15 +171,19 @@ class TVGL(object):
		def init_algorithm(self):
		pass

		""" Performs final tuning for the converged thetas,
		closes possible multiprocesses. """
		def final_tuning(self, stopping_criteria, max_iter):
		self.thetas = [np.round(theta, self.roundup) for theta in self.thetas]
		#self.only_true_false_edges()
		self.only_true_false_edges()
		self.terminate_processes()
		if stopping_criteria:
		print "\nIterations to complete: %s" % self.iteration
		else:
		print "\nMax iterations (%s) reached" % max_iter

		""" Converts values in the thetas into boolean values,
		informing only the existence of an edge without weight. """
		def only_true_false_edges(self):
		for k in range(self.blocks):
		for i in range(self.dimension - 1):
		@@ -196,6 +195,8 @@ class TVGL(object):
		self.thetas[k][i, j] = 0
		self.thetas[k][j, i] = 0

		""" Computes the Temporal Deviations between neighboring
		thetas, both absolute and normalized values. """
		def temporal_deviations(self):
		self.deviations = np.zeros(self.blocks - 1)
		for i in range(0, self.blocks - 1):
		@@ -210,6 +211,8 @@ class TVGL(object):
		self.norm_deviations = self.deviations
		self.dev_ratio = 0

		""" Computes the measures of correct edges in thetas,
		if true inverse covariance matrices are provided. """
		def correct_edges(self):
		self.real_edges = 0
		self.real_edgeless = 0

DataHandler.py

+70 −74

Original line number	Diff line number	Diff line

		import numpy as np
		import datetime
		import sys


		class DataHandler(object):
		@@ -10,6 +11,10 @@ class DataHandler(object):
		self.sigmas = []
		self.network_files = []

		""" Reads a network in given file and generates
		inverse covariance matrices. Expected format for
		networks in given files is:
		[start node index],[end node index],[edge weight] """
		def read_network(self, filename, comment="#", splitter=",",
		inversion=True):
		nodes = []
		@@ -41,25 +46,8 @@ class DataHandler(object):
		print np.shape(sigma)
		print network

		def init_true_inverse_covariance_matrices(self):
		inverse_sigma1 = np.array([[1.00, 0.50, 0.00, 0.00, 0.00, 0.00],
		[0.50, 1.00, 0.50, 0.25, 0.00, 0.00],
		[0.00, 0.50, 1.00, 0.00, 0.25, 0.00],
		[0.00, 0.25, 0.00, 1.00, 0.50, 0.00],
		[0.00, 0.00, 0.25, 0.50, 1.00, 0.25],
		[0.00, 0.00, 0.00, 0.00, 0.25, 1.00]])

		inverse_sigma2 = np.array([[1.00, 0.00, 0.00, 0.50, 0.00, 0.00],
		[0.00, 1.00, 0.00, 0.00, 0.50, 0.00],
		[0.00, 0.00, 1.00, 0.50, 0.25, 0.50],
		[0.50, 0.00, 0.50, 1.00, 0.00, 0.00],
		[0.00, 0.50, 0.25, 0.00, 1.00, 0.00],
		[0.00, 0.00, 0.50, 0.00, 0.00, 1.00]])
		self.inverse_sigmas.append(inverse_sigma1)
		self.inverse_sigmas.append(inverse_sigma2)
		self.sigmas.append(np.linalg.inv(inverse_sigma1))
		self.sigmas.append(np.linalg.inv(inverse_sigma2))

		""" Generates a data file (.csv) from networks previously defined in
		self.sigmas (covariance matrix) """
		def generate_real_data(self, counts=[100, 100]):
		if len(counts) is not len(self.sigmas):
		raise Exception(
		@@ -91,6 +79,7 @@ class DataHandler(object):
		line = line[1:]
		new_file.write("%s\n" % line)

		""" Converts a network into matrix form """
		def write_network_to_matrix_form(self, datafile):
		new_filename = datafile.split(".")[0] + "_matrix.csv"
		self.read_network(datafile, inversion=False)
		@@ -102,11 +91,13 @@ class DataHandler(object):
		f.write("\n")
		f.write("\n\n")

		def write_network_results(self, datafile, alg_type, alg, splitter=","):
		""" Creates a file containing results, with network details,
		from converged algorithm instance """
		def write_network_results(self, datafile, solver, splitter=","):
		run_time = datetime.datetime.now()
		results_name = "network_results/%s_di%sbl%sob%sla%sbe%s_%s.csv" % (
		alg_type, alg.dimension, alg.blocks, alg.obs, int(alg.lambd),
		int(alg.beta), run_time.strftime("%Y%m%d%H%M%S"))
		results_name = "network_results/%s_la%sbe%s_%s.csv" % (
		solver.__class__.__name__, int(solver.lambd),
		int(solver.beta), run_time.strftime("%Y%m%d%H%M%S"))
		""" Read features """
		with open(datafile, "r") as f:
		for i, line in enumerate(f):
		@@ -122,102 +113,104 @@ class DataHandler(object):
		f.write("Run datetime, %s\n" %
		run_time.strftime("%Y-%m-%d %H:%M:%S"))
		f.write("Data file, %s\n" % datafile)
		f.write("Algorithm type, %s\n" % alg.__class__.__name__)
		f.write("Penalty function, %s\n" % alg.penalty_function)
		f.write("Data dimension, %s\n" % alg.dimension)
		f.write("Blocks, %s\n" % alg.blocks)
		f.write("Observations in a block, %s\n" % alg.obs)
		f.write("Rho, %s\n" % alg.rho)
		f.write("Beta, %s\n" % alg.beta)
		f.write("Lambda, %s\n" % alg.lambd)
		f.write("Processes used, %s\n" % alg.processes)
		f.write("Solver type, %s\n" % solver.__class__.__name__)
		f.write("Penalty function, %s\n" % solver.penalty_function)
		f.write("Data dimension, %s\n" % solver.dimension)
		f.write("Blocks, %s\n" % solver.blocks)
		f.write("Observations in a block, %s\n" % solver.obs)
		f.write("Rho, %s\n" % solver.rho)
		f.write("Beta, %s\n" % solver.beta)
		f.write("Lambda, %s\n" % solver.lambd)
		f.write("Processes used, %s\n" % solver.processes)
		f.write("\n")
		f.write("# Results\n")
		f.write("Algorithm run time, %s seconds\n" % alg.run_time)
		f.write("Iterations to complete, %s\n\n" % alg.iteration)
		f.write("Algorithm run time, %s seconds\n" % solver.run_time)
		f.write("Iterations to complete, %s\n\n" % solver.iteration)
		try:
		f.write("Temporal deviations ratio (max/mean), {0:.3f}\n"
		.format(alg.dev_ratio))
		.format(solver.dev_ratio))
		except ValueError:
		f.write("Temporal deviations ratio (max/mean), %s\n"
		% alg.dev_ratio)
		% solver.dev_ratio)
		f.write("Temporal deviations ")
		for dev in alg.deviations:
		for dev in solver.deviations:
		try:
		f.write(",{0:.3f}".format(dev))
		except ValueError:
		f.write(",%s" % dev)
		f.write("\nNormalized Temporal deviations ")
		for dev in alg.norm_deviations:
		for dev in solver.norm_deviations:
		try:
		f.write(",{0:.3f}".format(dev))
		except ValueError:
		f.write(",%s" % dev)
		""" Write networks """
		f.write("\n\n#Networks:\n\n")
		for k in range(alg.blocks):
		for k in range(solver.blocks):
		f.write("Block %s," % k)
		f.write(alg.blockdates[k] + "\n")
		f.write(solver.blockdates[k] + "\n")
		if k > 0:
		f.write("Dev to prev,")
		f.write("{0:.3f},".format(alg.deviations[k-1]))
		if k < alg.blocks - 1:
		f.write("{0:.3f},".format(solver.deviations[k-1]))
		if k < solver.blocks - 1:
		f.write("Dev to next,")
		f.write("{0:.3f}".format(alg.deviations[k]))
		f.write("{0:.3f}".format(solver.deviations[k]))
		f.write("\n")
		for feat in feats:
		f.write("," + feat)
		f.write("\n")
		for i in range(alg.dimension):
		for i in range(solver.dimension):
		f.write(features[i])
		for j in range(alg.dimension):
		f.write("," + str(alg.thetas[k][i, j]))
		for j in range(solver.dimension):
		f.write("," + str(solver.thetas[k][i, j]))
		f.write("\n")
		f.write("\n\n")

		def write_results(self, datafile, alg_type, alg, splitter=','):
		""" Creates a file containing results, without network details,
		from converged solver instance """
		def write_results(self, datafile, solver, splitter=','):
		run_time = datetime.datetime.now()
		results_name = "results/%s_di%sbl%sob%sla%sbe%s_%s.csv" % (
		alg_type, alg.dimension, alg.blocks, alg.obs, int(alg.lambd),
		int(alg.beta), run_time.strftime("%Y%m%d%H%M%S"))
		results_name = "results/%s_la%sbe%s_%s.csv" % (
		solver.__class__.__name__, int(solver.lambd),
		int(solver.beta), run_time.strftime("%Y%m%d%H%M%S"))
		with open(results_name, "w") as f:
		f.write("# Information\n")
		f.write("Run datetime, %s\n" %
		run_time.strftime("%Y-%m-%d %H:%M:%S"))
		f.write("Data file, %s\n" % datafile)
		f.write("Algorithm type, %s\n" % alg.__class__.__name__)
		f.write("Penalty function, %s\n" % alg.penalty_function)
		f.write("Data dimension, %s\n" % alg.dimension)
		f.write("Blocks, %s\n" % alg.blocks)
		f.write("Observations in a block, %s\n" % alg.obs)
		f.write("Rho, %s\n" % alg.rho)
		f.write("Beta, %s\n" % alg.beta)
		f.write("Lambda, %s\n" % alg.lambd)
		f.write("Processes used, %s\n" % alg.processes)
		f.write("Total edges, %s\n" % alg.real_edges)
		f.write("Total edgeless, %s\n" % alg.real_edgeless)
		f.write("Solver type, %s\n" % solver.__class__.__name__)
		f.write("Penalty function, %s\n" % solver.penalty_function)
		f.write("Data dimension, %s\n" % solver.dimension)
		f.write("Blocks, %s\n" % solver.blocks)
		f.write("Observations in a block, %s\n" % solver.obs)
		f.write("Rho, %s\n" % solver.rho)
		f.write("Beta, %s\n" % solver.beta)
		f.write("Lambda, %s\n" % solver.lambd)
		f.write("Processes used, %s\n" % solver.processes)
		f.write("Total edges, %s\n" % solver.real_edges)
		f.write("Total edgeless, %s\n" % solver.real_edgeless)
		f.write("\n")
		f.write("# Results\n")
		f.write("Algorithm run time, %s seconds\n" % alg.run_time)
		f.write("Iterations to complete, %s\n\n" % alg.iteration)
		f.write("Correct positive edges, %s\n" % alg.correct_positives)
		f.write("All positives, %s\n" % alg.all_positives)
		f.write("Algorithm run time, %s seconds\n" % solver.run_time)
		f.write("Iterations to complete, %s\n\n" % solver.iteration)
		f.write("Correct positive edges, %s\n" % solver.correct_positives)
		f.write("All positives, %s\n" % solver.all_positives)
		f.write("F1 Score, {0:.3f}\n"
		.format(alg.f1score))
		.format(solver.f1score))
		try:
		f.write("Temporal deviations ratio (max/mean), {0:.3f}\n"
		.format(alg.dev_ratio))
		.format(solver.dev_ratio))
		except ValueError:
		f.write("Temporal deviations ratio (max/mean), %s\n"
		% alg.dev_ratio)
		% solver.dev_ratio)
		f.write("Temporal deviations ")
		for dev in alg.deviations:
		for dev in solver.deviations:
		try:
		f.write(",{0:.3f}".format(dev))
		except ValueError:
		f.write(",%s" % dev)
		f.write("\nNormalized Temporal deviations ")
		for dev in alg.norm_deviations:
		for dev in solver.norm_deviations:
		try:
		f.write(",{0:.3f}".format(dev))
		except ValueError:
		@@ -225,8 +218,11 @@ class DataHandler(object):
		f.write("\n")


		if __name__ == "__main__":
		if __name__ == "__main__" and len(sys.argv) % 2 == 1:
		dh = DataHandler()
		dh.read_network("networks/network1.csv")
		dh.read_network("networks/network3.csv")
		dh.generate_real_data([500, 500])
		data_counts = []
		for i in range(1, len(sys.argv), 2):
		dh.read_network(sys.argv[i])
		data_counts.append(int(sys.argv[i+1]))
		if len(data_counts) > 0:
		dh.generate_real_data(data_counts)

DynamicGL.py

deleted100644 → 0

+0 −67

Original line number	Diff line number	Diff line

		from TVGL import TVGL
		import penalty_functions as pf
		import numpy as np
		import multiprocessing
		import time
		import traceback


		def mp_dynamic_gl((theta, z0, u0, emp_cov_mat, rho,
		lambd, nju, dimension, max_iter)):
		try:
		iteration = 0
		stopping_criteria = False
		theta_pre = []
		while iteration < max_iter and stopping_criteria is False:
		""" Theta update """
		a = z0 - u0
		at = a.transpose()
		m = nju*(a + at)/2 - emp_cov_mat
		d, q = np.linalg.eig(m)
		qt = q.transpose()
		sqrt_matrix = np.sqrt(d*2 + 4/njunp.ones(dimension))
		diagonal = np.diag(d) + np.diag(sqrt_matrix)
		theta = np.real(
		nju/2*np.dot(np.dot(q, diagonal), qt))
		""" Z-update """
		z0 = pf.soft_threshold_odd(theta + u0, lambd, rho)
		""" U-update """
		u0 += theta - z0
		""" Check stopping criteria """
		if iteration > 0:
		dif = theta - theta_pre
		fro_norm = np.linalg.norm(dif)
		if fro_norm < 1e-5:
		stopping_criteria = True
		theta_pre = list(theta)
		iteration += 1
		except Exception as e:
		traceback.print_exc()
		raise e
		return theta


		class DynamicGL(TVGL):

		def __init__(self, args, *kwargs):
		super(DynamicGL, self).__init__(beta=0, args, *kwargs)
		self.nju = float(self.obs)/float(self.rho)
		self.iteration = "n/a"
		self.penalty_function = "n/a"

		def get_rho(self):
		return self.obs + 1

		def run_algorithm(self, max_iter=10000):
		start_time = time.time()
		p = multiprocessing.Pool(self.processes)
		inputs = [(self.thetas[i], self.z0s[i], self.u0s[i],
		self.emp_cov_mat[i], self.rho,
		self.lambd, self.nju, self.dimension, max_iter)
		for i in range(self.blocks)]
		self.thetas = p.map(mp_dynamic_gl, inputs)
		p.close()
		p.join()
		self.run_time = '{0:.3g}'.format(time.time() - start_time)
		self.thetas = [np.round(theta, self.roundup) for theta in self.thetas]

ParallelTVGL.py

+164 −72

File changed.

Preview size limit exceeded, changes collapsed.

README.md

+83 −1

Original line number	Diff line number	Diff line
		# graphical_lasso
		No newline at end of file
		Time-Varying Graphical Lasso

		Contains 2 TVGL and 2 GL solvers for network inference,
		and a DataHandler to create and maintain data files.

		The solvers use an ADMM algorithm to solve the problem.

		Implemented in Python2.7 with NumPy and multiprocessing.

		Based on paper from Hallac et al. (2017)
		"Network inference via the Time-Varying Graphical Lasso"
		https://arxiv.org/abs/1703.01958

		1. Contains
		Files:
		- BaseGraphicalLasso.py - Parent class for all 4 solvers
		- ParallelTVGL.py - TVGL solver using multiprocessing
		- SerialTVGL.py - TVGL solver using serial processing
		- StaticGL.py - GL solver solving T independent GL problems
		- SingleGL.py - GL solver for a single GL problem
		- DataHandler.py - Synthetic data generator, results writer
		- penalty_functions.py - Penalty functions for algorithms defined
		- run.py - Executes solvers

		Folders:
		- omxh_data - OMXH25 daily returns data
		- synthetic_data - Synthetic data generated with DataHandler
		- networks - Network files to be used for generating synthetic data
		- network_results - Results from non-synthetic data executions
		- results - Results from synthetic data executions

		2. Run solvers

		Run from command line with following options

		- Parallel TVGL:
		1. [data file]
		2. [penalty function index]
		3. [Number of blocks to be created]
		4. [lambda]
		5. [beta]
		6. [number of processes to be used]
		-if higher than # of blocks, # of processes is reduced to # of blocks

		Example:
		$ python ParallelTVGL.py synthetic_data/datafile1.csv 2 10 2 4 4

		- Serial TVGL:
		1. [data file]
		2. [penalty function index]
		3. [Number of blocks to be created]
		4. [lambda]
		5. [beta]

		Example:
		$ python SerialTVGL.py synthetic_data/datafile1.csv 2 10 2 4

		- Static GL:
		1. [data file]
		2. [Number of blocks to be created]
		3. [lambda]
		4. [number of processes to be used]
		-if higher than # of blocks, # of processes is reduced to # of blocks

		Example:
		$ python StaticGL.py synthetic_data/datafile1.csv 2 10 2

		- Single GL:
		1. [data file]
		2. [lambda]

		Example:
		$ python SingleGL.py synthetic_data/datafile1.csv 2

		3. Generate synthetic data

		Run from command line with at least one [network file name]-[number of data points] pair.
		Arbitrary number of pairs can be inputted.

		$ python DataHandler.py [network file name] [number of data points from network] ...

Admin message