Commit d9b8f0aa authored by tpetaja1's avatar tpetaja1
Browse files

updated readme.md

- changed execution of solvers into the specific file
parent 8f0d2b75
Loading
Loading
Loading
Loading
+29 −26
Original line number Diff line number Diff line
@@ -4,10 +4,15 @@ import time
from DataHandler import DataHandler


class TVGL(object):
class BaseGraphicalLasso(object):

    # The parent class for Graphical Lasso
    # problems. Most of the methods and
    # attributes are defined and initialized here.

    np.set_printoptions(precision=3)

    """ Initialize attributes, read data """
    def __init__(self, filename, blocks, lambd, beta,
                 processes, penalty_function="group_lasso",
                 datecolumn=True):
@@ -32,10 +37,15 @@ class TVGL(object):
        self.u0s = [np.zeros((self.dimension, self.dimension))] * self.blocks
        self.u1s = [np.zeros((self.dimension, self.dimension))] * self.blocks
        self.u2s = [np.zeros((self.dimension, self.dimension))] * self.blocks
        self.nju = float(self.obs)/float(3*self.rho)
        self.eta = float(self.obs)/float(3*self.rho)
        self.e = 1e-5
        self.roundup = 1

    """ Read data from the given file. Get parameters of data
        (number of data samples, observations in a block).
        Compute empirical covariance matrices.
        Compute real inverse covariance matrices,
        if provided in the second line of the data file. """
    def read_data(self, filename, comment="#", splitter=","):
        with open(filename, "r") as f:
            comment_count = 0
@@ -82,6 +92,8 @@ class TVGL(object):
                    count = 0
                    block += 1

    """ Computes real inverse covariance matrices with DataHandler,
        if provided in the second line of the data file """
    def generate_real_thetas(self, line, splitter):
        dh = DataHandler()
        infos = line.split(splitter)
@@ -94,29 +106,13 @@ class TVGL(object):
        self.real_thetas = dh.inverse_sigmas
        dh = None

    """ Assigns rho based on number of observations in a block """
    def get_rho(self):
        #if self.obs % 3 == 0:
        #    return self.obs / 3 + 1
        #else:
        #    return np.ceil(float(self.obs) / float(3))
        return float(self.obs + 0.1) / float(3)
        #return 2*self.obs

    def adjust_rho(self):
        step = 1/float(self.nju*2)
        step = max(self.max_step, step/1.1)
        self.rho = float(2*self.obs*step)/float(3)
        #self.rho = max(float(self.obs + 0.1) / float(3), self.rho/1.1)
        #if 3*self.rho - self.obs <= 0.01:
        #    return
        #self.rho = (np.exp(float(-self.iteration)/100) + self.obs)/3
        #if self.rho == self.max_rho:
        #    return
        #self.rho = min(self.rho * 1.1, self.max_rho)
        self.nju = float(self.obs)/float(3*self.rho)
        print "Rho: %s" % self.rho
        print "Nju: %s" % self.nju

    """ The core of the ADMM algorithm. To be called separately.
        Contains calls to the three update methods, which are to be
        defined in the child classes. """
    def run_algorithm(self, max_iter=10000):
        self.init_algorithm()
        self.iteration = 0
@@ -128,8 +124,8 @@ class TVGL(object):
                print "\n*** Iteration %s ***" % self.iteration
                print "Time passed: {0:.3g}s".format(time.time() - start_time)
                print "Rho: %s" % self.rho
                print "Nju: %s" % self.nju
                print "Step: {0:.3f}".format(1/(2*self.nju))
                print "Eta: %s" % self.eta
                print "Step: {0:.3f}".format(1/(2*self.eta))
            if self.iteration % 500 == 0 or self.iteration == 1:
                s_time = time.time()
            self.theta_update()
@@ -157,7 +153,6 @@ class TVGL(object):
                    stopping_criteria = True
            thetas_pre = list(self.thetas)
            self.iteration += 1
            #self.adjust_rho()
        self.run_time = "{0:.3g}".format(time.time() - start_time)
        self.final_tuning(stopping_criteria, max_iter)

@@ -176,15 +171,19 @@ class TVGL(object):
    def init_algorithm(self):
        pass

    """ Performs final tuning for the converged thetas,
        closes possible multiprocesses. """
    def final_tuning(self, stopping_criteria, max_iter):
        self.thetas = [np.round(theta, self.roundup) for theta in self.thetas]
        #self.only_true_false_edges()
        self.only_true_false_edges()
        self.terminate_processes()
        if stopping_criteria:
            print "\nIterations to complete: %s" % self.iteration
        else:
            print "\nMax iterations (%s) reached" % max_iter

    """ Converts values in the thetas into boolean values,
        informing only the existence of an edge without weight. """
    def only_true_false_edges(self):
        for k in range(self.blocks):
            for i in range(self.dimension - 1):
@@ -196,6 +195,8 @@ class TVGL(object):
                        self.thetas[k][i, j] = 0
                        self.thetas[k][j, i] = 0

    """ Computes the Temporal Deviations between neighboring
        thetas, both absolute and normalized values. """
    def temporal_deviations(self):
        self.deviations = np.zeros(self.blocks - 1)
        for i in range(0, self.blocks - 1):
@@ -210,6 +211,8 @@ class TVGL(object):
            self.norm_deviations = self.deviations
            self.dev_ratio = 0

    """ Computes the measures of correct edges in thetas,
        if true inverse covariance matrices are provided. """
    def correct_edges(self):
        self.real_edges = 0
        self.real_edgeless = 0
+70 −74
Original line number Diff line number Diff line

import numpy as np
import datetime
import sys


class DataHandler(object):
@@ -10,6 +11,10 @@ class DataHandler(object):
        self.sigmas = []
        self.network_files = []

    """ Reads a network in given file and generates
        inverse covariance matrices. Expected format for
        networks in given files is:
        [start node index],[end node index],[edge weight]  """
    def read_network(self, filename, comment="#", splitter=",",
                     inversion=True):
        nodes = []
@@ -41,25 +46,8 @@ class DataHandler(object):
            print np.shape(sigma)
            print network

    def init_true_inverse_covariance_matrices(self):
        inverse_sigma1 = np.array([[1.00, 0.50, 0.00, 0.00, 0.00, 0.00],
                                   [0.50, 1.00, 0.50, 0.25, 0.00, 0.00],
                                   [0.00, 0.50, 1.00, 0.00, 0.25, 0.00],
                                   [0.00, 0.25, 0.00, 1.00, 0.50, 0.00],
                                   [0.00, 0.00, 0.25, 0.50, 1.00, 0.25],
                                   [0.00, 0.00, 0.00, 0.00, 0.25, 1.00]])

        inverse_sigma2 = np.array([[1.00, 0.00, 0.00, 0.50, 0.00, 0.00],
                                   [0.00, 1.00, 0.00, 0.00, 0.50, 0.00],
                                   [0.00, 0.00, 1.00, 0.50, 0.25, 0.50],
                                   [0.50, 0.00, 0.50, 1.00, 0.00, 0.00],
                                   [0.00, 0.50, 0.25, 0.00, 1.00, 0.00],
                                   [0.00, 0.00, 0.50, 0.00, 0.00, 1.00]])
        self.inverse_sigmas.append(inverse_sigma1)
        self.inverse_sigmas.append(inverse_sigma2)
        self.sigmas.append(np.linalg.inv(inverse_sigma1))
        self.sigmas.append(np.linalg.inv(inverse_sigma2))

    """ Generates a data file (.csv) from networks previously defined in
        self.sigmas (covariance matrix) """
    def generate_real_data(self, counts=[100, 100]):
        if len(counts) is not len(self.sigmas):
            raise Exception(
@@ -91,6 +79,7 @@ class DataHandler(object):
                line = line[1:]
                new_file.write("%s\n" % line)

    """ Converts a network into matrix form """
    def write_network_to_matrix_form(self, datafile):
        new_filename = datafile.split(".")[0] + "_matrix.csv"
        self.read_network(datafile, inversion=False)
@@ -102,11 +91,13 @@ class DataHandler(object):
                    f.write("\n")
                f.write("\n\n")

    def write_network_results(self, datafile, alg_type, alg, splitter=","):
    """ Creates a file containing results, with network details,
        from converged algorithm instance """
    def write_network_results(self, datafile, solver, splitter=","):
        run_time = datetime.datetime.now()
        results_name = "network_results/%s_di%sbl%sob%sla%sbe%s_%s.csv" % (
            alg_type, alg.dimension, alg.blocks, alg.obs, int(alg.lambd),
            int(alg.beta), run_time.strftime("%Y%m%d%H%M%S"))
        results_name = "network_results/%s_la%sbe%s_%s.csv" % (
            solver.__class__.__name__, int(solver.lambd),
            int(solver.beta), run_time.strftime("%Y%m%d%H%M%S"))
        """ Read features """
        with open(datafile, "r") as f:
            for i, line in enumerate(f):
@@ -122,102 +113,104 @@ class DataHandler(object):
            f.write("Run datetime, %s\n" %
                    run_time.strftime("%Y-%m-%d %H:%M:%S"))
            f.write("Data file, %s\n" % datafile)
            f.write("Algorithm type, %s\n" % alg.__class__.__name__)
            f.write("Penalty function, %s\n" % alg.penalty_function)
            f.write("Data dimension, %s\n" % alg.dimension)
            f.write("Blocks, %s\n" % alg.blocks)
            f.write("Observations in a block, %s\n" % alg.obs)
            f.write("Rho, %s\n" % alg.rho)
            f.write("Beta, %s\n" % alg.beta)
            f.write("Lambda, %s\n" % alg.lambd)
            f.write("Processes used, %s\n" % alg.processes)
            f.write("Solver type, %s\n" % solver.__class__.__name__)
            f.write("Penalty function, %s\n" % solver.penalty_function)
            f.write("Data dimension, %s\n" % solver.dimension)
            f.write("Blocks, %s\n" % solver.blocks)
            f.write("Observations in a block, %s\n" % solver.obs)
            f.write("Rho, %s\n" % solver.rho)
            f.write("Beta, %s\n" % solver.beta)
            f.write("Lambda, %s\n" % solver.lambd)
            f.write("Processes used, %s\n" % solver.processes)
            f.write("\n")
            f.write("# Results\n")
            f.write("Algorithm run time, %s seconds\n" % alg.run_time)
            f.write("Iterations to complete, %s\n\n" % alg.iteration)
            f.write("Algorithm run time, %s seconds\n" % solver.run_time)
            f.write("Iterations to complete, %s\n\n" % solver.iteration)
            try:
                f.write("Temporal deviations ratio (max/mean), {0:.3f}\n"
                        .format(alg.dev_ratio))
                        .format(solver.dev_ratio))
            except ValueError:
                f.write("Temporal deviations ratio (max/mean), %s\n"
                        % alg.dev_ratio)
                        % solver.dev_ratio)
            f.write("Temporal deviations ")
            for dev in alg.deviations:
            for dev in solver.deviations:
                try:
                    f.write(",{0:.3f}".format(dev))
                except ValueError:
                    f.write(",%s" % dev)
            f.write("\nNormalized Temporal deviations ")
            for dev in alg.norm_deviations:
            for dev in solver.norm_deviations:
                try:
                    f.write(",{0:.3f}".format(dev))
                except ValueError:
                    f.write(",%s" % dev)
            """ Write networks """
            f.write("\n\n#Networks:\n\n")
            for k in range(alg.blocks):
            for k in range(solver.blocks):
                f.write("Block %s," % k)
                f.write(alg.blockdates[k] + "\n")
                f.write(solver.blockdates[k] + "\n")
                if k > 0:
                    f.write("Dev to prev,")
                    f.write("{0:.3f},".format(alg.deviations[k-1]))
                if k < alg.blocks - 1:
                    f.write("{0:.3f},".format(solver.deviations[k-1]))
                if k < solver.blocks - 1:
                    f.write("Dev to next,")
                    f.write("{0:.3f}".format(alg.deviations[k]))
                    f.write("{0:.3f}".format(solver.deviations[k]))
                f.write("\n")
                for feat in feats:
                    f.write("," + feat)
                f.write("\n")
                for i in range(alg.dimension):
                for i in range(solver.dimension):
                    f.write(features[i])
                    for j in range(alg.dimension):
                        f.write("," + str(alg.thetas[k][i, j]))
                    for j in range(solver.dimension):
                        f.write("," + str(solver.thetas[k][i, j]))
                    f.write("\n")
                f.write("\n\n")

    def write_results(self, datafile, alg_type, alg, splitter=','):
    """ Creates a file containing results, without network details,
        from converged solver instance """
    def write_results(self, datafile, solver, splitter=','):
        run_time = datetime.datetime.now()
        results_name = "results/%s_di%sbl%sob%sla%sbe%s_%s.csv" % (
            alg_type, alg.dimension, alg.blocks, alg.obs, int(alg.lambd),
            int(alg.beta), run_time.strftime("%Y%m%d%H%M%S"))
        results_name = "results/%s_la%sbe%s_%s.csv" % (
            solver.__class__.__name__, int(solver.lambd),
            int(solver.beta), run_time.strftime("%Y%m%d%H%M%S"))
        with open(results_name, "w") as f:
            f.write("# Information\n")
            f.write("Run datetime, %s\n" %
                    run_time.strftime("%Y-%m-%d %H:%M:%S"))
            f.write("Data file, %s\n" % datafile)
            f.write("Algorithm type, %s\n" % alg.__class__.__name__)
            f.write("Penalty function, %s\n" % alg.penalty_function)
            f.write("Data dimension, %s\n" % alg.dimension)
            f.write("Blocks, %s\n" % alg.blocks)
            f.write("Observations in a block, %s\n" % alg.obs)
            f.write("Rho, %s\n" % alg.rho)
            f.write("Beta, %s\n" % alg.beta)
            f.write("Lambda, %s\n" % alg.lambd)
            f.write("Processes used, %s\n" % alg.processes)
            f.write("Total edges, %s\n" % alg.real_edges)
            f.write("Total edgeless, %s\n" % alg.real_edgeless)
            f.write("Solver type, %s\n" % solver.__class__.__name__)
            f.write("Penalty function, %s\n" % solver.penalty_function)
            f.write("Data dimension, %s\n" % solver.dimension)
            f.write("Blocks, %s\n" % solver.blocks)
            f.write("Observations in a block, %s\n" % solver.obs)
            f.write("Rho, %s\n" % solver.rho)
            f.write("Beta, %s\n" % solver.beta)
            f.write("Lambda, %s\n" % solver.lambd)
            f.write("Processes used, %s\n" % solver.processes)
            f.write("Total edges, %s\n" % solver.real_edges)
            f.write("Total edgeless, %s\n" % solver.real_edgeless)
            f.write("\n")
            f.write("# Results\n")
            f.write("Algorithm run time, %s seconds\n" % alg.run_time)
            f.write("Iterations to complete, %s\n\n" % alg.iteration)
            f.write("Correct positive edges, %s\n" % alg.correct_positives)
            f.write("All positives, %s\n" % alg.all_positives)
            f.write("Algorithm run time, %s seconds\n" % solver.run_time)
            f.write("Iterations to complete, %s\n\n" % solver.iteration)
            f.write("Correct positive edges, %s\n" % solver.correct_positives)
            f.write("All positives, %s\n" % solver.all_positives)
            f.write("F1 Score, {0:.3f}\n"
                    .format(alg.f1score))
                    .format(solver.f1score))
            try:
                f.write("Temporal deviations ratio (max/mean), {0:.3f}\n"
                        .format(alg.dev_ratio))
                        .format(solver.dev_ratio))
            except ValueError:
                f.write("Temporal deviations ratio (max/mean), %s\n"
                        % alg.dev_ratio)
                        % solver.dev_ratio)
            f.write("Temporal deviations ")
            for dev in alg.deviations:
            for dev in solver.deviations:
                try:
                    f.write(",{0:.3f}".format(dev))
                except ValueError:
                    f.write(",%s" % dev)
            f.write("\nNormalized Temporal deviations ")
            for dev in alg.norm_deviations:
            for dev in solver.norm_deviations:
                try:
                    f.write(",{0:.3f}".format(dev))
                except ValueError:
@@ -225,8 +218,11 @@ class DataHandler(object):
            f.write("\n")


if __name__ == "__main__":
if __name__ == "__main__" and len(sys.argv) % 2 == 1:
    dh = DataHandler()
    dh.read_network("networks/network1.csv")
    dh.read_network("networks/network3.csv")
    dh.generate_real_data([500, 500])
    data_counts = []
    for i in range(1, len(sys.argv), 2):
        dh.read_network(sys.argv[i])
        data_counts.append(int(sys.argv[i+1]))
    if len(data_counts) > 0:
        dh.generate_real_data(data_counts)

DynamicGL.py

deleted100644 → 0
+0 −67
Original line number Diff line number Diff line

from TVGL import TVGL
import penalty_functions as pf
import numpy as np
import multiprocessing
import time
import traceback


def mp_dynamic_gl((theta, z0, u0, emp_cov_mat, rho,
                  lambd, nju, dimension, max_iter)):
    try:
        iteration = 0
        stopping_criteria = False
        theta_pre = []
        while iteration < max_iter and stopping_criteria is False:
            """ Theta update """
            a = z0 - u0
            at = a.transpose()
            m = nju*(a + at)/2 - emp_cov_mat
            d, q = np.linalg.eig(m)
            qt = q.transpose()
            sqrt_matrix = np.sqrt(d**2 + 4/nju*np.ones(dimension))
            diagonal = np.diag(d) + np.diag(sqrt_matrix)
            theta = np.real(
                nju/2*np.dot(np.dot(q, diagonal), qt))
            """ Z-update """
            z0 = pf.soft_threshold_odd(theta + u0, lambd, rho)
            """ U-update """
            u0 += theta - z0
            """ Check stopping criteria """
            if iteration > 0:
                dif = theta - theta_pre
                fro_norm = np.linalg.norm(dif)
                if fro_norm < 1e-5:
                    stopping_criteria = True
            theta_pre = list(theta)
            iteration += 1
    except Exception as e:
        traceback.print_exc()
        raise e
    return theta


class DynamicGL(TVGL):

    def __init__(self, *args, **kwargs):
        super(DynamicGL, self).__init__(beta=0, *args, **kwargs)
        self.nju = float(self.obs)/float(self.rho)
        self.iteration = "n/a"
        self.penalty_function = "n/a"

    def get_rho(self):
        return self.obs + 1

    def run_algorithm(self, max_iter=10000):
        start_time = time.time()
        p = multiprocessing.Pool(self.processes)
        inputs = [(self.thetas[i], self.z0s[i], self.u0s[i],
                   self.emp_cov_mat[i], self.rho,
                   self.lambd, self.nju, self.dimension, max_iter)
                  for i in range(self.blocks)]
        self.thetas = p.map(mp_dynamic_gl, inputs)
        p.close()
        p.join()
        self.run_time = '{0:.3g}'.format(time.time() - start_time)
        self.thetas = [np.round(theta, self.roundup) for theta in self.thetas]
+164 −72

File changed.

Preview size limit exceeded, changes collapsed.

+83 −1
Original line number Diff line number Diff line
# graphical_lasso
 No newline at end of file
Time-Varying Graphical Lasso

Contains 2 TVGL and 2 GL solvers for network inference,
and a DataHandler to create and maintain data files.

The solvers use an ADMM algorithm to solve the problem.

Implemented in Python2.7 with NumPy and multiprocessing.

Based on paper from Hallac et al. (2017)
  "Network inference via the Time-Varying Graphical Lasso"
  https://arxiv.org/abs/1703.01958

1. Contains
   Files:
   - BaseGraphicalLasso.py -   Parent class for all 4 solvers
   - ParallelTVGL.py       -   TVGL solver using multiprocessing
   - SerialTVGL.py         -   TVGL solver using serial processing
   - StaticGL.py           -   GL solver solving T independent GL problems
   - SingleGL.py           -   GL solver for a single GL problem
   - DataHandler.py        -   Synthetic data generator, results writer
   - penalty_functions.py  -   Penalty functions for algorithms defined
   - run.py                -   Executes solvers

   Folders:
   - omxh_data             -   OMXH25 daily returns data
   - synthetic_data        -   Synthetic data generated with DataHandler
   - networks              -   Network files to be used for generating synthetic data
   - network_results       -   Results from non-synthetic data executions
   - results               -   Results from synthetic data executions
 		      
2. Run solvers

 Run from command line with following options

 - Parallel TVGL:
    1. [data file]
    2. [penalty function index]
    3. [Number of blocks to be created]
    4. [lambda]
    5. [beta]
    6. [number of processes to be used]
       -if higher than # of blocks, # of processes is reduced to # of blocks

    Example:
       $ python ParallelTVGL.py synthetic_data/datafile1.csv 2 10 2 4 4

 - Serial TVGL:
    1. [data file]
    2. [penalty function index]
    3. [Number of blocks to be created]
    4. [lambda]
    5. [beta]

    Example:
       $ python SerialTVGL.py synthetic_data/datafile1.csv 2 10 2 4

 - Static GL:
    1. [data file]
    2. [Number of blocks to be created]
    3. [lambda]
    4. [number of processes to be used]
       -if higher than # of blocks, # of processes is reduced to # of blocks

    Example:
       $ python StaticGL.py synthetic_data/datafile1.csv 2 10 2

 - Single GL:
    1. [data file]
    2. [lambda]

    Example:
       $ python SingleGL.py synthetic_data/datafile1.csv 2

3. Generate synthetic data

 Run from command line with at least one [network file name]-[number of data points] pair.
 Arbitrary number of pairs can be inputted.

 $ python DataHandler.py [network file name] [number of data points from network] ...

   
Loading