Commit 375bd681 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #563 from peastman/temp

Delete temp directories
parents 68928823 0832fa80
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ import numpy as np
import pandas as pd
import joblib
import os
import shutil
import tempfile
import sklearn
from sklearn.base import BaseEstimator
@@ -44,17 +45,23 @@ class Model(BaseEstimator):
    model_dir: str
      Path to directory where model will be stored.
    """
    self.model_dir_is_temp = False
    if model_dir is not None:
      if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    else:
      model_dir = tempfile.mkdtemp()
      self.model_dir_is_temp = True
    self.model_dir = model_dir
    self.model_instance = model_instance
    self.model_class = model_instance.__class__

    self.verbose = verbose

  def __del__(self):
    if 'model_dir_is_temp' in dir(self) and self.model_dir_is_temp:
      shutil.rmtree(self.model_dir)

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
+14 −18
Original line number Diff line number Diff line
@@ -15,23 +15,19 @@ from deepchem.models import Model
from deepchem.data import DiskDataset
from deepchem.trans import undo_transforms


class SingletaskToMultitask(Model):
  """
  Convenience class to let singletask models be fit on multitask data.

  Warning: This current implementation is only functional for sklearn models. 
  """

  def __init__(self, tasks, model_builder, model_dir=None, verbose=True):
    super().__init__(self, model_dir=model_dir, verbose=verbose)
    self.tasks = tasks
    if model_dir is not None:
      if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    else:
      model_dir = tempfile.mkdtemp()
    self.model_dir = model_dir
    self.task_model_dirs = {}
    self.model_builder = model_builder
    self.verbose = True 
    log("About to initialize singletask to multitask model", self.verbose)
    for task in self.tasks:
      task_model_dir = os.path.join(self.model_dir, str(task))
@@ -51,8 +47,8 @@ class SingletaskToMultitask(Model):
      task_data_dirs.append(task_data_dir)
    task_datasets = self._to_singletask(dataset, task_data_dirs)
    for task, task_dataset in zip(self.tasks, task_datasets):
      log("Dataset for task %s has shape %s"
          % (task, str(task_dataset.get_shape())), self.verbose)
      log("Dataset for task %s has shape %s" %
          (task, str(task_dataset.get_shape())), self.verbose)
    return task_datasets

  @staticmethod
@@ -61,8 +57,10 @@ class SingletaskToMultitask(Model):
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbose)
    task_datasets = [DiskDataset.create_dataset([], task_dirs[task_num], [task])
                    for (task_num, task) in enumerate(tasks)]
    task_datasets = [
        DiskDataset.create_dataset([], task_dirs[task_num], [task])
        for (task_num, task) in enumerate(tasks)
    ]
    #task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbose)
@@ -84,7 +82,6 @@ class SingletaskToMultitask(Model):

    return task_datasets


  def fit(self, dataset, **kwargs):
    """
    Updates all singletask models with new information.
@@ -97,8 +94,7 @@ class SingletaskToMultitask(Model):
    task_datasets = self._create_task_datasets(dataset)
    for ind, task in enumerate(self.tasks):
      log("Fitting model for task %s" % task, self.verbose)
      task_model = self.model_builder(
          self.task_model_dirs[task])
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.fit(task_datasets[ind], **kwargs)
      task_model.save()

@@ -156,8 +152,8 @@ class SingletaskToMultitask(Model):
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.reload()

      y_pred[:, ind] = np.squeeze(task_model.predict_proba(
          dataset, transformers, n_classes))
      y_pred[:, ind] = np.squeeze(
          task_model.predict_proba(dataset, transformers, n_classes))
    return y_pred

  def save(self):
+2 −7
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ class Sequential(Model):
  """

  def __init__(self, name=None, logdir=None):
    super().__init__(self, model_dir=logdir)
    self.layers = []  # stack of layers
    self.outputs = None  # tensors (length 1)

@@ -64,13 +65,7 @@ class Sequential(Model):
    config = tf.ConfigProto(allow_soft_placement=True)
    self.session = tf.Session(graph=self.graph, config=config)
    # Path to save checkpoint files
    if logdir is not None:
      if not os.path.exists(logdir):
        os.makedirs(logdir)
    else:
      logdir = tempfile.mkdtemp()
    self.logdir = logdir
    self._save_path = os.path.join(self.logdir, 'model.ckpt')
    self._save_path = os.path.join(self.model_dir, 'model.ckpt')

  def add(self, layer):
    """Adds a layer instance on top of the layer stack.
+6 −12
Original line number Diff line number Diff line
@@ -190,22 +190,16 @@ class TensorflowGraphModel(Model):
    self.batch_size = batch_size
    self.n_classes = n_classes
    self.pad_batches = pad_batches
    self.verbose = verbose
    self.seed = seed

    if logdir is not None:
      if not os.path.exists(logdir):
        os.makedirs(logdir)
    else:
      logdir = tempfile.mkdtemp()
    self.logdir = logdir
    super().__init__(self, model_dir=logdir, verbose=verbose)

    # Guard variable to make sure we don't Restore() this model
    # from a disk checkpoint more than once.
    self._restored_model = False
    # Path to save checkpoint files, which matches the
    # replicated supervisor's default path.
    self._save_path = os.path.join(logdir, 'model.ckpt')
    self._save_path = os.path.join(self.model_dir, 'model.ckpt')

    self.train_graph = self.construct_graph(training=True, seed=self.seed)
    self.eval_graph = self.construct_graph(training=False, seed=self.seed)
@@ -593,9 +587,9 @@ class TensorflowGraphModel(Model):
  def _find_last_checkpoint(self):
    """Finds last saved checkpoint."""
    highest_num, last_checkpoint = -np.inf, None
    for filename in os.listdir(self.logdir):
      # checkpoints look like logdir/model.ckpt-N
      # self._save_path is "logdir/model.ckpt"
    for filename in os.listdir(self.model_dir):
      # checkpoints look like model_dir/model.ckpt-N
      # self._save_path is "model_dir/model.ckpt"
      if os.path.basename(self._save_path) in filename:
        try:
          N = int(filename.split("-")[1].split(".")[0])
@@ -604,7 +598,7 @@ class TensorflowGraphModel(Model):
            last_checkpoint = "model.ckpt-" + str(N)
        except ValueError:
          pass
    return os.path.join(self.logdir, last_checkpoint)
    return os.path.join(self.model_dir, last_checkpoint)


class TensorflowClassifier(TensorflowGraphModel):
+2 −8
Original line number Diff line number Diff line
@@ -84,17 +84,11 @@ class MultitaskGraphClassifier(Model):
               pad_batches=True,
               verbose=True):

    self.verbose = verbose
    super().__init__(self, model_dir=logdir, verbose=verbose)
    self.n_tasks = n_tasks
    self.final_loss = final_loss
    self.model = model
    self.sess = tf.Session(graph=self.model.graph)
    if logdir is not None:
      if not os.path.exists(logdir):
        os.makedirs(logdir)
    else:
      logdir = tempfile.mkdtemp()
    self.logdir = logdir

    with self.model.graph.as_default():
      # Extract model info 
@@ -129,7 +123,7 @@ class MultitaskGraphClassifier(Model):

      # Path to save checkpoint files, which matches the
      # replicated supervisor's default path.
      self._save_path = os.path.join(logdir, 'model.ckpt')
      self._save_path = os.path.join(self.model_dir, 'model.ckpt')

  def build(self):
    # Create target inputs
Loading