Commit 1009c2b2 authored by miaecle's avatar miaecle
Browse files

save and load function

parent 4d85bbf4
Loading
Loading
Loading
Loading
+20 −45
Original line number Diff line number Diff line
@@ -7,7 +7,6 @@ from __future__ import unicode_literals

import os
import deepchem
import pickle
from deepchem.molnet.load_function.bace_features import bace_user_specified_features


@@ -15,13 +14,12 @@ def load_bace_regression(featurizer=None, split='random', reload=True):
  """Load bace datasets."""
  # Featurize bace dataset
  print("About to featurize bace dataset.")
  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"
  if reload:
    save_dir = os.path.join(data_dir, "bace_r/" + featurizer + "/" + split)

  dataset_file = os.path.join(data_dir, "bace.csv")

@@ -32,19 +30,10 @@ def load_bace_regression(featurizer=None, split='random', reload=True):
    )

  bace_tasks = ["pIC50"]
  if save:
    save_dir = os.path.join(data_dir, "bace_r/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return bace_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
@@ -78,12 +67,10 @@ def load_bace_regression(featurizer=None, split='random', reload=True):
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return bace_tasks, (train, valid, test), transformers


@@ -91,13 +78,12 @@ def load_bace_classification(featurizer=None, split='random', reload=True):
  """Load bace datasets."""
  # Featurize bace dataset
  print("About to featurize bace dataset.")
  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"
  if reload:
    save_dir = os.path.join(data_dir, "bace_c/" + featurizer + "/" + split)

  dataset_file = os.path.join(data_dir, "bace.csv")

@@ -108,19 +94,10 @@ def load_bace_classification(featurizer=None, split='random', reload=True):
    )

  bace_tasks = ["Class"]
  if save:
    save_dir = os.path.join(data_dir, "bace_c/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return bace_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
@@ -153,10 +130,8 @@ def load_bace_classification(featurizer=None, split='random', reload=True):
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return bace_tasks, (train, valid, test), transformers
+10 −23
Original line number Diff line number Diff line
@@ -7,20 +7,18 @@ from __future__ import unicode_literals

import os
import deepchem
import pickle


def load_bbbp(featurizer='ECFP', split='index', reload=True):
  """Load blood-brain barrier penetration datasets """
  # Featurize bbb dataset
  print("About to featurize bbbp dataset.")
  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"
  if reload:
    save_dir = os.path.join(data_dir, "bbbp/" + featurizer + "/" + split)

  dataset_file = os.path.join(data_dir, "BBBP.csv")
  if not os.path.exists(dataset_file):
@@ -31,19 +29,10 @@ def load_bbbp(featurizer='ECFP', split='index', reload=True):

  bbbp_tasks = ["p_np"]

  if save:
    save_dir = os.path.join(data_dir, "bbbp/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return bbbp_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
@@ -72,10 +61,8 @@ def load_bbbp(featurizer='ECFP', split='index', reload=True):
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return bbbp_tasks, (train, valid, test), transformers
+10 −23
Original line number Diff line number Diff line
@@ -7,7 +7,6 @@ from __future__ import unicode_literals

import os
import deepchem
import pickle
from deepchem.molnet.load_function.chembl_tasks import chembl_tasks


@@ -17,13 +16,12 @@ def load_chembl(shard_size=2000,
                split="random",
                reload=True):

  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"
  if reload:
    save_dir = os.path.join(data_dir, "chembl/" + featurizer + "/" + split)

  dataset_path = os.path.join(data_dir, "chembl_%s.csv.gz" % set)
  if not os.path.exists(dataset_path):
@@ -61,19 +59,10 @@ def load_chembl(shard_size=2000,
    )

  print("About to load ChEMBL dataset.")
  if save:
    save_dir = os.path.join(data_dir, "chembl/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return chembl_tasks, all_dataset, transformers

  if split == "year":
@@ -134,10 +123,8 @@ def load_chembl(shard_size=2000,
    splitter = splitters[split]
    print("Performing new split.")
    train, valid, test = splitter.train_valid_test_split(dataset)
  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return chembl_tasks, (train, valid, test), transformers
+10 −23
Original line number Diff line number Diff line
@@ -7,7 +7,6 @@ from __future__ import unicode_literals

import os
import deepchem
import pickle


def load_clearance(featurizer='ECFP', split='random', reload=True):
@@ -15,13 +14,12 @@ def load_clearance(featurizer='ECFP', split='random', reload=True):
  # Featurize clearance dataset
  print("About to featurize clearance dataset.")
  print("About to load clearance dataset.")
  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"
  if reload:
    save_dir = os.path.join(data_dir, "clearance/" + featurizer + "/" + split)

  dataset_file = os.path.join(data_dir, "clearance.csv")
  if not os.path.exists(dataset_file):
@@ -32,19 +30,10 @@ def load_clearance(featurizer='ECFP', split='random', reload=True):

  clearance_tasks = ['exp']

  if save:
    save_dir = os.path.join(data_dir, "clearance/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return clearance_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
@@ -75,10 +64,8 @@ def load_clearance(featurizer='ECFP', split='random', reload=True):
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return clearance_tasks, (train, valid, test), transformers
+10 −23
Original line number Diff line number Diff line
@@ -9,19 +9,17 @@ from __future__ import unicode_literals

import os
import deepchem
import pickle


def load_clintox(featurizer='ECFP', split='index', reload=True):
  """Load clintox datasets."""

  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"
  if reload:
    save_dir = os.path.join(data_dir, "clintox/" + featurizer + "/" + split)

  dataset_file = os.path.join(data_dir, "clintox.csv.gz")
  if not os.path.exists(dataset_file):
@@ -36,19 +34,10 @@ def load_clintox(featurizer='ECFP', split='index', reload=True):
  print("Tasks in dataset: %s" % (clintox_tasks))
  print("Number of tasks in dataset: %s" % str(len(clintox_tasks)))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))
  if save:
    save_dir = os.path.join(data_dir, "clintox/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return clintox_tasks, all_dataset, transformers
  # Featurize clintox dataset
  print("About to featurize clintox dataset.")
@@ -80,11 +69,9 @@ def load_clintox(featurizer='ECFP', split='index', reload=True):
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)

  return clintox_tasks, (train, valid, test), transformers
Loading