Commit fe358db6 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixes to get examples running again.

parent d073d041
Loading
Loading
Loading
Loading
+8 −2
Original line number Diff line number Diff line
@@ -221,7 +221,11 @@ class Dataset(object):
  def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    ######################################################### DEBUG
    if n_samples > 0:
      y = np.reshape(y, (n_samples, -1))
    ######################################################### DEBUG
    #y = np.reshape(y, (n_samples, -1))
    n_tasks = y.shape[1]
    if ids is None:
      ids = np.arange(n_samples)
@@ -234,9 +238,11 @@ class Dataset(object):

  def select(self, select_dir, indices):
    """Creates a new dataset from a selection of indices from self."""
    indices = np.array(indices)
    ################################################### DEBUG
    indices = np.array(indices).astype(int)
    X, y, w, ids = self.to_numpy()
    tasks = self.get_task_names()
    ################################################### DEBUG
    X_sel, y_sel, w_sel, ids_sel = (
        X[indices], y[indices], w[indices], ids[indices])
    return Dataset.from_numpy(select_dir, X_sel, y_sel, w_sel, ids_sel, tasks)
+29 −43
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ import deepchem
import tempfile, shutil
from deepchem.utils.save import load_from_disk
from deepchem.splits import SpecifiedSplitter
from deepchem.featurizers import UserDefinedFeaturizer 
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.datasets import Dataset
from deepchem.transformers import NormalizationTransformer
@@ -26,6 +27,7 @@ def load_bace(mode="regression", transform=True, split="20-80"):
  """Load BACE-1 dataset as regression/classification problem."""
  reload = True
  verbosity = "high"
  regen = False
  assert split in ["20-80", "80-20"]

  current_dir = os.path.dirname(os.path.realpath(__file__))
@@ -53,17 +55,12 @@ def load_bace(mode="regression", transform=True, split="20-80"):

  #Make directories to store the raw and featurized datasets.
  base_dir = tempfile.mkdtemp()
  feature_dir = os.path.join(base_dir, "features")
  samples_dir = os.path.join(base_dir, "samples")
  full_dir = os.path.join(base_dir, "full_dataset")
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")
  test_dir = os.path.join(base_dir, "test_dataset")
  model_dir = os.path.join(base_dir, "model")
  crystal_dir = os.path.join(base_dir, "crystal")
  crystal_feature_dir = os.path.join(base_dir, "crystal_feature")
  crystal_samples_dir = os.path.join(base_dir, "crystal_samples")


  if mode == "regression":
    bace_tasks = ["pIC50"]
@@ -71,42 +68,36 @@ def load_bace(mode="regression", transform=True, split="20-80"):
    bace_tasks = ["Class"]
  else:
    raise ValueError("Unknown mode %s" % mode)
  featurizers = [UserDefinedFeaturizer(user_specified_features)]
  featurizer = DataFeaturizer(tasks=bace_tasks,
                              smiles_field="mol",
                              id_field="CID",
                              user_specified_features=user_specified_features,
                              split_field="Model")
  featurized_samples = featurizer.featurize(
      dataset_file, feature_dir, samples_dir, shard_size=2000,
      reload=reload)

  crystal_featurized_samples = featurizer.featurize(
      crystal_dataset_file, crystal_feature_dir, crystal_samples_dir,
  shard_size=2000)
                              featurizers=featurizers)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
  if not reload or not os.path.exists(crystal_dir):
    crystal_dataset = featurizer.featurize(crystal_dataset_file, crystal_dir)
  else:
    crystal_dataset = Dataset(crystal_dir, reload=True)


  splitter = SpecifiedSplitter(verbosity=verbosity)
  train_samples, valid_samples, test_samples = splitter.train_valid_test_split(
      featurized_samples, train_dir, valid_dir, test_dir,
      reload=reload)
  if (not reload or not os.path.exists(train_dir) or not os.path.exists(valid_dir)
      or not os.path.exists(test_dir)):
    regen = True
    splitter = SpecifiedSplitter(dataset_file, "Model", verbosity=verbosity)
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, train_dir, valid_dir, test_dir)
  else:
    train_dataset = Dataset(train_dir, reload=True)
    valid_dataset = Dataset(valid_dir, reload=True)
    test_dataset = Dataset(test_dir, reload=True)

  #NOTE THE RENAMING:
  if split == "20-80":
    valid_samples, test_samples = test_samples, valid_samples

  train_dataset = Dataset(data_dir=train_dir, samples=train_samples, 
                          featurizers=[], tasks=bace_tasks,
                          use_user_specified_features=True)
  valid_dataset = Dataset(data_dir=valid_dir, samples=valid_samples, 
                          featurizers=[], tasks=bace_tasks,
                          use_user_specified_features=True)
  test_dataset = Dataset(data_dir=test_dir, samples=test_samples, 
                         featurizers=[], tasks=bace_tasks,
                         use_user_specified_features=True)
  crystal_dataset = Dataset(data_dir=crystal_dir,
                            samples=crystal_featurized_samples, 
                            featurizers=[], tasks=bace_tasks,
                            use_user_specified_features=True)
    valid_dataset, test_dataset = test_dataset, valid_dataset
  print("Number of compounds in train set")
  print(len(train_dataset))
  print("Number of compounds in validation set")
@@ -116,7 +107,7 @@ def load_bace(mode="regression", transform=True, split="20-80"):
  print("Number of compounds in crystal set")
  print(len(crystal_dataset))

  if transform:
  if transform and regen:
    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
@@ -130,14 +121,9 @@ def load_bace(mode="regression", transform=True, split="20-80"):
    input_transformers, output_transformers = [], []
  
  transformers = input_transformers + output_transformers
  for dataset in [train_dataset, valid_dataset, test_dataset, crystal_dataset]:
    for transformer in transformers:
      transformer.transform(train_dataset)
  for transformer in transformers:
      transformer.transform(valid_dataset)
  for transformer in transformers:
      transformer.transform(test_dataset)
  for transformer in transformers:
      transformer.transform(crystal_dataset)
        transformer.transform(dataset)

  return (bace_tasks, train_dataset, valid_dataset, test_dataset,
          crystal_dataset, output_transformers)
+10 −31
Original line number Diff line number Diff line
@@ -8,24 +8,11 @@ from __future__ import unicode_literals
import os
import numpy as np
import shutil
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import RandomSplitter
from deepchem.datasets import Dataset
from deepchem.transformers import BalancingTransformer
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.metrics import to_one_hot
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import relative_difference
from deepchem.utils.evaluate import Evaluator

def load_muv(base_dir, reload=True):
  """Load MUV datasets. Does not do train/test split"""
@@ -33,10 +20,10 @@ def load_muv(base_dir, reload=True):
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  #base_dir = "/scratch/users/rbharath/muv_multitask_analysis"
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
@@ -44,8 +31,6 @@ def load_muv(base_dir, reload=True):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  feature_dir = os.path.join(base_dir, "features")
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")

  # Load MUV dataset
@@ -66,26 +51,20 @@ def load_muv(base_dir, reload=True):

  featurizer = DataFeaturizer(tasks=all_MUV_tasks,
                              smiles_field="smiles",
                              compound_featurizers=featurizers,
                              featurizers=featurizers,
                              verbosity=verbosity)
  featurized_samples = featurizer.featurize(
      dataset_file, feature_dir,
      samples_dir, shard_size=8192,
      reload=reload)

  dataset = Dataset(data_dir=data_dir, samples=featurized_samples, 
                    featurizers=featurizers, tasks=all_MUV_tasks,
                    verbosity=verbosity, reload=reload)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  input_transformers = []
  output_transformers = []
  weight_transformers = [
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  transformers = input_transformers + output_transformers + weight_transformers
  if not reload:
  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_MUV_tasks, featurized_samples, dataset, transformers
  return all_MUV_tasks, dataset, transformers
+37 −56
Original line number Diff line number Diff line
@@ -8,31 +8,18 @@ from __future__ import unicode_literals
import os
import numpy as np
import shutil
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import RandomSplitter
from deepchem.datasets import Dataset
from deepchem.transformers import BalancingTransformer
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.metrics import to_one_hot
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import relative_difference
from deepchem.utils.evaluate import Evaluator

def load_pcba(base_dir, reload=True):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
@@ -43,8 +30,6 @@ def load_pcba(base_dir, reload=True):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  feature_dir = os.path.join(base_dir, "features")
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")

  # Load PCBA dataset
@@ -58,7 +43,8 @@ def load_pcba(base_dir, reload=True):
  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizers = [CircularFingerprint(size=1024)]
  all_PCBA_tasks = ['PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
  all_PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
      'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
      'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242',
@@ -86,26 +72,21 @@ def load_pcba(base_dir, reload=True):

  featurizer = DataFeaturizer(tasks=all_PCBA_tasks,
                              smiles_field="smiles",
                              compound_featurizers=featurizers,
                              featurizers=featurizers,
                              verbosity=verbosity)
  featurized_samples = featurizer.featurize(
      dataset_file, feature_dir,
      samples_dir, shard_size=8192,
      reload=reload)

  dataset = Dataset(data_dir=data_dir, samples=featurized_samples, 
                    featurizers=featurizers, tasks=all_PCBA_tasks,
                    verbosity=verbosity, reload=reload)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  input_transformers = []
  output_transformers = []
  weight_transformers = [
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  transformers = input_transformers + output_transformers + weight_transformers
  if not reload:

  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_PCBA_tasks, featurized_samples, dataset, transformers
  return all_PCBA_tasks, dataset, transformers
+0 −5
Original line number Diff line number Diff line
@@ -54,11 +54,6 @@ class SklearnModel(Model):
    """
    Makes predictions on batch of data.
    """
    ################################################ DEBUG
    print("SklearnModel.predict_on_batch()")
    print("X.shape, np.main(X), np.amax(X)")
    print(X.shape, np.amin(X), np.amax(X))
    ################################################ DEBUG
    return self.raw_model.predict(X)

  def predict_proba_on_batch(self, X):
Loading