Commit 724a4fac authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #272 from apappu97/master

moving in sider sweet and toxcast
parents 96c7ab7b 9e27e19d
Loading
Loading
Loading
Loading
+34 KiB

File added.

No diff preview for this file type.

+85 −0
Original line number Diff line number Diff line
"""
SIDER dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
from deepchem.utils.save import load_from_disk
from deepchem.datasets import DiskDataset
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer

def load_sider(base_dir, reload=True, frac_train=.8):
  """Load SIDER datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")

  # Load SIDER dataset
  print("About to load SIDER dataset.")
  dataset_file = os.path.join(
      current_dir, "./sider.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize SIDER dataset
  print("About to featurize SIDER dataset.")
  featurizer = CircularFingerprint(size=1024)
  SIDER_tasks = dataset.columns.values[1:].tolist()

  loader = DataLoader(tasks=SIDER_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = DiskDataset(data_dir, reload=True)

  # Initialize transformers
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if regen:
    print("About to transform data")
    for transformer in transformers:
        dataset = transformer.transform(dataset)

  X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  num_tasks = 17
  num_train = frac_train * len(dataset)
  SIDER_tasks = SIDER_tasks[:num_tasks]
  print("Using following tasks")
  print(SIDER_tasks)
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, SIDER_tasks)
  valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, SIDER_tasks)

  return SIDER_tasks, (train_dataset, valid_dataset), transformers
+77 −0
Original line number Diff line number Diff line
"""
Script that trains Keras multitask models on SIDER dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
from sider_datasets import load_sider
from deepchem.datasets import Dataset
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.utils.evaluate import Evaluator
from deepchem.models.keras_models.fcnet import MultiTaskDNN
from deepchem.models.keras_models import KerasModel

# Set some global variables up top
np.random.seed(123)
reload = True
verbosity = "high"
model = "logistic"

base_data_dir = "/tmp/sider_keras"

sider_tasks, dataset, transformers = load_sider(
    base_data_dir, reload=reload)
print("len(dataset)")
print(len(dataset))

base_dir = "/tmp/sider_analysis"
model_dir = os.path.join(base_dir, "model")
if os.path.exists(base_dir):
  shutil.rmtree(base_dir)
os.makedirs(base_dir)

# Load SIDER data
sider_tasks, sider_datasets, transformers = load_sider(
    base_dir, reload=reload)
train_dataset, valid_dataset = sider_datasets
n_features = 1024 


# Build model
classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")

learning_rates = [0.0003, 0.001, 0.003]
hidden_units = [1000, 500]
dropouts = [.5, .25]
num_hidden_layers = [1, 2]

# hyperparameter sweep here
for learning_rate in learning_rates:
  for hidden_unit in hidden_units:
    for dropout in dropouts:
      keras_model = MultiTaskDNN(len(sider_tasks), n_features, "classification",
                                 dropout=.25, learning_rate=.001, decay=1e-4)
      model = KerasModel(keras_model, self.model_dir, verbosity=verbosity)

      # Fit trained model
      model.fit(train_dataset)
      model.save()

      train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
      train_scores = train_evaluator.compute_model_performance([classification_metric])

      print("Train scores")
      print(train_scores)

      valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity)
      valid_scores = valid_evaluator.compute_model_performance([classification_metric])

      print("Validation scores")
      print(valid_scores)
+91 −0
Original line number Diff line number Diff line
"""
Script that trains Sklearn multitask models on the sider dataset
@Author Bharath Ramsundar, Aneesh Pappu
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
from sklearn.ensemble import RandomForestClassifier
from deepchem.datasets import Dataset
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import Evaluator
from sider_datasets import load_sider
from deepchem.splits import RandomSplitter

# Set some global variables up top
reload = False
verbosity = "high"

base_data_dir = "/tmp/sider_rf"

sider_tasks, sider_dataset, transformers = load_sider(
    base_data_dir, reload=reload)

#removes directory if present -- warning
base_dir = "/tmp/sider_analysis"
if os.path.exists(base_dir):
  shutil.rmtree(base_dir)
if not os.path.exists(base_dir):
  os.makedirs(base_dir)
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
test_dir = os.path.join(base_dir, "test_dataset")
model_dir = os.path.join(base_dir, "model")

print("About to perform train/valid/test split.")
  
splitter = RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
  sider_dataset, train_dir, valid_dir, test_dir)

train_dataset.set_verbosity(verbosity)
valid_dataset.set_verbosity(verbosity)
test_dataset.set_verbosity(verbosity)

# Fit Logistic Regression models
sider_task_types = {task: "classification" for task in sider_tasks}


classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")
params_dict = {
    "batch_size": None,
    "data_shape": train_dataset.get_data_shape(),
}

if os.path.exists(model_dir):
  shutil.rmtree(model_dir)
os.makedirs(model_dir)

def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
  return SklearnModel(tasks, task_types, model_params, model_dir,
                      model_instance=RandomForestClassifier(
                          class_weight="balanced",
                          n_estimators=500,
                          n_jobs=-1),
                      verbosity=verbosity)
model = SingletaskToMultitask(sider_tasks, sider_task_types, params_dict, model_dir,
                              model_builder, verbosity=verbosity)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance([classification_metric])

print("Train scores")
#  train_scores_list.append(train_scores)

valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity)
valid_scores = valid_evaluator.compute_model_performance([classification_metric])

print("Validation scores")
 No newline at end of file
+53.4 KiB

File added.

No diff preview for this file type.

Loading