Commit 6c588b20 authored by Aneesh's avatar Aneesh
Browse files

nci data and rf & dnn scripts

parent 6ab9fefc
Loading
Loading
Loading
Loading
+84 −0
Original line number Diff line number Diff line
"""
NCI dataset loader.
Original Author - Bharath Ramsundar
Author - Aneesh Pappu
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import sys
import numpy as np
import shutil
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer

def load_nci(base_dir, reload=True):
  """Load NCI datasets. Does not do train/test split"""
  # Set some global variables up top
  #reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      print("deleting dir in datasets.py")
      print(base_dir)
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load nci dataset
  print("About to load NCI dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/nci.csv.gz")
  print(dataset_file)
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize nci dataset
  print("About to featurize nci dataset.")
  featurizers = [CircularFingerprint(size=1024)]
  #was sorted list originally in muv_datasets.py, but csv is ordered so removed
  all_nci_tasks = (['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226',
                          'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226',
                          'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205',
                          'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620',
                          'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251',
                          'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2',
                          'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1',
                          'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES',
                          'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393',
                          'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7',
                          'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'MDA-N', 'BT-549'])

  featurizer = DataFeaturizer(tasks=all_nci_tasks,
                              smiles_field="smiles",
                              featurizers=featurizers,
                              verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)

  return all_nci_tasks, dataset, transformers

examples/nci/nci_rf.py

0 → 100644
+108 −0
Original line number Diff line number Diff line
"""
Script that trains Sklearn multitask models on nci dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from deepchem.datasets import Dataset
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import Evaluator
from deepchem.datasets.nci_datasets import load_nci

np.random.seed(123)

# Set some global variables up top

reload = False
verbosity = "high"

#base_data_dir = "/scratch/users/rbharath/muv"
base_data_dir = "/home/apappu/deepchem/examples/nci/dataset"

nci_tasks, dataset, transformers = load_nci(
    base_data_dir, reload=reload)
print("len(dataset)")
print(len(dataset))

#base_dir = "/scratch/users/rbharath/muv_analysis"
base_dir = "/home/apappu/deepchem/examples/nci/"
if os.path.exists(base_dir):
  shutil.rmtree(base_dir)
if not os.path.exists(base_dir):
  os.makedirs(base_dir)
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
test_dir = os.path.join(base_dir, "test_dataset")
model_dir = os.path.join(base_dir, "model")

print("About to perform train/valid/test split.")
num_train = .8 * len(dataset)
X, y, w, ids = dataset.to_numpy()
num_tasks = 17
nci_tasks = nci_tasks[:num_tasks]
print("Using following tasks")
print(nci_tasks)
X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
ids_train, ids_valid = ids[:num_train], ids[num_train:]

if os.path.exists(train_dir):
  shutil.rmtree(train_dir)
train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, nci_tasks)

if os.path.exists(valid_dir):
  shutil.rmtree(valid_dir)
valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, nci_tasks)

# Fit Logistic Regression models
nci_task_types = {task: "classification" for task in nci_tasks}


classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")
params_dict = {
    "batch_size": None,
    "data_shape": train_dataset.get_data_shape(),
}

if os.path.exists(model_dir):
  shutil.rmtree(model_dir)
os.makedirs(model_dir)
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
  return SklearnModel(tasks, task_types, model_params, model_dir,
                      #model_instance=LogisticRegression(class_weight="balanced"),
                      model_instance=RandomForestClassifier(
                          class_weight="balanced",
                          n_estimators=500),
                      verbosity=verbosity)
model = SingletaskToMultitask(nci_tasks, nci_task_types, params_dict, model_dir,
                              model_builder, verbosity=verbosity)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance([classification_metric])

print("Train scores")
print(train_scores)

valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity)
valid_scores = valid_evaluator.compute_model_performance([classification_metric])

print("Validation scores")
print(valid_scores)