Commit 69b0aa24 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

getting sweetlead working

parent 2d77aa3a
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@ from deepchem.molnet.load_function.qm8_datasets import load_qm8
from deepchem.molnet.load_function.qm9_datasets import load_qm9
from deepchem.molnet.load_function.sampl_datasets import load_sampl
from deepchem.molnet.load_function.sider_datasets import load_sider
from deepchem.molnet.load_function.sweetlead_datasets import load_sweet
from deepchem.molnet.load_function.tox21_datasets import load_tox21
from deepchem.molnet.load_function.toxcast_datasets import load_toxcast

+61 −0
Original line number Diff line number Diff line
"""
SWEET dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
import deepchem as dc

def load_sweet(featurizer='ECFP', split='index', reload=True, frac_train=.8):
  """Load sweet datasets."""
  # Load Sweetlead dataset
  logger.info("About to load Sweetlead dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "sweetlead/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "sweet.csv.gz")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/sweet.csv.gz'
    )

  # Featurize SWEET dataset
  print("About to featurize SWEET dataset.")
  if featurizer == 'ECFP':
    featurizer = dc.feat.CircularFingerprint(size=1024)
  else:
    raise ValueError("Other featurizations not supported")
  SWEET_tasks = ["task"] 

  loader = dc.data.CSVLoader(
      tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file)


  # Initialize transformers 
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  print("About to transform data")
  for transformer in transformers:
      dataset = transformer.transform(dataset)

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'task': deepchem.splits.TaskSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
    all_dataset = (train, valid, test)

  return SWEET_tasks, (train, valid, test), transformers
+25 −28
Original line number Diff line number Diff line
@@ -17,48 +17,45 @@ from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.models.sklearn_models import SklearnModel
from deepchem.splits import StratifiedSplitter, RandomSplitter
from sweetlead_datasets import load_sweet
#from sweetlead_datasets import load_sweet

sys.path.append('./../toxcast')
sys.path.append('./../sider')
#sys.path.append('./../toxcast')
#sys.path.append('./../sider')
#
#from tox_datasets import load_tox
#from sider_datasets import load_sider

from tox_datasets import load_tox
from sider_datasets import load_sider
#"""
#Load toxicity models now
#"""

"""
Load toxicity models now
"""

# Set some global variables up top
reload = False
verbosity = "high"
## Set some global variables up top
#reload = False
#verbosity = "high"
#
#base_tox_data_dir = "/home/apappu/deepchem-models/toxcast_models/toxcast/toxcast_data"

base_tox_data_dir = "/home/apappu/deepchem-models/toxcast_models/toxcast/toxcast_data"

tox_tasks, tox_dataset, tox_transformers = load_tox(
    base_tox_data_dir, reload=reload)
tox_tasks, tox_dataset, tox_transformers = dc.molnet.load_toxcast()

#removes directory if present -- warning
base_tox_dir = "/home/apappu/deepchem-models/toxcast_models/toxcast/toxcast_analysis"
#base_tox_dir = "/home/apappu/deepchem-models/toxcast_models/toxcast/toxcast_analysis"

tox_train_dir = os.path.join(base_tox_dir, "train_dataset")
tox_valid_dir = os.path.join(base_tox_dir, "valid_dataset")
tox_test_dir = os.path.join(base_tox_dir, "test_dataset")
tox_model_dir = os.path.join(base_tox_dir, "model")
#tox_train_dir = os.path.join(base_tox_dir, "train_dataset")
#tox_valid_dir = os.path.join(base_tox_dir, "valid_dataset")
#tox_test_dir = os.path.join(base_tox_dir, "test_dataset")
#tox_model_dir = os.path.join(base_tox_dir, "model")

tox_splitter = StratifiedSplitter()
#tox_splitter = StratifiedSplitter()

#default split is 80-10-10 train-valid-test split
tox_train_dataset, tox_valid_dataset, tox_test_dataset = tox_splitter.train_valid_test_split(
  tox_dataset, tox_train_dir, tox_valid_dir, tox_test_dir)

# Fit Logistic Regression models
tox_task_types = {task: "classification" for task in tox_tasks}
## Fit Logistic Regression models
#tox_task_types = {task: "classification" for task in tox_tasks}


classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")
classification_metric = Metric(metrics.roc_auc_score, np.mean, mode="classification")
params_dict = {
    "batch_size": None,
    "data_shape": tox_train_dataset.get_data_shape(),
@@ -113,7 +110,7 @@ Load sweetlead dataset now. Pass in dataset object and appropriate transformers

base_sweet_data_dir = "/home/apappu/deepchem-models/toxcast_models/sweetlead/sweet_data"

sweet_dataset, sweet_transformers = load_sweet(
sweet_dataset, sweet_transformers = dc.molnet.load_sweet(
    base_sweet_data_dir, reload=reload)

sider_predictions = sider_model.predict(sweet_dataset, sweet_transformers)
+0 −41
Original line number Diff line number Diff line
"""
SWEET dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
import deepchem as dc

def load_sweet(base_dir, frac_train=.8):
  """Load sweet datasets. Does not do train/test split"""
  current_dir = os.path.dirname(os.path.realpath(__file__))

  # Load SWEET dataset
  dataset_file = os.path.join(
      current_dir, "./sweet.csv.gz")

  # Featurize SWEET dataset
  print("About to featurize SWEET dataset.")
  featurizer = dc.feat.CircularFingerprint(size=1024)
  SWEET_tasks = dataset.columns.values[1:].tolist()

  loader = dc.data.CSVLoader(
      tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file)


  # Initialize transformers 
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  print("About to transform data")
  for transformer in transformers:
      dataset = transformer.transform(dataset)

  spliter = dc.splits.IndexSplitter()
  train, valid, test = splitter.train_valid_test_split(dataset)

  return SWEET_tasks, (train, valid, test), transformers