Commit 7efa6437 authored by miaecle's avatar miaecle
Browse files

style change

parent 3edcd2fc
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line

from deepchem.molnet.load_function.chembl_datasets import load_chembl
from deepchem.molnet.load_function.clintox_datasets import load_clintox
from deepchem.molnet.load_function.delaney_datasets import load_delaney
+58 −37
Original line number Diff line number Diff line
@@ -10,41 +10,60 @@ import deepchem as dc

from deepchem.molnet.load_function.chembl_tasks import chembl_tasks

def load_chembl(shard_size=2000, featurizer="ECFP", set="5thresh", split="random"):

def load_chembl(shard_size=2000,
                featurizer="ECFP",
                set="5thresh",
                split="random"):

  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
  else:
    data_dir = "/tmp"

  dataset_path = os.path.join(
      data_dir, "chembl_%s.csv.gz" % set)
  dataset_path = os.path.join(data_dir, "chembl_%s.csv.gz" % set)
  if not os.path.exists(dataset_path):
    os.system('wget -P ' + data_dir + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_5thresh.csv.gz')
    os.system('wget -P ' + data_dir + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz')
    os.system('wget -P ' + os.path.join(data_dir, 'chembl_year_sets') + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_test.csv.gz')
    os.system('wget -P ' + os.path.join(data_dir, 'chembl_year_sets') + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_train.csv.gz')
    os.system('wget -P ' + os.path.join(data_dir, 'chembl_year_sets') + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_valid.csv.gz')
    os.system('wget -P ' + os.path.join(data_dir, 'chembl_year_sets') + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_test.csv.gz')
    os.system('wget -P ' + os.path.join(data_dir, 'chembl_year_sets') + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_train.csv.gz')
    os.system('wget -P ' + os.path.join(data_dir, 'chembl_year_sets') + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_valid.csv.gz')
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_5thresh.csv.gz'
    )
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz'
    )
    os.system(
        'wget -P ' + os.path.join(data_dir, 'chembl_year_sets') +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_test.csv.gz'
    )
    os.system(
        'wget -P ' + os.path.join(data_dir, 'chembl_year_sets') +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_train.csv.gz'
    )
    os.system(
        'wget -P ' + os.path.join(data_dir, 'chembl_year_sets') +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_valid.csv.gz'
    )
    os.system(
        'wget -P ' + os.path.join(data_dir, 'chembl_year_sets') +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_test.csv.gz'
    )
    os.system(
        'wget -P ' + os.path.join(data_dir, 'chembl_year_sets') +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_train.csv.gz'
    )
    os.system(
        'wget -P ' + os.path.join(data_dir, 'chembl_year_sets') +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_valid.csv.gz'
    )

  print("About to load ChEMBL dataset.")
  if split == "year":
    train_files = os.path.join(data_dir,
        "./chembl_year_sets/chembl_%s_ts_train.csv.gz" % set)
    valid_files = os.path.join(data_dir,
        "./chembl_year_sets/chembl_%s_ts_valid.csv.gz" % set)
    test_files = os.path.join(data_dir,
        "./chembl_year_sets/chembl_%s_ts_test.csv.gz" % set)
    train_files = os.path.join(
        data_dir, "./chembl_year_sets/chembl_%s_ts_train.csv.gz" % set)
    valid_files = os.path.join(
        data_dir, "./chembl_year_sets/chembl_%s_ts_valid.csv.gz" % set)
    test_files = os.path.join(
        data_dir, "./chembl_year_sets/chembl_%s_ts_test.csv.gz" % set)

  # Featurize ChEMBL dataset
  print("About to featurize ChEMBL dataset.")
@@ -60,34 +79,36 @@ def load_chembl(shard_size=2000, featurizer="ECFP", set="5thresh", split="random

  if split == "year":
    print("Featurizing train datasets")
    train_dataset = loader.featurize(
        train_files, shard_size=shard_size)
    train_dataset = loader.featurize(train_files, shard_size=shard_size)
    print("Featurizing valid datasets")
    valid_dataset = loader.featurize(
        valid_files, shard_size=shard_size)
    valid_dataset = loader.featurize(valid_files, shard_size=shard_size)
    print("Featurizing test datasets")
    test_dataset = loader.featurize(
        test_files, shard_size=shard_size)
    test_dataset = loader.featurize(test_files, shard_size=shard_size)
  else:
    dataset = loader.featurize(dataset_path, shard_size=shard_size)
  # Initialize transformers
  print("About to transform data")
  if split == "year":
    transformers = [
        dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
        dc.trans.NormalizationTransformer(
            transform_y=True, dataset=train_dataset)
    ]
    for transformer in transformers:
      train = transformer.transform(train_dataset)
      valid = transformer.transform(valid_dataset)
      test = transformer.transform(test_dataset)
  else:
    transformers = [
        dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
        dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
    ]
    for transformer in transformers:
      dataset = transformer.transform(dataset)

  splitters = {'index': dc.splits.IndexSplitter(),
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
      'scaffold': dc.splits.ScaffoldSplitter()
  }

  if split in splitters:
    splitter = splitters[split]
+142 −138

File changed.

Preview size limit exceeded, changes collapsed.

+13 −9
Original line number Diff line number Diff line
@@ -19,11 +19,12 @@ def load_clintox(featurizer='ECFP', split='index'):
  else:
    data_dir = "/tmp"

  dataset_file = os.path.join(
      data_dir, "clintox.csv.gz")
  dataset_file = os.path.join(data_dir, "clintox.csv.gz")
  if not os.path.exists(dataset_file):
    os.system('wget -P ' + data_dir + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/clintox.csv.gz')
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/clintox.csv.gz'
    )

  print("About to load clintox dataset.")
  dataset = dc.utils.save.load_from_disk(dataset_file)
@@ -48,15 +49,18 @@ def load_clintox(featurizer='ECFP', split='index'):
  # Transform clintox dataset
  print("About to transform clintox dataset.")
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  # Split clintox dataset
  print("About to split clintox dataset.")
  splitters = {'index': dc.splits.IndexSplitter(),
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
      'scaffold': dc.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)

+16 −12
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ from __future__ import unicode_literals
import os
import deepchem as dc


def load_delaney(featurizer='ECFP', split='index'):
  """Load delaney datasets."""
  # Featurize Delaney dataset
@@ -17,12 +18,13 @@ def load_delaney(featurizer='ECFP', split='index'):
  else:
    data_dir = "/tmp"

  dataset_file = os.path.join(
      data_dir, "delaney-processed.csv")
  dataset_file = os.path.join(data_dir, "delaney-processed.csv")

  if not os.path.exists(dataset_file):
    os.system('wget -P ' + data_dir + 
    ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/delaney-processed.csv')
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/delaney-processed.csv'
    )

  delaney_tasks = ['measured log solubility in mols per litre']
  if featurizer == 'ECFP':
@@ -34,20 +36,22 @@ def load_delaney(featurizer='ECFP', split='index'):

  loader = dc.data.CSVLoader(
      tasks=delaney_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(
      dataset_file, shard_size=8192)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  # Initialize transformers 
  transformers = [
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {'index': dc.splits.IndexSplitter(),
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
      'scaffold': dc.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return delaney_tasks, (train, valid, test), transformers
Loading