Commit d9869f00 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #484 from miaecle/molnet

dc.molnet update
parents 49ae1606 5f1c379b
Loading
Loading
Loading
Loading
+324 −215

File changed.

Preview size limit exceeded, changes collapsed.

+8 −0
Original line number Diff line number Diff line
@@ -2,16 +2,23 @@ from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

from deepchem.molnet.load_function.bace_datasets import load_bace_classification, load_bace_regression
from deepchem.molnet.load_function.bbbp_datasets import load_bbbp
from deepchem.molnet.load_function.chembl_datasets import load_chembl
from deepchem.molnet.load_function.clearance_datasets import load_clearance
from deepchem.molnet.load_function.clintox_datasets import load_clintox
from deepchem.molnet.load_function.delaney_datasets import load_delaney
from deepchem.molnet.load_function.hiv_datasets import load_hiv
from deepchem.molnet.load_function.hopv_datasets import load_hopv
from deepchem.molnet.load_function.kaggle_datasets import load_kaggle
from deepchem.molnet.load_function.lipo_datasets import load_lipo
from deepchem.molnet.load_function.muv_datasets import load_muv
from deepchem.molnet.load_function.nci_datasets import load_nci
from deepchem.molnet.load_function.pcba_datasets import load_pcba
from deepchem.molnet.load_function.pdbbind_datasets import load_pdbbind_grid
from deepchem.molnet.load_function.ppb_datasets import load_ppb
from deepchem.molnet.load_function.qm7_datasets import load_qm7_from_mat, load_qm7b_from_mat
from deepchem.molnet.load_function.qm8_datasets import load_qm8
from deepchem.molnet.load_function.qm9_datasets import load_qm9
from deepchem.molnet.load_function.sampl_datasets import load_sampl
from deepchem.molnet.load_function.sider_datasets import load_sider
@@ -19,3 +26,4 @@ from deepchem.molnet.load_function.tox21_datasets import load_tox21
from deepchem.molnet.load_function.toxcast_datasets import load_toxcast

from deepchem.molnet.run_benchmark import run_benchmark
from deepchem.molnet.run_benchmark_low_data import run_benchmark_low_data
+127 −0
Original line number Diff line number Diff line
CheckFeaturizer = {
    ('bace_c', 'logreg'): ['ECFP', 1024],
    ('bace_c', 'tf'): ['ECFP', 1024],
    ('bace_c', 'tf_robust'): ['ECFP', 1024],
    ('bace_c', 'rf'): ['ECFP', 1024],
    ('bace_c', 'irv'): ['ECFP', 1024],
    ('bace_c', 'graphconv'): ['GraphConv', 75],
    ('bbbp', 'logreg'): ['ECFP', 1024],
    ('bbbp', 'tf'): ['ECFP', 1024],
    ('bbbp', 'tf_robust'): ['ECFP', 1024],
    ('bbbp', 'rf'): ['ECFP', 1024],
    ('bbbp', 'irv'): ['ECFP', 1024],
    ('bbbp', 'graphconv'): ['GraphConv', 75],
    ('clintox', 'logreg'): ['ECFP', 1024],
    ('clintox', 'tf'): ['ECFP', 1024],
    ('clintox', 'tf_robust'): ['ECFP', 1024],
    ('clintox', 'rf'): ['ECFP', 1024],
    ('clintox', 'irv'): ['ECFP', 1024],
    ('clintox', 'graphconv'): ['GraphConv', 75],
    ('hiv', 'logreg'): ['ECFP', 1024],
    ('hiv', 'tf'): ['ECFP', 1024],
    ('hiv', 'tf_robust'): ['ECFP', 1024],
    ('hiv', 'rf'): ['ECFP', 1024],
    ('hiv', 'irv'): ['ECFP', 1024],
    ('hiv', 'graphconv'): ['GraphConv', 75],
    ('muv', 'logreg'): ['ECFP', 1024],
    ('muv', 'tf'): ['ECFP', 1024],
    ('muv', 'tf_robust'): ['ECFP', 1024],
    ('muv', 'rf'): ['ECFP', 1024],
    ('muv', 'irv'): ['ECFP', 1024],
    ('muv', 'graphconv'): ['GraphConv', 75],
    ('muv', 'siamese'): ['GraphConv', 75],
    ('muv', 'attn'): ['GraphConv', 75],
    ('muv', 'res'): ['GraphConv', 75],
    ('pcba', 'logreg'): ['ECFP', 1024],
    ('pcba', 'tf'): ['ECFP', 1024],
    ('pcba', 'tf_robust'): ['ECFP', 1024],
    ('pcba', 'rf'): ['ECFP', 1024],
    ('pcba', 'irv'): ['ECFP', 1024],
    ('pcba', 'graphconv'): ['GraphConv', 75],
    ('sider', 'logreg'): ['ECFP', 1024],
    ('sider', 'tf'): ['ECFP', 1024],
    ('sider', 'tf_robust'): ['ECFP', 1024],
    ('sider', 'rf'): ['ECFP', 1024],
    ('sider', 'irv'): ['ECFP', 1024],
    ('sider', 'graphconv'): ['GraphConv', 75],
    ('sider', 'siamese'): ['GraphConv', 75],
    ('sider', 'attn'): ['GraphConv', 75],
    ('sider', 'res'): ['GraphConv', 75],
    ('tox21', 'logreg'): ['ECFP', 1024],
    ('tox21', 'tf'): ['ECFP', 1024],
    ('tox21', 'tf_robust'): ['ECFP', 1024],
    ('tox21', 'rf'): ['ECFP', 1024],
    ('tox21', 'irv'): ['ECFP', 1024],
    ('tox21', 'graphconv'): ['GraphConv', 75],
    ('tox21', 'siamese'): ['GraphConv', 75],
    ('tox21', 'attn'): ['GraphConv', 75],
    ('tox21', 'res'): ['GraphConv', 75],
    ('toxcast', 'logreg'): ['ECFP', 1024],
    ('toxcast', 'tf'): ['ECFP', 1024],
    ('toxcast', 'tf_robust'): ['ECFP', 1024],
    ('toxcast', 'rf'): ['ECFP', 1024],
    ('toxcast', 'irv'): ['ECFP', 1024],
    ('toxcast', 'graphconv'): ['GraphConv', 75],
    ('bace_r', 'tf_regression'): ['ECFP', 1024],
    ('bace_r', 'rf_regression'): ['ECFP', 1024],
    ('bace_r', 'graphconvreg'): ['GraphConv', 75],
    ('chembl', 'tf_regression'): ['ECFP', 1024],
    ('chembl', 'rf_regression'): ['ECFP', 1024],
    ('chembl', 'graphconvreg'): ['GraphConv', 75],
    ('clearance', 'tf_regression'): ['ECFP', 1024],
    ('clearance', 'rf_regression'): ['ECFP', 1024],
    ('clearance', 'graphconvreg'): ['GraphConv', 75],
    ('delaney', 'tf_regression'): ['ECFP', 1024],
    ('delaney', 'rf_regression'): ['ECFP', 1024],
    ('delaney', 'graphconvreg'): ['GraphConv', 75],
    ('hopv', 'tf_regression'): ['ECFP', 1024],
    ('hopv', 'rf_regression'): ['ECFP', 1024],
    ('hopv', 'graphconvreg'): ['GraphConv', 75],
    ('lipo', 'tf_regression'): ['ECFP', 1024],
    ('lipo', 'rf_regression'): ['ECFP', 1024],
    ('lipo', 'graphconvreg'): ['GraphConv', 75],
    ('nci', 'tf_regression'): ['ECFP', 1024],
    ('nci', 'rf_regression'): ['ECFP', 1024],
    ('nci', 'graphconvreg'): ['GraphConv', 75],
    ('ppb', 'tf_regression'): ['ECFP', 1024],
    ('ppb', 'rf_regression'): ['ECFP', 1024],
    ('ppb', 'graphconvreg'): ['GraphConv', 75],
    ('sampl', 'tf_regression'): ['ECFP', 1024],
    ('sampl', 'rf_regression'): ['ECFP', 1024],
    ('sampl', 'graphconvreg'): ['GraphConv', 75],
    ('kaggle', 'tf_regression'): [None, 14293],
    ('kaggle', 'rf_regression'): [None, 14293],
    ('pdbbind', 'tf_regression'): ['grid', 2052],
    ('pdbbind', 'rf_regression'): ['grid', 2052],
    ('qm7', 'tf_regression_ft'): [None, [23, 23]],
    ('qm7b', 'tf_regression_ft'): [None, [23, 23]],
    ('qm8', 'tf_regression_ft'): [None, [26, 26]],
    ('qm9', 'tf_regression_ft'): [None, [29, 29]]
}

CheckSplit = {
    'bace_c': ['random', 'scaffold'],
    'bace_r': ['random', 'scaffold'],
    'bbbp': ['random', 'scaffold'],
    'chembl': ['index', 'random', 'scaffold', 'year'],
    'clearance': ['index', 'random', 'scaffold'],
    'clintox': ['index', 'random', 'scaffold'],
    'delaney': ['index', 'random', 'scaffold'],
    'hiv': ['index', 'random', 'scaffold', 'butina'],
    'hopv': ['index', 'random', 'scaffold', 'butina'],
    'kaggle': ['index'],  # already splitted, no splitter required
    'lipo': ['index', 'random', 'scaffold'],
    'muv': ['index', 'random', 'scaffold', 'task'],
    'nci': ['index', 'random', 'scaffold'],
    'pcba': ['index', 'random', 'scaffold'],
    'pdbbind': ['index', 'random'],
    'ppb': ['index', 'random', 'scaffold'],
    'qm7': ['index', 'random', 'stratified'],
    'qm7b': ['index', 'random', 'stratified'],
    'qm8': ['index', 'random', 'stratified'],
    'qm9': ['index', 'random', 'stratified'],
    'sampl': ['index', 'random', 'scaffold'],
    'sider': ['index', 'random', 'scaffold', 'task'],
    'tox21': ['index', 'random', 'scaffold', 'butina', 'task'],
    'toxcast': ['index', 'random', 'scaffold']
}
+113 −0
Original line number Diff line number Diff line
"""
bace dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem
from deepchem.molnet.load_function.bace_features import bace_user_specified_features


def load_bace_regression(featurizer=None, split='random'):
  """Load bace datasets."""
  # Featurize bace dataset
  print("About to featurize bace dataset.")
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
  else:
    data_dir = "/tmp"

  dataset_file = os.path.join(data_dir, "bace.csv")

  if not os.path.exists(dataset_file):
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv '
    )

  bace_tasks = ["pIC50"]
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == None:
    featurizer = deepchem.feat.UserDefinedFeaturizer(
        bace_user_specified_features)

  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, smiles_field="mol", featurizer=featurizer)

  dataset = loader.featurize(dataset_file, shard_size=8192)
  # Initialize transformers 
  transformers = [
      deepchem.trans.NormalizationTransformer(
          transform_y=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return bace_tasks, (train, valid, test), transformers


def load_bace_classification(featurizer=None, split='random'):
  """Load bace datasets."""
  # Featurize bace dataset
  print("About to featurize bace dataset.")
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
  else:
    data_dir = "/tmp"

  dataset_file = os.path.join(data_dir, "bace.csv")

  if not os.path.exists(dataset_file):
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv '
    )

  bace_tasks = ["Class"]
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == None:
    featurizer = deepchem.feat.UserDefinedFeaturizer(
        bace_user_specified_features)

  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, smiles_field="mol", featurizer=featurizer)

  dataset = loader.featurize(dataset_file, shard_size=8192)
  # Initialize transformers 
  transformers = [
      deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return bace_tasks, (train, valid, test), transformers
+221 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading