Commit 8545a632 authored by miaecle's avatar miaecle
Browse files

refining module

parent 72025959
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@ from __future__ import division
from __future__ import unicode_literals

from deepchem.molnet.load_function.bace_datasets import load_bace_classification, load_bace_regression
from deepchem.molnet.load_function.bbbp_datasets import load_bbbp
from deepchem.molnet.load_function.chembl_datasets import load_chembl
from deepchem.molnet.load_function.clearance_datasets import load_clearance
from deepchem.molnet.load_function.clintox_datasets import load_clintox
@@ -24,5 +25,4 @@ from deepchem.molnet.load_function.sider_datasets import load_sider
from deepchem.molnet.load_function.tox21_datasets import load_tox21
from deepchem.molnet.load_function.toxcast_datasets import load_toxcast


from deepchem.molnet.run_benchmark import run_benchmark
+63 −56
Original line number Diff line number Diff line
CheckFeaturizer = {
('tox21', 'logreg'):      ['ECFP', 1024],
('tox21', 'tf'):          ['ECFP', 1024],
('tox21', 'tf_robust'):   ['ECFP', 1024],
('tox21', 'rf'):          ['ECFP', 1024],
('tox21', 'irv'):         ['ECFP', 1024],
('tox21', 'graphconv'):   ['GraphConv', 75],
('bace_c', 'logreg'):     ['ECFP', 1024],
('bace_c', 'tf'):         ['ECFP', 1024],
('bace_c', 'tf_robust'):  ['ECFP', 1024],
('bace_c', 'rf'):         ['ECFP', 1024],
('bace_c', 'irv'):        ['ECFP', 1024],
('bace_c', 'graphconv'):  ['GraphConv', 75],
('bbbp', 'logreg'):       ['ECFP', 1024],
('bbbp', 'tf'):           ['ECFP', 1024],
('bbbp', 'tf_robust'):    ['ECFP', 1024],
('bbbp', 'rf'):           ['ECFP', 1024],
('bbbp', 'irv'):          ['ECFP', 1024],
('bbbp', 'graphconv'):    ['GraphConv', 75],
('clintox', 'logreg'):    ['ECFP', 1024],
('clintox', 'tf'):        ['ECFP', 1024],
('clintox', 'tf_robust'): ['ECFP', 1024],
('clintox', 'rf'):        ['ECFP', 1024],
('clintox', 'irv'):       ['ECFP', 1024],
('clintox', 'graphconv'): ['GraphConv', 75],
('hiv', 'logreg'):        ['ECFP', 1024],
('hiv', 'tf'):            ['ECFP', 1024],
('hiv', 'tf_robust'):     ['ECFP', 1024],
('hiv', 'rf'):            ['ECFP', 1024],
('hiv', 'irv'):           ['ECFP', 1024],
('hiv', 'graphconv'):     ['GraphConv', 75],
('muv', 'logreg'):        ['ECFP', 1024],
('muv', 'tf'):            ['ECFP', 1024],
('muv', 'tf_robust'):     ['ECFP', 1024],
@@ -17,64 +35,52 @@ CheckFeaturizer = {
('pcba', 'rf'):           ['ECFP', 1024],
('pcba', 'irv'):          ['ECFP', 1024],
('pcba', 'graphconv'):    ['GraphConv', 75],
('toxcast', 'logreg'):    ['ECFP', 1024],
('toxcast', 'tf'):        ['ECFP', 1024],
('toxcast', 'tf_robust'): ['ECFP', 1024],
('toxcast', 'rf'):        ['ECFP', 1024],
('toxcast', 'irv'):       ['ECFP', 1024],
('toxcast', 'graphconv'): ['GraphConv', 75],
('sider', 'logreg'):      ['ECFP', 1024],
('sider', 'tf'):          ['ECFP', 1024],
('sider', 'tf_robust'):   ['ECFP', 1024],
('sider', 'rf'):          ['ECFP', 1024],
('sider', 'irv'):         ['ECFP', 1024],
('sider', 'graphconv'):   ['GraphConv', 75],
('clintox', 'logreg'):    ['ECFP', 1024],
('clintox', 'tf'):        ['ECFP', 1024],
('clintox', 'tf_robust'): ['ECFP', 1024],
('clintox', 'rf'):        ['ECFP', 1024],
('clintox', 'irv'):       ['ECFP', 1024],
('clintox', 'graphconv'): ['GraphConv', 75],
('hiv', 'logreg'):        ['ECFP', 1024],
('hiv', 'tf'):            ['ECFP', 1024],
('hiv', 'tf_robust'):     ['ECFP', 1024],
('hiv', 'rf'):            ['ECFP', 1024],
('hiv', 'irv'):           ['ECFP', 1024],
('hiv', 'graphconv'):     ['GraphConv', 75],
('bace_c', 'logreg'):     ['ECFP', 1024],
('bace_c', 'tf'):         ['ECFP', 1024],
('bace_c', 'tf_robust'):  ['ECFP', 1024],
('bace_c', 'rf'):         ['ECFP', 1024],
('bace_c', 'irv'):        ['ECFP', 1024],
('bace_c', 'graphconv'):  ['GraphConv', 75],
('tox21', 'logreg'):      ['ECFP', 1024],
('tox21', 'tf'):          ['ECFP', 1024],
('tox21', 'tf_robust'):   ['ECFP', 1024],
('tox21', 'rf'):          ['ECFP', 1024],
('tox21', 'irv'):         ['ECFP', 1024],
('tox21', 'graphconv'):   ['GraphConv', 75],
('toxcast', 'logreg'):    ['ECFP', 1024],
('toxcast', 'tf'):        ['ECFP', 1024],
('toxcast', 'tf_robust'): ['ECFP', 1024],
('toxcast', 'rf'):        ['ECFP', 1024],
('toxcast', 'irv'):       ['ECFP', 1024],
('toxcast', 'graphconv'): ['GraphConv', 75],

('delaney', 'tf_regression'):   ['ECFP', 1024],
('delaney', 'rf_regression'):   ['ECFP', 1024],
('delaney', 'graphconvreg'):    ['GraphConv', 75],
('sampl', 'tf_regression'):     ['ECFP', 1024],
('sampl', 'rf_regression'):     ['ECFP', 1024],
('sampl', 'graphconvreg'):      ['GraphConv', 75],
('nci', 'tf_regression'):       ['ECFP', 1024],
('nci', 'rf_regression'):       ['ECFP', 1024],
('nci', 'graphconvreg'):        ['GraphConv', 75],
('chembl', 'tf_regression'):    ['ECFP', 1024],
('chembl', 'rf_regression'):    ['ECFP', 1024],
('chembl', 'graphconvreg'):     ['GraphConv', 75],
('bace_r', 'tf_regression'):    ['ECFP', 1024],
('bace_r', 'rf_regression'):    ['ECFP', 1024],
('bace_r', 'graphconvreg'):     ['GraphConv', 75],
('chembl', 'tf_regression'):    ['ECFP', 1024],
('chembl', 'rf_regression'):    ['ECFP', 1024],
('chembl', 'graphconvreg'):     ['GraphConv', 75],
('clearance', 'tf_regression'): ['ECFP', 1024],
('clearance', 'rf_regression'): ['ECFP', 1024],
('clearance', 'graphconvreg'):  ['GraphConv', 75],
('delaney', 'tf_regression'):   ['ECFP', 1024],
('delaney', 'rf_regression'):   ['ECFP', 1024],
('delaney', 'graphconvreg'):    ['GraphConv', 75],
('hopv', 'tf_regression'):      ['ECFP', 1024],
('hopv', 'rf_regression'):      ['ECFP', 1024],
('hopv', 'graphconvreg'):       ['GraphConv', 75],
('lipo', 'tf_regression'):      ['ECFP', 1024],
('lipo', 'rf_regression'):      ['ECFP', 1024],
('lipo', 'graphconvreg'):       ['GraphConv', 75],
('nci', 'tf_regression'):       ['ECFP', 1024],
('nci', 'rf_regression'):       ['ECFP', 1024],
('nci', 'graphconvreg'):        ['GraphConv', 75],
('ppb', 'tf_regression'):       ['ECFP', 1024],
('ppb', 'rf_regression'):       ['ECFP', 1024],
('ppb', 'graphconvreg'):        ['GraphConv', 75],
('sampl', 'tf_regression'):     ['ECFP', 1024],
('sampl', 'rf_regression'):     ['ECFP', 1024],
('sampl', 'graphconvreg'):      ['GraphConv', 75],

('kaggle', 'tf_regression'):  [None, 14293],
('kaggle', 'rf_regression'):  [None, 14293],
@@ -87,27 +93,28 @@ CheckFeaturizer = {
}

CheckSplit = {
'tox21':  ['index', 'random', 'scaffold', 'butina'],
'muv':    ['index', 'random', 'scaffold'], 
'pcba':   ['index', 'random', 'scaffold'],
'sider':  ['index', 'random', 'scaffold'],
'toxcast':['index', 'random', 'scaffold'], 
'clintox':['index', 'random', 'scaffold'], 
'hiv':    ['index', 'random', 'scaffold', 'butina'],
'bace_c': ['index', 'random'],
'delaney':['index', 'random', 'scaffold'],
'sampl':  ['index', 'random', 'scaffold'],
'nci':    ['index', 'random', 'scaffold'],
'chembl': ['index', 'random', 'scaffold', 'year'],
'bace_r': ['index', 'random'],
'bbbp':   ['index', 'random', 'scaffold'],
'chembl': ['index', 'random', 'scaffold', 'year'],
'clearance': ['index', 'random', 'scaffold'],
'clintox':['index', 'random', 'scaffold'], 
'delaney':['index', 'random', 'scaffold'],
'hiv':    ['index', 'random', 'scaffold', 'butina'],
'hopv':    ['index', 'random', 'scaffold', 'butina'],
'lipo':    ['index', 'random', 'scaffold'],
'ppb':    ['index', 'random', 'scaffold'],
'kaggle': ['index'], # already splitted, no splitter required
'lipo':    ['index', 'random', 'scaffold'],
'muv':    ['index', 'random', 'scaffold'], 
'nci':    ['index', 'random', 'scaffold'],
'pcba':   ['index', 'random', 'scaffold'],
'pdbbind':['index', 'random'],
'ppb':    ['index', 'random', 'scaffold'],
'qm7':    ['index', 'random', 'stratified'],
'qm7b':   ['index', 'random', 'stratified'],
'qm8':    ['index', 'random', 'stratified'],
'qm9':    ['index', 'random', 'stratified']
'qm9':    ['index', 'random', 'stratified'],
'sampl':  ['index', 'random', 'scaffold'],
'sider':  ['index', 'random', 'scaffold'],
'tox21':  ['index', 'random', 'scaffold', 'butina'],
'toxcast':['index', 'random', 'scaffold']
}
+55 −0
Original line number Diff line number Diff line
"""
Blood-Brain Barrier Penetration dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem


def load_bbbp(featurizer='ECFP', split='index'):
  """Load blood-brain barrier penetration datasets """
  # Featurize bbb dataset
  print("About to featurize bbbp dataset.")
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
  else:
    data_dir = "/tmp"

  dataset_file = os.path.join(data_dir, "BBBP.csv")
  if not os.path.exists(dataset_file):
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/BBBP.csv'
    )

  bbbp_tasks = ["p_np"]
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()

  loader = deepchem.data.CSVLoader(
      tasks=bbbp_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)
  # Initialize transformers 
  transformers = [
      deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return bbbp_tasks, (train, valid, test), transformers
+41 −35
Original line number Diff line number Diff line
@@ -24,16 +24,18 @@ def run_benchmark(datasets,
                  featurizer=None,
                  n_features=0,
                  out_path='.',
                  test=False):
                  hyper_parameters=None,
                  test=False,
                  seed=123):
  """
  Run benchmark test on designated datasets with deepchem(or user-defined) model
  
  Parameters
  ----------
  datasets: list of string
      choice of which datasets to use, should be: tox21, muv, sider, 
      toxcast, pcba, delaney, kaggle, nci, clintox, hiv, pdbbind, chembl,
      qm7, qm7b, qm9, sampl
      choice of which datasets to use, should be: bace_c, bace_r, bbbp, chembl,
      clearance, clintox, delaney, hiv, hopv, kaggle, lipo, muv, nci, pcba, 
      pdbbind, ppb, qm7, qm7b, qm8, qm9, sampl, sider, tox21, toxcast 
  model: string or user-defined model stucture
      choice of which model to use, deepchem provides implementation of
      logistic regression, random forest, multitask network, 
@@ -56,25 +58,26 @@ def run_benchmark(datasets,
  """
  for dataset in datasets:
    if dataset in [
        'muv', 'pcba', 'tox21', 'sider', 'toxcast', 'clintox', 'hiv', 'bace_c'
        'bace_c', 'bbbp', 'clintox', 'hiv', 'muv', 'pcba', 'sider', 
        'tox21',  'toxcast'
    ]:
      mode = 'classification'
      if metric == None:
        metric = [
            deepchem.metrics.Metric(deepchem.metrics.roc_auc_score, np.mean)
        ]
        metric = 'auc'
    elif dataset in [
        'kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'qm7', 'qm7b', 'qm8', 'qm9',
        'sampl', 'bace_r', 'clearance', 'hopv', 'lipo', 'ppb'
        'bace_r', 'chembl', 'clearance', 'delaney', 'hopv', 'kaggle', 'lipo', 
        'nci', 'pdbbind', 'ppb',  'qm7', 'qm7b', 'qm8', 'qm9', 'sampl'
    ]:
      mode = 'regression'
      if metric == None:
        metric = [
            deepchem.metrics.Metric(deepchem.metrics.pearson_r2_score, np.mean)
        ]
        metric = 'r2'
    else:
      raise ValueError('Dataset not supported')

    metric_all = {'auc': deepchem.metrics.Metric(deepchem.metrics.roc_auc_score, np.mean),
                  'r2': deepchem.metrics.Metric(deepchem.metrics.pearson_r2_score, np.mean)}
    metric = [metric_all[metric]]

    if featurizer == None and isinstance(model, str):
      # Assigning featurizer if not user defined
      pair = (dataset, model)
@@ -88,29 +91,30 @@ def run_benchmark(datasets,
      continue

    loading_functions = {
        'tox21': deepchem.molnet.load_tox21,
        'bace_c': deepchem.molnet.load_bace_classification,
        'bace_r': deepchem.molnet.load_bace_regression,
        'bbbp': deepchem.molnet.load_bbbp,
        'chembl': deepchem.molnet.load_chembl,
        'clearance': deepchem.molnet.load_clearance,
        'clintox': deepchem.molnet.load_clintox,
        'delaney': deepchem.molnet.load_delaney,
        'hiv': deepchem.molnet.load_hiv,
        'hopv': deepchem.molnet.load_hopv,
        'kaggle': deepchem.molnet.load_kaggle,
        'lipo': deepchem.molnet.load_lipo,
        'muv': deepchem.molnet.load_muv,
        'pcba': deepchem.molnet.load_pcba,
        'nci': deepchem.molnet.load_nci,
        'sider': deepchem.molnet.load_sider,
        'toxcast': deepchem.molnet.load_toxcast,
        'kaggle': deepchem.molnet.load_kaggle,
        'delaney': deepchem.molnet.load_delaney,
        'pcba': deepchem.molnet.load_pcba,
        'pdbbind': deepchem.molnet.load_pdbbind_grid,
        'chembl': deepchem.molnet.load_chembl,
        'ppb': deepchem.molnet.load_ppb,
        'qm7': deepchem.molnet.load_qm7_from_mat,
        'qm7b': deepchem.molnet.load_qm7b_from_mat,
        'qm8': deepchem.molnet.load_qm8,
        'qm9': deepchem.molnet.load_qm9,
        'sampl': deepchem.molnet.load_sampl,
        'clintox': deepchem.molnet.load_clintox,
        'hiv': deepchem.molnet.load_hiv,
        'bace_c': deepchem.molnet.load_bace_classification,
        'bace_r': deepchem.molnet.load_bace_regression,
        'clearance': deepchem.molnet.load_clearance,
        'hopv': deepchem.molnet.load_hopv,
        'lipo': deepchem.molnet.load_lipo,
        'ppb': deepchem.molnet.load_ppb
        'sider': deepchem.molnet.load_sider,
        'tox21': deepchem.molnet.load_tox21,
        'toxcast': deepchem.molnet.load_toxcast
    }

    print('-------------------------------------')
@@ -126,9 +130,7 @@ def run_benchmark(datasets,
          featurizer=featurizer)

    train_dataset, valid_dataset, test_dataset = all_dataset
    if dataset in ['pdbbind']:
      n_features = train_dataset.get_data_shape()[0]
      print(n_features)

    time_start_fitting = time.time()
    train_score = {}
    valid_score = {}
@@ -144,8 +146,10 @@ def run_benchmark(datasets,
            transformers,
            n_features,
            metric,
            model=model,
            test=test)
            model,
            test=test,
            hyper_parameters=hyper_parameters,
            seed=seed)
      elif mode == 'regression':
        train_score, valid_score, test_score = benchmark_regression(
            train_dataset,
@@ -155,8 +159,10 @@ def run_benchmark(datasets,
            transformers,
            n_features,
            metric,
            model=model,
            test=test)
            model,
            test=test,
            hyper_parameters=hyper_parameters,
            seed=seed)
    else:
      model.fit(train_dataset)
      train_score['user_defined'] = model.evaluate(train_dataset, metric,
+61 −900

File changed.

Preview size limit exceeded, changes collapsed.

Loading