Commit 72025959 authored by miaecle's avatar miaecle
Browse files

adding new datasets

parent 48701c9a
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -2,20 +2,27 @@ from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

from deepchem.molnet.load_function.bace_datasets import load_bace_classification, load_bace_regression
from deepchem.molnet.load_function.chembl_datasets import load_chembl
from deepchem.molnet.load_function.clearance_datasets import load_clearance
from deepchem.molnet.load_function.clintox_datasets import load_clintox
from deepchem.molnet.load_function.delaney_datasets import load_delaney
from deepchem.molnet.load_function.hiv_datasets import load_hiv
from deepchem.molnet.load_function.hopv_datasets import load_hopv
from deepchem.molnet.load_function.kaggle_datasets import load_kaggle
from deepchem.molnet.load_function.lipo_datasets import load_lipo
from deepchem.molnet.load_function.muv_datasets import load_muv
from deepchem.molnet.load_function.nci_datasets import load_nci
from deepchem.molnet.load_function.pcba_datasets import load_pcba
from deepchem.molnet.load_function.pdbbind_datasets import load_pdbbind_grid
from deepchem.molnet.load_function.ppb_datasets import load_ppb
from deepchem.molnet.load_function.qm7_datasets import load_qm7_from_mat, load_qm7b_from_mat
from deepchem.molnet.load_function.qm8_datasets import load_qm8
from deepchem.molnet.load_function.qm9_datasets import load_qm9
from deepchem.molnet.load_function.sampl_datasets import load_sampl
from deepchem.molnet.load_function.sider_datasets import load_sider
from deepchem.molnet.load_function.tox21_datasets import load_tox21
from deepchem.molnet.load_function.toxcast_datasets import load_toxcast


from deepchem.molnet.run_benchmark import run_benchmark
+113 −0
Original line number Diff line number Diff line
CheckFeaturizer = {
('tox21', 'logreg'):      ['ECFP', 1024],
('tox21', 'tf'):          ['ECFP', 1024],
('tox21', 'tf_robust'):   ['ECFP', 1024],
('tox21', 'rf'):          ['ECFP', 1024],
('tox21', 'irv'):         ['ECFP', 1024],
('tox21', 'graphconv'):   ['GraphConv', 75],
('muv', 'logreg'):        ['ECFP', 1024],
('muv', 'tf'):            ['ECFP', 1024],
('muv', 'tf_robust'):     ['ECFP', 1024],
('muv', 'rf'):            ['ECFP', 1024],
('muv', 'irv'):           ['ECFP', 1024],
('muv', 'graphconv'):     ['GraphConv', 75],
('pcba', 'logreg'):       ['ECFP', 1024],
('pcba', 'tf'):           ['ECFP', 1024],
('pcba', 'tf_robust'):    ['ECFP', 1024],
('pcba', 'rf'):           ['ECFP', 1024],
('pcba', 'irv'):          ['ECFP', 1024],
('pcba', 'graphconv'):    ['GraphConv', 75],
('toxcast', 'logreg'):    ['ECFP', 1024],
('toxcast', 'tf'):        ['ECFP', 1024],
('toxcast', 'tf_robust'): ['ECFP', 1024],
('toxcast', 'rf'):        ['ECFP', 1024],
('toxcast', 'irv'):       ['ECFP', 1024],
('toxcast', 'graphconv'): ['GraphConv', 75],
('sider', 'logreg'):      ['ECFP', 1024],
('sider', 'tf'):          ['ECFP', 1024],
('sider', 'tf_robust'):   ['ECFP', 1024],
('sider', 'rf'):          ['ECFP', 1024],
('sider', 'irv'):         ['ECFP', 1024],
('sider', 'graphconv'):   ['GraphConv', 75],
('clintox', 'logreg'):    ['ECFP', 1024],
('clintox', 'tf'):        ['ECFP', 1024],
('clintox', 'tf_robust'): ['ECFP', 1024],
('clintox', 'rf'):        ['ECFP', 1024],
('clintox', 'irv'):       ['ECFP', 1024],
('clintox', 'graphconv'): ['GraphConv', 75],
('hiv', 'logreg'):        ['ECFP', 1024],
('hiv', 'tf'):            ['ECFP', 1024],
('hiv', 'tf_robust'):     ['ECFP', 1024],
('hiv', 'rf'):            ['ECFP', 1024],
('hiv', 'irv'):           ['ECFP', 1024],
('hiv', 'graphconv'):     ['GraphConv', 75],
('bace_c', 'logreg'):     ['ECFP', 1024],
('bace_c', 'tf'):         ['ECFP', 1024],
('bace_c', 'tf_robust'):  ['ECFP', 1024],
('bace_c', 'rf'):         ['ECFP', 1024],
('bace_c', 'irv'):        ['ECFP', 1024],
('bace_c', 'graphconv'):  ['GraphConv', 75],

('delaney', 'tf_regression'):   ['ECFP', 1024],
('delaney', 'rf_regression'):   ['ECFP', 1024],
('delaney', 'graphconvreg'):    ['GraphConv', 75],
('sampl', 'tf_regression'):     ['ECFP', 1024],
('sampl', 'rf_regression'):     ['ECFP', 1024],
('sampl', 'graphconvreg'):      ['GraphConv', 75],
('nci', 'tf_regression'):       ['ECFP', 1024],
('nci', 'rf_regression'):       ['ECFP', 1024],
('nci', 'graphconvreg'):        ['GraphConv', 75],
('chembl', 'tf_regression'):    ['ECFP', 1024],
('chembl', 'rf_regression'):    ['ECFP', 1024],
('chembl', 'graphconvreg'):     ['GraphConv', 75],
('bace_r', 'tf_regression'):    ['ECFP', 1024],
('bace_r', 'rf_regression'):    ['ECFP', 1024],
('bace_r', 'graphconvreg'):     ['GraphConv', 75],
('clearance', 'tf_regression'): ['ECFP', 1024],
('clearance', 'rf_regression'): ['ECFP', 1024],
('clearance', 'graphconvreg'):  ['GraphConv', 75],
('hopv', 'tf_regression'):      ['ECFP', 1024],
('hopv', 'rf_regression'):      ['ECFP', 1024],
('hopv', 'graphconvreg'):       ['GraphConv', 75],
('lipo', 'tf_regression'):      ['ECFP', 1024],
('lipo', 'rf_regression'):      ['ECFP', 1024],
('lipo', 'graphconvreg'):       ['GraphConv', 75],
('ppb', 'tf_regression'):       ['ECFP', 1024],
('ppb', 'rf_regression'):       ['ECFP', 1024],
('ppb', 'graphconvreg'):        ['GraphConv', 75],

('kaggle', 'tf_regression'):  [None, 14293],
('kaggle', 'rf_regression'):  [None, 14293],
('pdbbind', 'tf_regression'): ['grid', 2052],
('pdbbind', 'rf_regression'): ['grid', 2052],
('qm7', 'tf_regression_ft'):  [None, [23, 23]],
('qm7b', 'tf_regression_ft'): [None, [23, 23]],
('qm8', 'tf_regression_ft'):  [None, [26, 26]],
('qm9', 'tf_regression_ft'):  [None, [29, 29]]
}

CheckSplit = {
'tox21':  ['index', 'random', 'scaffold', 'butina'],
'muv':    ['index', 'random', 'scaffold'], 
'pcba':   ['index', 'random', 'scaffold'],
'sider':  ['index', 'random', 'scaffold'],
'toxcast':['index', 'random', 'scaffold'], 
'clintox':['index', 'random', 'scaffold'], 
'hiv':    ['index', 'random', 'scaffold', 'butina'],
'bace_c': ['index', 'random'],
'delaney':['index', 'random', 'scaffold'],
'sampl':  ['index', 'random', 'scaffold'],
'nci':    ['index', 'random', 'scaffold'],
'chembl': ['index', 'random', 'scaffold', 'year'],
'bace_r': ['index', 'random'],
'clearance': ['index', 'random', 'scaffold'],
'hopv':    ['index', 'random', 'scaffold', 'butina'],
'lipo':    ['index', 'random', 'scaffold'],
'ppb':    ['index', 'random', 'scaffold'],
'kaggle': ['index'], # already splitted, no splitter required
'pdbbind':['index', 'random'],
'qm7':    ['index', 'random', 'stratified'],
'qm7b':   ['index', 'random', 'stratified'],
'qm8':    ['index', 'random', 'stratified'],
'qm9':    ['index', 'random', 'stratified']
}
+106 −0
Original line number Diff line number Diff line
"""
bace dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem
from deepchem.molnet.load_function.bace_features import bace_user_specified_features

def load_bace_regression(featurizer=None, split='random'):
  """Load bace datasets."""
  # Featurize bace dataset
  print("About to featurize bace dataset.")
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
  else:
    data_dir = "/tmp"

  dataset_file = os.path.join(data_dir, "bace.csv")

  if not os.path.exists(dataset_file):
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv '
    )

  bace_tasks = ["pIC50"]
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == None:
    featurizer = deepchem.feat.UserDefinedFeaturizer(bace_user_specified_features)
    
  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, smiles_field="mol", featurizer=featurizer)
  
  dataset = loader.featurize(dataset_file, shard_size=8192)
  # Initialize transformers 
  transformers = [
      deepchem.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)  
  return bace_tasks, (train, valid, test), transformers

def load_bace_classification(featurizer=None, split='random'):
  """Load bace datasets."""
  # Featurize bace dataset
  print("About to featurize bace dataset.")
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
  else:
    data_dir = "/tmp"

  dataset_file = os.path.join(data_dir, "bace.csv")

  if not os.path.exists(dataset_file):
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv '
    )

  bace_tasks = ["Class"]
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == None:
    featurizer = deepchem.feat.UserDefinedFeaturizer(bace_user_specified_features)
    
  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, smiles_field="mol", featurizer=featurizer)
  
  dataset = loader.featurize(dataset_file, shard_size=8192)
  # Initialize transformers 
  transformers = [
      deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)  
  return bace_tasks, (train, valid, test), transformers
 No newline at end of file
+1 −0
Original line number Diff line number Diff line
bace_user_specified_features = ['MW','AlogP','HBA','HBD','RB','HeavyAtomCount','ChiralCenterCount','ChiralCenterCountAllPossible','RingCount','PSA','Estate','MR','Polar','sLi_Key','ssBe_Key','ssssBem_Key','sBH2_Key','ssBH_Key','sssB_Key','ssssBm_Key','sCH3_Key','dCH2_Key','ssCH2_Key','tCH_Key','dsCH_Key','aaCH_Key','sssCH_Key','ddC_Key','tsC_Key','dssC_Key','aasC_Key','aaaC_Key','ssssC_Key','sNH3_Key','sNH2_Key','ssNH2_Key','dNH_Key','ssNH_Key','aaNH_Key','tN_Key','sssNH_Key','dsN_Key','aaN_Key','sssN_Key','ddsN_Key','aasN_Key','ssssN_Key','daaN_Key','sOH_Key','dO_Key','ssO_Key','aaO_Key','aOm_Key','sOm_Key','sF_Key','sSiH3_Key','ssSiH2_Key','sssSiH_Key','ssssSi_Key','sPH2_Key','ssPH_Key','sssP_Key','dsssP_Key','ddsP_Key','sssssP_Key','sSH_Key','dS_Key','ssS_Key','aaS_Key','dssS_Key','ddssS_Key','ssssssS_Key','Sm_Key','sCl_Key','sGeH3_Key','ssGeH2_Key','sssGeH_Key','ssssGe_Key','sAsH2_Key','ssAsH_Key','sssAs_Key','dsssAs_Key','ddsAs_Key','sssssAs_Key','sSeH_Key','dSe_Key','ssSe_Key','aaSe_Key','dssSe_Key','ssssssSe_Key','ddssSe_Key','sBr_Key','sSnH3_Key','ssSnH2_Key','sssSnH_Key','ssssSn_Key','sI_Key','sPbH3_Key','ssPbH2_Key','sssPbH_Key','ssssPb_Key','sLi_Cnt','ssBe_Cnt','ssssBem_Cnt','sBH2_Cnt','ssBH_Cnt','sssB_Cnt','ssssBm_Cnt','sCH3_Cnt','dCH2_Cnt','ssCH2_Cnt','tCH_Cnt','dsCH_Cnt','aaCH_Cnt','sssCH_Cnt','ddC_Cnt','tsC_Cnt','dssC_Cnt','aasC_Cnt','aaaC_Cnt','ssssC_Cnt','sNH3_Cnt','sNH2_Cnt','ssNH2_Cnt','dNH_Cnt','ssNH_Cnt','aaNH_Cnt','tN_Cnt','sssNH_Cnt','dsN_Cnt','aaN_Cnt','sssN_Cnt','ddsN_Cnt','aasN_Cnt','ssssN_Cnt','daaN_Cnt','sOH_Cnt','dO_Cnt','ssO_Cnt','aaO_Cnt','aOm_Cnt','sOm_Cnt','sF_Cnt','sSiH3_Cnt','ssSiH2_Cnt','sssSiH_Cnt','ssssSi_Cnt','sPH2_Cnt','ssPH_Cnt','sssP_Cnt','dsssP_Cnt','ddsP_Cnt','sssssP_Cnt','sSH_Cnt','dS_Cnt','ssS_Cnt','aaS_Cnt','dssS_Cnt','ddssS_Cnt','ssssssS_Cnt','Sm_Cnt','sCl_Cnt','sGeH3_Cnt','ssGeH2_Cnt','sssGeH_Cnt','ssssGe_Cnt','sAsH2_Cnt','ssAsH_Cnt','sssAs_Cnt','dsssAs_Cnt','ddsAs_Cnt','sssssAs_Cnt','sSeH_Cnt','dSe_Cnt','ssSe_Cnt','aaSe_Cnt','dssSe_Cnt','ssssssSe_Cnt','ddssSe_Cnt','sBr_Cnt','sSnH3_Cnt','ssSnH2_Cnt','sssSnH_Cnt','ssssSn_Cnt','sI_Cnt','sPbH3_Cnt','ssPbH2_Cnt','sssPbH_Cnt','ssssPb_Cnt','sLi_Sum','ssBe_Sum','ssssBem_Sum','sBH2_Sum','ssBH_Sum','sssB_Sum','ssssBm_Sum','sCH3_Sum','dCH2_Sum','ssCH2_Sum','tCH_Sum','dsCH_Sum','aaCH_Sum','sssCH_Sum','ddC_Sum','tsC_Sum','dssC_Sum','aasC_Sum','aaaC_Sum','ssssC_Sum','sNH3_Sum','sNH2_Sum','ssNH2_Sum','dNH_Sum','ssNH_Sum','aaNH_Sum','tN_Sum','sssNH_Sum','dsN_Sum','aaN_Sum','sssN_Sum','ddsN_Sum','aasN_Sum','ssssN_Sum','daaN_Sum','sOH_Sum','dO_Sum','ssO_Sum','aaO_Sum','aOm_Sum','sOm_Sum','sF_Sum','sSiH3_Sum','ssSiH2_Sum','sssSiH_Sum','ssssSi_Sum','sPH2_Sum','ssPH_Sum','sssP_Sum','dsssP_Sum','ddsP_Sum','sssssP_Sum','sSH_Sum','dS_Sum','ssS_Sum','aaS_Sum','dssS_Sum','ddssS_Sum','ssssssS_Sum','Sm_Sum','sCl_Sum','sGeH3_Sum','ssGeH2_Sum','sssGeH_Sum','ssssGe_Sum','sAsH2_Sum','ssAsH_Sum','sssAs_Sum','dsssAs_Sum','ddsAs_Sum','sssssAs_Sum','sSeH_Sum','dSe_Sum','ssSe_Sum','aaSe_Sum','dssSe_Sum','ssssssSe_Sum','ddssSe_Sum','sBr_Sum','sSnH3_Sum','ssSnH2_Sum','sssSnH_Sum','ssssSn_Sum','sI_Sum','sPbH3_Sum','ssPbH2_Sum','sssPbH_Sum','ssssPb_Sum','sLi_Avg','ssBe_Avg','ssssBem_Avg','sBH2_Avg','ssBH_Avg','sssB_Avg','ssssBm_Avg','sCH3_Avg','dCH2_Avg','ssCH2_Avg','tCH_Avg','dsCH_Avg','aaCH_Avg','sssCH_Avg','ddC_Avg','tsC_Avg','dssC_Avg','aasC_Avg','aaaC_Avg','ssssC_Avg','sNH3_Avg','sNH2_Avg','ssNH2_Avg','dNH_Avg','ssNH_Avg','aaNH_Avg','tN_Avg','sssNH_Avg','dsN_Avg','aaN_Avg','sssN_Avg','ddsN_Avg','aasN_Avg','ssssN_Avg','daaN_Avg','sOH_Avg','dO_Avg','ssO_Avg','aaO_Avg','aOm_Avg','sOm_Avg','sF_Avg','sSiH3_Avg','ssSiH2_Avg','sssSiH_Avg','ssssSi_Avg','sPH2_Avg','ssPH_Avg','sssP_Avg','dsssP_Avg','ddsP_Avg','sssssP_Avg','sSH_Avg','dS_Avg','ssS_Avg','aaS_Avg','dssS_Avg','ddssS_Avg','ssssssS_Avg','Sm_Avg','sCl_Avg','sGeH3_Avg','ssGeH2_Avg','sssGeH_Avg','ssssGe_Avg','sAsH2_Avg','ssAsH_Avg','sssAs_Avg','dsssAs_Avg','ddsAs_Avg','sssssAs_Avg','sSeH_Avg','dSe_Avg','ssSe_Avg','aaSe_Avg','dssSe_Avg','ssssssSe_Avg','ddssSe_Avg','sBr_Avg','sSnH3_Avg','ssSnH2_Avg','sssSnH_Avg','ssssSn_Avg','sI_Avg','sPbH3_Avg','ssPbH2_Avg','sssPbH_Avg','ssssPb_Avg','First Zagreb (ZM1)','First Zagreb index by valence vertex degrees (ZM1V)','Second Zagreb (ZM2)','Second Zagreb index by valence vertex degrees (ZM2V)','Polarity (Pol)','Narumi Simple Topological (NST)','Narumi Harmonic Topological (NHT)','Narumi Geometric Topological (NGT)','Total structure connectivity (TSC)','Wiener (W)','Mean Wiener (MW)','Xu (Xu)','Quadratic (QIndex)','Radial centric (RC)','Mean Square Distance Balaban (MSDB)','Superpendentic (SP)','Harary (Har)','Log of product of row sums (LPRS)','Pogliani (Pog)','Schultz Molecular Topological (SMT)','Schultz Molecular Topological by valence vertex degrees (SMTV)','Mean Distance Degree Deviation (MDDD)','Ramification (Ram)','Gutman Molecular Topological (GMT)','Gutman MTI by valence vertex degrees (GMTV)','Average vertex distance degree (AVDD)','Unipolarity (UP)','Centralization (CENT)','Variation (VAR)','Molecular electrotopological variation (MEV)','Maximal electrotopological positive variation (MEPV)','Maximal electrotopological negative variation (MENV)','Eccentric connectivity (ECCc)','Eccentricity (ECC)','Average eccentricity (AECC)','Eccentric (DECC)','Valence connectivity index chi-0 (vX0)','Valence connectivity index chi-1 (vX1)','Valence connectivity index chi-2 (vX2)','Valence connectivity index chi-3 (vX3)','Valence connectivity index chi-4 (vX4)','Valence connectivity index chi-5 (vX5)','Average valence connectivity index chi-0 (AvX0)','Average valence connectivity index chi-1 (AvX1)','Average valence connectivity index chi-2 (AvX2)','Average valence connectivity index chi-3 (AvX3)','Average valence connectivity index chi-4 (AvX4)','Average valence connectivity index chi-5 (AvX5)','Quasi Wiener (QW)','First Mohar (FM)','Second Mohar (SM)','Spanning tree number (STN)','Kier benzene-likeliness index (KBLI)','Topological charge index of order 1 (TCI1)','Topological charge index of order 2 (TCI2)','Topological charge index of order 3 (TCI3)','Topological charge index of order 4 (TCI4)','Topological charge index of order 5 (TCI5)','Topological charge index of order 6 (TCI6)','Topological charge index of order 7 (TCI7)','Topological charge index of order 8 (TCI8)','Topological charge index of order 9 (TCI9)','Topological charge index of order 10 (TCI10)','Mean topological charge index of order 1 (MTCI1)','Mean topological charge index of order 2 (MTCI2)','Mean topological charge index of order 3 (MTCI3)','Mean topological charge index of order 4 (MTCI4)','Mean topological charge index of order 5 (MTCI5)','Mean topological charge index of order 6 (MTCI6)','Mean topological charge index of order 7 (MTCI7)','Mean topological charge index of order 8 (MTCI8)','Mean topological charge index of order 9 (MTCI9)','Mean topological charge index of order 10 (MTCI10)','Global topological charge (GTC)','Hyper-distance-path index (HDPI)','Reciprocal hyper-distance-path index (RHDPI)','Square reciprocal distance sum (SRDS)','Modified Randic connectivity (MRC)','Balaban centric (BC)','Lopping centric (LC)','Kier Hall electronegativity (KHE)','Sum of topological distances between N..N (STD(N N))','Sum of topological distances between N..O (STD(N O))','Sum of topological distances between N..S (STD(N S))','Sum of topological distances between N..P (STD(N P))','Sum of topological distances between N..F (STD(N F))','Sum of topological distances between N..Cl (STD(N Cl))','Sum of topological distances between N..Br (STD(N Br))','Sum of topological distances between N..I (STD(N I))','Sum of topological distances between O..O (STD(O O))','Sum of topological distances between O..S (STD(O S))','Sum of topological distances between O..P (STD(O P))','Sum of topological distances between O..F (STD(O F))','Sum of topological distances between O..Cl (STD(O Cl))','Sum of topological distances between O..Br (STD(O Br))','Sum of topological distances between O..I (STD(O I))','Sum of topological distances between S..S (STD(S S))','Sum of topological distances between S..P (STD(S P))','Sum of topological distances between S..F (STD(S F))','Sum of topological distances between S..Cl (STD(S Cl))','Sum of topological distances between S..Br (STD(S Br))','Sum of topological distances between S..I (STD(S I))','Sum of topological distances between P..P (STD(P P))','Sum of topological distances between P..F (STD(P F))','Sum of topological distances between P..Cl (STD(P Cl))','Sum of topological distances between P..Br (STD(P Br))','Sum of topological distances between P..I (STD(P I))','Sum of topological distances between F..F (STD(F F))','Sum of topological distances between F..Cl (STD(F Cl))','Sum of topological distances between F..Br (STD(F Br))','Sum of topological distances between F..I (STD(F I))','Sum of topological distances between Cl..Cl (STD(Cl Cl))','Sum of topological distances between Cl..Br (STD(Cl Br))','Sum of topological distances between Cl..I (STD(Cl I))','Sum of topological distances between Br..Br (STD(Br Br))','Sum of topological distances between Br..I (STD(Br I))','Sum of topological distances between I..I (STD(I I))','Wiener-type index from Z weighted distance matrix - Barysz matrix (WhetZ)','Wiener-type index from electronegativity weighted distance matrix (Whete)','Wiener-type index from mass weighted distance matrix (Whetm)','Wiener-type index from van der waals weighted distance matrix (Whetv)','Wiener-type index from polarizability weighted distance matrix (Whetp)','Balaban-type index from Z weighted distance matrix - Barysz matrix (JhetZ)','Balaban-type index from electronegativity weighted distance matrix (Jhete)','Balaban-type index from mass weighted distance matrix (Jhetm)','Balaban-type index from van der waals weighted distance matrix (Jhetv)','Balaban-type index from polarizability weighted distance matrix (Jhetp)','Topological diameter (TD)','Topological radius (TR)','Petitjean 2D shape (PJ2DS)','Balaban distance connectivity index (J)','Solvation connectivity index chi-0 (SCIX0)','Solvation connectivity index chi-1 (SCIX1)','Solvation connectivity index chi-2 (SCIX2)','Solvation connectivity index chi-3 (SCIX3)','Solvation connectivity index chi-4 (SCIX4)','Solvation connectivity index chi-5 (SCIX5)','Connectivity index chi-0 (CIX0)','Connectivity chi-1 [Randic connectivity] (CIX1)','Connectivity index chi-2 (CIX2)','Connectivity index chi-3 (CIX3)','Connectivity index chi-4 (CIX4)','Connectivity index chi-5 (CIX5)','Average connectivity index chi-0 (ACIX0)','Average connectivity index chi-1 (ACIX1)','Average connectivity index chi-2 (ACIX2)','Average connectivity index chi-3 (ACIX3)','Average connectivity index chi-4 (ACIX4)','Average connectivity index chi-5 (ACIX5)','reciprocal distance Randic-type index (RDR)','reciprocal distance square Randic-type index (RDSR)','1-path Kier alpha-modified shape index (KAMS1)','2-path Kier alpha-modified shape index (KAMS2)','3-path Kier alpha-modified shape index (KAMS3)','Kier flexibility (KF)','path/walk 2 - Randic shape index (RSIpw2)','path/walk 3 - Randic shape index (RSIpw3)','path/walk 4 - Randic shape index (RSIpw4)','path/walk 5 - Randic shape index (RSIpw5)','E-state topological parameter (ETP)','Ring Count 3 (RNGCNT3)','Ring Count 4 (RNGCNT4)','Ring Count 5 (RNGCNT5)','Ring Count 6 (RNGCNT6)','Ring Count 7 (RNGCNT7)','Ring Count 8 (RNGCNT8)','Ring Count 9 (RNGCNT9)','Ring Count 10 (RNGCNT10)','Ring Count 11 (RNGCNT11)','Ring Count 12 (RNGCNT12)','Ring Count 13 (RNGCNT13)','Ring Count 14 (RNGCNT14)','Ring Count 15 (RNGCNT15)','Ring Count 16 (RNGCNT16)','Ring Count 17 (RNGCNT17)','Ring Count 18 (RNGCNT18)','Ring Count 19 (RNGCNT19)','Ring Count 20 (RNGCNT20)','Atom Count (ATMCNT)','Bond Count (BNDCNT)','Atoms in Ring System (ATMRNGCNT)','Bonds in Ring System (BNDRNGCNT)','Cyclomatic number (CYCLONUM)','Number of ring systems (NRS)','Normalized number of ring systems (NNRS)','Ring Fusion degree (RFD)','Ring perimeter (RNGPERM)','Ring bridge count (RNGBDGE)','Molecule cyclized degree (MCD)','Ring Fusion density (RFDELTA)','Ring complexity index (RCI)','Van der Waals surface area (VSA)','MR1 (MR1)','MR2 (MR2)','MR3 (MR3)','MR4 (MR4)','MR5 (MR5)','MR6 (MR6)','MR7 (MR7)','MR8 (MR8)','ALOGP1 (ALOGP1)','ALOGP2 (ALOGP2)','ALOGP3 (ALOGP3)','ALOGP4 (ALOGP4)','ALOGP5 (ALOGP5)','ALOGP6 (ALOGP6)','ALOGP7 (ALOGP7)','ALOGP8 (ALOGP8)','ALOGP9 (ALOGP9)','ALOGP10 (ALOGP10)','PEOE1 (PEOE1)','PEOE2 (PEOE2)','PEOE3 (PEOE3)','PEOE4 (PEOE4)','PEOE5 (PEOE5)','PEOE6 (PEOE6)','PEOE7 (PEOE7)','PEOE8 (PEOE8)','PEOE9 (PEOE9)','PEOE10 (PEOE10)','PEOE11 (PEOE11)','PEOE12 (PEOE12)','PEOE13 (PEOE13)','PEOE14 (PEOE14)']
+58 −0
Original line number Diff line number Diff line
"""
clearance dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem


def load_clearance(featurizer='ECFP', split='random'):
  """Load clearance datasets."""
  # Featurize clearance dataset
  print("About to featurize clearance dataset.")
  print("About to load clearance dataset.")
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
  else:
    data_dir = "/tmp"

  dataset_file = os.path.join(data_dir, "clearance.csv")
  if not os.path.exists(dataset_file):
    os.system(
        'wget -P ' + data_dir +
        ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/clearance.csv'
    )

  clearance_tasks = ['exp']
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()

  loader = deepchem.data.CSVLoader(
      tasks=clearance_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  # Initialize transformers 
  transformers = [
      deepchem.trans.NormalizationTransformer(
          transform_y=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return clearance_tasks, (train, valid, test), transformers
Loading