Commit b6580cf7 authored by ZHENQIN WU's avatar ZHENQIN WU
Browse files

qm7 add in

parent 7f6cf0e1
Loading
Loading
Loading
Loading
+56 −22
Original line number Diff line number Diff line
@@ -18,8 +18,8 @@ Giving regression performances of:
    MultitaskDNN(tf_regression),
    Random forest(rf_regression),
    Graph convolution regression(graphconvreg)
on datasets: delaney(ESOL), nci, kaggle, pdbbind, qm7, 
             chembl, sampl(FreeSolv)
on datasets: delaney(ESOL), nci, kaggle, pdbbind, 
             qm7, qm7b, chembl, sampl(FreeSolv)

time estimation listed in README file

@@ -51,7 +51,7 @@ from kaggle.kaggle_datasets import load_kaggle
from delaney.delaney_datasets import load_delaney
from pdbbind.pdbbind_datasets import load_pdbbind_grid
from chembl.chembl_datasets import load_chembl
from qm7.qm7_datasets import load_qm7
from qm7.qm7_datasets import load_qm7_from_mat, load_qm7b_from_mat
from sampl.sampl_datasets import load_sampl
from clintox.clintox_datasets import load_clintox

@@ -73,7 +73,7 @@ def benchmark_loading_datasets(hyper_parameters,
  dataset: string, optional (default='tox21')
      choice of which dataset to use, should be: tox21, muv, sider, 
      toxcast, pcba, delaney, kaggle, nci, clintox, pdbbind, chembl,
      qm7, sampl
      qm7, qm7b, sampl
  model: string,  optional (default='tf')
      choice of which model to use, should be: rf, tf, tf_robust, logreg,
      irv, graphconv, tf_regression, rf_regression, graphconvreg
@@ -86,7 +86,7 @@ def benchmark_loading_datasets(hyper_parameters,
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast', 'clintox']:
    mode = 'classification'
  elif dataset in [
      'kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'qm7', 'sampl'
      'kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'qm7', 'qm7b', 'sampl'
  ]:
    mode = 'regression'
  else:
@@ -121,9 +121,19 @@ def benchmark_loading_datasets(hyper_parameters,
    if not model in ['tf_regression', 'rf_regression']:
      return

  if dataset in ['qm7']:
    featurizer = None  # qm7 is already featurized
    if split in ['scaffold', 'butina']:
      return  # qm7 accept index and random splitter
    if not model in ['tf_regression']:
      return

  if split in ['year']:
    if not dataset in ['chembl']:
      return
  if split in ['indice', 'stratified']:
    if not dataset in ['qm7', 'qm7b']:
      return
  elif not split in [None, 'index', 'random', 'scaffold', 'butina']:
    raise ValueError('Splitter function not supported')

@@ -138,7 +148,8 @@ def benchmark_loading_datasets(hyper_parameters,
      'delaney': load_delaney,
      'pdbbind': load_pdbbind_grid,
      'chembl': load_chembl,
      'qm7': load_qm7,
      'qm7': load_qm7_from_mat,
      'qm7b': load_qm7b_from_mat,
      'sampl': load_sampl,
      'clintox': load_clintox
  }
@@ -157,10 +168,16 @@ def benchmark_loading_datasets(hyper_parameters,
        featurizer=featurizer)

  train_dataset, valid_dataset, test_dataset = all_dataset
  fit_transformers = []
  if dataset in ['qm7', 'qm7b']:
    fit_transformers = [dc.trans.CoulombFitTransformer(train_dataset)]

  time_finish_loading = time.time()
  # time_finish_loading-time_start is the time(s) used for dataset loading
  if dataset in ['kaggle', 'pdbbind', 'qm7']:
  if dataset in ['kaggle', 'pdbbind']:
    n_features = train_dataset.get_data_shape()[0]
  elif dataset in ['qm7', 'qm7b']:
    n_features = list(train_dataset.get_data_shape())
    # dataset has customized features

  # running model
@@ -191,6 +208,7 @@ def benchmark_loading_datasets(hyper_parameters,
          n_features,
          metric=metric,
          model=model,
          fit_transformers=fit_transformers,
          test=test)
    time_finish_fitting = time.time()

@@ -551,6 +569,7 @@ def benchmark_regression(train_dataset,
                         n_features,
                         metric='r2',
                         model='tf_regression',
                         fit_transformers=[],
                         seed=123,
                         test=False):
  """
@@ -610,8 +629,23 @@ def benchmark_regression(train_dataset,
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']

    if len(fit_transformers) > 0:
      model_tf_regression = dc.models.TensorflowMultiTaskFitTransformRegressor(
          n_tasks=len(tasks),
          n_features=n_features,
          layer_sizes=layer_sizes,
          weight_init_stddevs=weight_init_stddevs,
          bias_init_consts=bias_init_consts,
          dropouts=dropouts,
          penalty=penalty,
          penalty_type=penalty_type,
          batch_size=batch_size,
          learning_rate=learning_rate,
          fit_transformers=fit_transformers,
          n_eval=10,
          seed=seed)
    # Building tensorflow MultiTaskDNN model
    else:
      model_tf_regression = dc.models.TensorflowMultiTaskRegressor(
          len(tasks),
          n_features,
@@ -752,7 +786,7 @@ if __name__ == '__main__':
      dest='dataset_args',
      default=[],
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' +
      'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, clintox')
      'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, qm7b, clintox')
  parser.add_argument(
      '-t',
      action='store_true',
@@ -778,7 +812,7 @@ if __name__ == '__main__':
  if len(datasets) == 0:
    datasets = [
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'sampl',
        'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7'
        'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b'
    ]

  #input hyperparameters
+13 −7
Original line number Diff line number Diff line
@@ -13,9 +13,12 @@ import scipy.io
import csv

def load_qm7_from_mat(featurizer=None, split='stratified'):
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./qm7.mat")
  
  if not os.path.exists('qm7.mat'): os.system('wget http://www.quantum-machine.org/data/qm7.mat')
  dataset = scipy.io.loadmat('qm7.mat')
  if not os.path.exists(dataset_file): os.system('wget -P '+current_dir+' http://www.quantum-machine.org/data/qm7.mat')
  dataset = scipy.io.loadmat(dataset_file)
  
  X = dataset['X']
  y = dataset['T']
@@ -51,13 +54,16 @@ def load_qm7_from_mat(featurizer=None, split='stratified'):
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)

  qm7_tasks = np.arange(y.shape[1])
  qm7_tasks = np.arange(y.shape[0])
  return qm7_tasks, (train_dataset, valid_dataset, test_dataset), transformers

def load_qm7b_from_mat(featurizer=None, split='stratified'):
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./qm7b.mat")
  
  if not os.path.exists('qm7b.mat'): os.system('wget http://www.quantum-machine.org/data/qm7b.mat')
  dataset = scipy.io.loadmat('qm7b.mat')
  if not os.path.exists(dataset_file): os.system('wget -P '+current_dir+' http://www.quantum-machine.org/data/qm7b.mat')
  dataset = scipy.io.loadmat(dataset_file)
 
  X = dataset['X']
  y = dataset['T']