Commit 6693d1a7 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #880 from miaecle/temp3

MoleculeNet update
parents a0a586fd 364340a6
Loading
Loading
Loading
Loading
+95 −23
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ from __future__ import unicode_literals

import numpy as np
import tempfile
import os
from deepchem.hyper.grid_search import HyperparamOpt
from deepchem.utils.evaluate import Evaluator
from deepchem.molnet.run_benchmark_models import benchmark_classification, benchmark_regression
@@ -17,21 +18,26 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
  Gaussian Process Global Optimization(GPGO)
  """

  def hyperparam_search(self,
  def hyperparam_search(
      self,
      params_dict,
      train_dataset,
      valid_dataset,
      output_transformers,
      metric,
      direction=True,
      n_features=1024,
      n_tasks=1,
      max_iter=20,
      search_range=4,
      hp_invalid_list=[
                            'seed', 'nb_epoch', 'penalty_type', 'dropouts',
                            'bypass_dropouts', 'n_pair_feat'
          'seed', 'nb_epoch', 'penalty_type', 'dropouts', 'bypass_dropouts',
          'n_pair_feat', 'fit_transformers', 'min_child_weight',
          'max_delta_step', 'subsample', 'colsample_bylevel',
          'colsample_bytree', 'reg_alpha', 'reg_lambda', 'scale_pos_weight',
          'base_score'
      ],
                        logdir=None):
      log_file='GPhypersearch.log'):
    """Perform hyperparams search using a gaussian process assumption

    params_dict include single-valued parameters being optimized,
@@ -55,6 +61,8 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      transformers for evaluation
    metric: list of dc.metrics.Metric
      metric used for evaluation
    direction: bool
      maximization(True) or minimization(False)
    n_features: int
      number of input features
    n_tasks: int
@@ -66,6 +74,8 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
                       initial values * search_range]
    hp_invalid_list: list
      names of parameters that should not be optimized
    logfile: string
      name of log file, hyperparameters and results for each trial will be recorded

    Returns
    -------
@@ -95,7 +105,9 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
                        if hp_list_class[i] is list]

    # Number of parameters
    n_param = len(hp_list_single + sum([hp[1] for hp in hp_list_multiple]))
    n_param = len(hp_list_single)
    if len(hp_list_multiple) > 0:
      n_param = n_param + sum([hp[1] for hp in hp_list_multiple])
    # Range of optimization
    param_range = []
    for hp in hp_list_single:
@@ -125,6 +137,9 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    param_name = ['l' + format(i, '02d') for i in range(20)]
    param = dict(zip(param_name[:n_param], param_range))

    data_dir = os.environ['DEEPCHEM_DATA_DIR']
    log_file = os.path.join(data_dir, log_file)

    def f(l00=0,
          l01=0,
          l02=0,
@@ -177,6 +192,10 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):

      print(hyper_parameters)
      # Run benchmark
      with open(log_file, 'a') as f:
        # Record hyperparameters
        f.write(str(hyper_parameters))
        f.write('\n')
      if isinstance(self.model_class, str) or isinstance(
          self.model_class, unicode):
        try:
@@ -199,7 +218,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
              metric,
              self.model_class,
              hyper_parameters=hyper_parameters)
        return valid_scores[self.model_class][metric[0].name]
        score = valid_scores[self.model_class][metric[0].name]
      else:
        model_dir = tempfile.mkdtemp()
        model = self.model_class(hyper_parameters, model_dir)
@@ -207,17 +226,31 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        model.save()
        evaluator = Evaluator(model, valid_dataset, output_transformers)
        multitask_scores = evaluator.compute_model_performance([metric])
        return multitask_scores[metric.name]
        score = multitask_scores[metric.name]

      with open(log_file, 'a') as f:
        # Record performances
        f.write(str(score))
        f.write('\n')
      # GPGO maximize performance by default, set performance to its negative value for minimization
      if direction:
        return score
      else:
        return -score

    import pyGPGO
    cov = pyGPGO.covfunc.matern32()
    gp = pyGPGO.surrogates.GaussianProcess.GaussianProcess(cov)
    acq = pyGPGO.acquisition.Acquisition(mode='ExpectedImprovement')
    gpgo = pyGPGO.GPGO.GPGO(gp, acq, f, param)
    from pyGPGO.covfunc import matern32
    from pyGPGO.acquisition import Acquisition
    from pyGPGO.surrogates.GaussianProcess import GaussianProcess
    from pyGPGO.GPGO import GPGO
    cov = matern32()
    gp = GaussianProcess(cov)
    acq = Acquisition(mode='ExpectedImprovement')
    gpgo = GPGO(gp, acq, f, param)
    print("Max number of iteration: %i" % max_iter)
    gpgo.run(max_iter=max_iter)

    hp_opt, valid_performance_opt = gpgo.getResult()

    # Readout best hyper parameters
    i = 0
    for hp in hp_list_single:
@@ -233,4 +266,43 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        hyper_parameters[hp[0]] = map(int, hyper_parameters[hp[0]])
      i = i + hp[1]

    # Compare best model to default hyperparameters
    with open(log_file, 'a') as f:
      # Record hyperparameters
      f.write(str(params_dict))
      f.write('\n')
    if isinstance(self.model_class, str) or isinstance(self.model_class,
                                                       unicode):
      try:
        train_scores, valid_scores, _ = benchmark_classification(
            train_dataset,
            valid_dataset,
            valid_dataset, ['task_placeholder'] * n_tasks,
            output_transformers,
            n_features,
            metric,
            self.model_class,
            hyper_parameters=params_dict)
      except AssertionError:
        train_scores, valid_scores, _ = benchmark_regression(
            train_dataset,
            valid_dataset,
            valid_dataset, ['task_placeholder'] * n_tasks,
            output_transformers,
            n_features,
            metric,
            self.model_class,
            hyper_parameters=params_dict)
      score = valid_scores[self.model_class][metric[0].name]
      with open(log_file, 'a') as f:
        # Record performances
        f.write(str(score))
        f.write('\n')
      if not direction:
        score = -score
      if score > valid_performance_opt:
        # Optimized model is better, return hyperparameters
        return params_dict, score

    # Return default hyperparameters
    return hyper_parameters, valid_performance_opt
+2 −0
Original line number Diff line number Diff line
@@ -886,6 +886,8 @@ class MessagePassing(Layer):
      out = tf.pad(atom_features, ((0, 0), (0, pad_length)), mode='CONSTANT')
    elif n_atom_features > self.n_hidden:
      raise ValueError("Too large initial feature vector")
    else:
      out = atom_features

    for i in range(self.T):
      message = self.message_function.forward(out, atom_to_pair)
+3 −0
Original line number Diff line number Diff line
@@ -1095,3 +1095,6 @@ class MPNNTensorGraph(TensorGraph):
        # Only fetch the first set of unique samples
        results.append(result[:n_valid_samples])
      return np.concatenate(results, axis=0)

  def predict_on_generator(self, generator, transformers=[]):
    return self.predict_proba_on_generator(generator, transformers)
 No newline at end of file
+13 −2
Original line number Diff line number Diff line
@@ -120,6 +120,7 @@ CheckFeaturizer = {
    ('delaney', 'graphconvreg'): ['GraphConv', 75],
    ('delaney', 'dag_regression'): ['GraphConv', 75],
    ('delaney', 'weave_regression'): ['Weave', 75],
    ('delaney', 'mpnn'): ['Weave', [75, 14]],
    ('hopv', 'tf_regression'): ['ECFP', 1024],
    ('hopv', 'rf_regression'): ['ECFP', 1024],
    ('hopv', 'krr'): ['ECFP', 1024],
@@ -134,6 +135,7 @@ CheckFeaturizer = {
    ('lipo', 'graphconvreg'): ['GraphConv', 75],
    ('lipo', 'dag_regression'): ['GraphConv', 75],
    ('lipo', 'weave_regression'): ['Weave', 75],
    ('lipo', 'mpnn'): ['Weave', [75, 14]],
    ('nci', 'tf_regression'): ['ECFP', 1024],
    ('nci', 'rf_regression'): ['ECFP', 1024],
    ('nci', 'krr'): ['ECFP', 1024],
@@ -154,34 +156,43 @@ CheckFeaturizer = {
    ('sampl', 'graphconvreg'): ['GraphConv', 75],
    ('sampl', 'dag_regression'): ['GraphConv', 75],
    ('sampl', 'weave_regression'): ['Weave', 75],
    ('sampl', 'mpnn'): ['Weave', [75, 14]],
    ('kaggle', 'tf_regression'): [None, 14293],
    ('kaggle', 'rf_regression'): [None, 14293],
    ('kaggle', 'krr'): [None, 14293],
    ('pdbbind', 'tf_regression'): ['grid', 2052],
    ('pdbbind', 'rf_regression'): ['grid', 2052],
    ('pdbbind', 'krr'): ['grid', 2052],
    ('pdbbind', 'graphconvreg'): ['GraphConv', 75],
    ('qm7', 'tf_regression'): ['ECFP', 1024],
    ('qm7', 'rf_regression'): ['ECFP', 1024],
    ('qm7', 'krr'): ['ECFP', 1024],
    ('qm7', 'krr_ft'): ['CoulombMatrix', 1024],
    ('qm7', 'graphconvreg'): ['GraphConv', 75],
    ('qm7', 'tf_regression_ft'): ['CoulombMatrix', [23, 23]],
    ('qm7', 'dtnn'): ['CoulombMatrix', [23, 23]],
    ('qm7', 'ani'): ['BPSymmetryFunction', [23, 4]],
    ('qm7b', 'tf_regression_ft'): ['CoulombMatrix', [23, 23]],
    ('qm7b', 'krr_ft'): ['CoulombMatrix', 1024],
    ('qm7b', 'dtnn'): ['CoulombMatrix', [23, 23]],
    ('qm8', 'tf_regression'): ['ECFP', 1024],
    ('qm8', 'rf_regression'): ['ECFP', 1024],
    ('qm8', 'krr'): ['ECFP', 1024],
    ('qm8', 'graphconvreg'): ['GraphConv', 75],
    ('qm8', 'tf_regression_ft'): ['CoulombMatrix', [26, 26]],
    ('qm8', 'krr_ft'): ['CoulombMatrix', 1024],
    ('qm8', 'dtnn'): ['CoulombMatrix', [26, 26]],
    ('qm8', 'ani'): ['BPSymmetryFunction', [26, 4]],
    ('qm8', 'mpnn'): ['MP', [70, 8]],
    ('qm9', 'tf_regression'): ['ECFP', 1024],
    ('qm9', 'rf_regression'): ['ECFP', 1024],
    ('qm9', 'krr'): ['ECFP', 1024],
    ('qm9', 'graphconvreg'): ['GraphConv', 75],
    ('qm9', 'tf_regression_ft'): ['CoulombMatrix', [29, 29]],
    ('qm9', 'krr_ft'): ['CoulombMatrix', 1024],
    ('qm9', 'dtnn'): ['CoulombMatrix', [29, 29]],
    ('qm9', 'ani'): ['BPSymmetryFunction', [29, 4]]
    ('qm9', 'ani'): ['BPSymmetryFunction', [29, 4]],
    ('qm9', 'mpnn'): ['MP', [70, 8]]
}

CheckSplit = {
@@ -199,7 +210,7 @@ CheckSplit = {
    'muv': ['index', 'random', 'scaffold', 'task'],
    'nci': ['index', 'random', 'scaffold'],
    'pcba': ['index', 'random', 'scaffold'],
    'pdbbind': ['index', 'random'],
    'pdbbind': ['index', 'random', 'time'],
    'ppb': ['index', 'random', 'scaffold'],
    'qm7': ['index', 'random', 'stratified'],
    'qm7b': ['index', 'random', 'stratified'],
+12 −8
Original line number Diff line number Diff line
@@ -21,8 +21,8 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
  """Featurizes pdbbind according to provided featurization"""
  tasks = ["-logKd/Ki"]
  data_dir = deepchem.utils.get_data_dir()
  data_dir = os.path.join(data_dir, "pdbbind")
  dataset_dir = os.path.join(data_dir, "%s_%s" % (subset, feat))
  pdbbind_dir = os.path.join(data_dir, "pdbbind")
  dataset_dir = os.path.join(pdbbind_dir, "%s_%s" % (subset, feat))

  if not os.path.exists(dataset_dir):
    deepchem.utils.download_url(
@@ -34,12 +34,14 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/refined_grid.tar.gz'
    )
    if not os.path.exists(pdbbind_dir):
      os.system('mkdir ' + pdbbind_dir)
    deepchem.utils.untargz_file(
        os.path.join(data_dir, 'core_grid.tar.gz'), data_dir)
        os.path.join(data_dir, 'core_grid.tar.gz'), pdbbind_dir)
    deepchem.utils.untargz_file(
        os.path.join(data_dir, 'full_grid.tar.gz'), data_dir)
        os.path.join(data_dir, 'full_grid.tar.gz'), pdbbind_dir)
    deepchem.utils.untargz_file(
        os.path.join(data_dir, 'refined_grid.tar.gz'), data_dir)
        os.path.join(data_dir, 'refined_grid.tar.gz'), pdbbind_dir)

  return deepchem.data.DiskDataset(dataset_dir), tasks

@@ -54,7 +56,8 @@ def load_pdbbind_grid(split="random",

    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter()
        'random': deepchem.splits.RandomSplitter(),
        'time': deepchem.splits.TimeSplitterPDBbind(dataset.ids)
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset)
@@ -105,11 +108,12 @@ def load_pdbbind_grid(split="random",

    for transformer in transformers:
      dataset = transformer.transform(dataset)

    df = pd.read_csv(dataset_file)
    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
        'scaffold': deepchem.splits.ScaffoldSplitter()
        'scaffold': deepchem.splits.ScaffoldSplitter(),
        'time': deepchem.splits.TimeSplitterPDBbind(np.array(df['id']))
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset)
Loading