Commit b880f6bc authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent ecd815fd
Loading
Loading
Loading
Loading
+1 −5
Original line number Diff line number Diff line
@@ -96,7 +96,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):

  >>> import sklearn
  >>> import deepchem as dc
  >>> optimizer = dc.hyper.GaussianProcessHyperparamOpt(lambda **p: dc.models.GraphConvModel(**p))
  >>> optimizer = dc.hyper.GaussianProcessHyperparamOpt(lambda **p: dc.models.GraphConvModel(n_tasks=1, **p))

  Here's a more sophisticated example that shows how to optimize only
  some parameters of a model. In this case, we have some parameters we
@@ -222,10 +222,6 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      valid_scores: float
        valid set performances
      """
      ############################
      print("placeholders: %s" % str(placeholders))
      print("param_range: %s" % str(param_range))
      ############################
      hyper_parameters = {}
      for hp in param_keys:
        if param_range[hp][0] == "int":
+19 −0
Original line number Diff line number Diff line
@@ -34,6 +34,25 @@ class GridHyperparamOpt(HyperparamOpt):
  >>> import deepchem as dc
  >>> optimizer = dc.hyper.GridHyperparamOpt(lambda **p: dc.models.GraphConvModel(**p))

  Here's a more sophisticated example that shows how to optimize only
  some parameters of a model. In this case, we have some parameters we
  want to optimize, and others which we don't. To handle this type of
  search, we create a `model_builder` which hard codes some arguments
  (in this case, `n_tasks` and `n_features` which are properties of a
  dataset and not hyperparameters to search over.)

  >>> def model_builder(**model_params):
  ...   n_layers = model_params['layers']
  ...   layer_width = model_params['width']
  ...   dropout = model_params['dropout']
  ...   return dc.models.MultitaskClassifier(
  ...     n_tasks=5,
  ...     n_features=100,
  ...     layer_sizes=[layer_width]*n_layers,
  ...     dropouts=dropout
  ...   )
  >>> optimizer = dc.hyper.GridHyperparamOpt(model_builder)

  """

  def hyperparam_search(self,
+0 −31
Original line number Diff line number Diff line
@@ -97,37 +97,6 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
    assert valid_score["pearson_r2_score"] == max(all_results.values())
    assert valid_score["pearson_r2_score"] > 0

  def test_regression_overfit(self):
    """Test that MultitaskRegressor can overfit simple regression datasets."""
    n_samples = 10
    n_features = 3
    n_tasks = 1

    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.zeros((n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w, ids)

    regression_metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
    # TODO(rbharath): This breaks with optimizer="momentum". Why?
    model = dc.models.MultitaskRegressor(
        n_tasks,
        n_features,
        dropouts=[0.],
        weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)],
        batch_size=n_samples,
        learning_rate=0.003)

    # Fit trained model
    model.fit(dataset, nb_epoch=100)

    # Eval model on train
    scores = model.evaluate(dataset, [regression_metric])
    assert scores[regression_metric.name] < .1

  @flaky
  def test_multitask_example(self):
    """Test a simple example of optimizing a multitask model with a gaussian process search."""
+11 −12
Original line number Diff line number Diff line
@@ -6,21 +6,20 @@ import deepchem as dc
import sklearn

# Load delaney dataset
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney()
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney(
    featurizer="GraphConv")
train, valid, test = delaney_datasets

# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
optimizer = dc.hyper.GaussianProcessHyperparamOpt(
    lambda **p: dc.models.GraphConvModel(
    n_tasks=len(delaney_tasks), mode="regression", **p))


def rf_model_builder(**model_params):
  rf_params = {k: v for (k, v) in model_params.items() if k != 'model_dir'}
  model_dir = model_params['model_dir']
  sklearn_model = sklearn.ensemble.RandomForestRegressor(**rf_params)
  return dc.models.SklearnModel(sklearn_model, model_dir)


params_dict = {"n_estimators": 30}
optimizer = dc.hyper.GaussianProcessHyperparamOpt(rf_model_builder)
params_dict = {"dropout": 0.5}
best_model, best_params, all_results = optimizer.hyperparam_search(
    params_dict, train, valid, transformers, metric)
    params_dict, train, valid, transformers, metric, max_iter=2, search_range=2)

valid_score = best_model.evaluate(valid, [metric], transformers)
print("valid_score")
print(valid_score)
+7 −12
Original line number Diff line number Diff line
@@ -8,24 +8,19 @@ import logging
logging.basicConfig(level=logging.INFO)

# Load delaney dataset
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney()
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney(
    featurizer="GraphConv")
train, valid, test = delaney_datasets

# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
optimizer = dc.hyper.GaussianProcessHyperparamOpt(
    lambda **p: dc.models.GraphConvModel(
    n_tasks=len(delaney_tasks), mode="regression", **p))


def rf_model_builder(**model_params):
  rf_params = {k: v for (k, v) in model_params.items() if k != 'model_dir'}
  model_dir = model_params['model_dir']
  sklearn_model = sklearn.ensemble.RandomForestRegressor(**rf_params)
  return dc.models.SklearnModel(sklearn_model, model_dir)


params_dict = {"n_estimators": 30}
optimizer = dc.hyper.GaussianProcessHyperparamOpt(rf_model_builder)
params_dict = {"dropout": 0.5}
best_model, best_params, all_results = optimizer.hyperparam_search(
    params_dict, train, valid, transformers, metric, logdir="/tmp")
    params_dict, train, valid, transformers, metric, max_iter=2, search_range=2)

valid_score = best_model.evaluate(valid, [metric], transformers)
print("valid_score")
Loading