Commit acb0c8c7 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

changes

parent b880f6bc
Loading
Loading
Loading
Loading
+16 −6
Original line number Diff line number Diff line
@@ -192,6 +192,9 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    if logfile:
      log_file = logfile
    elif logdir is not None:
      # Make logdir if it doesn't exist.
      if not os.path.exists(logdir):
        os.makedirs(logdir, exist_ok=True)
      log_file = os.path.join(logdir, "results.txt")
    else:
      log_file = None
@@ -232,10 +235,9 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
          hyper_parameters[hp] = float(placeholders[hp])
      logger.info("Running hyperparameter set: %s" % str(hyper_parameters))
      if log_file:
        # Run benchmark
        with open(log_file, 'a') as f:
        with open(log_file, 'w+') as f:
          # Record hyperparameters
          f.write(str(hyper_parameters))
          f.write("Parameters: %s" % str(hyper_parameters))
          f.write('\n')

      hp_str = _convert_hyperparam_dict_to_filename(hyper_parameters)
@@ -253,23 +255,28 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        model_dir = tempfile.mkdtemp()
      # Add it on to the information needed for the constructor
      hyper_parameters["model_dir"] = model_dir
      ##########################################
      print("hyper_parameters")
      print(hyper_parameters)
      ##########################################
      model = self.model_builder(**hyper_parameters)
      model.fit(train_dataset)
      ##########################################
      print("SAVING MODEL")
      ##########################################
      try:
        model.save()
      # Some models autosave
      except NotImplementedError:
        pass

      #evaluator = Evaluator(model, valid_dataset, transformers)
      #multitask_scores = evaluator.compute_model_performance([metric])
      multitask_scores = model.evaluate(valid_dataset, [metric])
      score = multitask_scores[metric.name]

      if log_file:
        with open(log_file, 'a') as f:
          # Record performances
          f.write(str(score))
          f.write("Score: %s" % str(score))
          f.write('\n')
      # Store all results
      all_results[hp_str] = score
@@ -307,6 +314,9 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    model_dir = model_locations[hp_str]
    hyper_parameters["model_dir"] = model_dir
    best_model = self.model_builder(**hyper_parameters)
    ##########################################
    print("RESTORING BEST MODEL")
    ##########################################
    # Some models need to be explicitly reloaded
    try:
      best_model.restore()
+109 −108
Original line number Diff line number Diff line
@@ -34,68 +34,68 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
    self.valid_dataset = dc.data.NumpyDataset(
        X=np.random.rand(20, 5), y=np.random.rand(20, 1))

  def test_rf_example(self):
    """Test a simple example of optimizing a RF model with a gaussian process."""

    optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder)
    params_dict = {"n_estimators": 10}
    transformers = []
    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict,
        self.train_dataset,
        self.valid_dataset,
        transformers,
        metric,
        max_iter=2)

    valid_score = best_model.evaluate(self.valid_dataset, [metric],
                                      transformers)
    assert valid_score["pearson_r2_score"] == max(all_results.values())
    assert valid_score["pearson_r2_score"] > 0

  def test_rf_example_min(self):
    """Test a simple example of optimizing a RF model with a gaussian process looking for minimum score."""

    optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder)
    params_dict = {"n_estimators": 10}
    transformers = []
    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict,
        self.train_dataset,
        self.valid_dataset,
        transformers,
        metric,
        use_max=False,
        max_iter=2)

    valid_score = best_model.evaluate(self.valid_dataset, [metric],
                                      transformers)
    assert valid_score["pearson_r2_score"] == min(all_results.values())
    assert valid_score["pearson_r2_score"] > 0

  def test_rf_with_logdir(self):
    """Test that using a logdir can work correctly."""
    optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder)
    params_dict = {"n_estimators": 10}
    transformers = []
    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
    with tempfile.TemporaryDirectory() as tmpdirname:
      best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
          params_dict,
          self.train_dataset,
          self.valid_dataset,
          transformers,
          metric,
          logdir=tmpdirname,
          max_iter=2)
    valid_score = best_model.evaluate(self.valid_dataset, [metric],
                                      transformers)
    assert valid_score["pearson_r2_score"] == max(all_results.values())
    assert valid_score["pearson_r2_score"] > 0
#  def test_rf_example(self):
#    """Test a simple example of optimizing a RF model with a gaussian process."""
#
#    optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder)
#    params_dict = {"n_estimators": 10}
#    transformers = []
#    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
#
#    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
#        params_dict,
#        self.train_dataset,
#        self.valid_dataset,
#        transformers,
#        metric,
#        max_iter=2)
#
#    valid_score = best_model.evaluate(self.valid_dataset, [metric],
#                                      transformers)
#    assert valid_score["pearson_r2_score"] == max(all_results.values())
#    assert valid_score["pearson_r2_score"] > 0
#
#  def test_rf_example_min(self):
#    """Test a simple example of optimizing a RF model with a gaussian process looking for minimum score."""
#
#    optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder)
#    params_dict = {"n_estimators": 10}
#    transformers = []
#    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
#
#    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
#        params_dict,
#        self.train_dataset,
#        self.valid_dataset,
#        transformers,
#        metric,
#        use_max=False,
#        max_iter=2)
#
#    valid_score = best_model.evaluate(self.valid_dataset, [metric],
#                                      transformers)
#    assert valid_score["pearson_r2_score"] == min(all_results.values())
#    assert valid_score["pearson_r2_score"] > 0
#
#  def test_rf_with_logdir(self):
#    """Test that using a logdir can work correctly."""
#    optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder)
#    params_dict = {"n_estimators": 10}
#    transformers = []
#    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
#    with tempfile.TemporaryDirectory() as tmpdirname:
#      best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
#          params_dict,
#          self.train_dataset,
#          self.valid_dataset,
#          transformers,
#          metric,
#          logdir=tmpdirname,
#          max_iter=2)
#    valid_score = best_model.evaluate(self.valid_dataset, [metric],
#                                      transformers)
#    assert valid_score["pearson_r2_score"] == max(all_results.values())
#    assert valid_score["pearson_r2_score"] > 0

  @flaky
  def test_multitask_example(self):
@@ -125,56 +125,57 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
        valid_dataset,
        transformers,
        metric,
        max_iter=2,
        max_iter=1,
        use_max=False)

    valid_score = best_model.evaluate(valid_dataset, [metric])
    assert valid_score["mean-mean_squared_error"] == min(all_results.values())
    assert valid_score["mean-mean_squared_error"] > 0

  @flaky
  def test_multitask_example_different_search_range(self):
    """Test a simple example of optimizing a multitask model with a gaussian process search with per-parameter search range."""
    # Generate dummy dataset
    np.random.seed(123)
    train_dataset = dc.data.NumpyDataset(
        np.random.rand(10, 3), np.zeros((10, 2)), np.ones((10, 2)),
        np.arange(10))
    valid_dataset = dc.data.NumpyDataset(
        np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5))

    optimizer = dc.hyper.GaussianProcessHyperparamOpt(
        lambda **p: dc.models.MultitaskRegressor(
            n_tasks=2,
            n_features=3,
            dropouts=[0.],
            weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)],
            #learning_rate=0.003, **p))
            **p))

    params_dict = {"learning_rate": 0.003, "batch_size": 10}
    # These are per-example multiplier
    search_range = {"learning_rate": 10, "batch_size": 4}
    transformers = []
    metric = dc.metrics.Metric(
        dc.metrics.mean_squared_error, task_averager=np.mean)

    with tempfile.TemporaryDirectory() as tmpdirname:
      best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
          params_dict,
          train_dataset,
          valid_dataset,
          transformers,
          metric,
          max_iter=2,
          logdir=tmpdirname,
          search_range=search_range,
          use_max=False)
      valid_score = best_model.evaluate(valid_dataset, [metric])
    # Test that 2 parameters were optimized
    for hp_str in all_results.keys():
      # Recall that the key is a string of the form _batch_size_39_learning_rate_0.01 for example
      assert "batch_size" in hp_str
      assert "learning_rate" in hp_str
    assert valid_score["mean-mean_squared_error"] == min(all_results.values())
    assert valid_score["mean-mean_squared_error"] > 0
#  @flaky
#  def test_multitask_example_different_search_range(self):
#    """Test a simple example of optimizing a multitask model with a gaussian process search with per-parameter search range."""
#    # Generate dummy dataset
#    np.random.seed(123)
#    train_dataset = dc.data.NumpyDataset(
#        np.random.rand(10, 3), np.zeros((10, 2)), np.ones((10, 2)),
#        np.arange(10))
#    valid_dataset = dc.data.NumpyDataset(
#        np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5))
#
#    optimizer = dc.hyper.GaussianProcessHyperparamOpt(
#        lambda **p: dc.models.MultitaskRegressor(
#            n_tasks=2,
#            n_features=3,
#            dropouts=[0.],
#            weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)],
#            #learning_rate=0.003, **p))
#            **p))
#
#    params_dict = {"learning_rate": 0.003, "batch_size": 10}
#    # These are per-example multiplier
#    search_range = {"learning_rate": 10, "batch_size": 4}
#    transformers = []
#    metric = dc.metrics.Metric(
#        dc.metrics.mean_squared_error, task_averager=np.mean)
#
#    with tempfile.TemporaryDirectory() as tmpdirname:
#      best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
#          params_dict,
#          train_dataset,
#          valid_dataset,
#          transformers,
#          metric,
#          max_iter=2,
#          logdir=tmpdirname,
#          search_range=search_range,
#          use_max=False)
#      valid_score = best_model.evaluate(valid_dataset, [metric])
#    # Test that 2 parameters were optimized
#    for hp_str in all_results.keys():
#      # Recall that the key is a string of the form _batch_size_39_learning_rate_0.01 for example
#      assert "batch_size" in hp_str
#      assert "learning_rate" in hp_str
#    assert valid_score["mean-mean_squared_error"] == min(all_results.values())
#    assert valid_score["mean-mean_squared_error"] > 0
+1 −1
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ optimizer = dc.hyper.GaussianProcessHyperparamOpt(

params_dict = {"dropout": 0.5}
best_model, best_params, all_results = optimizer.hyperparam_search(
    params_dict, train, valid, transformers, metric, max_iter=2, search_range=2)
    params_dict, train, valid, transformers, metric, max_iter=1, search_range=2)

valid_score = best_model.evaluate(valid, [metric], transformers)
print("valid_score")