Commit 1b474de8 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent 8b5708d4
Loading
Loading
Loading
Loading
+5 −6
Original line number Diff line number Diff line
@@ -1051,7 +1051,6 @@ class DiskDataset(Dataset):
      Location on disk of an existing `DiskDataset`.
    """
    self.data_dir = data_dir
    self.legacy_metadata = legacy_metadata

    logger.info("Loading dataset from disk.")
    self.tasks, self.metadata_df = self.load_metadata()
@@ -1114,7 +1113,7 @@ class DiskDataset(Dataset):
    DiskDataset._save_metadata(tasks, metadata_df, data_dir)
    time2 = time.time()
    logger.info("TIMING: dataset construction took %0.3f s" % (time2 - time1))
    return DiskDataset(data_dir, legacy_metadata)
    return DiskDataset(data_dir)

  def load_metadata(self) -> Tuple[List[str], pd.DataFrame]:
    """Helper method that loads metadata from disk."""
@@ -2193,23 +2192,23 @@ class DiskDataset(Dataset):
      for shard_num in range(n_rows):
        row = self.metadata_df.iloc[shard_num]
        if row['X_shape'] is not None:
          shard_X_shape = make_tuple(row['X_shape'])
          shard_X_shape = make_tuple(str(row['X_shape']))
        else:
          shard_X_shape = tuple()
        if n_tasks > 0:
          if row['y_shape'] is not None:
            shard_y_shape = make_tuple(row['y_shape'])
            shard_y_shape = make_tuple(str(row['y_shape']))
          else:
            shard_y_shape = tuple()
          if row['w_shape'] is not None:
            shard_w_shape = make_tuple(row['w_shape'])
            shard_w_shape = make_tuple(str(row['w_shape']))
          else:
            shard_w_shape = tuple()
        else:
          shard_y_shape = tuple()
          shard_w_shape = tuple()
        if row['ids_shape'] is not None:
          shard_ids_shape = make_tuple(row['ids_shape'])
          shard_ids_shape = make_tuple(str(row['ids_shape']))
        else:
          shard_ids_shape = tuple()
        if shard_num == 0:
+59 −57
Original line number Diff line number Diff line
@@ -122,63 +122,65 @@ def test_sklearn_multitask_regression():
  assert score > .5


#def test_sklearn_classification():
#  """Test that sklearn models can learn on simple classification datasets."""
#  np.random.seed(123)
#  dataset = sklearn.datasets.load_digits(n_class=2)
#  X, y = dataset.data, dataset.target

#  frac_train = .7
#  n_samples = len(X)
#  n_train = int(frac_train*n_samples)
#  X_train, y_train = X[:n_train], y[:n_train]
#  X_test, y_test = X[n_train:], y[n_train:]
#  train_dataset = dc.data.NumpyDataset(X_train, y_train)
#  test_dataset = dc.data.NumpyDataset(X_test, y_test)

#  classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
#  sklearn_model = LogisticRegression()
#  model = dc.models.SklearnModel(sklearn_model)

#  # Fit trained model
#  model.fit(train_dataset)
#  model.save()

#  # Eval model on test
#  scores = model.evaluate(test_dataset, [classification_metric])
#  assert scores[classification_metric.name] > .5

#def test_sklearn_multitask_classification():
#  """Test that sklearn models can learn on simple multitask classification."""
#  np.random.seed(123)
#  n_tasks = 4
#  tasks = range(n_tasks)
#  dataset = sklearn.datasets.load_digits(n_class=2)
#  X, y = dataset.data, dataset.target
#  y = np.reshape(y, (len(y), 1))
#  y = np.hstack([y] * n_tasks)
#
#  frac_train = .7
#  n_samples = len(X)
#  n_train = int(frac_train*n_samples)
#  X_train, y_train = X[:n_train], y[:n_train]
#  X_test, y_test = X[n_train:], y[n_train:]
#  train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train)
#  test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test)

#  classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
#  def model_builder(model_dir):
#    sklearn_model = LogisticRegression()
#    return dc.models.SklearnModel(sklearn_model, model_dir)
#  model = dc.models.SingletaskToMultitask(tasks, model_builder)

#  # Fit trained model
#  model.fit(train_dataset)
#  model.save()
#  # Eval model on test
#  scores = model.evaluate(test_dataset, [classification_metric])
#  for score in scores[classification_metric.name]:
#    assert score > .5
def test_sklearn_classification():
  """Test that sklearn models can learn on simple classification datasets."""
  np.random.seed(123)
  dataset = sklearn.datasets.load_digits(n_class=2)
  X, y = dataset.data, dataset.target

  frac_train = .7
  n_samples = len(X)
  n_train = int(frac_train * n_samples)
  X_train, y_train = X[:n_train], y[:n_train]
  X_test, y_test = X[n_train:], y[n_train:]
  train_dataset = dc.data.NumpyDataset(X_train, y_train)
  test_dataset = dc.data.NumpyDataset(X_test, y_test)

  classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
  sklearn_model = LogisticRegression()
  model = dc.models.SklearnModel(sklearn_model)

  # Fit trained model
  model.fit(train_dataset)
  model.save()

  # Eval model on test
  scores = model.evaluate(test_dataset, [classification_metric])
  assert scores[classification_metric.name] > .5


def test_sklearn_multitask_classification():
  """Test that sklearn models can learn on simple multitask classification."""
  np.random.seed(123)
  n_tasks = 4
  tasks = range(n_tasks)
  dataset = sklearn.datasets.load_digits(n_class=2)
  X, y = dataset.data, dataset.target
  y = np.reshape(y, (len(y), 1))
  y = np.hstack([y] * n_tasks)

  frac_train = .7
  n_samples = len(X)
  n_train = int(frac_train * n_samples)
  X_train, y_train = X[:n_train], y[:n_train]
  X_test, y_test = X[n_train:], y[n_train:]
  train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train)
  test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test)

  classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

  def model_builder(model_dir):
    sklearn_model = LogisticRegression()
    return dc.models.SklearnModel(sklearn_model, model_dir)

  model = dc.models.SingletaskToMultitask(tasks, model_builder)

  # Fit trained model
  model.fit(train_dataset)
  model.save()
  # Eval model on test
  scores = model.evaluate(test_dataset, [classification_metric])
  assert scores['roc_auc_score'] > 0.5


def test_xgboost_regression():