Commit 3dd366c2 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Some bugfixes

parent ef843889
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -637,6 +637,12 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field):
      if y[ind, task] == "":
        missing[ind, task] = 1
  x_list = list(df[feature_type].values)
  ############################################################## DEBUG
  print("x_list")
  print(x_list)
  print("[type(elt) for elt in x_list]")
  print([type(elt) for elt in x_list])
  ############################################################## DEBUG
  valid_inds = np.array([1 if elt.size > 0 else 0 for elt in x_list], dtype=bool)
  x_list = [elt for (is_valid, elt) in zip(valid_inds, x_list) if is_valid]
  x = np.squeeze(np.array(x_list))
+8 −1
Original line number Diff line number Diff line
@@ -430,8 +430,15 @@ class DataFeaturizer(object):
    ############################################################## DEBUG
    df[featurizer.feature_fields] = df[featurizer.feature_fields].apply(pd.to_numeric)
    X_shard = df.as_matrix(columns=featurizer.feature_fields)
    df[featurizer.__class__.__name__] = X_shard.tolist()
    ############################################################## DEBUG
    #df[featurizer.__class__.__name__] = X_shard.tolist()
    df[featurizer.__class__.__name__] = [np.array(elt) for elt in X_shard.tolist()]
    ############################################################## DEBUG
    print("X_shard")
    print(X_shard)
    print("type(X_shard)")
    print(type(X_shard))
    print("[type(elt) for elt in X_shard.tolist()]")
    print([type(elt) for elt in X_shard.tolist()])
    #for ind, row in df.iterrows():
    #  # pandas rows are tuples (row_num, row_data)
    #  feature_list = []
+58 −54
Original line number Diff line number Diff line
@@ -95,7 +95,7 @@ class TestModelAPI(TestAPI):
                                smiles_field=self.smiles_field,
                                featurizer=featurizer,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)
    dataset = featurizer.featurize(input_file, self.data_dir, debug=True)

    splitter = SpecifiedSplitter(input_file, "split")
    train_dataset, test_dataset = splitter.train_test_split(
@@ -230,60 +230,64 @@ class TestModelAPI(TestAPI):
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

  def test_singletask_keras_mlp_USF_regression_API(self):
    """Test of singletask MLP User Specified Features regression API."""
    from deepchem.models.keras_models.fcnet import SingleTaskDNN
    featurizer = UserDefinedFeaturizer(["evals"])
    tasks = ["u0"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    model_params = {"nb_hidden": 10, "activation": "relu",
                    "dropout": .5, "learning_rate": .01,
                    "momentum": .9, "nesterov": False,
                    "decay": 1e-4, "batch_size": 5,
                    "nb_epoch": 2, "init": "glorot_uniform",
                    "nb_layers": 1, "batchnorm": False}

    input_file = os.path.join(self.current_dir, "gbd3k.pkl.gz")
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizer=featurizer,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
      NormalizationTransformer(transform_X=True, dataset=train_dataset),
      ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
      NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers

    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    model = SingleTaskDNN(tasks, task_types, model_params, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
  #### TODO(rbharath): This test is being disabled since deepchem no longer
  #### accepts this format of input. Decide whether this test should be deleted
  #### altogether or replaced.
  #def test_singletask_keras_mlp_USF_regression_API(self):
  #  """Test of singletask MLP User Specified Features regression API."""
  #  from deepchem.models.keras_models.fcnet import SingleTaskDNN
  #  featurizer = UserDefinedFeaturizer(["evals"])
  #  tasks = ["u0"]
  #  task_type = "regression"
  #  task_types = {task: task_type for task in tasks}
  #  model_params = {"nb_hidden": 10, "activation": "relu",
  #                  "dropout": .5, "learning_rate": .01,
  #                  "momentum": .9, "nesterov": False,
  #                  "decay": 1e-4, "batch_size": 5,
  #                  "nb_epoch": 2, "init": "glorot_uniform",
  #                  "nb_layers": 1, "batchnorm": False}

  #  input_file = os.path.join(self.current_dir, "gbd3k.pkl.gz")
  #  featurizer = DataFeaturizer(tasks=tasks,
  #                              smiles_field=self.smiles_field,
  #                              featurizer=featurizer,
  #                              verbosity="low")
  #  dataset = featurizer.featurize(input_file, self.data_dir)

  #  splitter = ScaffoldSplitter()
  #  train_dataset, test_dataset = splitter.train_test_split(
  #      dataset, self.train_dir, self.test_dir)

  #  input_transformers = [
  #    NormalizationTransformer(transform_X=True, dataset=train_dataset),
  #    ClippingTransformer(transform_X=True, dataset=train_dataset)]
  #  output_transformers = [
  #    NormalizationTransformer(transform_y=True, dataset=train_dataset)]
  #  transformers = input_transformers + output_transformers

  #  for dataset in [train_dataset, test_dataset]:
  #    for transformer in transformers:
  #      transformer.transform(dataset)

  #  model_params["data_shape"] = train_dataset.get_data_shape()
  #  regression_metrics = [Metric(metrics.r2_score),
  #                        Metric(metrics.mean_squared_error),
  #                        Metric(metrics.mean_absolute_error)]

  #  model = SingleTaskDNN(tasks, task_types, model_params, self.model_dir)

  #  # Fit trained model
  #  model.fit(train_dataset)
  #  model.save()

  #  # Eval model on train
  #  evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
  #  _ = evaluator.compute_model_performance(regression_metrics)

  #  # Eval model on test
  #  evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
  #  _ = evaluator.compute_model_performance(regression_metrics)


  def test_multitask_keras_mlp_ECFP_classification_API(self):