Commit 4a1b7527 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Still more test fixes

parent 979ba0d5
Loading
Loading
Loading
Loading
+7 −2
Original line number Diff line number Diff line
@@ -27,7 +27,6 @@ class Dataset(object):
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
  """
  def __init__(self, data_dir=None, tasks=[], metadata_rows=None, #featurizers=None, 
               #use_user_specified_features=False,
               raw_data=None, verbosity=None, reload=False):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
@@ -424,7 +423,8 @@ def _df_to_numpy(df, feature_types, tasks):
  # perform common train/test split across all tasks
  n_samples = df.shape[0]
  n_tasks = len(tasks)
  y = np.hstack([np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
  y = np.hstack([
      np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
  w = np.ones((n_samples, n_tasks))
  missing = np.zeros_like(y).astype(int)
  tensors = []
@@ -457,6 +457,11 @@ def _df_to_numpy(df, feature_types, tasks):
      missing[ind, :] = 1
      continue
    tensors.append(features)
  ################################################## DEBUG
  #print("_df_to_numpy")
  #print("tensors, n_samples, feature_types")
  #print(tensors, n_samples, feature_types)
  ################################################## DEBUG
  x = np.stack(tensors)
  sorted_ids = df["mol_id"]

+8 −0
Original line number Diff line number Diff line
@@ -233,3 +233,11 @@ class Featurizer(object):
        for j in xrange(n_confs):
          x[i, j] = mol_features[j]
    return x

class UserDefinedFeaturizer(Featurizer):
  """Directs usage of user-computed featurizations."""

  def __init__(self, feature_fields):
    """Creates user-defined-featurizer."""
    self.feature_fields = feature_fields
+61 −53
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_pandas_from_disk
from deepchem.featurizers import Featurizer, ComplexFeaturizer
from deepchem.featurizers import UserDefinedFeaturizer
from deepchem.datasets import Dataset

def _process_field(val):
@@ -37,6 +38,36 @@ def _process_field(val):
  else:
    raise ValueError("Field of unrecognized type: %s" % str(val))

def load_data(input_file):
  """Loads data from disk."""
  input_type = _get_input_type(input_file)
  if input_type == "sdf":
    raw_df = _load_sdf_file(input_file)
  else:
    raw_df = _load_csv_file(input_file)
  return raw_df

def _load_sdf_file(input_file):
  """Load SDF file into dataframe."""
  # Tasks are stored in .sdf.csv file
  raw_df = load_pandas_from_disk(input_file+".csv")
  # Structures are stored in .sdf file
  print("Reading structures from %s." % input_file)
  suppl = Chem.SDMolSupplier(str(input_file), removeHs=False)
  df_rows = []
  for ind, mol in enumerate(suppl):
    if mol is not None:
      smiles = Chem.MolToSmiles(mol)
      df_rows.append([ind,smiles,mol])
  mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol'))
  raw_df = pd.concat([mol_df, raw_df], axis=1, join='inner')
  return raw_df

def _load_csv_file(input_file):
  """Loads CSV file into dataframe."""
  raw_df = load_pandas_from_disk(input_file)
  return raw_df

def _get_input_type(input_file):
  """Get type of input file. Must be csv/pkl.gz/sdf file."""
  filename, file_extension = os.path.splitext(input_file)
@@ -84,8 +115,8 @@ class DataFeaturizer(object):
  dataframe object to disk as output.
  """

  def __init__(self, tasks, smiles_field, split_field=None,
               id_field=None, threshold=None, user_specified_features=None,
  def __init__(self, tasks, smiles_field,
               id_field=None, threshold=None,
               protein_pdb_field=None, ligand_pdb_field=None,
               ligand_mol2_field=None, mol_field=None,
               featurizers=[],
@@ -97,7 +128,6 @@ class DataFeaturizer(object):
    self.verbosity = verbosity
    self.tasks = tasks
    self.smiles_field = smiles_field
    self.split_field = split_field
    if id_field is None:
      self.id_field = smiles_field
    else:
@@ -107,51 +137,29 @@ class DataFeaturizer(object):
    self.ligand_pdb_field = ligand_pdb_field
    self.ligand_mol2_field = ligand_mol2_field
    self.mol_field = mol_field
    self.user_specified_features = user_specified_features
    self.user_specified_features = None
    for featurizer in featurizers:
      if isinstance(featurizer, UserDefinedFeaturizer):
        self.user_specified_features = featurizer.feature_fields 
    self.featurizers = featurizers
    self.log_every_n = log_every_n

  def _load_sdf_file(self, input_file):
    """Load SDF file into dataframe."""
    # Tasks are stored in .sdf.csv file
    raw_df = load_pandas_from_disk(input_file+".csv")
    # Structures are stored in .sdf file
    print("Reading structures from %s." % input_file)
    suppl = Chem.SDMolSupplier(str(input_file), removeHs=False)
    df_rows = []
    for ind, mol in enumerate(suppl):
      if mol is not None:
        smiles = Chem.MolToSmiles(mol)
        df_rows.append([ind,smiles,mol])
    mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol'))
    raw_df = pd.concat([mol_df, raw_df], axis=1, join='inner')
    return raw_df

  def _load_csv_file(self, input_file):
    """Loads CSV file into dataframe."""
    raw_df = load_pandas_from_disk(input_file)
    return raw_df

  def featurize(self, input_file, data_dir, shard_size=8192, worker_pool=None,
                reload=False):
    """Featurize provided file and write to specified location."""
    # If we are not to reload data, or data has not already been featurized.
    input_type = _get_input_type(input_file)

    if not reload:
      log("Loading raw samples now.", self.verbosity)

      if input_type == "sdf":
        raw_df = self._load_sdf_file(input_file)
      else:
        raw_df = self._load_csv_file(input_file)

      raw_df = load_data(input_file)
      fields = raw_df.keys()
      log("Loaded raw data frame from file.", self.verbosity)
      log("About to preprocess samples.", self.verbosity)

      def process_raw_sample_helper(row, fields, input_type):
        return self._process_raw_sample(input_type, row, fields)
      input_type = _get_input_type(input_file)
      process_raw_sample_helper_partial = partial(process_raw_sample_helper,
                                                  fields=fields,
                                                  input_type=input_type)
@@ -180,7 +188,9 @@ class DataFeaturizer(object):
        for featurizer in self.featurizers:
          log("Currently featurizing feature_type: %s"
              % featurizer.__class__.__name__, self.verbosity)
          if isinstance(featurizer, Featurizer):
          if isinstance(featurizer, UserDefinedFeaturizer):
            self._add_user_specified_features(df, featurizer)
          elif isinstance(featurizer, Featurizer):
            self._featurize_mol(df, featurizer, field=field,
                                worker_pool=worker_pool)
          elif isinstance(featurizer, ComplexFeaturizer):
@@ -224,7 +234,12 @@ class DataFeaturizer(object):
    return data

  def _standardize_df(self, ori_df):
    """Copy specified columns to new df with standard column names."""
    """Copy specified columns to new df with standard column names.

    TODO(rbharath): I think think function is now unnecessary (since the
                    dataframes are only temporary and not on disk). Should
                    be able to remove this function.
    """
    df = pd.DataFrame(ori_df[[self.id_field]])
    df.columns = ["mol_id"]
    df["smiles"] = ori_df[[self.smiles_field]]
@@ -235,15 +250,12 @@ class DataFeaturizer(object):
        df[feature] = ori_df[[feature]]
    if self.mol_field is not None:
      df["mol"] = ori_df[[self.mol_field]]
    if self.split_field is not None:
      df["split"] = ori_df[[self.split_field]]
    if self.protein_pdb_field is not None:
      df["protein_pdb"] = ori_df[[self.protein_pdb_field]]
    if self.ligand_pdb_field is not None:
      df["ligand_pdb"] = ori_df[[self.ligand_pdb_field]]
    if self.ligand_mol2_field is not None:
      df["ligand_mol2"] = ori_df[[self.ligand_mol2_field]]
    self._add_user_specified_features(df, ori_df)
    return df

  def _featurize_complexes(self, df, featurizer, parallel=True,
@@ -300,9 +312,9 @@ class DataFeaturizer(object):
        if ind % self.log_every_n == 0:
          log("Featurizing sample %d" % ind, self.verbosity)
        ###################################### DEBUG
        print("DataFeaturizer._featurize_mol")
        print("mol, self.verbosity")
        print(mol, self.verbosity)
        #print("DataFeaturizer._featurize_mol")
        #print("mol, self.verbosity")
        #print(mol, self.verbosity)
        ###################################### DEBUG
        features.append(featurizer.featurize([mol], verbosity=self.verbosity))
    else:
@@ -321,14 +333,12 @@ class DataFeaturizer(object):

    df[featurizer.__class__.__name__] = features

  def _add_user_specified_features(self, df, ori_df):
  def _add_user_specified_features(self, df, featurizer):
    """Merge user specified features. 

      Merge features included in dataset provided by user
      into final features dataframe

      TODO(rbharath): Needs to be handled consistently with other featurizations.

      Three types of featurization here:

        1) Molecule featurization
@@ -337,15 +347,13 @@ class DataFeaturizer(object):
        2) Complex featurization
          -) PDB files for interacting molecules.
        3) User specified featurizations.
           TODO(rbharath): These should not be passed to Dataset!
    """
    if self.user_specified_features is not None:
    log("Aggregating User-Specified Features", self.verbosity)
    features_data = []
      for ind, row in ori_df.iterrows():
    for ind, row in df.iterrows():
      # pandas rows are tuples (row_num, row_data)
      feature_list = []
        for feature_name in self.user_specified_features:
      for feature_name in featurizer.feature_fields:
        feature_list.append(row[feature_name])
      features_data.append(np.array(feature_list))
      df["user-specified-features"] = features_data
    df[featurizer.__class__.__name__] = features_data
+110 −45
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ import os
import unittest
import tempfile
import shutil
from deepchem.featurizers import UserDefinedFeaturizer 
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers.basic import RDKitDescriptors
@@ -30,6 +31,8 @@ from deepchem.metrics import Metric
from sklearn.ensemble import RandomForestRegressor
from deepchem.models.tensorflow_models import TensorflowModel
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import SpecifiedSplitter

class TestModelAPI(TestAPI):
  """
@@ -74,21 +77,30 @@ class TestModelAPI(TestAPI):
  def test_singletask_sklearn_rf_user_specified_regression_API(self):
    """Test of singletask RF USF regression API."""
    splittype = "specified"
    split_field = "split"
    featurizers = []
    input_transformers = []
    output_transformers = [NormalizationTransformer]
    featurizers = [UserDefinedFeaturizer(["user-specified1", "user-specified2"])]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = "user_specified_example.csv"
    user_specified_features = ["user-specified1", "user-specified2"]
    train_dataset, test_dataset, _, transformers, = self._featurize_train_test_split(
        splittype, featurizers, 
        input_transformers, output_transformers, input_file, tasks,
        user_specified_features=user_specified_features,
        split_field=split_field)
    input_file = os.path.join(self.current_dir, "user_specified_example.csv")
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = SpecifiedSplitter(input_file, "split")
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
@@ -114,16 +126,32 @@ class TestModelAPI(TestAPI):
    """Test of singletask RF ECFP regression API: sharded edition."""
    splittype = "scaffold"
    featurizers = [CircularFingerprint(size=1024)]
    input_transformers = []
    output_transformers = [NormalizationTransformer]
    model_params = {}
    tasks = ["label"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = "../../../datasets/pdbbind_core_df.pkl.gz"
    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, featurizers, input_transformers, output_transformers,
        input_file, tasks, shard_size=50)
    input_file = os.path.join(
        self.current_dir, "../../../datasets/pdbbind_core_df.pkl.gz")

    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    #train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
    #    splittype, featurizers, input_transformers, output_transformers,
    #    input_file, tasks, shard_size=50)
    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)
    # We set shard size above to force the creation of multiple shards of the data.
    # pdbbind_core has ~200 examples.
    model_params["data_shape"] = train_dataset.get_data_shape()
@@ -151,16 +179,34 @@ class TestModelAPI(TestAPI):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    featurizers = [RDKitDescriptors()]
    input_transformers = [NormalizationTransformer, ClippingTransformer]
    output_transformers = [NormalizationTransformer]
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    model_params = {}
    input_file = "example.csv"
    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, featurizers, input_transformers, output_transformers,
        input_file, tasks)
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    #train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
    #    splittype, featurizers, input_transformers, output_transformers,
    #    input_file, tasks)
    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
@@ -185,12 +231,7 @@ class TestModelAPI(TestAPI):
  def test_singletask_keras_mlp_USF_regression_API(self):
    """Test of singletask MLP User Specified Features regression API."""
    from deepchem.models.keras_models.fcnet import SingleTaskDNN
    splittype = "scaffold"
    featurizers = []
    input_transformers = [NormalizationTransformer, ClippingTransformer]
    output_transformers = [NormalizationTransformer]
    feature_types = ["user_specified_features"]
    user_specified_features = ["evals"]
    featurizers = [UserDefinedFeaturizer(["evals"])]
    tasks = ["u0"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
@@ -201,16 +242,28 @@ class TestModelAPI(TestAPI):
                    "nb_epoch": 2, "init": "glorot_uniform",
                    "nb_layers": 1, "batchnorm": False}

    input_file = "gbd3k.pkl.gz"
    protein_pdb_field = None
    ligand_pdb_field = None
    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, featurizers,
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks,
        protein_pdb_field=protein_pdb_field,
        ligand_pdb_field=ligand_pdb_field,
        user_specified_features=user_specified_features)
    input_file = os.path.join(self.current_dir, "gbd3k.pkl.gz")
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
      NormalizationTransformer(transform_X=True, dataset=train_dataset),
      ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
      NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers

    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
@@ -290,14 +343,26 @@ class TestModelAPI(TestAPI):
    tasks = ["outcome"]
    task_type = "classification"
    task_types = {task: task_type for task in tasks}
    input_file = "example_classification.csv"
    input_file = os.path.join(self.current_dir, "example_classification.csv")

    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    
    input_transformers = []
    output_transformers = [NormalizationTransformer]
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers

    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, featurizers, 
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks)
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params = {
      "batch_size": 2,
+14 −6
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@ from rdkit import Chem
from deepchem.utils import ScaffoldGenerator
from deepchem.utils.save import log
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import load_data

def generate_scaffold(smiles, include_chirality=False):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
@@ -65,8 +66,7 @@ class Splitter(object):
    valid_dir = None
    train_samples, _, test_samples = self.train_valid_test_split(
        samples, train_dir, valid_dir, test_dir,
        frac_train=frac_train, frac_test=1-frac_train, frac_valid=0.,
        reload=False)
        frac_train=frac_train, frac_test=1-frac_train, frac_valid=0.)
    return train_samples, test_samples

  def split(self, samples, frac_train=None, frac_valid=None, frac_test=None,
@@ -165,18 +165,26 @@ class SpecifiedSplitter(Splitter):
  """
  Class that splits data according to user specification.
  """

  def __init__(self, input_file, split_field, verbosity=None):
    """Provide input information for splits."""
    raw_df = load_data(input_file)
    self.splits = raw_df[split_field].values
    self.verbosity = verbosity

  def split(self, dataset, frac_train=.8, frac_valid=.1, frac_test=.1,
            log_every_n=1000):
    """
    Splits internal compounds into train/validation/test by user-specification.
    """
    train_inds, valid_inds, test_inds = [], [], []
    for ind, row in samples.compounds_df.iterrows():
      if row["split"].lower() == "train":
    for ind, split in enumerate(self.splits):
      split = split.lower()
      if split == "train":
        train_inds.append(ind)
      elif row["split"].lower() in ["valid", "validation"]:
      elif split in ["valid", "validation"]:
        valid_inds.append(ind)
      elif row["split"].lower() == "test":
      elif split == "test":
        test_inds.append(ind)
      else:
        raise ValueError("Missing required split information.")