Commit 6ab9fefc authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #196 from rbharath/grabbag

Grabbag of small changes. See description below.
parents 6c2aa460 c452aa55
Loading
Loading
Loading
Loading
+17 −5
Original line number Diff line number Diff line
@@ -221,11 +221,10 @@ class Dataset(object):
  def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    ######################################################### DEBUG
    if n_samples > 0:
      y = np.reshape(y, (n_samples, -1))
    ######################################################### DEBUG
    #y = np.reshape(y, (n_samples, -1))
      if w is not None:
        w = np.reshape(w, (n_samples, -1))
    n_tasks = y.shape[1]
    if ids is None:
      ids = np.arange(n_samples)
@@ -236,13 +235,26 @@ class Dataset(object):
    raw_data = (ids, X, y, w)
    return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data)

  @staticmethod
  def merge(merge_dir, datasets):
    """Merges provided datasets into a merged dataset."""
    Xs, ys, ws, all_ids = [], [], [], []
    for dataset in datasets:
      X, y, w, ids = dataset.to_numpy()
      Xs.append(X)
      ys.append(y)
      ws.append(w)
      all_ids.append(ids)
    tasks = dataset.get_task_names()
    X, y, w, ids = (
        np.vstack(Xs), np.vstack(ys), np.vstack(ws), np.concatenate(all_ids))
    return Dataset.from_numpy(merge_dir, X, y, w, ids, tasks)

  def select(self, select_dir, indices):
    """Creates a new dataset from a selection of indices from self."""
    ################################################### DEBUG
    indices = np.array(indices).astype(int)
    X, y, w, ids = self.to_numpy()
    tasks = self.get_task_names()
    ################################################### DEBUG
    X_sel, y_sel, w_sel, ids_sel = (
        X[indices], y[indices], w[indices], ids[indices])
    return Dataset.from_numpy(select_dir, X_sel, y_sel, w_sel, ids_sel, tasks)
+126 −0
Original line number Diff line number Diff line
"""
PDBBind dataset loader.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import pandas as pd
import shutil
from rdkit import Chem
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
from deepchem.featurizers.grid_featurizer import GridFeaturizer
#from deepchem.featurizers.nnscore_utils import hydrogenate_and_compute_partial_charges

def load_pdbbind_labels(labels_file):
  """Loads pdbbind labels as dataframe"""
  contents = []
  with open(labels_file) as f:
    for line in f:
      if line.startswith("#"):
        continue
      else:
        contents.append(line.split())
  contents_df = pd.DataFrame(
      contents,
      columns=("PDB code", "resolution", "release year", "-logKd/Ki", "Kd/Ki",
               "ignore-this-field", "reference", "ligand name"))
  return contents_df

def compute_pdbbind_feature(compound_featurizers, complex_featurizers,
                            pdb_subdir, pdb_code):
  """Compute features for a given complex"""
  protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
  ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
  ################################## DEBUG
  print("ligand_file")
  print(ligand_file)
  ################################## DEBUG
  #rdkit_mol = Chem.MolFromMol2File(str(ligand_file))
  rdkit_mol = Chem.SDMolSupplier(str(ligand_file)).next()

  all_features = []
  for complex_featurizer in complex_featurizers:
    features = complex_featurizer.featurize_complexes(
      [ligand_file], [protein_file])
    all_features.append(features)
  
  for compound_featurizer in compound_featurizers:
    features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
    ########################################### DEBUG
    ########################################### DEBUG
    all_features.append(features)

  features = np.concatenate(all_features)
  return features
    
def load_pdbbind(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PDBBind dataset
  labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
  tasks = ["-logKd/Ki"]
  print("About to load contents.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  grid_featurizer = GridFeaturizer(
      voxel_width=16.0, feature_types="voxel_combined",
      voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
      "salt_bridge"], ecfp_power=9, splif_power=9,
      parallel=True, flatten=True)
  compound_featurizers = [CircularFingerprint(size=1024)]
  #complex_featurizers = [grid_featurizer, NNScoreComplexFeaturizer()]
  complex_featurizers = [grid_featurizer]
  #complex_featurizers = []
  
  # Featurize Dataset
  features = []
  for pdb_code in ids:
    pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
    computed_feature = compute_pdbbind_feature(
        compound_featurizers, complex_featurizers, pdb_subdir, pdb_code)
    if len(computed_feature) == 0:
      computed_feature = np.zeros(1024)
    features.append(computed_feature)
  X = np.vstack(features)
  w = np.ones_like(y)
   
  #from sklearn.ensemble import RandomForestRegressor
  #rf = RandomForestRegressor(n_estimators=500)
  #rf.fit(X, y)
  print("About to call Dataset.from_numpy()")
  print("X.shape, y.shape, w.shape, ids.shape")
  print(X.shape, y.shape, w.shape, ids.shape)
  
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers
+51 −0
Original line number Diff line number Diff line
"""
Testing singletask/multitask dataset merging
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

import os
import shutil
import tempfile
import numpy as np
from deepchem.models.tests import TestAPI
from deepchem.utils.save import load_from_disk
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.datasets import Dataset

class TestMerge(TestAPI):
  """
  Test singletask/multitask dataset merging.
  """
  def test_move_load(self):
    """Test that datasets can be moved and loaded."""
    verbosity = "high"
    current_dir = os.path.dirname(os.path.realpath(__file__))
    first_data_dir = os.path.join(self.base_dir, "first_dataset")
    second_data_dir = os.path.join(self.base_dir, "second_dataset")
    merged_data_dir = os.path.join(self.base_dir, "merged_data")

    dataset_file = os.path.join(
        current_dir, "../../models/tests/example.csv")

    featurizers = [CircularFingerprint(size=1024)]
    tasks = ["log-solubility"]
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field="smiles",
                                featurizers=featurizers,
                                verbosity=verbosity)
    first_dataset = featurizer.featurize(
        dataset_file, first_data_dir)
    second_dataset = featurizer.featurize(
        dataset_file, second_data_dir)

    merged_dataset = Dataset.merge(
        merged_data_dir, [first_dataset, second_dataset])

    assert len(merged_dataset) == len(first_dataset) + len(second_dataset)
+66 −81
Original line number Diff line number Diff line
@@ -16,8 +16,7 @@ from functools import partial
from rdkit import Chem
from deepchem.utils.save import log
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_pandas_from_disk
from deepchem.utils.save import load_pickle_from_disk
from deepchem.featurizers import Featurizer, ComplexFeaturizer
from deepchem.featurizers import UserDefinedFeaturizer
from deepchem.datasets import Dataset
@@ -38,19 +37,25 @@ def _process_field(val):
  else:
    raise ValueError("Field of unrecognized type: %s" % str(val))

def load_data(input_file):
  """Loads data from disk."""
def load_data(input_file, shard_size=None):
  """Loads data from disk.
     
  For CSV files, supports sharded loading for large files.
  """
  input_type = _get_input_type(input_file)
  if input_type == "sdf":
    raw_df = _load_sdf_file(input_file)
  else:
    raw_df = _load_csv_file(input_file)
  return raw_df
    if shard_size is not None:
      raise ValueError("shard_size must be None for sdf input.")
    return _load_sdf_file(input_file)
  elif input_type == "csv":
    return _load_csv_file(input_file, shard_size)
  elif input_type == "pandas-pickle":
    return [load_pickle_from_disk(input_file)]

def _load_sdf_file(input_file):
  """Load SDF file into dataframe."""
  # Tasks are stored in .sdf.csv file
  raw_df = load_pandas_from_disk(input_file+".csv")
  raw_df = _load_csv_file(input_file+".csv", shard_size=None).next()
  # Structures are stored in .sdf file
  print("Reading structures from %s." % input_file)
  suppl = Chem.SDMolSupplier(str(input_file), removeHs=False)
@@ -61,12 +66,17 @@ def _load_sdf_file(input_file):
      df_rows.append([ind,smiles,mol])
  mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol'))
  raw_df = pd.concat([mol_df, raw_df], axis=1, join='inner')
  return raw_df
  return [raw_df]

def _load_csv_file(input_file):
  """Loads CSV file into dataframe."""
  raw_df = load_pandas_from_disk(input_file)
  return raw_df
def _load_csv_file(filename, shard_size=None):
  """Load data as pandas dataframe."""
  # First line of user-specified CSV *must* be header.
  if shard_size is None:
    yield pd.read_csv(filename)
  else:
    for df in pd.read_csv(filename, chunksize=shard_size):
      df = df.replace(np.nan, str(""), regex=True)
      yield df

def _get_input_type(input_file):
  """Get type of input file. Must be csv/pkl.gz/sdf file."""
@@ -85,28 +95,6 @@ def _get_input_type(input_file):
  else:
    raise ValueError("Unrecognized extension %s" % file_extension)

def _get_fields(input_file):
  """Get the names of fields and field_types for input data."""
  # If CSV input, assume that first row contains labels
  input_type = _get_input_type(input_file)
  if input_type == "csv":
    with open(input_file, "rb") as inp_file_obj:
      return csv.reader(inp_file_obj).next()
  elif input_type == "pandas-joblib":
    df = load_from_disk(input_file)
    return df.keys()
  elif input_type == "pandas-pickle":
    df = load_pickle_from_disk(input_file)
    return df.keys()
  # If SDF input, assume that .sdf.csv file contains labels 
  elif input_type == "sdf":
    label_file = input_file + ".csv"
    print("Reading labels from %s" % label_file)
    with open(label_file, "rb") as inp_file_obj:
      return inp_file_obj.readline()
  else:
    raise ValueError("Unrecognized extension for %s" % input_file)

class DataFeaturizer(object):
  """
  Handles loading/featurizing of chemical samples (datapoints).
@@ -115,7 +103,7 @@ class DataFeaturizer(object):
  dataframe object to disk as output.
  """

  def __init__(self, tasks, smiles_field,
  def __init__(self, tasks, smiles_field=None,
               id_field=None, threshold=None,
               protein_pdb_field=None, ligand_pdb_field=None,
               ligand_mol2_field=None, mol_field=None,
@@ -148,61 +136,57 @@ class DataFeaturizer(object):
    """Featurize provided file and write to specified location."""
    log("Loading raw samples now.", self.verbosity)

    raw_df = load_data(input_file)
    fields = raw_df.keys()
    log("Loaded raw data frame from file.", self.verbosity)
    log("About to preprocess samples.", self.verbosity)

    if not os.path.exists(data_dir):
      os.makedirs(data_dir)

    def process_raw_sample_helper(row, fields, input_type):
      return self._process_raw_sample(input_type, row, fields)
    # Construct partial function to write datasets.
    write_fn = partial(
        Dataset.write_dataframe, data_dir=data_dir,
        featurizers=self.featurizers, tasks=self.tasks)
    input_type = _get_input_type(input_file)
    process_raw_sample_helper_partial = partial(process_raw_sample_helper,
                                                fields=fields,
                                                input_type=input_type)

    metadata_rows = []
    for shard_num, raw_df_shard in enumerate(load_data(input_file, shard_size)):
      log("Loaded shard %d of size %s from file." % (shard_num+1, str(shard_size)),
          self.verbosity)
      log("About to featurize shard.", self.verbosity)

    nb_sample = raw_df.shape[0]
    interval_points = np.linspace(
        0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int)
      def process_helper(row, fields, input_type):
        return self._process_raw_sample(input_type, row, fields)
      process_fn = partial(process_helper, fields=raw_df_shard.keys(),
                           input_type=input_type)

    metadata_rows = []
    # Construct partial function to write datasets.
    write_dataframe_partial = partial(
        Dataset.write_dataframe, data_dir=data_dir,
        featurizers=self.featurizers, tasks=self.tasks)
      metadata_rows.append(self._featurize_shard(
          raw_df_shard, process_fn, write_fn, shard_num, input_type))

    for j in range(len(interval_points)-1):
      log("Sharding and standardizing into shard-%s / %s shards"
          % (str(j+1), len(interval_points)-1), self.verbosity)
      raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])]
      raw_df_shard = raw_df_shard.apply(
          process_raw_sample_helper_partial, axis=1, reduce=False)
    # TODO(rbharath): This whole bit with metadata_rows is an awkward way of
    # creating a Dataset. Is there a more elegant solutions?
    dataset = Dataset(data_dir=data_dir,
                      metadata_rows=metadata_rows,
                      reload=reload, verbosity=self.verbosity)
    return dataset 

      df = self._standardize_df(raw_df_shard) 
  def _featurize_shard(self, raw_df_shard, process_fn, write_fn, shard_num, input_type):
    """Featurizes a shard of an input dataframe."""
    log("Applying processing transformation to shard.",
        self.verbosity)
    raw_df_shard = raw_df_shard.apply(
        process_fn, axis=1, reduce=False)
    log("About to standardize dataframe.")
    df_shard = self._standardize_df(raw_df_shard) 
  
    field = "mol" if input_type == "sdf" else "smiles"
    for featurizer in self.featurizers:
      log("Currently featurizing feature_type: %s"
          % featurizer.__class__.__name__, self.verbosity)
      if isinstance(featurizer, UserDefinedFeaturizer):
          self._add_user_specified_features(df, featurizer)
        self._add_user_specified_features(df_shard, featurizer)
      elif isinstance(featurizer, Featurizer):
          self._featurize_mol(df, featurizer, field=field,
                              worker_pool=worker_pool)
        self._featurize_mol(df_shard, featurizer, field=field)
      elif isinstance(featurizer, ComplexFeaturizer):
          self._featurize_complexes(df, featurizer,
                                    worker_pool=worker_pool)
      basename = "shard-%d" % j
      metadata_rows.append(write_dataframe_partial((basename, df)))

    dataset = Dataset(data_dir=data_dir,
                      metadata_rows=metadata_rows,
                      reload=reload, verbosity=self.verbosity)

    return dataset 
        self._featurize_complexes(df_shard, featurizer)
    basename = "shard-%d" % shard_num 
    return write_fn((basename, df_shard))

  def _shard_files_exist(self, feature_dir):
    """Checks if data shard files already exist."""
@@ -239,6 +223,7 @@ class DataFeaturizer(object):
    """
    df = pd.DataFrame(ori_df[[self.id_field]])
    df.columns = ["mol_id"]
    if self.smiles_field is not None:
      df["smiles"] = ori_df[[self.smiles_field]]
    for task in self.tasks:
      df[task] = ori_df[[task]]
+92 −79

File changed.

Preview size limit exceeded, changes collapsed.

Loading