Commit b9b21c60 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Some bugfixes

parent b68a9b85
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -229,6 +229,7 @@ class Dataset(object):
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
      y = np.reshape(y, (n_samples, -1))
      if w is not None:
        w = np.reshape(w, (n_samples, -1))
    n_tasks = y.shape[1]
    if ids is None:
+126 −0
Original line number Diff line number Diff line
"""
PDBBind dataset loader.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import pandas as pd
import shutil
from rdkit import Chem
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
from deepchem.featurizers.grid_featurizer import GridFeaturizer
#from deepchem.featurizers.nnscore_utils import hydrogenate_and_compute_partial_charges

def load_pdbbind_labels(labels_file):
  """Loads pdbbind labels as dataframe"""
  contents = []
  with open(labels_file) as f:
    for line in f:
      if line.startswith("#"):
        continue
      else:
        contents.append(line.split())
  contents_df = pd.DataFrame(
      contents,
      columns=("PDB code", "resolution", "release year", "-logKd/Ki", "Kd/Ki",
               "ignore-this-field", "reference", "ligand name"))
  return contents_df

def compute_pdbbind_feature(compound_featurizers, complex_featurizers,
                            pdb_subdir, pdb_code):
  """Compute features for a given complex"""
  protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
  ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
  ################################## DEBUG
  print("ligand_file")
  print(ligand_file)
  ################################## DEBUG
  #rdkit_mol = Chem.MolFromMol2File(str(ligand_file))
  rdkit_mol = Chem.SDMolSupplier(str(ligand_file)).next()

  all_features = []
  for complex_featurizer in complex_featurizers:
    features = complex_featurizer.featurize_complexes(
      [ligand_file], [protein_file])
    all_features.append(features)
  
  for compound_featurizer in compound_featurizers:
    features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
    ########################################### DEBUG
    ########################################### DEBUG
    all_features.append(features)

  features = np.concatenate(all_features)
  return features
    
def load_pdbbind(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PDBBind dataset
  labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
  tasks = ["-logKd/Ki"]
  print("About to load contents.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  grid_featurizer = GridFeaturizer(
      voxel_width=16.0, feature_types="voxel_combined",
      voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
      "salt_bridge"], ecfp_power=9, splif_power=9,
      parallel=True, flatten=True)
  compound_featurizers = [CircularFingerprint(size=1024)]
  #complex_featurizers = [grid_featurizer, NNScoreComplexFeaturizer()]
  complex_featurizers = [grid_featurizer]
  #complex_featurizers = []
  
  # Featurize Dataset
  features = []
  for pdb_code in ids:
    pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
    computed_feature = compute_pdbbind_feature(
        compound_featurizers, complex_featurizers, pdb_subdir, pdb_code)
    if len(computed_feature) == 0:
      computed_feature = np.zeros(1024)
    features.append(computed_feature)
  X = np.vstack(features)
  w = np.ones_like(y)
   
  #from sklearn.ensemble import RandomForestRegressor
  #rf = RandomForestRegressor(n_estimators=500)
  #rf.fit(X, y)
  print("About to call Dataset.from_numpy()")
  print("X.shape, y.shape, w.shape, ids.shape")
  print(X.shape, y.shape, w.shape, ids.shape)
  
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers
+29 −29
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@ from functools import partial
from rdkit import Chem
from deepchem.utils.save import log
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_pickle_from_disk
from deepchem.featurizers import Featurizer, ComplexFeaturizer
from deepchem.featurizers import UserDefinedFeaturizer
from deepchem.datasets import Dataset
@@ -47,13 +47,15 @@ def load_data(input_file, shard_size=None):
    if shard_size is not None:
      raise ValueError("shard_size must be None for sdf input.")
    return _load_sdf_file(input_file)
  else:
  elif input_type == "csv":
    return _load_csv_file(input_file, shard_size)
  elif input_type == "pandas-pickle":
    return [load_pickle_from_disk(input_file)]

def _load_sdf_file(input_file):
  """Load SDF file into dataframe."""
  # Tasks are stored in .sdf.csv file
  raw_df = load_pandas_from_disk(input_file+".csv")
  raw_df = _load_csv_file(input_file+".csv")
  # Structures are stored in .sdf file
  print("Reading structures from %s." % input_file)
  suppl = Chem.SDMolSupplier(str(input_file), removeHs=False)
@@ -88,27 +90,27 @@ def _get_input_type(input_file):
  else:
    raise ValueError("Unrecognized extension %s" % file_extension)

def _get_fields(input_file):
  """Get the names of fields and field_types for input data."""
  # If CSV input, assume that first row contains labels
  input_type = _get_input_type(input_file)
  if input_type == "csv":
    with open(input_file, "rb") as inp_file_obj:
      return csv.reader(inp_file_obj).next()
  elif input_type == "pandas-joblib":
    df = load_from_disk(input_file)
    return df.keys()
  elif input_type == "pandas-pickle":
    df = load_pickle_from_disk(input_file)
    return df.keys()
  # If SDF input, assume that .sdf.csv file contains labels 
  elif input_type == "sdf":
    label_file = input_file + ".csv"
    print("Reading labels from %s" % label_file)
    with open(label_file, "rb") as inp_file_obj:
      return inp_file_obj.readline()
  else:
    raise ValueError("Unrecognized extension for %s" % input_file)
#def _get_fields(input_file):
#  """Get the names of fields and field_types for input data."""
#  # If CSV input, assume that first row contains labels
#  input_type = _get_input_type(input_file)
#  if input_type == "csv":
#    with open(input_file, "rb") as inp_file_obj:
#      return csv.reader(inp_file_obj).next()
#  elif input_type == "pandas-joblib":
#    df = load_from_disk(input_file)
#    return df.keys()
#  elif input_type == "pandas-pickle":
#    df = load_pickle_from_disk(input_file)
#    return df.keys()
#  # If SDF input, assume that .sdf.csv file contains labels 
#  elif input_type == "sdf":
#    label_file = input_file + ".csv"
#    print("Reading labels from %s" % label_file)
#    with open(label_file, "rb") as inp_file_obj:
#      return inp_file_obj.readline()
#  else:
#    raise ValueError("Unrecognized extension for %s" % input_file)

class DataFeaturizer(object):
  """
@@ -162,7 +164,7 @@ class DataFeaturizer(object):

    metadata_rows = []
    for shard_num, raw_df_shard in enumerate(load_data(input_file, shard_size)):
      log("Loaded shard %d of size %d from file." % (shard_num+1, shard_size),
      log("Loaded shard %d of size %s from file." % (shard_num+1, str(shard_size)),
          self.verbosity)
      log("About to featurize shard.", self.verbosity)

@@ -197,11 +199,9 @@ class DataFeaturizer(object):
      if isinstance(featurizer, UserDefinedFeaturizer):
        self._add_user_specified_features(df_shard, featurizer)
      elif isinstance(featurizer, Featurizer):
        self._featurize_mol(df_shard, featurizer, field=field,
                            worker_pool=worker_pool)
        self._featurize_mol(df_shard, featurizer, field=field)
      elif isinstance(featurizer, ComplexFeaturizer):
        self._featurize_complexes(df_shard, featurizer,
                                  worker_pool=worker_pool)
        self._featurize_complexes(df_shard, featurizer)
    basename = "shard-%d" % shard_num 
    return write_fn((basename, df_shard))

+1 −1
Original line number Diff line number Diff line
@@ -44,7 +44,7 @@ class TestFeaturizedSamples(TestAPI):
                                featurizers=featurizers,
                                verbosity="low")

    dataset = featurizer.featurize(input_file, self.data_dir)
    dataset = featurizer.featurize(input_file, self.data_dir, shard_size=None)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()