Commit 8b6bcd56 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Loading functions

parent b89b0137
Loading
Loading
Loading
Loading
+26 −12
Original line number Diff line number Diff line
@@ -8,10 +8,13 @@ __author__ = "Joseph Gomes and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import logging
import numpy as np
from deepchem.utils.save import log
from deepchem.feat import Featurizer
from deepchem.feat import ComplexFeaturizer
from deepchem.utils import rdkit_util, pad_array
from deepchem.utils.rdkit_util import MoleculeLoadException


class AtomicCoordinates(Featurizer):
@@ -223,8 +226,14 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
        self.max_num_neighbors, self.neighbor_cutoff)

  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
    try:
      frag1_coords, frag1_mol = rdkit_util.load_molecule(mol_pdb_file)
      frag2_coords, frag2_mol = rdkit_util.load_molecule(protein_pdb_file)
    except MoleculeLoadException:
      # Currently handles loading failures by returning None
      # TODO: Is there a better handling procedure?
      logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
      return None
    system_mol = rdkit_util.merge_molecules(frag1_mol, frag2_mol)
    system_coords = rdkit_util.get_xyz_from_mol(system_mol)

@@ -232,6 +241,7 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
    frag2_coords, frag2_mol = self._strip_hydrogens(frag2_coords, frag2_mol)
    system_coords, system_mol = self._strip_hydrogens(system_coords, system_mol)

    try:
      frag1_coords, frag1_neighbor_list, frag1_z = self.featurize_mol(
          frag1_coords, frag1_mol, self.frag1_num_atoms)

@@ -240,19 +250,23 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):

      system_coords, system_neighbor_list, system_z = self.featurize_mol(
          system_coords, system_mol, self.complex_num_atoms)
    except ValueError as e:
      logging.warning(
          "max_atoms was set too low. Some complexes too large and skipped")
      return None

    return frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords, frag2_neighbor_list, frag2_z, \
           system_coords, system_neighbor_list, system_z

  def get_Z_matrix(self, mol, max_atoms):
    ######################################### DEBUG
    print("len(mol.GetAtoms())")
    print(len(mol.GetAtoms()))
    ######################################### DEBUG
    if len(mol.GetAtoms()) > max_atoms:
      raise ValueError("A molecule is larger than permitted by max_atoms. "
                       "Increase max_atoms and try again.")
    return pad_array(
        np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)

  def featurize_mol(self, coords, mol, max_num_atoms):
    logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms()))
    neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
                                          self.max_num_neighbors, None)
    z = self.get_Z_matrix(mol, max_num_atoms)
+29 −58
Original line number Diff line number Diff line
@@ -5,25 +5,12 @@ import types
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdGeometry, rdMolTransforms
from deepchem.utils.save import log
import multiprocessing

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"

def get_ligand_filetype(ligand_filename):
  """Returns the filetype of ligand."""
  if ".mol2" in ligand_filename:
    return "mol2"
  elif ".sdf" in ligand_filename:
    return "sdf"
  elif ".pdbqt" in ligand_filename:
    return "pdbqt"
  elif ".pdb" in ligand_filename:
    return "pdb"
  else:
    raise ValueError("Unrecognized_filename")

def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file):
  return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)
@@ -34,60 +21,43 @@ class ComplexFeaturizer(object):
  Abstract class for calculating features for mol/protein complexes.
  """

  def featurize_complexes(self, mol_pdbs, protein_pdbs, verbose=True,
                          log_every_n=1000):
  def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
    """
    Calculate features for mol/protein complexes.

    Parameters
    ----------
    mol_pdbs: list
      List of PDBs for molecules. Each PDB should be a list of lines of the
      PDB file.
    mols: list
      List of PDB filenames for molecules.
    protein_pdbs: list
      List of PDBs for proteins. Each PDB should be a list of lines of the
      PDB file.
      List of PDB filenames for proteins.

    Returns
    -------
    features: np.array
      Array of features
    failures: list
      Indices of complexes that failed to featurize.
    """
    pool = multiprocessing.Pool()
    results = []
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
      log_message = "Featurizing %d / %d" % (
          i, len(mol_files)) if i % log_every_n == 0 else None
      results.append(
          pool.apply_async(_featurize_complex, (self, mol_file, protein_pdb)))
    pool.close()
    features = []
    for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs, protein_pdbs)):
      if verbose and i % log_every_n == 0:
        log("Featurizing %d / %d" % (i, len(mol_pdbs)))
      features.append(self._featurize_complex(mol_pdb, protein_pdb))
    failures = []
    for ind, result in enumerate(results):
      new_features = result.get()
      # Handle loading failures which return None
      if new_features is not None:
        features.append(new_features)
      else:
        failures.append(ind)
    features = np.asarray(features)
    return features

  #def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
  #  """
  #  Calculate features for mol/protein complexes.

  #  Parameters
  #  ----------
  #  mols: list
  #    List of PDB filenames for molecules.
  #  protein_pdbs: list
  #    List of PDB filenames for proteins.
  #  """
  #  pool = multiprocessing.Pool()
  #  results = []
  #  for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
  #    log_message = "Featurizing %d / %d" % (
  #        i, len(mol_files)) if i % log_every_n == 0 else None
  #    #ligand_ext = get_ligand_filetype(mol_file)
  #    #with open(mol_file) as mol_f:
  #    #  mol_lines = mol_f.readlines()
  #    #with open(protein_pdb) as protein_file:
  #    #  protein_pdb_lines = protein_file.readlines()
  #    results.append(
  #        pool.apply_async(
  #            _featurize_complex,
  #            (self, mol_file, protein_pdb)))
  #            #(self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
  #  pool.close()
  #  features = []
  #  for result in results:
  #    features += result.get()
  #  features = np.asarray(features)
  #  return features
    return features, failures

  def _featurize_complex(self, mol_pdb, complex_pdb):
    """
@@ -102,6 +72,7 @@ class ComplexFeaturizer(object):
    """
    raise NotImplementedError('Featurizer is not defined.')


class Featurizer(object):
  """
  Abstract class for calculating a set of features for a molecule.
+41 −25
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ __author__ = "Bharath Ramsundar, Evan Feinberg, and Karl Leswing"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import logging
import os
import shutil
from warnings import warn
@@ -16,6 +17,7 @@ from collections import Counter
from rdkit import Chem
from rdkit.Chem import AllChem
from deepchem.utils.rdkit_util import load_molecule
from deepchem.utils.rdkit_util import MoleculeLoadException

import numpy as np
from scipy.spatial.distance import cdist
@@ -1256,7 +1258,11 @@ class RdkitGridFeaturizer(ComplexFeaturizer):

    features_dict = self._transform(protein_pdb_file, ligand_file)
    shutil.rmtree(tempdir)
    # Handle case the transform failed
    if features_dict is not None:
      return list(features_dict.values())
    else:
      return None

  def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
    """
@@ -1285,10 +1291,16 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
              (self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
    pool.close()
    features = []
    for result in results:
      features += result.get()
    failures = []
    for ind, result in enumerate(results):
      new_features = result.get()
      # Handle loading failures which return None
      if new_features is not None:
        features.append(new_features)
      else:
        failures.append(ind)
    features = np.asarray(features)
    return features
    return features, failures

  def _transform(self, protein_pdb, ligand_file):
    """Computes featurization of protein/ligand complex.
@@ -1303,6 +1315,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):

    This function then computes a featurization with scheme specified by the user.
    """
    try:
      ############################################################## TIMING
      time1 = time.time()
      ############################################################## TIMING
@@ -1324,6 +1337,9 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      log("TIMING: Loading ligand coordinates took %0.3f s" % (time2 - time1),
          self.verbose)
      ############################################################## TIMING
    except MoleculeLoadException:
      logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
      return None

    ############################################################## TIMING
    time1 = time.time()
+11 −2
Original line number Diff line number Diff line
@@ -206,6 +206,7 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
      # The base-10 logarithm, -log kd/pk
      log_label = line[3]
      labels.append(log_label)
  labels = np.array(labels)
  # Featurize Data
  if featurizer == "grid":
    # TODO: This is not the correct setting. Set hyperparameters correctly
@@ -220,7 +221,7 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
  elif featurizer == "atomic":
    # Pulled from PDB files. For larger datasets with more PDBs, would use
    # max num atoms instead of exact.
    frag1_num_atoms = 60  # for ligand atoms
    frag1_num_atoms = 70  # for ligand atoms
    frag2_num_atoms = 24000  # for protein atoms
    complex_num_atoms = 24060  # in total
    max_num_neighbors = 4
@@ -233,8 +234,16 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
  else:
    raise ValueError("Featurizer not supported")
  print("Featurizing Complexes")
  features = featurizer.featurize_complexes(
  features, failures = featurizer.featurize_complexes(
      ligand_files, protein_files, log_every_n=1)
  # Delete labels for failing elements
  print("np.shape(features)")
  print(np.shape(features))
  print("failures")
  print(failures)
  print("np.shape(labels)")
  print(np.shape(labels))
  labels = np.delete(labels, failures)
  dataset = deepchem.data.DiskDataset.from_numpy(features, labels)
  # No transformations of data
  transformers = []