Unverified Commit 40604945 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2027 from nd-02110114/add-flake8

add flake8 and improve type annotation in dc.hyper and dc.dock
parents 58c4ecb2 7f9b3cd5
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -33,10 +33,11 @@ install:
  - bash scripts/install_deepchem_conda.sh deepchem
  - conda activate deepchem
  - python setup.py install
  - pip install coveralls mypy yapf==0.22.0
  - pip install coveralls mypy flake8 yapf==0.22.0

script:
  - bash devtools/run_yapf.sh
  - bash devtools/run_flake8.sh
  - mypy -p deepchem
  - pytest -m "not slow" --cov=deepchem deepchem
  - if [ $TRAVIS_PYTHON_VERSION == '3.7' ]; then
@@ -47,6 +48,7 @@ script:
      find ./deepchem -name "*.py" ! -name '*load_dataset_template.py' | xargs python -m doctest -v;
    fi


after_success:
  - echo $TRAVIS_SECURE_ENV_VARS
  - coveralls
+1 −3
Original line number Diff line number Diff line
"""
Imports all submodules 
"""
# flake8: noqa
from deepchem.dock.pose_generation import PoseGenerator
from deepchem.dock.pose_generation import VinaPoseGenerator
from deepchem.dock.docking import Docker
+36 −28
Original line number Diff line number Diff line
"""
Computes putative binding pockets on protein.
"""
import os
import logging
import tempfile
import numpy as np
from subprocess import call
from deepchem.feat.fingerprints import CircularFingerprint
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils import rdkit_util
from deepchem.utils import coordinate_box_utils as box_utils
from typing import Any, List, Optional, Tuple

from deepchem.models import Model
from deepchem.utils.rdkit_util import load_molecule
from deepchem.utils.coordinate_box_utils \
  import CoordinateBox, get_face_boxes, merge_overlapping_boxes
from deepchem.utils.fragment_util import get_contact_atom_indices

logger = logging.getLogger(__name__)


def extract_active_site(protein_file, ligand_file, cutoff=4):
def extract_active_site(protein_file: str,
                        ligand_file: str,
                        cutoff: float = 4.0
                       ) -> Tuple[CoordinateBox, np.ndarray]:
  """Extracts a box for the active site.

  Parameters
@@ -24,18 +26,18 @@ def extract_active_site(protein_file, ligand_file, cutoff=4):
    Location of protein PDB
  ligand_file: str
    Location of ligand input file
  cutoff: int, optional
  cutoff: float, optional (default 4.0)
    The distance in angstroms from the protein pocket to
    consider for featurization.

  Returns
  -------
  Tuple[CoordinateBox, np.ndarray]
    A tuple of `(CoordinateBox, np.ndarray)` where the second entry is
    of shape `(N, 3)` with `N` the number of atoms in the active site.
  """
  protein = rdkit_util.load_molecule(protein_file, add_hydrogens=False)
  ligand = rdkit_util.load_molecule(
      ligand_file, add_hydrogens=True, calc_charges=True)
  protein = load_molecule(protein_file, add_hydrogens=False)
  ligand = load_molecule(ligand_file, add_hydrogens=True, calc_charges=True)
  protein_contacts, ligand_contacts = get_contact_atom_indices(
      [protein, ligand], cutoff=cutoff)
  protein_coords = protein[0]
@@ -47,7 +49,7 @@ def extract_active_site(protein_file, ligand_file, cutoff=4):
  y_max = int(np.ceil(np.amax(pocket_coords[:, 1])))
  z_min = int(np.floor(np.amin(pocket_coords[:, 2])))
  z_max = int(np.ceil(np.amax(pocket_coords[:, 2])))
  box = box_utils.CoordinateBox((x_min, x_max), (y_min, y_max), (z_min, z_max))
  box = CoordinateBox((x_min, x_max), (y_min, y_max), (z_min, z_max))
  return (box, pocket_coords)


@@ -66,7 +68,7 @@ class BindingPocketFinder(object):
  technique to be used.
  """

  def find_pockets(self, molecule):
  def find_pockets(self, molecule: Any):
    """Finds potential binding pockets in proteins.

    Parameters
@@ -83,32 +85,37 @@ class ConvexHullPocketFinder(BindingPocketFinder):
  Based on https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4112621/pdf/1472-6807-14-18.pdf
  """

  def __init__(self, scoring_model=None, pad=5):
  def __init__(self, scoring_model: Optional[Model] = None, pad: float = 5.0):
    """Initialize the pocket finder.

    Parameters
    ----------
    scoring_model: `dc.models.Model`, optional
      If specified, use this model to prune pockets.
    pad: float, optional
    pad: float, optional (default 5.0)
      The number of angstroms to pad around a binding pocket's atoms
      to get a binding pocket box.
    """
    self.scoring_model = scoring_model
    self.pad = pad

  def find_all_pockets(self, protein_file):
  def find_all_pockets(self, protein_file: str) -> List[CoordinateBox]:
    """Find list of binding pockets on protein.

    Parameters
    ----------
    protein_file: str
      Protein to load in.

    Returns
    -------
    List[CoordinateBox]
      List of binding pockets on protein. Each pocket is a `CoordinateBox`
    """
    coords, _ = rdkit_util.load_molecule(protein_file)
    return box_utils.get_face_boxes(coords, self.pad)
    coords, _ = load_molecule(protein_file)
    return get_face_boxes(coords, self.pad)

  def find_pockets(self, macromolecule_file):
  def find_pockets(self, macromolecule_file: str) -> List[CoordinateBox]:
    """Find list of suitable binding pockets on protein.

    This function computes putative binding pockets on this protein.
@@ -123,10 +130,11 @@ class ConvexHullPocketFinder(BindingPocketFinder):

    Returns
    -------
    List[CoordinateBox]
      List of pockets. Each pocket is a `CoordinateBox`
    """
    coords = rdkit_util.load_molecule(
        macromolecule_file, add_hydrogens=False, calc_charges=False)[0]
    boxes = box_utils.get_face_boxes(coords, self.pad)
    boxes = box_utils.merge_overlapping_boxes(boxes)
    coords, _ = load_molecule(
        macromolecule_file, add_hydrogens=False, calc_charges=False)
    boxes = get_face_boxes(coords, self.pad)
    boxes = merge_overlapping_boxes(boxes)
    return boxes
+39 −21
Original line number Diff line number Diff line
@@ -2,11 +2,14 @@
Docks Molecular Complexes
"""
import logging
import numpy as np
import os
import tempfile
from subprocess import call
from typing import cast, Optional, Tuple
import numpy as np

from deepchem.models import Model
from deepchem.feat import ComplexFeaturizer
from deepchem.data import NumpyDataset
from deepchem.dock import PoseGenerator

logger = logging.getLogger(__name__)

@@ -25,16 +28,19 @@ class Docker(object):
  generation and scoring classes that are provided to this class.
  """

  def __init__(self, pose_generator, featurizer=None, scoring_model=None):
  def __init__(self,
               pose_generator: PoseGenerator,
               featurizer: Optional[ComplexFeaturizer] = None,
               scoring_model: Optional[Model] = None):
    """Builds model.

    Parameters
    ----------
    pose_generator: `PoseGenerator`
      The pose generator to use for this model
    featurizer: `ComplexFeaturizer`
    featurizer: `ComplexFeaturizer`, optional (default None)
      Featurizer associated with `scoring_model`
    scoring_model: `Model`
    scoring_model: `Model`, optional (default None)
      Should make predictions on molecular complex.
    """
    if ((featurizer is not None and scoring_model is None) or
@@ -47,14 +53,14 @@ class Docker(object):
    self.scoring_model = scoring_model

  def dock(self,
           molecular_complex,
           centroid=None,
           box_dims=None,
           exhaustiveness=10,
           num_modes=9,
           num_pockets=None,
           out_dir=None,
           use_pose_generator_scores=False):
           molecular_complex: Tuple[str, str],
           centroid: Optional[np.ndarray] = None,
           box_dims: Optional[np.ndarray] = None,
           exhaustiveness: int = 10,
           num_modes: int = 9,
           num_pockets: Optional[int] = None,
           out_dir: Optional[str] = None,
           use_pose_generator_scores: bool = False):
    """Generic docking function.

    This docking function uses this object's featurizer, pose
@@ -63,8 +69,14 @@ class Docker(object):

    Parameters
    ----------
    molecular_complex: Object
      Some representation of a molecular complex.
    molecular_complex: Tuple[str]
      A representation of a molecular complex. This tuple is
      (protein_file, ligand_file).
    centroid: np.ndarray, optional (default None)
      The centroid to dock against. Is computed if not specified.
    box_dims: np.ndarray, optional (default None)
      Of shape `(3,)` holding the size of the box to dock. If not
      specified is set to size of molecular complex plus 5 angstroms.
    exhaustiveness: int, optional (default 10)
      Tells pose generator how exhaustive it should be with pose
      generation.
@@ -90,8 +102,10 @@ class Docker(object):
    """
    if self.scoring_model is not None and use_pose_generator_scores:
      raise ValueError(
          "Cannot set use_pose_generator_scores=True when self.scoring_model is set (since both generator scores for complexes)."
          "Cannot set use_pose_generator_scores=True "
          "when self.scoring_model is set (since both generator scores for complexes)."
      )

    outputs = self.pose_generator.generate_poses(
        molecular_complex,
        centroid=centroid,
@@ -105,11 +119,15 @@ class Docker(object):
      complexes, scores = outputs
    else:
      complexes = outputs

    # We know use_pose_generator_scores == False in this case
    if self.scoring_model is not None:
      for posed_complex in complexes:
        # NOTE: this casting is workaround. This line doesn't effect anything to the runtime
        self.featurizer = cast(ComplexFeaturizer, self.featurizer)
        # TODO: How to handle the failure here?
        features, _ = self.featurizer.featurize([molecular_complex])
        (protein_file, ligand_file) = molecular_complex
        features, _ = self.featurizer.featurize([protein_file], [ligand_file])
        dataset = NumpyDataset(X=features)
        score = self.scoring_model.predict(dataset)
        yield (posed_complex, score)
+51 −43
Original line number Diff line number Diff line
@@ -2,19 +2,21 @@
Generates protein-ligand docked poses.
"""
import platform
import deepchem
import logging
import numpy as np
import os
import tempfile
import tarfile
import numpy as np
from subprocess import call
from subprocess import check_output
from deepchem.utils import rdkit_util
from deepchem.utils import mol_xyz_util
from deepchem.utils import geometry_utils
from deepchem.utils import vina_utils
from deepchem.utils import download_url
from typing import Optional, Tuple

from deepchem.dock.binding_pocket import BindingPocketFinder
from deepchem.utils import download_url, get_data_dir
from deepchem.utils.mol_xyz_util import get_molecule_range
from deepchem.utils.geometry_utils import compute_centroid
from deepchem.utils.rdkit_util import load_molecule, write_molecule
from deepchem.utils.vina_utils import load_docked_ligands, write_vina_conf

logger = logging.getLogger(__name__)

@@ -32,23 +34,24 @@ class PoseGenerator(object):
  """

  def generate_poses(self,
                     molecular_complex,
                     centroid=None,
                     box_dims=None,
                     exhaustiveness=10,
                     num_modes=9,
                     num_pockets=None,
                     out_dir=None,
                     generate_scores=False):
                     molecular_complex: Tuple[str, str],
                     centroid: Optional[np.ndarray] = None,
                     box_dims: Optional[np.ndarray] = None,
                     exhaustiveness: int = 10,
                     num_modes: int = 9,
                     num_pockets: Optional[int] = None,
                     out_dir: Optional[str] = None,
                     generate_scores: bool = False):
    """Generates a list of low energy poses for molecular complex

    Parameters
    ----------
    molecular_complexes: list
      A representation of a molecular complex.
    centroid: np.ndarray, optional
    molecular_complexes: Tuple[str]
      A representation of a molecular complex. This tuple is
      (protein_file, ligand_file).
    centroid: np.ndarray, optional (default None)
      The centroid to dock against. Is computed if not specified.
    box_dims: np.ndarray, optional
    box_dims: np.ndarray, optional (default None)
      Of shape `(3,)` holding the size of the box to dock. If not
      specified is set to size of molecular complex plus 5 angstroms.
    exhaustiveness: int, optional (default 10)
@@ -61,7 +64,7 @@ class PoseGenerator(object):
      If specified, `self.pocket_finder` must be set. Will only
      generate poses for the first `num_pockets` returned by
      `self.pocket_finder`.
    out_dir: str, optional
    out_dir: str, optional (default None)
      If specified, write generated poses to this directory.
    generate_score: bool, optional (default False)
      If `True`, the pose generator will return scores for complexes.
@@ -89,7 +92,9 @@ class VinaPoseGenerator(PoseGenerator):
  This class requires RDKit to be installed.
  """

  def __init__(self, sixty_four_bits=True, pocket_finder=None):
  def __init__(self,
               sixty_four_bits: bool = True,
               pocket_finder: Optional[BindingPocketFinder] = None):
    """Initializes Vina Pose Generator

    Parameters
@@ -101,7 +106,7 @@ class VinaPoseGenerator(PoseGenerator):
      If specified should be an instance of
      `dc.dock.BindingPocketFinder`.
    """
    data_dir = deepchem.utils.get_data_dir()
    data_dir = get_data_dir()
    if platform.system() == 'Linux':
      url = "http://vina.scripps.edu/download/autodock_vina_1_1_2_linux_x86.tgz"
      filename = "autodock_vina_1_1_2_linux_x86.tgz"
@@ -144,22 +149,23 @@ class VinaPoseGenerator(PoseGenerator):
      os.remove(downloaded_file)

  def generate_poses(self,
                     molecular_complex,
                     centroid=None,
                     box_dims=None,
                     exhaustiveness=10,
                     num_modes=9,
                     num_pockets=None,
                     out_dir=None,
                     generate_scores=False):
                     molecular_complex: Tuple[str, str],
                     centroid: Optional[np.ndarray] = None,
                     box_dims: Optional[np.ndarray] = None,
                     exhaustiveness: int = 10,
                     num_modes: int = 9,
                     num_pockets: Optional[int] = None,
                     out_dir: Optional[str] = None,
                     generate_scores: bool = False):
    """Generates the docked complex and outputs files for docked complex.

    TODO: How can this work on Windows? We need to install a .msi file and invoke it correctly from Python for this to work.

    Parameters
    ----------
    molecular_complexes: list
      A representation of a molecular complex.
    molecular_complexes: Tuple[str]
      A representation of a molecular complex. This tuple is
      (protein_file, ligand_file).
    centroid: np.ndarray, optional
      The centroid to dock against. Is computed if not specified.
    box_dims: np.ndarray, optional
@@ -213,10 +219,10 @@ class VinaPoseGenerator(PoseGenerator):
    protein_name = os.path.basename(protein_file).split(".")[0]
    protein_hyd = os.path.join(out_dir, "%s_hyd.pdb" % protein_name)
    protein_pdbqt = os.path.join(out_dir, "%s.pdbqt" % protein_name)
    protein_mol = rdkit_util.load_molecule(
    protein_mol = load_molecule(
        protein_file, calc_charges=True, add_hydrogens=True)
    rdkit_util.write_molecule(protein_mol[1], protein_hyd, is_protein=True)
    rdkit_util.write_molecule(protein_mol[1], protein_pdbqt, is_protein=True)
    write_molecule(protein_mol[1], protein_hyd, is_protein=True)
    write_molecule(protein_mol[1], protein_pdbqt, is_protein=True)

    # Get protein centroid and range
    if centroid is not None and box_dims is not None:
@@ -225,8 +231,8 @@ class VinaPoseGenerator(PoseGenerator):
    else:
      if self.pocket_finder is None:
        logger.info("Pockets not specified. Will use whole protein to dock")
        protein_centroid = geometry_utils.compute_centroid(protein_mol[0])
        protein_range = mol_xyz_util.get_molecule_range(protein_mol[0])
        protein_centroid = compute_centroid(protein_mol[0])
        protein_range = get_molecule_range(protein_mol[0])
        box_dims = protein_range + 5.0
        centroids, dimensions = [protein_centroid], [box_dims]
      else:
@@ -257,9 +263,9 @@ class VinaPoseGenerator(PoseGenerator):
    ligand_name = os.path.basename(ligand_file).split(".")[0]
    ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name)

    ligand_mol = rdkit_util.load_molecule(
    ligand_mol = load_molecule(
        ligand_file, calc_charges=True, add_hydrogens=True)
    rdkit_util.write_molecule(ligand_mol[1], ligand_pdbqt)
    write_molecule(ligand_mol[1], ligand_pdbqt)

    docked_complexes = []
    all_scores = []
@@ -270,7 +276,7 @@ class VinaPoseGenerator(PoseGenerator):
      logger.info("Box dimensions: %s" % str(box_dims))
      # Write Vina conf file
      conf_file = os.path.join(out_dir, "conf.txt")
      vina_utils.write_vina_conf(
      write_vina_conf(
          protein_pdbqt,
          ligand_pdbqt,
          protein_centroid,
@@ -291,10 +297,12 @@ class VinaPoseGenerator(PoseGenerator):
      else:
        # I'm not sure why specifying the args as a list fails on other platforms,
        # but for some reason it only works if I pass it as a string.
        args = "%s --config %s --log %s --out %s" % (self.vina_cmd, conf_file,
                                                     log_file, out_pdbqt)
        # FIXME: Incompatible types in assignment
        args = "%s --config %s --log %s --out %s" % (  # type: ignore
            self.vina_cmd, conf_file, log_file, out_pdbqt)
      # FIXME: We should use `subprocess.run` instead of `call`
      call(args, shell=True)
      ligands, scores = vina_utils.load_docked_ligands(out_pdbqt)
      ligands, scores = load_docked_ligands(out_pdbqt)
      docked_complexes += [(protein_mol[1], ligand) for ligand in ligands]
      all_scores += scores

Loading