Commit ee7baea0 authored by nd-02110114's avatar nd-02110114
Browse files

♻️ refactor utils

parent 7f9b3cd5
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ from typing import List, Optional, Dict, Tuple, Any, Sequence, Union
from deepchem.utils.typing import OneOrMany
from deepchem.utils.save import load_csv_files, load_json_files
from deepchem.utils.save import load_sdf_files
from deepchem.utils.genomics import encode_fasta_sequence
from deepchem.utils.genomics_utils import encode_bio_sequence
from deepchem.feat import UserDefinedFeaturizer, Featurizer
from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset
import zipfile
@@ -739,7 +739,7 @@ class FASTALoader(DataLoader):

    def shard_generator():
      for input_file in input_files:
        X = encode_fasta_sequence(input_file)
        X = encode_bio_sequence(input_file)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids
+1 −1
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ from deepchem.models import Model
from deepchem.utils.rdkit_util import load_molecule
from deepchem.utils.coordinate_box_utils \
  import CoordinateBox, get_face_boxes, merge_overlapping_boxes
from deepchem.utils.fragment_util import get_contact_atom_indices
from deepchem.utils.fragment_utils import get_contact_atom_indices

logger = logging.getLogger(__name__)

+10 −5
Original line number Diff line number Diff line
@@ -3,15 +3,17 @@ Docks Molecular Complexes
"""
import logging
import tempfile
from typing import cast, Optional, Tuple
from typing import cast, Generator, Optional, Tuple, Union
import numpy as np

from deepchem.utils.typing import RDKitMol
from deepchem.models import Model
from deepchem.feat import ComplexFeaturizer
from deepchem.data import NumpyDataset
from deepchem.dock import PoseGenerator

logger = logging.getLogger(__name__)
POSED_COMPLEX = Tuple[RDKitMol, RDKitMol]


class Docker(object):
@@ -60,7 +62,9 @@ class Docker(object):
           num_modes: int = 9,
           num_pockets: Optional[int] = None,
           out_dir: Optional[str] = None,
           use_pose_generator_scores: bool = False):
           use_pose_generator_scores: bool = False
          ) -> Union[Generator[POSED_COMPLEX, None, None], Generator[Tuple[
              POSED_COMPLEX, float], None, None]]:
    """Generic docking function.

    This docking function uses this object's featurizer, pose
@@ -96,6 +100,7 @@ class Docker(object):

    Returns
    -------
    Generator[(`posed_complex, score`)] or Generator[`posed_complex`]
      A generator. If `use_pose_generator_scores==True` or
      `self.scoring_model` is set, then will yield tuples
      `(posed_complex, score)`. Else will yield `posed_complex`.
+14 −10
Original line number Diff line number Diff line
@@ -9,16 +9,17 @@ import tarfile
import numpy as np
from subprocess import call
from subprocess import check_output
from typing import Optional, Tuple
from typing import List, Optional, Tuple, Union

from deepchem.dock.binding_pocket import BindingPocketFinder
from deepchem.utils import download_url, get_data_dir
from deepchem.utils.mol_xyz_util import get_molecule_range
from deepchem.utils.geometry_utils import compute_centroid
from deepchem.utils.typing import RDKitMol
from deepchem.utils.geometry_utils import compute_centroid, compute_protein_range
from deepchem.utils.rdkit_util import load_molecule, write_molecule
from deepchem.utils.vina_utils import load_docked_ligands, write_vina_conf

logger = logging.getLogger(__name__)
DOCKED_POSES = List[Tuple[RDKitMol, RDKitMol]]


class PoseGenerator(object):
@@ -156,10 +157,12 @@ class VinaPoseGenerator(PoseGenerator):
                     num_modes: int = 9,
                     num_pockets: Optional[int] = None,
                     out_dir: Optional[str] = None,
                     generate_scores: bool = False):
                     generate_scores: bool = False
                    ) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
    """Generates the docked complex and outputs files for docked complex.

    TODO: How can this work on Windows? We need to install a .msi file and invoke it correctly from Python for this to work.
    TODO: How can this work on Windows? We need to install a .msi file and 
    invoke it correctly from Python for this to work.

    Parameters
    ----------
@@ -190,10 +193,11 @@ class VinaPoseGenerator(PoseGenerator):

    Returns
    -------
    Tuple of `(docked_poses, scores)`. `docked_poses` is a list of
    docked molecular complexes. Each entry in this list contains a
    `(protein_mol, ligand_mol)` pair of RDKit molecules. `scores` is a
    list of binding free energies predicted by Vina.
    `(docked_poses, scores)` or `docked_poses`
      Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses`
      is a list of docked molecular complexes. Each entry in this list
      contains a `(protein_mol, ligand_mol)` pair of RDKit molecules.
      `scores` is a list of binding free energies predicted by Vina.

    Raises
    ------
@@ -232,7 +236,7 @@ class VinaPoseGenerator(PoseGenerator):
      if self.pocket_finder is None:
        logger.info("Pockets not specified. Will use whole protein to dock")
        protein_centroid = compute_centroid(protein_mol[0])
        protein_range = get_molecule_range(protein_mol[0])
        protein_range = compute_protein_range(protein_mol[0])
        box_dims = protein_range + 5.0
        centroids, dimensions = [protein_centroid], [box_dims]
      else:
+4 −4
Original line number Diff line number Diff line
@@ -25,7 +25,7 @@ class TestGenomicMetrics(unittest.TestCase):
    # Encode motif
    motif_name = "TAL1_known4"
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
    sequences = dc.utils.genomics_utils.seq_one_hot_encode(sequences, letters=LETTERS)
    # sequences now has shape (3, 4, 5, 1)
    self.assertEqual(sequences.shape, (3, 4, 5, 1))

@@ -36,7 +36,7 @@ class TestGenomicMetrics(unittest.TestCase):
    """Test get_pssm_scores returns correct shape."""
    motif_name = "TAL1_known4"
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
    sequences = dc.utils.genomics_utils.seq_one_hot_encode(sequences, letters=LETTERS)
    # sequences now has shape (3, 4, 5, 1)
    self.assertEqual(sequences.shape, (3, 4, 5, 1))
    pssm = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
@@ -58,7 +58,7 @@ class TestGenomicMetrics(unittest.TestCase):
    """Test in-silico mutagenesis returns correct shape."""
    # Construct and train SequenceDNN model
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
    sequences = dc.utils.genomics_utils.seq_one_hot_encode(sequences, letters=LETTERS)
    labels = np.array([1, 0, 0])
    labels = np.reshape(labels, (3, 1))
    self.assertEqual(sequences.shape, (3, 4, 5, 1))
@@ -78,7 +78,7 @@ class TestGenomicMetrics(unittest.TestCase):
    """Test in-silico mutagenesis returns nonzero output."""
    # Construct and train SequenceDNN model
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
    sequences = dc.utils.genomics_utils.seq_one_hot_encode(sequences, letters=LETTERS)
    labels = np.array([1, 0, 0])
    labels = np.reshape(labels, (3, 1))
    self.assertEqual(sequences.shape, (3, 4, 5, 1))
Loading