Merge pull request #2 from deepchem/master (c1da56a0) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+1 −4

Original line number	Diff line number	Diff line
		@@ -1160,7 +1160,4 @@ class InMemoryLoader(DataLoader):
		labels.append(label)
		ids.append(entry_id)
		X = np.concatenate(features, axis=0)
		y = np.array(labels)
		w = np.array(weights)
		ids = np.array(ids)
		return X, y, w, ids
		return X, np.array(labels), np.array(weights), np.array(ids)

deepchem/data/datasets.py

+42 −53

Original line number	Diff line number	Diff line
		@@ -17,7 +17,7 @@ import numpy as np
		import pandas as pd

		import deepchem as dc
		from deepchem.utils.typing import OneOrMany, Shape
		from deepchem.utils.typing import ArrayLike, OneOrMany, Shape
		from deepchem.utils.data_utils import save_to_disk, load_from_disk, load_image_files

		Batch = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
		@@ -46,8 +46,7 @@ def sparsify_features(X: np.ndarray) -> np.ndarray:
		nonzero_inds = np.nonzero(X[i])[0]
		nonzero_vals = X[i][nonzero_inds]
		X_sparse.append((nonzero_inds, nonzero_vals))
		X_sparse = np.array(X_sparse, dtype=object)
		return X_sparse
		return np.array(X_sparse, dtype=object)


		def densify_features(X_sparse: np.ndarray, num_features: int) -> np.ndarray:
		@@ -703,10 +702,10 @@ class NumpyDataset(Dataset):
		"""

		def __init__(self,
		X: np.ndarray,
		y: Optional[np.ndarray] = None,
		w: Optional[np.ndarray] = None,
		ids: Optional[np.ndarray] = None,
		X: ArrayLike,
		y: Optional[ArrayLike] = None,
		w: Optional[ArrayLike] = None,
		ids: Optional[ArrayLike] = None,
		n_tasks: int = 1) -> None:
		"""Initialize this object.

		@@ -824,7 +823,7 @@ class NumpyDataset(Dataset):
		if not deterministic:
		sample_perm = np.random.permutation(n_samples)
		batch_idx = 0
		num_batches = np.math.ceil(n_samples / batch_size)
		num_batches = math.ceil(n_samples / batch_size)
		while batch_idx < num_batches:
		start = batch_idx * batch_size
		end = min(n_samples, (batch_idx + 1) * batch_size)
		@@ -1150,7 +1149,8 @@ class DiskDataset(Dataset):
		self.data_dir = data_dir

		logger.info("Loading dataset from disk.")
		self.tasks, self.metadata_df = self.load_metadata()
		tasks, self.metadata_df = self.load_metadata()
		self.tasks = np.array(tasks)
		if len(self.metadata_df.columns) == 4 and list(
		self.metadata_df.columns) == ['ids', 'X', 'y', 'w']:
		logger.info(
		@@ -1175,7 +1175,7 @@ class DiskDataset(Dataset):
		@staticmethod
		def create_dataset(shard_generator: Iterable[Batch],
		data_dir: Optional[str] = None,
		tasks: Optional[Sequence] = []) -> "DiskDataset":
		tasks: Optional[ArrayLike] = []) -> "DiskDataset":
		"""Creates a new DiskDataset

		Parameters
		@@ -1203,8 +1203,7 @@ class DiskDataset(Dataset):
		for shard_num, (X, y, w, ids) in enumerate(shard_generator):
		basename = "shard-%d" % shard_num
		metadata_rows.append(
		DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w,
		ids))
		DiskDataset.write_data_to_disk(data_dir, basename, X, y, w, ids))
		metadata_df = DiskDataset._construct_metadata(metadata_rows)
		DiskDataset._save_metadata(metadata_df, data_dir, tasks)
		time2 = time.time()
		@@ -1235,7 +1234,7 @@ class DiskDataset(Dataset):

		@staticmethod
		def _save_metadata(metadata_df: pd.DataFrame, data_dir: str,
		tasks: Optional[Sequence]) -> None:
		tasks: Optional[ArrayLike]) -> None:
		"""Saves the metadata for a DiskDataset

		Parameters
		@@ -1279,14 +1278,12 @@ class DiskDataset(Dataset):
		return metadata_df

		@staticmethod
		def write_data_to_disk(
		data_dir: str,
		def write_data_to_disk(data_dir: str,
		basename: str,
		tasks: np.ndarray,
		X: Optional[np.ndarray] = None,
		y: Optional[np.ndarray] = None,
		w: Optional[np.ndarray] = None,
		ids: Optional[np.ndarray] = None) -> List[Optional[str]]:
		ids: Optional[np.ndarray] = None) -> List[Any]:
		"""Static helper method to write data to disk.

		This helper method is used to write a shard of data to disk.
		@@ -1297,8 +1294,6 @@ class DiskDataset(Dataset):
		Data directory to write shard to.
		basename: str
		Basename for the shard in question.
		tasks: np.ndarray
		The names of the tasks in question.
		X: np.ndarray, optional (default None)
		The features array.
		y: np.ndarray, optional (default None)
		@@ -1318,7 +1313,7 @@ class DiskDataset(Dataset):
		if X is not None:
		out_X: Optional[str] = "%s-X.npy" % basename
		save_to_disk(X, os.path.join(data_dir, out_X)) # type: ignore
		out_X_shape = X.shape
		out_X_shape: Optional[Tuple[int, ...]] = X.shape
		else:
		out_X = None
		out_X_shape = None
		@@ -1326,7 +1321,7 @@ class DiskDataset(Dataset):
		if y is not None:
		out_y: Optional[str] = "%s-y.npy" % basename
		save_to_disk(y, os.path.join(data_dir, out_y)) # type: ignore
		out_y_shape = y.shape
		out_y_shape: Optional[Tuple[int, ...]] = y.shape
		else:
		out_y = None
		out_y_shape = None
		@@ -1334,7 +1329,7 @@ class DiskDataset(Dataset):
		if w is not None:
		out_w: Optional[str] = "%s-w.npy" % basename
		save_to_disk(w, os.path.join(data_dir, out_w)) # type: ignore
		out_w_shape = w.shape
		out_w_shape: Optional[Tuple[int, ...]] = w.shape
		else:
		out_w = None
		out_w_shape = None
		@@ -1342,7 +1337,7 @@ class DiskDataset(Dataset):
		if ids is not None:
		out_ids: Optional[str] = "%s-ids.npy" % basename
		save_to_disk(ids, os.path.join(data_dir, out_ids)) # type: ignore
		out_ids_shape = ids.shape
		out_ids_shape: Optional[Tuple[int, ...]] = ids.shape
		else:
		out_ids = None
		out_ids_shape = None
		@@ -1410,7 +1405,7 @@ class DiskDataset(Dataset):
		shutil.copytree(self.data_dir, new_data_dir)
		return DiskDataset(new_data_dir)

		def get_task_names(self) -> List[str]:
		def get_task_names(self) -> np.ndarray:
		"""Gets learning tasks associated with this dataset."""
		return self.tasks

		@@ -1793,8 +1788,7 @@ class DiskDataset(Dataset):
		ids = np.array(load_from_disk(ids_file))
		X, y, w, ids = transformer.transform_array(X, y, w, ids)
		basename = "shard-%d" % shard_num
		return DiskDataset.write_data_to_disk(out_dir, basename, tasks, X, y, w,
		ids)
		return DiskDataset.write_data_to_disk(out_dir, basename, X, y, w, ids)

		def make_pytorch_dataset(self,
		epochs: int = 1,
		@@ -1839,11 +1833,11 @@ class DiskDataset(Dataset):
		return pytorch_ds

		@staticmethod
		def from_numpy(X: np.ndarray,
		y: Optional[np.ndarray] = None,
		w: Optional[np.ndarray] = None,
		ids: Optional[np.ndarray] = None,
		tasks: Optional[Sequence] = None,
		def from_numpy(X: ArrayLike,
		y: Optional[ArrayLike] = None,
		w: Optional[ArrayLike] = None,
		ids: Optional[ArrayLike] = None,
		tasks: Optional[ArrayLike] = None,
		data_dir: Optional[str] = None) -> "DiskDataset":
		"""Creates a DiskDataset object from specified Numpy arrays.

		@@ -2054,7 +2048,6 @@ class DiskDataset(Dataset):
		The basenames for each shard. If this isn't specified, will assume the
		basenames of form "shard-i" used by `create_dataset` and `reshard`.
		"""
		tasks = self.get_task_names()
		# Shuffle the arrays corresponding to each row in metadata_df
		n_rows = len(self.metadata_df.index)
		if shard_basenames is not None:
		@@ -2071,8 +2064,7 @@ class DiskDataset(Dataset):
		permutation = np.random.permutation(n)
		X, y, w, ids = (X[permutation], y[permutation], w[permutation],
		ids[permutation])
		DiskDataset.write_data_to_disk(self.data_dir, basename, tasks, X, y, w,
		ids)
		DiskDataset.write_data_to_disk(self.data_dir, basename, X, y, w, ids)
		# Reset cache
		self._cached_shards = None

		@@ -2110,7 +2102,8 @@ class DiskDataset(Dataset):
		X = np.array(load_from_disk(os.path.join(self.data_dir, row['X'])))

		if row['y'] is not None:
		y = np.array(load_from_disk(os.path.join(self.data_dir, row['y'])))
		y: Optional[np.ndarray] = np.array(
		load_from_disk(os.path.join(self.data_dir, row['y'])))
		else:
		y = None

		@@ -2118,14 +2111,16 @@ class DiskDataset(Dataset):
		# TODO (ytz): Under what condition does this exist but the file itself doesn't?
		w_filename = os.path.join(self.data_dir, row['w'])
		if os.path.exists(w_filename):
		w = np.array(load_from_disk(w_filename))
		else:
		w: Optional[np.ndarray] = np.array(load_from_disk(w_filename))
		elif y is not None:
		if len(y.shape) == 1:
		w = np.ones(y.shape[0], np.float32)
		else:
		w = np.ones((y.shape[0], 1), np.float32)
		else:
		w = None
		else:
		w = None

		ids = np.array(
		load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object)
		@@ -2185,8 +2180,7 @@ class DiskDataset(Dataset):
		if self._cached_shards is not None and self._cached_shards[i] is not None:
		return self._cached_shards[i].y
		row = self.metadata_df.iloc[i]
		return np.array(
		load_from_disk(os.path.join(self.data_dir, row['y'])), dtype=object)
		return np.array(load_from_disk(os.path.join(self.data_dir, row['y'])))

		def get_shard_w(self, i: int) -> np.ndarray:
		"""Retrieves the weights for the i-th shard from disk.
		@@ -2205,8 +2199,7 @@ class DiskDataset(Dataset):
		if self._cached_shards is not None and self._cached_shards[i] is not None:
		return self._cached_shards[i].w
		row = self.metadata_df.iloc[i]
		return np.array(
		load_from_disk(os.path.join(self.data_dir, row['w'])), dtype=object)
		return np.array(load_from_disk(os.path.join(self.data_dir, row['w'])))

		def add_shard(self,
		X: np.ndarray,
		@@ -2229,10 +2222,8 @@ class DiskDataset(Dataset):
		metadata_rows = self.metadata_df.values.tolist()
		shard_num = len(metadata_rows)
		basename = "shard-%d" % shard_num
		tasks = self.get_task_names()
		metadata_rows.append(
		DiskDataset.write_data_to_disk(self.data_dir, basename, tasks, X, y, w,
		ids))
		DiskDataset.write_data_to_disk(self.data_dir, basename, X, y, w, ids))
		self.metadata_df = DiskDataset._construct_metadata(metadata_rows)
		self.save_to_disk()

		@@ -2258,8 +2249,7 @@ class DiskDataset(Dataset):
		Identifiers array.
		"""
		basename = "shard-%d" % shard_num
		tasks = self.get_task_names()
		DiskDataset.write_data_to_disk(self.data_dir, basename, tasks, X, y, w, ids)
		DiskDataset.write_data_to_disk(self.data_dir, basename, X, y, w, ids)
		self._cached_shards = None

		def select(self,
		@@ -2324,7 +2314,6 @@ class DiskDataset(Dataset):
		np.array([]), np.array([]), np.array([]), np.array([]))

		N = len(indices)
		indices = np.array(indices).astype(int)
		tasks = self.get_task_names()
		n_shards = self.get_number_shards()

		@@ -2591,8 +2580,8 @@ class ImageDataset(Dataset):
		def __init__(self,
		X: Union[np.ndarray, List[str]],
		y: Optional[Union[np.ndarray, List[str]]],
		w: Optional[np.ndarray] = None,
		ids: Optional[np.ndarray] = None) -> None:
		w: Optional[ArrayLike] = None,
		ids: Optional[ArrayLike] = None) -> None:
		"""Create a dataset whose X and/or y array is defined by image files on disk.

		Parameters

deepchem/dock/init.py

+1 −0

Original line number	Diff line number	Diff line
		# flake8: noqa
		from deepchem.dock.pose_generation import PoseGenerator
		from deepchem.dock.pose_generation import VinaPoseGenerator
		from deepchem.dock.pose_generation import GninaPoseGenerator
		from deepchem.dock.docking import Docker
		from deepchem.dock.binding_pocket import ConvexHullPocketFinder

deepchem/dock/pose_generation.py

+169 −7

Original line number	Diff line number	Diff line
		@@ -7,7 +7,7 @@ import os
		import tempfile
		import tarfile
		import numpy as np
		from subprocess import call
		from subprocess import call, Popen, PIPE
		from subprocess import check_output
		from typing import List, Optional, Tuple, Union

		@@ -16,7 +16,7 @@ from deepchem.utils.data_utils import download_url, get_data_dir
		from deepchem.utils.typing import RDKitMol
		from deepchem.utils.geometry_utils import compute_centroid, compute_protein_range
		from deepchem.utils.rdkit_utils import load_molecule, write_molecule
		from deepchem.utils.vina_utils import load_docked_ligands, write_vina_conf
		from deepchem.utils.docking_utils import load_docked_ligands, write_vina_conf, write_gnina_conf, read_gnina_log

		logger = logging.getLogger(__name__)
		DOCKED_POSES = List[Tuple[RDKitMol, RDKitMol]]
		@@ -53,8 +53,8 @@ class PoseGenerator(object):
		centroid: np.ndarray, optional (default None)
		The centroid to dock against. Is computed if not specified.
		box_dims: np.ndarray, optional (default None)
		A numpy array of shape `(3,)` holding the size of the box to dock. If not
		specified is set to size of molecular complex plus 5 angstroms.
		A numpy array of shape `(3,)` holding the size of the box to dock.
		If not specified is set to size of molecular complex plus 5 angstroms.
		exhaustiveness: int, optional (default 10)
		Tells pose generator how exhaustive it should be with pose
		generation.
		@@ -79,6 +79,167 @@ class PoseGenerator(object):
		raise NotImplementedError


		class GninaPoseGenerator(PoseGenerator):
		"""Use GNINA to generate binding poses.

		This class uses GNINA (a deep learning framework for molecular
		docking) to generate binding poses. It downloads the GNINA
		executable to DEEPCHEM_DATA_DIR (an environment variable you set)
		and invokes the executable to perform pose generation.

		GNINA uses pre-trained convolutional neural network (CNN) scoring
		functions to rank binding poses based on learned representations of
		3D protein-ligand interactions. It has been shown to outperform
		AutoDock Vina in virtual screening applications [1]_.

		If you use the GNINA molecular docking engine, please cite the relevant
		papers: https://github.com/gnina/gnina#citation
		The primary citation for GNINA is [1]_.

		References
		----------
		.. [1] M Ragoza, J Hochuli, E Idrobo, J Sunseri, DR Koes.
		"Protein–Ligand Scoring with Convolutional Neural Networks."
		Journal of chemical information and modeling (2017).

		Note
		----
		* GNINA currently only works on Linux operating systems.
		* GNINA requires CUDA >= 10.1 for fast CNN scoring.
		* Almost all dependencies are included in the most compatible way
		possible, which reduces performance. Build GNINA from source
		for production use.

		"""

		def __init__(self):
		"""Initialize GNINA pose generator."""

		data_dir = get_data_dir()
		if platform.system() == 'Linux':
		url = "https://github.com/gnina/gnina/releases/download/v1.0/gnina"
		filename = 'gnina'
		self.gnina_dir = data_dir
		self.gnina_cmd = os.path.join(self.gnina_dir, filename)
		else:
		raise ValueError(
		"GNINA currently only runs on Linux. Try using a cloud platform to run this code instead."
		)

		if not os.path.exists(self.gnina_cmd):
		logger.info("GNINA not available. Downloading...")
		download_url(url, data_dir)
		downloaded_file = os.path.join(data_dir, filename)
		os.chmod(downloaded_file, 755)
		logger.info("Downloaded GNINA.")

		def generate_poses(
		self,
		molecular_complex: Tuple[str, str],
		centroid: Optional[np.ndarray] = None,
		box_dims: Optional[np.ndarray] = None,
		exhaustiveness: int = 10,
		num_modes: int = 9,
		num_pockets: Optional[int] = None,
		out_dir: Optional[str] = None,
		generate_scores: bool = True,
		**kwargs) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
		"""Generates the docked complex and outputs files for docked complex.

		Parameters
		----------
		molecular_complexes: Tuple[str, str]
		A representation of a molecular complex. This tuple is
		(protein_file, ligand_file).
		centroid: np.ndarray, optional (default None)
		The centroid to dock against. Is computed if not specified.
		box_dims: np.ndarray, optional (default None)
		A numpy array of shape `(3,)` holding the size of the box to dock.
		If not specified is set to size of molecular complex plus 4 angstroms.
		exhaustiveness: int (default 8)
		Tells GNINA how exhaustive it should be with pose
		generation.
		num_modes: int (default 9)
		Tells GNINA how many binding modes it should generate at
		each invocation.
		out_dir: str, optional
		If specified, write generated poses to this directory.
		generate_scores: bool, optional (default True)
		If `True`, the pose generator will return scores for complexes.
		This is used typically when invoking external docking programs
		that compute scores.
		kwargs:
		Any args supported by GNINA as documented
		https://github.com/gnina/gnina#usage

		Returns
		-------
		Tuple[`docked_poses`, `scores`] or `docked_poses`
		Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses`
		is a list of docked molecular complexes. Each entry in this list
		contains a `(protein_mol, ligand_mol)` pair of RDKit molecules.
		`scores` is an array of binding affinities (kcal/mol),
		CNN pose scores, and CNN affinities predicted by GNINA.

		"""

		if out_dir is None:
		out_dir = tempfile.mkdtemp()
		if not os.path.exists(out_dir):
		os.makedirs(out_dir)

		# Parse complex
		if len(molecular_complex) > 2:
		raise ValueError(
		"GNINA can only dock protein-ligand complexes and not more general molecular complexes."
		)

		(protein_file, ligand_file) = molecular_complex

		# check filetypes
		if not protein_file.endswith('.pdb'):
		raise ValueError('Protein file must be in .pdb format.')
		if not ligand_file.endswith('.sdf'):
		raise ValueError('Ligand file must be in .sdf format.')

		protein_mol = load_molecule(
		protein_file, calc_charges=True, add_hydrogens=True)
		ligand_name = os.path.basename(ligand_file).split(".")[0]

		# Define locations of log and output files
		log_file = os.path.join(out_dir, "%s_log.txt" % ligand_name)
		out_file = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name)
		logger.info("About to call GNINA.")

		# Write GNINA conf file
		conf_file = os.path.join(out_dir, "conf.txt")
		write_gnina_conf(
		protein_filename=protein_file,
		ligand_filename=ligand_file,
		conf_filename=conf_file,
		num_modes=num_modes,
		exhaustiveness=exhaustiveness,
		**kwargs)

		# Run GNINA
		args = [
		self.gnina_cmd, "--config", conf_file, "--log", log_file, "--out",
		out_file
		]
		process = Popen(args, stdout=PIPE, stderr=PIPE)
		stdout, stderr = process.communicate()

		# read output and log
		ligands, _ = load_docked_ligands(out_file)
		docked_complexes = [(protein_mol[1], ligand) for ligand in ligands]
		scores = read_gnina_log(log_file)

		if generate_scores:
		return docked_complexes, scores
		else:
		return docked_complexes


		class VinaPoseGenerator(PoseGenerator):
		"""Uses Autodock Vina to generate binding poses.

		@@ -157,7 +318,7 @@ class VinaPoseGenerator(PoseGenerator):
		num_modes: int = 9,
		num_pockets: Optional[int] = None,
		out_dir: Optional[str] = None,
		generate_scores: bool = False
		generate_scores: Optional[bool] = False
		) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
		"""Generates the docked complex and outputs files for docked complex.

		@@ -168,7 +329,8 @@ class VinaPoseGenerator(PoseGenerator):
		----------
		molecular_complexes: Tuple[str, str]
		A representation of a molecular complex. This tuple is
		(protein_file, ligand_file).
		(protein_file, ligand_file). The protein should be a pdb file
		and the ligand should be an sdf file.
		centroid: np.ndarray, optional
		The centroid to dock against. Is computed if not specified.
		box_dims: np.ndarray, optional
		@@ -263,7 +425,7 @@ class VinaPoseGenerator(PoseGenerator):
		centroids = centroids[:num_pockets]
		dimensions = dimensions[:num_pockets]

		# Prepare protein
		# Prepare ligand
		ligand_name = os.path.basename(ligand_file).split(".")[0]
		ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name)

deepchem/dock/tests/test_pose_generation.py

+35 −0

Original line number	Diff line number	Diff line
		@@ -11,6 +11,7 @@ import deepchem as dc
		import pytest

		IS_WINDOWS = platform.system() == 'Windows'
		IS_LINUX = platform.system() == 'Linux'


		class TestPoseGeneration(unittest.TestCase):
		@@ -23,6 +24,11 @@ class TestPoseGeneration(unittest.TestCase):
		"""Test that VinaPoseGenerator can be initialized."""
		dc.dock.VinaPoseGenerator()

		@unittest.skipIf(not IS_LINUX, 'Skip the test on Windows and Mac.')
		def test_gnina_initialization(self):
		"""Test that GninaPoseGenerator can be initialized."""
		dc.dock.GninaPoseGenerator()

		@unittest.skipIf(IS_WINDOWS, 'Skip the test on Windows')
		def test_pocket_vina_initialization(self):
		"""Test that VinaPoseGenerator can be initialized."""
		@@ -58,6 +64,35 @@ class TestPoseGeneration(unittest.TestCase):
		assert isinstance(protein, Chem.Mol)
		assert isinstance(ligand, Chem.Mol)

		@pytest.mark.slow
		@unittest.skipIf(not IS_LINUX, 'Skip the test on Windows and Mac.')
		def test_gnina_poses_and_scores(self):
		"""Test that GninaPoseGenerator generates poses and scores

		This test takes some time to run, about 3 minutes on
		development laptop.
		"""
		# Let's turn on logging since this test will run for a while
		logging.basicConfig(level=logging.INFO)
		current_dir = os.path.dirname(os.path.realpath(__file__))
		protein_file = os.path.join(current_dir, "1jld_protein.pdb")
		ligand_file = os.path.join(current_dir, "1jld_ligand.sdf")

		gpg = dc.dock.GninaPoseGenerator()
		with tempfile.TemporaryDirectory() as tmp:
		poses, scores = gpg.generate_poses(
		(protein_file, ligand_file),
		exhaustiveness=1,
		num_modes=1,
		out_dir=tmp)

		assert len(poses) == 1
		assert len(scores) == 1
		protein, ligand = poses[0]
		from rdkit import Chem
		assert isinstance(protein, Chem.Mol)
		assert isinstance(ligand, Chem.Mol)

		@pytest.mark.slow
		def test_vina_poses_no_scores(self):
		"""Test that VinaPoseGenerator generates poses.

Admin message