Loading functions (8b6bcd56) · Commits · 钟慕尧 / deepchem

deepchem/feat/atomic_coordinates.py

+26 −12

Original line number	Diff line number	Diff line
		@@ -8,10 +8,13 @@ __author__ = "Joseph Gomes and Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import logging
		import numpy as np
		from deepchem.utils.save import log
		from deepchem.feat import Featurizer
		from deepchem.feat import ComplexFeaturizer
		from deepchem.utils import rdkit_util, pad_array
		from deepchem.utils.rdkit_util import MoleculeLoadException


		class AtomicCoordinates(Featurizer):
		@@ -223,8 +226,14 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
		self.max_num_neighbors, self.neighbor_cutoff)

		def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
		try:
		frag1_coords, frag1_mol = rdkit_util.load_molecule(mol_pdb_file)
		frag2_coords, frag2_mol = rdkit_util.load_molecule(protein_pdb_file)
		except MoleculeLoadException:
		# Currently handles loading failures by returning None
		# TODO: Is there a better handling procedure?
		logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
		return None
		system_mol = rdkit_util.merge_molecules(frag1_mol, frag2_mol)
		system_coords = rdkit_util.get_xyz_from_mol(system_mol)

		@@ -232,6 +241,7 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
		frag2_coords, frag2_mol = self._strip_hydrogens(frag2_coords, frag2_mol)
		system_coords, system_mol = self._strip_hydrogens(system_coords, system_mol)

		try:
		frag1_coords, frag1_neighbor_list, frag1_z = self.featurize_mol(
		frag1_coords, frag1_mol, self.frag1_num_atoms)

		@@ -240,19 +250,23 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):

		system_coords, system_neighbor_list, system_z = self.featurize_mol(
		system_coords, system_mol, self.complex_num_atoms)
		except ValueError as e:
		logging.warning(
		"max_atoms was set too low. Some complexes too large and skipped")
		return None

		return frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords, frag2_neighbor_list, frag2_z, \
		system_coords, system_neighbor_list, system_z

		def get_Z_matrix(self, mol, max_atoms):
		######################################### DEBUG
		print("len(mol.GetAtoms())")
		print(len(mol.GetAtoms()))
		######################################### DEBUG
		if len(mol.GetAtoms()) > max_atoms:
		raise ValueError("A molecule is larger than permitted by max_atoms. "
		"Increase max_atoms and try again.")
		return pad_array(
		np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)

		def featurize_mol(self, coords, mol, max_num_atoms):
		logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms()))
		neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
		self.max_num_neighbors, None)
		z = self.get_Z_matrix(mol, max_num_atoms)

deepchem/feat/base_classes.py

+29 −58

Original line number	Diff line number	Diff line
		@@ -5,25 +5,12 @@ import types
		import numpy as np
		from rdkit import Chem
		from rdkit.Chem import rdGeometry, rdMolTransforms
		from deepchem.utils.save import log
		import multiprocessing

		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "BSD 3-clause"

		def get_ligand_filetype(ligand_filename):
		"""Returns the filetype of ligand."""
		if ".mol2" in ligand_filename:
		return "mol2"
		elif ".sdf" in ligand_filename:
		return "sdf"
		elif ".pdbqt" in ligand_filename:
		return "pdbqt"
		elif ".pdb" in ligand_filename:
		return "pdb"
		else:
		raise ValueError("Unrecognized_filename")

		def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file):
		return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)
		@@ -34,60 +21,43 @@ class ComplexFeaturizer(object):
		Abstract class for calculating features for mol/protein complexes.
		"""

		def featurize_complexes(self, mol_pdbs, protein_pdbs, verbose=True,
		log_every_n=1000):
		def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
		"""
		Calculate features for mol/protein complexes.

		Parameters
		----------
		mol_pdbs: list
		List of PDBs for molecules. Each PDB should be a list of lines of the
		PDB file.
		mols: list
		List of PDB filenames for molecules.
		protein_pdbs: list
		List of PDBs for proteins. Each PDB should be a list of lines of the
		PDB file.
		List of PDB filenames for proteins.

		Returns
		-------
		features: np.array
		Array of features
		failures: list
		Indices of complexes that failed to featurize.
		"""
		pool = multiprocessing.Pool()
		results = []
		for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
		log_message = "Featurizing %d / %d" % (
		i, len(mol_files)) if i % log_every_n == 0 else None
		results.append(
		pool.apply_async(_featurize_complex, (self, mol_file, protein_pdb)))
		pool.close()
		features = []
		for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs, protein_pdbs)):
		if verbose and i % log_every_n == 0:
		log("Featurizing %d / %d" % (i, len(mol_pdbs)))
		features.append(self._featurize_complex(mol_pdb, protein_pdb))
		failures = []
		for ind, result in enumerate(results):
		new_features = result.get()
		# Handle loading failures which return None
		if new_features is not None:
		features.append(new_features)
		else:
		failures.append(ind)
		features = np.asarray(features)
		return features

		#def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
		# """
		# Calculate features for mol/protein complexes.

		# Parameters
		# ----------
		# mols: list
		# List of PDB filenames for molecules.
		# protein_pdbs: list
		# List of PDB filenames for proteins.
		# """
		# pool = multiprocessing.Pool()
		# results = []
		# for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
		# log_message = "Featurizing %d / %d" % (
		# i, len(mol_files)) if i % log_every_n == 0 else None
		# #ligand_ext = get_ligand_filetype(mol_file)
		# #with open(mol_file) as mol_f:
		# # mol_lines = mol_f.readlines()
		# #with open(protein_pdb) as protein_file:
		# # protein_pdb_lines = protein_file.readlines()
		# results.append(
		# pool.apply_async(
		# _featurize_complex,
		# (self, mol_file, protein_pdb)))
		# #(self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
		# pool.close()
		# features = []
		# for result in results:
		# features += result.get()
		# features = np.asarray(features)
		# return features
		return features, failures

		def _featurize_complex(self, mol_pdb, complex_pdb):
		"""
		@@ -102,6 +72,7 @@ class ComplexFeaturizer(object):
		"""
		raise NotImplementedError('Featurizer is not defined.')


		class Featurizer(object):
		"""
		Abstract class for calculating a set of features for a molecule.

deepchem/feat/rdkit_grid_featurizer.py

+41 −25

Original line number	Diff line number	Diff line
		@@ -5,6 +5,7 @@ __author__ = "Bharath Ramsundar, Evan Feinberg, and Karl Leswing"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import logging
		import os
		import shutil
		from warnings import warn
		@@ -16,6 +17,7 @@ from collections import Counter
		from rdkit import Chem
		from rdkit.Chem import AllChem
		from deepchem.utils.rdkit_util import load_molecule
		from deepchem.utils.rdkit_util import MoleculeLoadException

		import numpy as np
		from scipy.spatial.distance import cdist
		@@ -1256,7 +1258,11 @@ class RdkitGridFeaturizer(ComplexFeaturizer):

		features_dict = self._transform(protein_pdb_file, ligand_file)
		shutil.rmtree(tempdir)
		# Handle case the transform failed
		if features_dict is not None:
		return list(features_dict.values())
		else:
		return None

		def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
		"""
		@@ -1285,10 +1291,16 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		(self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
		pool.close()
		features = []
		for result in results:
		features += result.get()
		failures = []
		for ind, result in enumerate(results):
		new_features = result.get()
		# Handle loading failures which return None
		if new_features is not None:
		features.append(new_features)
		else:
		failures.append(ind)
		features = np.asarray(features)
		return features
		return features, failures

		def _transform(self, protein_pdb, ligand_file):
		"""Computes featurization of protein/ligand complex.
		@@ -1303,6 +1315,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):

		This function then computes a featurization with scheme specified by the user.
		"""
		try:
		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		@@ -1324,6 +1337,9 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		log("TIMING: Loading ligand coordinates took %0.3f s" % (time2 - time1),
		self.verbose)
		############################################################## TIMING
		except MoleculeLoadException:
		logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
		return None

		############################################################## TIMING
		time1 = time.time()

deepchem/molnet/load_function/pdbbind_datasets.py

+11 −2

Original line number	Diff line number	Diff line
		@@ -206,6 +206,7 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
		# The base-10 logarithm, -log kd/pk
		log_label = line[3]
		labels.append(log_label)
		labels = np.array(labels)
		# Featurize Data
		if featurizer == "grid":
		# TODO: This is not the correct setting. Set hyperparameters correctly
		@@ -220,7 +221,7 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
		elif featurizer == "atomic":
		# Pulled from PDB files. For larger datasets with more PDBs, would use
		# max num atoms instead of exact.
		frag1_num_atoms = 60 # for ligand atoms
		frag1_num_atoms = 70 # for ligand atoms
		frag2_num_atoms = 24000 # for protein atoms
		complex_num_atoms = 24060 # in total
		max_num_neighbors = 4
		@@ -233,8 +234,16 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
		else:
		raise ValueError("Featurizer not supported")
		print("Featurizing Complexes")
		features = featurizer.featurize_complexes(
		features, failures = featurizer.featurize_complexes(
		ligand_files, protein_files, log_every_n=1)
		# Delete labels for failing elements
		print("np.shape(features)")
		print(np.shape(features))
		print("failures")
		print(failures)
		print("np.shape(labels)")
		print(np.shape(labels))
		labels = np.delete(labels, failures)
		dataset = deepchem.data.DiskDataset.from_numpy(features, labels)
		# No transformations of data
		transformers = []

Admin message