Featurizing (b89b0137) · Commits · 钟慕尧 / deepchem

deepchem/feat/atomic_coordinates.py

+4 −0

Original line number	Diff line number	Diff line
		@@ -245,6 +245,10 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
		system_coords, system_neighbor_list, system_z

		def get_Z_matrix(self, mol, max_atoms):
		######################################### DEBUG
		print("len(mol.GetAtoms())")
		print(len(mol.GetAtoms()))
		######################################### DEBUG
		return pad_array(
		np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)

deepchem/feat/base_classes.py

+51 −0

Original line number	Diff line number	Diff line
		@@ -6,11 +6,29 @@ import numpy as np
		from rdkit import Chem
		from rdkit.Chem import rdGeometry, rdMolTransforms
		from deepchem.utils.save import log
		import multiprocessing

		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "BSD 3-clause"

		def get_ligand_filetype(ligand_filename):
		"""Returns the filetype of ligand."""
		if ".mol2" in ligand_filename:
		return "mol2"
		elif ".sdf" in ligand_filename:
		return "sdf"
		elif ".pdbqt" in ligand_filename:
		return "pdbqt"
		elif ".pdb" in ligand_filename:
		return "pdb"
		else:
		raise ValueError("Unrecognized_filename")

		def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file):
		return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


		class ComplexFeaturizer(object):
		""""
		Abstract class for calculating features for mol/protein complexes.
		@@ -38,6 +56,39 @@ class ComplexFeaturizer(object):
		features = np.asarray(features)
		return features

		#def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
		# """
		# Calculate features for mol/protein complexes.

		# Parameters
		# ----------
		# mols: list
		# List of PDB filenames for molecules.
		# protein_pdbs: list
		# List of PDB filenames for proteins.
		# """
		# pool = multiprocessing.Pool()
		# results = []
		# for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
		# log_message = "Featurizing %d / %d" % (
		# i, len(mol_files)) if i % log_every_n == 0 else None
		# #ligand_ext = get_ligand_filetype(mol_file)
		# #with open(mol_file) as mol_f:
		# # mol_lines = mol_f.readlines()
		# #with open(protein_pdb) as protein_file:
		# # protein_pdb_lines = protein_file.readlines()
		# results.append(
		# pool.apply_async(
		# _featurize_complex,
		# (self, mol_file, protein_pdb)))
		# #(self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
		# pool.close()
		# features = []
		# for result in results:
		# features += result.get()
		# features = np.asarray(features)
		# return features

		def _featurize_complex(self, mol_pdb, complex_pdb):
		"""
		Calculate features for single mol/protein complex.

deepchem/molnet/load_function/pdbbind_datasets.py

+27 −5

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@ import pandas as pd
		import logging
		import tarfile
		from deepchem.feat import rdkit_grid_featurizer as rgf
		from deepchem.feat.atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates

		logger = logging.getLogger(__name__)

		@@ -50,10 +51,6 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
		return deepchem.data.DiskDataset(dataset_dir), tasks


		def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
		"""Loads and featurizes PDBBind dataset."""


		def load_pdbbind_grid(split="random",
		featurizer="grid",
		subset="core",
		@@ -138,7 +135,19 @@ def load_pdbbind_grid(split="random",


		def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
		"""Load and featurize raw PDBBind dataset."""
		"""Load and featurize raw PDBBind dataset.

		Parameters
		----------
		data_dir: String, optional
		Specifies the data directory to store the featurized dataset.
		split: Str
		Either "random" or "index"
		feat: Str
		Either "grid" or "atomic" for grid and atomic featurizations.
		subset: Str
		Only "core" or "refined" for now.
		"""
		pdbbind_tasks = ["-logKd/Ki"]
		data_dir = deepchem.utils.get_data_dir()
		if reload:
		@@ -208,6 +217,19 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
		ecfp_power=ecfp_power,
		splif_power=splif_power,
		flatten=True)
		elif featurizer == "atomic":
		# Pulled from PDB files. For larger datasets with more PDBs, would use
		# max num atoms instead of exact.
		frag1_num_atoms = 60 # for ligand atoms
		frag2_num_atoms = 24000 # for protein atoms
		complex_num_atoms = 24060 # in total
		max_num_neighbors = 4
		# Cutoff in angstroms
		neighbor_cutoff = 4
		featurizer = ComplexNeighborListFragmentAtomicCoordinates(
		frag1_num_atoms, frag2_num_atoms, complex_num_atoms, max_num_neighbors,
		neighbor_cutoff)

		else:
		raise ValueError("Featurizer not supported")
		print("Featurizing Complexes")

Admin message