Fixes to nblist featurization (3cd38948) · Commits · 钟慕尧 / deepchem

deepchem/datasets/pdbbind_datasets.py

+61 −4

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@ from deepchem.featurizers.fingerprints import CircularFingerprint
		from deepchem.transformers import BalancingTransformer
		from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
		from deepchem.featurizers.grid_featurizer import GridFeaturizer
		from deepchem.featurizers.atomic_coordinates import NeighborListComplexAtomicCoordinates

		def load_pdbbind_labels(labels_file):
		"""Loads pdbbind labels as dataframe"""
		@@ -44,9 +45,7 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
		for complex_featurizer in complex_featurizers:
		features = complex_featurizer.featurize_complexes(
		[ligand_file], [protein_file])
		################################################ DEBUG
		all_features.append(np.squeeze(features))
		################################################ DEBUG

		for compound_featurizer in compound_featurizers:
		features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
		@@ -55,6 +54,66 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
		features = np.concatenate(all_features)
		return features

		def compute_pdbbind_coordinate_features(
		complex_featurizer, pdb_subdir, pdb_code):
		"""Compute features for a given complex"""
		protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
		ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)

		feature = complex_featurizer.featurize_complexes(
		[ligand_file], [protein_file])
		return feature

		def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True):
		"""Load PDBBind datasets. Does not do train/test split"""
		# Set some global variables up top
		reload = True
		verbosity = "high"
		model = "logistic"
		regen = False
		neighbor_cutoff = 4
		max_num_neighbors = 10

		# Create some directories for analysis
		# The base_dir holds the results of all analysis
		if not reload:
		if os.path.exists(base_dir):
		shutil.rmtree(base_dir)
		if not os.path.exists(base_dir):
		os.makedirs(base_dir)
		current_dir = os.path.dirname(os.path.realpath(__file__))
		#Make directories to store the raw and featurized datasets.
		data_dir = os.path.join(base_dir, "dataset")

		# Load PDBBind dataset
		labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
		pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
		tasks = ["-logKd/Ki"]
		print("About to load contents.")
		contents_df = load_pdbbind_labels(labels_file)
		ids = contents_df["PDB code"].values
		y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

		# Define featurizers
		featurizer = NeighborListComplexAtomicCoordinates(
		max_num_neighbors, neighbor_cutoff)

		# Featurize Dataset
		features = []
		for ind, pdb_code in enumerate(ids):
		print("Processing %s" % str(pdb_code))
		pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
		computed_feature = compute_pdbbind_coordinate_features(
		featurizer, pdb_subdir, pdb_code)
		features.append(computed_feature)
		X = np.array(features, dtype-object)
		w = np.ones_like(y)

		dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
		transformers = []

		return tasks, dataset, transformers

		def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
		"""Load PDBBind datasets. Does not do train/test split"""
		# Set some global variables up top
		@@ -113,9 +172,7 @@ def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
		continue
		y_inds.append(ind)
		features.append(computed_feature)
		############################################################# DEBUG
		y = y[y_inds]
		############################################################# DEBUG
		X = np.vstack(features)
		w = np.ones_like(y)

deepchem/featurizers/atomic_coordinates.py

+1 −11

Original line number	Diff line number	Diff line
		@@ -281,16 +281,6 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
		"""
		mol_coords, ob_mol = load_molecule(mol_pdb_file)
		protein_coords, protein_mol = load_molecule(protein_pdb_file)
		########################################################## DEBUG
		print("mol_pdb_file, protein_pdb_file")
		print(mol_pdb_file, protein_pdb_file)
		print("type(mol_coords), type(ob_mol)")
		print(type(mol_coords), type(ob_mol))
		print("type(protein_coords), type(protein_mol)")
		print(type(protein_coords), type(protein_mol))
		print("mol_coords.shape, protein_coords.shape")
		print(mol_coords.shape, protein_coords.shape)
		########################################################## DEBUG
		system_coords, system_mol = merge_molecules(
		mol_coords, ob_mol, protein_coords, protein_mol)

deepchem/featurizers/grid_featurizer.py

+16 −6

Original line number	Diff line number	Diff line
		@@ -57,15 +57,25 @@ def load_molecule(molecule_file, remove_hydrogens=True,
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("mol2"), str("mol2"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, molecule_file)
		else:
		obConversion.ReadFile(ob_mol, str(molecule_file))
		elif ".sdf" in molecule_file:
		###################################################### DEBUG
		print("sdf file")
		###################################################### DEBUG
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("sdf"), str("sdf"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, str(molecule_file))
		elif ".pdb" in molecule_file:
		###################################################### DEBUG
		print("non mol2 file")
		print("pdb file")
		###################################################### DEBUG
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("pdb"), str("pdb"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, str(molecule_file))
		else:
		raise ValueError("Unrecognized file type")
		###################################################### DEBUG
		print("ob_mol.NumAtoms()")
		print(ob_mol.NumAtoms())

deepchem/featurizers/tests/test_atomic_coordinates.py

+4 −0

Original line number	Diff line number	Diff line
		@@ -242,3 +242,7 @@ class TestAtomicCoordinates(unittest.TestCase):
		system_coords, system_neighbor_list = complex_featurizer._featurize_complex(
		ligand_file, protein_file)

		N = system_coords.shape[0]
		assert len(system_neighbor_list.keys()) == N
		for atom in range(N):
		assert len(system_neighbor_list[atom]) <= max_num_neighbors

Admin message