Some bugfixes (b9b21c60) · Commits · 钟慕尧 / deepchem

deepchem/datasets/init.py

+2 −1

Original line number	Diff line number	Diff line
		@@ -229,6 +229,7 @@ class Dataset(object):
		# The -1 indicates that y will be reshaped to have length -1
		if n_samples > 0:
		y = np.reshape(y, (n_samples, -1))
		if w is not None:
		w = np.reshape(w, (n_samples, -1))
		n_tasks = y.shape[1]
		if ids is None:

deepchem/datasets/pdbbind_datasets.py

0 → 100644

+126 −0

Original line number	Diff line number	Diff line
		"""
		PDBBind dataset loader.
		"""

		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		import os
		import numpy as np
		import pandas as pd
		import shutil
		from rdkit import Chem
		from deepchem.utils.save import load_from_disk
		from deepchem.datasets import Dataset
		from deepchem.featurizers.featurize import DataFeaturizer
		from deepchem.featurizers.fingerprints import CircularFingerprint
		from deepchem.transformers import BalancingTransformer
		from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
		from deepchem.featurizers.grid_featurizer import GridFeaturizer
		#from deepchem.featurizers.nnscore_utils import hydrogenate_and_compute_partial_charges

		def load_pdbbind_labels(labels_file):
		"""Loads pdbbind labels as dataframe"""
		contents = []
		with open(labels_file) as f:
		for line in f:
		if line.startswith("#"):
		continue
		else:
		contents.append(line.split())
		contents_df = pd.DataFrame(
		contents,
		columns=("PDB code", "resolution", "release year", "-logKd/Ki", "Kd/Ki",
		"ignore-this-field", "reference", "ligand name"))
		return contents_df

		def compute_pdbbind_feature(compound_featurizers, complex_featurizers,
		pdb_subdir, pdb_code):
		"""Compute features for a given complex"""
		protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
		ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
		################################## DEBUG
		print("ligand_file")
		print(ligand_file)
		################################## DEBUG
		#rdkit_mol = Chem.MolFromMol2File(str(ligand_file))
		rdkit_mol = Chem.SDMolSupplier(str(ligand_file)).next()

		all_features = []
		for complex_featurizer in complex_featurizers:
		features = complex_featurizer.featurize_complexes(
		[ligand_file], [protein_file])
		all_features.append(features)

		for compound_featurizer in compound_featurizers:
		features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
		########################################### DEBUG
		########################################### DEBUG
		all_features.append(features)

		features = np.concatenate(all_features)
		return features

		def load_pdbbind(pdbbind_dir, base_dir, reload=True):
		"""Load PDBBind datasets. Does not do train/test split"""
		# Set some global variables up top
		reload = True
		verbosity = "high"
		model = "logistic"
		regen = False

		# Create some directories for analysis
		# The base_dir holds the results of all analysis
		if not reload:
		if os.path.exists(base_dir):
		shutil.rmtree(base_dir)
		if not os.path.exists(base_dir):
		os.makedirs(base_dir)
		current_dir = os.path.dirname(os.path.realpath(__file__))
		#Make directories to store the raw and featurized datasets.
		data_dir = os.path.join(base_dir, "dataset")

		# Load PDBBind dataset
		labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
		pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
		tasks = ["-logKd/Ki"]
		print("About to load contents.")
		contents_df = load_pdbbind_labels(labels_file)
		ids = contents_df["PDB code"].values
		y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

		# Define featurizers
		grid_featurizer = GridFeaturizer(
		voxel_width=16.0, feature_types="voxel_combined",
		voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
		"salt_bridge"], ecfp_power=9, splif_power=9,
		parallel=True, flatten=True)
		compound_featurizers = [CircularFingerprint(size=1024)]
		#complex_featurizers = [grid_featurizer, NNScoreComplexFeaturizer()]
		complex_featurizers = [grid_featurizer]
		#complex_featurizers = []

		# Featurize Dataset
		features = []
		for pdb_code in ids:
		pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
		computed_feature = compute_pdbbind_feature(
		compound_featurizers, complex_featurizers, pdb_subdir, pdb_code)
		if len(computed_feature) == 0:
		computed_feature = np.zeros(1024)
		features.append(computed_feature)
		X = np.vstack(features)
		w = np.ones_like(y)

		#from sklearn.ensemble import RandomForestRegressor
		#rf = RandomForestRegressor(n_estimators=500)
		#rf.fit(X, y)
		print("About to call Dataset.from_numpy()")
		print("X.shape, y.shape, w.shape, ids.shape")
		print(X.shape, y.shape, w.shape, ids.shape)

		dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
		transformers = []

		return tasks, dataset, transformers

deepchem/featurizers/featurize.py

+29 −29

Original line number	Diff line number	Diff line
		@@ -16,7 +16,7 @@ from functools import partial
		from rdkit import Chem
		from deepchem.utils.save import log
		from deepchem.utils.save import save_to_disk
		from deepchem.utils.save import load_from_disk
		from deepchem.utils.save import load_pickle_from_disk
		from deepchem.featurizers import Featurizer, ComplexFeaturizer
		from deepchem.featurizers import UserDefinedFeaturizer
		from deepchem.datasets import Dataset
		@@ -47,13 +47,15 @@ def load_data(input_file, shard_size=None):
		if shard_size is not None:
		raise ValueError("shard_size must be None for sdf input.")
		return _load_sdf_file(input_file)
		else:
		elif input_type == "csv":
		return _load_csv_file(input_file, shard_size)
		elif input_type == "pandas-pickle":
		return [load_pickle_from_disk(input_file)]

		def _load_sdf_file(input_file):
		"""Load SDF file into dataframe."""
		# Tasks are stored in .sdf.csv file
		raw_df = load_pandas_from_disk(input_file+".csv")
		raw_df = _load_csv_file(input_file+".csv")
		# Structures are stored in .sdf file
		print("Reading structures from %s." % input_file)
		suppl = Chem.SDMolSupplier(str(input_file), removeHs=False)
		@@ -88,27 +90,27 @@ def _get_input_type(input_file):
		else:
		raise ValueError("Unrecognized extension %s" % file_extension)

		def _get_fields(input_file):
		"""Get the names of fields and field_types for input data."""
		# If CSV input, assume that first row contains labels
		input_type = _get_input_type(input_file)
		if input_type == "csv":
		with open(input_file, "rb") as inp_file_obj:
		return csv.reader(inp_file_obj).next()
		elif input_type == "pandas-joblib":
		df = load_from_disk(input_file)
		return df.keys()
		elif input_type == "pandas-pickle":
		df = load_pickle_from_disk(input_file)
		return df.keys()
		# If SDF input, assume that .sdf.csv file contains labels
		elif input_type == "sdf":
		label_file = input_file + ".csv"
		print("Reading labels from %s" % label_file)
		with open(label_file, "rb") as inp_file_obj:
		return inp_file_obj.readline()
		else:
		raise ValueError("Unrecognized extension for %s" % input_file)
		#def _get_fields(input_file):
		# """Get the names of fields and field_types for input data."""
		# # If CSV input, assume that first row contains labels
		# input_type = _get_input_type(input_file)
		# if input_type == "csv":
		# with open(input_file, "rb") as inp_file_obj:
		# return csv.reader(inp_file_obj).next()
		# elif input_type == "pandas-joblib":
		# df = load_from_disk(input_file)
		# return df.keys()
		# elif input_type == "pandas-pickle":
		# df = load_pickle_from_disk(input_file)
		# return df.keys()
		# # If SDF input, assume that .sdf.csv file contains labels
		# elif input_type == "sdf":
		# label_file = input_file + ".csv"
		# print("Reading labels from %s" % label_file)
		# with open(label_file, "rb") as inp_file_obj:
		# return inp_file_obj.readline()
		# else:
		# raise ValueError("Unrecognized extension for %s" % input_file)

		class DataFeaturizer(object):
		"""
		@@ -162,7 +164,7 @@ class DataFeaturizer(object):

		metadata_rows = []
		for shard_num, raw_df_shard in enumerate(load_data(input_file, shard_size)):
		log("Loaded shard %d of size %d from file." % (shard_num+1, shard_size),
		log("Loaded shard %d of size %s from file." % (shard_num+1, str(shard_size)),
		self.verbosity)
		log("About to featurize shard.", self.verbosity)

		@@ -197,11 +199,9 @@ class DataFeaturizer(object):
		if isinstance(featurizer, UserDefinedFeaturizer):
		self._add_user_specified_features(df_shard, featurizer)
		elif isinstance(featurizer, Featurizer):
		self._featurize_mol(df_shard, featurizer, field=field,
		worker_pool=worker_pool)
		self._featurize_mol(df_shard, featurizer, field=field)
		elif isinstance(featurizer, ComplexFeaturizer):
		self._featurize_complexes(df_shard, featurizer,
		worker_pool=worker_pool)
		self._featurize_complexes(df_shard, featurizer)
		basename = "shard-%d" % shard_num
		return write_fn((basename, df_shard))

deepchem/featurizers/tests/test_sdf_reader.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -44,7 +44,7 @@ class TestFeaturizedSamples(TestAPI):
		featurizers=featurizers,
		verbosity="low")

		dataset = featurizer.featurize(input_file, self.data_dir)
		dataset = featurizer.featurize(input_file, self.data_dir, shard_size=None)

		# Splits featurized samples into train/test
		splitter = RandomSplitter()

Admin message