Merge pull request #196 from rbharath/grabbag (6ab9fefc) · Commits · 钟慕尧 / deepchem

deepchem/datasets/init.py

+17 −5

Original line number	Diff line number	Diff line
		@@ -221,11 +221,10 @@ class Dataset(object):
		def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None):
		n_samples = len(X)
		# The -1 indicates that y will be reshaped to have length -1
		######################################################### DEBUG
		if n_samples > 0:
		y = np.reshape(y, (n_samples, -1))
		######################################################### DEBUG
		#y = np.reshape(y, (n_samples, -1))
		if w is not None:
		w = np.reshape(w, (n_samples, -1))
		n_tasks = y.shape[1]
		if ids is None:
		ids = np.arange(n_samples)
		@@ -236,13 +235,26 @@ class Dataset(object):
		raw_data = (ids, X, y, w)
		return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data)

		@staticmethod
		def merge(merge_dir, datasets):
		"""Merges provided datasets into a merged dataset."""
		Xs, ys, ws, all_ids = [], [], [], []
		for dataset in datasets:
		X, y, w, ids = dataset.to_numpy()
		Xs.append(X)
		ys.append(y)
		ws.append(w)
		all_ids.append(ids)
		tasks = dataset.get_task_names()
		X, y, w, ids = (
		np.vstack(Xs), np.vstack(ys), np.vstack(ws), np.concatenate(all_ids))
		return Dataset.from_numpy(merge_dir, X, y, w, ids, tasks)

		def select(self, select_dir, indices):
		"""Creates a new dataset from a selection of indices from self."""
		################################################### DEBUG
		indices = np.array(indices).astype(int)
		X, y, w, ids = self.to_numpy()
		tasks = self.get_task_names()
		################################################### DEBUG
		X_sel, y_sel, w_sel, ids_sel = (
		X[indices], y[indices], w[indices], ids[indices])
		return Dataset.from_numpy(select_dir, X_sel, y_sel, w_sel, ids_sel, tasks)

deepchem/datasets/pdbbind_datasets.py

0 → 100644

+126 −0

Original line number	Diff line number	Diff line
		"""
		PDBBind dataset loader.
		"""

		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		import os
		import numpy as np
		import pandas as pd
		import shutil
		from rdkit import Chem
		from deepchem.utils.save import load_from_disk
		from deepchem.datasets import Dataset
		from deepchem.featurizers.featurize import DataFeaturizer
		from deepchem.featurizers.fingerprints import CircularFingerprint
		from deepchem.transformers import BalancingTransformer
		from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
		from deepchem.featurizers.grid_featurizer import GridFeaturizer
		#from deepchem.featurizers.nnscore_utils import hydrogenate_and_compute_partial_charges

		def load_pdbbind_labels(labels_file):
		"""Loads pdbbind labels as dataframe"""
		contents = []
		with open(labels_file) as f:
		for line in f:
		if line.startswith("#"):
		continue
		else:
		contents.append(line.split())
		contents_df = pd.DataFrame(
		contents,
		columns=("PDB code", "resolution", "release year", "-logKd/Ki", "Kd/Ki",
		"ignore-this-field", "reference", "ligand name"))
		return contents_df

		def compute_pdbbind_feature(compound_featurizers, complex_featurizers,
		pdb_subdir, pdb_code):
		"""Compute features for a given complex"""
		protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
		ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
		################################## DEBUG
		print("ligand_file")
		print(ligand_file)
		################################## DEBUG
		#rdkit_mol = Chem.MolFromMol2File(str(ligand_file))
		rdkit_mol = Chem.SDMolSupplier(str(ligand_file)).next()

		all_features = []
		for complex_featurizer in complex_featurizers:
		features = complex_featurizer.featurize_complexes(
		[ligand_file], [protein_file])
		all_features.append(features)

		for compound_featurizer in compound_featurizers:
		features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
		########################################### DEBUG
		########################################### DEBUG
		all_features.append(features)

		features = np.concatenate(all_features)
		return features

		def load_pdbbind(pdbbind_dir, base_dir, reload=True):
		"""Load PDBBind datasets. Does not do train/test split"""
		# Set some global variables up top
		reload = True
		verbosity = "high"
		model = "logistic"
		regen = False

		# Create some directories for analysis
		# The base_dir holds the results of all analysis
		if not reload:
		if os.path.exists(base_dir):
		shutil.rmtree(base_dir)
		if not os.path.exists(base_dir):
		os.makedirs(base_dir)
		current_dir = os.path.dirname(os.path.realpath(__file__))
		#Make directories to store the raw and featurized datasets.
		data_dir = os.path.join(base_dir, "dataset")

		# Load PDBBind dataset
		labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
		pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
		tasks = ["-logKd/Ki"]
		print("About to load contents.")
		contents_df = load_pdbbind_labels(labels_file)
		ids = contents_df["PDB code"].values
		y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

		# Define featurizers
		grid_featurizer = GridFeaturizer(
		voxel_width=16.0, feature_types="voxel_combined",
		voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
		"salt_bridge"], ecfp_power=9, splif_power=9,
		parallel=True, flatten=True)
		compound_featurizers = [CircularFingerprint(size=1024)]
		#complex_featurizers = [grid_featurizer, NNScoreComplexFeaturizer()]
		complex_featurizers = [grid_featurizer]
		#complex_featurizers = []

		# Featurize Dataset
		features = []
		for pdb_code in ids:
		pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
		computed_feature = compute_pdbbind_feature(
		compound_featurizers, complex_featurizers, pdb_subdir, pdb_code)
		if len(computed_feature) == 0:
		computed_feature = np.zeros(1024)
		features.append(computed_feature)
		X = np.vstack(features)
		w = np.ones_like(y)

		#from sklearn.ensemble import RandomForestRegressor
		#rf = RandomForestRegressor(n_estimators=500)
		#rf.fit(X, y)
		print("About to call Dataset.from_numpy()")
		print("X.shape, y.shape, w.shape, ids.shape")
		print(X.shape, y.shape, w.shape, ids.shape)

		dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
		transformers = []

		return tasks, dataset, transformers

deepchem/datasets/tests/test_merge.py

0 → 100644

+51 −0

Original line number	Diff line number	Diff line
		"""
		Testing singletask/multitask dataset merging
		"""
		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "GPL"

		import os
		import shutil
		import tempfile
		import numpy as np
		from deepchem.models.tests import TestAPI
		from deepchem.utils.save import load_from_disk
		from deepchem.featurizers.fingerprints import CircularFingerprint
		from deepchem.featurizers.featurize import DataFeaturizer
		from deepchem.datasets import Dataset

		class TestMerge(TestAPI):
		"""
		Test singletask/multitask dataset merging.
		"""
		def test_move_load(self):
		"""Test that datasets can be moved and loaded."""
		verbosity = "high"
		current_dir = os.path.dirname(os.path.realpath(__file__))
		first_data_dir = os.path.join(self.base_dir, "first_dataset")
		second_data_dir = os.path.join(self.base_dir, "second_dataset")
		merged_data_dir = os.path.join(self.base_dir, "merged_data")

		dataset_file = os.path.join(
		current_dir, "../../models/tests/example.csv")

		featurizers = [CircularFingerprint(size=1024)]
		tasks = ["log-solubility"]
		featurizer = DataFeaturizer(tasks=tasks,
		smiles_field="smiles",
		featurizers=featurizers,
		verbosity=verbosity)
		first_dataset = featurizer.featurize(
		dataset_file, first_data_dir)
		second_dataset = featurizer.featurize(
		dataset_file, second_data_dir)

		merged_dataset = Dataset.merge(
		merged_data_dir, [first_dataset, second_dataset])

		assert len(merged_dataset) == len(first_dataset) + len(second_dataset)

deepchem/featurizers/featurize.py

+66 −81

Original line number	Diff line number	Diff line
		@@ -16,8 +16,7 @@ from functools import partial
		from rdkit import Chem
		from deepchem.utils.save import log
		from deepchem.utils.save import save_to_disk
		from deepchem.utils.save import load_from_disk
		from deepchem.utils.save import load_pandas_from_disk
		from deepchem.utils.save import load_pickle_from_disk
		from deepchem.featurizers import Featurizer, ComplexFeaturizer
		from deepchem.featurizers import UserDefinedFeaturizer
		from deepchem.datasets import Dataset
		@@ -38,19 +37,25 @@ def _process_field(val):
		else:
		raise ValueError("Field of unrecognized type: %s" % str(val))

		def load_data(input_file):
		"""Loads data from disk."""
		def load_data(input_file, shard_size=None):
		"""Loads data from disk.

		For CSV files, supports sharded loading for large files.
		"""
		input_type = _get_input_type(input_file)
		if input_type == "sdf":
		raw_df = _load_sdf_file(input_file)
		else:
		raw_df = _load_csv_file(input_file)
		return raw_df
		if shard_size is not None:
		raise ValueError("shard_size must be None for sdf input.")
		return _load_sdf_file(input_file)
		elif input_type == "csv":
		return _load_csv_file(input_file, shard_size)
		elif input_type == "pandas-pickle":
		return [load_pickle_from_disk(input_file)]

		def _load_sdf_file(input_file):
		"""Load SDF file into dataframe."""
		# Tasks are stored in .sdf.csv file
		raw_df = load_pandas_from_disk(input_file+".csv")
		raw_df = _load_csv_file(input_file+".csv", shard_size=None).next()
		# Structures are stored in .sdf file
		print("Reading structures from %s." % input_file)
		suppl = Chem.SDMolSupplier(str(input_file), removeHs=False)
		@@ -61,12 +66,17 @@ def _load_sdf_file(input_file):
		df_rows.append([ind,smiles,mol])
		mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol'))
		raw_df = pd.concat([mol_df, raw_df], axis=1, join='inner')
		return raw_df
		return [raw_df]

		def _load_csv_file(input_file):
		"""Loads CSV file into dataframe."""
		raw_df = load_pandas_from_disk(input_file)
		return raw_df
		def _load_csv_file(filename, shard_size=None):
		"""Load data as pandas dataframe."""
		# First line of user-specified CSV must be header.
		if shard_size is None:
		yield pd.read_csv(filename)
		else:
		for df in pd.read_csv(filename, chunksize=shard_size):
		df = df.replace(np.nan, str(""), regex=True)
		yield df

		def _get_input_type(input_file):
		"""Get type of input file. Must be csv/pkl.gz/sdf file."""
		@@ -85,28 +95,6 @@ def _get_input_type(input_file):
		else:
		raise ValueError("Unrecognized extension %s" % file_extension)

		def _get_fields(input_file):
		"""Get the names of fields and field_types for input data."""
		# If CSV input, assume that first row contains labels
		input_type = _get_input_type(input_file)
		if input_type == "csv":
		with open(input_file, "rb") as inp_file_obj:
		return csv.reader(inp_file_obj).next()
		elif input_type == "pandas-joblib":
		df = load_from_disk(input_file)
		return df.keys()
		elif input_type == "pandas-pickle":
		df = load_pickle_from_disk(input_file)
		return df.keys()
		# If SDF input, assume that .sdf.csv file contains labels
		elif input_type == "sdf":
		label_file = input_file + ".csv"
		print("Reading labels from %s" % label_file)
		with open(label_file, "rb") as inp_file_obj:
		return inp_file_obj.readline()
		else:
		raise ValueError("Unrecognized extension for %s" % input_file)

		class DataFeaturizer(object):
		"""
		Handles loading/featurizing of chemical samples (datapoints).
		@@ -115,7 +103,7 @@ class DataFeaturizer(object):
		dataframe object to disk as output.
		"""

		def __init__(self, tasks, smiles_field,
		def __init__(self, tasks, smiles_field=None,
		id_field=None, threshold=None,
		protein_pdb_field=None, ligand_pdb_field=None,
		ligand_mol2_field=None, mol_field=None,
		@@ -148,61 +136,57 @@ class DataFeaturizer(object):
		"""Featurize provided file and write to specified location."""
		log("Loading raw samples now.", self.verbosity)

		raw_df = load_data(input_file)
		fields = raw_df.keys()
		log("Loaded raw data frame from file.", self.verbosity)
		log("About to preprocess samples.", self.verbosity)

		if not os.path.exists(data_dir):
		os.makedirs(data_dir)

		def process_raw_sample_helper(row, fields, input_type):
		return self._process_raw_sample(input_type, row, fields)
		# Construct partial function to write datasets.
		write_fn = partial(
		Dataset.write_dataframe, data_dir=data_dir,
		featurizers=self.featurizers, tasks=self.tasks)
		input_type = _get_input_type(input_file)
		process_raw_sample_helper_partial = partial(process_raw_sample_helper,
		fields=fields,
		input_type=input_type)

		metadata_rows = []
		for shard_num, raw_df_shard in enumerate(load_data(input_file, shard_size)):
		log("Loaded shard %d of size %s from file." % (shard_num+1, str(shard_size)),
		self.verbosity)
		log("About to featurize shard.", self.verbosity)

		nb_sample = raw_df.shape[0]
		interval_points = np.linspace(
		0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int)
		def process_helper(row, fields, input_type):
		return self._process_raw_sample(input_type, row, fields)
		process_fn = partial(process_helper, fields=raw_df_shard.keys(),
		input_type=input_type)

		metadata_rows = []
		# Construct partial function to write datasets.
		write_dataframe_partial = partial(
		Dataset.write_dataframe, data_dir=data_dir,
		featurizers=self.featurizers, tasks=self.tasks)
		metadata_rows.append(self._featurize_shard(
		raw_df_shard, process_fn, write_fn, shard_num, input_type))

		for j in range(len(interval_points)-1):
		log("Sharding and standardizing into shard-%s / %s shards"
		% (str(j+1), len(interval_points)-1), self.verbosity)
		raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])]
		raw_df_shard = raw_df_shard.apply(
		process_raw_sample_helper_partial, axis=1, reduce=False)
		# TODO(rbharath): This whole bit with metadata_rows is an awkward way of
		# creating a Dataset. Is there a more elegant solutions?
		dataset = Dataset(data_dir=data_dir,
		metadata_rows=metadata_rows,
		reload=reload, verbosity=self.verbosity)
		return dataset

		df = self._standardize_df(raw_df_shard)
		def _featurize_shard(self, raw_df_shard, process_fn, write_fn, shard_num, input_type):
		"""Featurizes a shard of an input dataframe."""
		log("Applying processing transformation to shard.",
		self.verbosity)
		raw_df_shard = raw_df_shard.apply(
		process_fn, axis=1, reduce=False)
		log("About to standardize dataframe.")
		df_shard = self._standardize_df(raw_df_shard)

		field = "mol" if input_type == "sdf" else "smiles"
		for featurizer in self.featurizers:
		log("Currently featurizing feature_type: %s"
		% featurizer.__class__.__name__, self.verbosity)
		if isinstance(featurizer, UserDefinedFeaturizer):
		self._add_user_specified_features(df, featurizer)
		self._add_user_specified_features(df_shard, featurizer)
		elif isinstance(featurizer, Featurizer):
		self._featurize_mol(df, featurizer, field=field,
		worker_pool=worker_pool)
		self._featurize_mol(df_shard, featurizer, field=field)
		elif isinstance(featurizer, ComplexFeaturizer):
		self._featurize_complexes(df, featurizer,
		worker_pool=worker_pool)
		basename = "shard-%d" % j
		metadata_rows.append(write_dataframe_partial((basename, df)))

		dataset = Dataset(data_dir=data_dir,
		metadata_rows=metadata_rows,
		reload=reload, verbosity=self.verbosity)

		return dataset
		self._featurize_complexes(df_shard, featurizer)
		basename = "shard-%d" % shard_num
		return write_fn((basename, df_shard))

		def _shard_files_exist(self, feature_dir):
		"""Checks if data shard files already exist."""
		@@ -239,6 +223,7 @@ class DataFeaturizer(object):
		"""
		df = pd.DataFrame(ori_df[[self.id_field]])
		df.columns = ["mol_id"]
		if self.smiles_field is not None:
		df["smiles"] = ori_df[[self.smiles_field]]
		for task in self.tasks:
		df[task] = ori_df[[task]]

deepchem/featurizers/grid_featurizer.py

+92 −79

File changed.

Preview size limit exceeded, changes collapsed.

Admin message