Merge pull request #219 from rbharath/object_featurizers (c1302d05) · Commits · 钟慕尧 / deepchem

deepchem/datasets/init.py

+115 −75

Original line number	Diff line number	Diff line
		@@ -73,11 +73,17 @@ class Dataset(object):
		# TODO(rbharath): This is a hack. clean up.
		if not len(df):
		return None
		if hasattr(featurizer, "dtype"):
		dtype = featurizer.dtype
		compute_feature_statistics = False
		else:
		dtype = float
		compute_feature_statistics = True
		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		ids, X, y, w = convert_df_to_numpy(df, feature_type, tasks, mol_id_field,
		verbosity)
		dtype, verbosity)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: convert_df_to_numpy took %0.3f s" % (time2-time1), verbosity)
		@@ -88,7 +94,9 @@ class Dataset(object):
		assert X.shape[0] == y.shape[0]
		assert y.shape == w.shape
		assert len(ids) == X.shape[0]
		return Dataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)
		return Dataset.write_data_to_disk(
		data_dir, basename, tasks, X, y, w, ids,
		compute_feature_statistics=compute_feature_statistics)

		@staticmethod
		def construct_metadata(metadata_entries):
		@@ -107,7 +115,8 @@ class Dataset(object):
		return metadata_df

		@staticmethod
		def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None):
		def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None,
		compute_feature_statistics=True):
		out_X = "%s-X.joblib" % basename
		out_X_transformed = "%s-X-transformed.joblib" % basename
		out_X_sums = "%s-X_sums.joblib" % basename
		@@ -125,6 +134,11 @@ class Dataset(object):
		if X is not None:
		save_to_disk(X, os.path.join(data_dir, out_X))
		save_to_disk(X, os.path.join(data_dir, out_X_transformed))
		if compute_feature_statistics:
		########################################################## DEBUG
		print("compute_feature_statistics")
		print(compute_feature_statistics)
		########################################################## DEBUG
		X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
		save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
		save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
		@@ -449,7 +463,9 @@ class Dataset(object):
		"""Sets verbosity."""
		self.verbosity = new_verbosity

		def select(self, select_dir, indices):
		# TODO(rbharath): This change for general object types seems a little
		# kludgey. Is there a more principled approach to support general objects?
		def select(self, select_dir, indices, compute_feature_statistics=False):
		"""Creates a new dataset from a selection of indices from self."""
		if not os.path.exists(select_dir):
		os.makedirs(select_dir)
		@@ -478,8 +494,10 @@ class Dataset(object):
		ids_sel = ids[shard_indices]
		basename = "dataset-%d" % shard_num
		metadata_rows.append(
		Dataset.write_data_to_disk(select_dir, basename, tasks,
		X_sel, y_sel, w_sel, ids_sel))
		Dataset.write_data_to_disk(
		select_dir, basename, tasks,
		X_sel, y_sel, w_sel, ids_sel,
		compute_feature_statistics=compute_feature_statistics))
		# Updating counts
		indices_count += num_shard_elts
		count += shard_len
		@@ -603,20 +621,33 @@ class Dataset(object):
		"""Return pandas series of label stds."""
		return self.metadata_df["y_stds"]

		def get_statistics(self):
		def get_statistics(self, X_stats=True, y_stats=True):
		"""Computes and returns statistics of this dataset"""
		if len(self) == 0:
		return None, None, None, None
		self.update_moments()
		self.update_moments(X_stats, y_stats)
		df = self.metadata_df
		X_means, X_stds, y_means, y_stds = self._compute_mean_and_std(df)
		if X_stats and not y_stats:
		X_means, X_stds = self._compute_mean_and_std(df, X_stats, y_stats)
		return X_means, X_stds
		elif y_stats and not X_stats:
		y_means, y_stds = self._compute_mean_and_std(df, X_stats, y_stats)
		return y_means, y_stds
		elif X_stats and y_stats:
		X_means, X_stds = self._compute_mean_and_std(
		df, X_stats=True, y_stats=False)
		y_means, y_stds = self._compute_mean_and_std(
		df, X_stats=False, y_stats=True)
		return X_means, X_stds, y_means, y_stds
		else:
		return None

		def _compute_mean_and_std(self, df):
		def _compute_mean_and_std(self, df, X_stats, y_stats):
		"""
		Compute means/stds of X/y from sums/sum_squares of tensors.
		"""

		if X_stats:
		X_sums = []
		X_sum_squares = []
		X_n = []
		@@ -637,7 +668,9 @@ class Dataset(object):
		overall_X_sum_squares = np.sum(X_sum_squares, axis=0)

		X_vars = (overall_X_sum_squares - np.square(overall_X_sums)/n)/(n)
		return overall_X_means, np.sqrt(X_vars)

		if y_stats:
		y_sums = []
		y_sum_squares = []
		y_n = []
		@@ -655,17 +688,18 @@ class Dataset(object):
		y_sum_squares = np.vstack(y_sum_squares)
		y_means = np.sum(y_sums, axis=0)/y_n
		y_vars = np.sum(y_sum_squares, axis=0)/y_n - np.square(y_means)
		return overall_X_means, np.sqrt(X_vars), y_means, np.sqrt(y_vars)
		return y_means, np.sqrt(y_vars)

		def update_moments(self):
		def update_moments(self, X_stats, y_stats):
		"""Re-compute statistics of this dataset during transformation"""
		df = self.metadata_df
		self._update_mean_and_std(df)
		self._update_mean_and_std(df, X_stats, y_stats)

		def _update_mean_and_std(self, df):
		def _update_mean_and_std(self, df, X_stats, y_stats):
		"""
		Compute means/stds of X/y from sums/sum_squares of tensors.
		"""
		if X_stats:
		X_transform = []
		for _, row in df.iterrows():
		Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed']))
		@@ -674,6 +708,7 @@ class Dataset(object):
		save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
		save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))

		if y_stats:
		y_transform = []
		for _, row in df.iterrows():
		yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
		@@ -717,7 +752,6 @@ class Dataset(object):

		return grad, ydely_means


		def compute_sums_and_nb_sample(tensor, W=None):
		"""
		Computes sums, squared sums of tensor along axis 0.
		@@ -749,7 +783,8 @@ def compute_sums_and_nb_sample(tensor, W=None):

		# The following are all associated with Dataset, but are separate functions to
		# make it easy to use multiprocessing.
		def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, verbosity=None):
		def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, dtype,
		verbosity=None):
		"""Transforms a dataframe containing deepchem input into numpy arrays"""
		if feature_type not in df.keys():
		raise ValueError(
		@@ -808,4 +843,9 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, verbosity=None):
		w = w[valid_inds]
		# Adding this assertion in to avoid ill-formed outputs.
		assert len(sorted_ids) == len(x) == len(y) == len(w)
		if dtype == float:
		return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
		elif dtype == object:
		return sorted_ids, x, y.astype(float), w.astype(float)
		else:
		raise ValueError("Unrecognized dtype for featurizer.")

deepchem/featurizers/atomic_coordinates.py

+7 −2

Original line number	Diff line number	Diff line
		@@ -173,6 +173,9 @@ class NeighborListAtomicCoordinates(Featurizer):
		if neighbor_cutoff <= 0:
		raise ValueError("neighbor_cutoff must be positive value.")
		self.neighbor_cutoff = neighbor_cutoff
		# Type of data created by this featurizer
		self.dtype = object
		self.coordinates_featurizer = AtomicCoordinates()

		def _featurize(self, mol):
		"""
		@@ -182,6 +185,8 @@ class NeighborListAtomicCoordinates(Featurizer):
		----------
		"""
		N = mol.GetNumAtoms()
		# TODO(rbharath): Should this return a list?
		bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
		coords = get_coords(mol)

		x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff)
		@@ -217,6 +222,6 @@ class NeighborListAtomicCoordinates(Featurizer):
		if np.linalg.norm(coords[atom] - coords[neighbor_atom]) < self.neighbor_cutoff:
		neighbor_list[atom].add(neighbor_atom)

		neighbor_list[atom] = list(neighbor_list[atom])
		neighbor_list[atom] = sorted(list(neighbor_list[atom]))

		return neighbor_list
		return (bohr_coords, neighbor_list)

deepchem/featurizers/tests/test_atomic_coordinates.py

+3 −4

Original line number	Diff line number	Diff line
		@@ -159,7 +159,7 @@ class TestAtomicCoordinates(unittest.TestCase):
		x_bins, y_bins, z_bins = get_cells(coords, nblist_featurizer.neighbor_cutoff)

		nblist_featurizer = NeighborListAtomicCoordinates()
		nblist = nblist_featurizer._featurize(self.mol)
		nblist = nblist_featurizer._featurize(self.mol)[1]
		assert isinstance(nblist, dict)
		assert len(nblist.keys()) == N
		for (atom, neighbors) in nblist.items():
		@@ -185,13 +185,12 @@ class TestAtomicCoordinates(unittest.TestCase):

		# Test with cutoff 0 angstroms. There should be no neighbors in this case.
		nblist_featurizer = NeighborListAtomicCoordinates(neighbor_cutoff=.1)
		nblist = nblist_featurizer._featurize(self.mol)
		nblist = nblist_featurizer._featurize(self.mol)[1]
		for atom in range(N):
		assert len(nblist[atom]) == 0

		# Test with cutoff 100 angstroms. Everything should be neighbors now.
		nblist_featurizer = NeighborListAtomicCoordinates(neighbor_cutoff=100)
		nblist = nblist_featurizer._featurize(self.mol)
		nblist = nblist_featurizer._featurize(self.mol)[1]
		for atom in range(N):
		assert len(nblist[atom]) == N-1

deepchem/models/tensorflow_models/init.py

+7 −5

Original line number	Diff line number	Diff line
		"""Helper operations and classes for general model building.

		"""
		from __future__ import print_function
		from __future__ import division
		@@ -209,7 +208,10 @@ class TensorflowGraph(object):
		if shuffle:
		log("About to shuffle dataset before epoch start.", self.verbosity)
		dataset.shuffle()
		for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size):
		####################################################################### DEBUG
		for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size)):
		log("On batch %d" % ind, self.verbosity)
		####################################################################### DEBUG
		# Run training op.
		feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
		fetches = self.output + [
		@@ -370,9 +372,9 @@ class TensorflowGraph(object):
		# allow_soft_placement=True allows ops without a GPU implementation
		# to run on the CPU instead.
		config = tf.ConfigProto(allow_soft_placement=True)
		################################################################# DEBUG
		config.gpu_options.allow_growth = True
		################################################################# DEBUG
		################################################################## DEBUG
		#config.gpu_options.allow_growth = True
		################################################################## DEBUG
		self._shared_session = tf.Session(config=config)
		return self._shared_session

deepchem/models/tests/test_api.py

+2 −0

Original line number	Diff line number	Diff line
		@@ -193,6 +193,7 @@ class TestModelAPI(TestAPI):
		smiles_field=self.smiles_field,
		featurizer=featurizer,
		verbosity="low")

		dataset = loader.featurize(input_file, self.data_dir)

		splitter = ScaffoldSplitter()
		@@ -218,6 +219,7 @@ class TestModelAPI(TestAPI):
		mode="regression",
		model_instance=RandomForestRegressor())


		# Fit trained model
		model.fit(train_dataset)
		model.save()

Admin message