Some test fixes and cleanup (1e57a4f9) · Commits · 钟慕尧 / deepchem

deepchem/datasets/init.py

+16 −52

Original line number	Diff line number	Diff line
		@@ -73,14 +73,12 @@ class Dataset(object):
		# TODO(rbharath): This is a hack. clean up.
		if not len(df):
		return None
		##################################################### DEBUG
		if hasattr(featurizer, "dtype"):
		dtype = featurizer.dtype
		compute_feature_statistics = False
		else:
		dtype = float
		compute_feature_statistics = True
		##################################################### DEBUG
		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		@@ -117,11 +115,8 @@ class Dataset(object):
		return metadata_df

		@staticmethod
		############################################################## DEBUG
		#def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None):
		def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None,
		compute_feature_statistics=True):
		############################################################## DEBUG
		out_X = "%s-X.joblib" % basename
		out_X_transformed = "%s-X-transformed.joblib" % basename
		out_X_sums = "%s-X_sums.joblib" % basename
		@@ -140,6 +135,10 @@ class Dataset(object):
		save_to_disk(X, os.path.join(data_dir, out_X))
		save_to_disk(X, os.path.join(data_dir, out_X_transformed))
		if compute_feature_statistics:
		########################################################## DEBUG
		print("compute_feature_statistics")
		print(compute_feature_statistics)
		########################################################## DEBUG
		X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
		save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
		save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
		@@ -464,12 +463,9 @@ class Dataset(object):
		"""Sets verbosity."""
		self.verbosity = new_verbosity

		####################################################### DEBUG
		# TODO(rbharath): This ad-hoc change for general object types seems kludgey.
		# Is there a more principled approach to support general objects?
		#def select(self, select_dir, indices):
		# TODO(rbharath): This change for general object types seems a little
		# kludgey. Is there a more principled approach to support general objects?
		def select(self, select_dir, indices, compute_feature_statistics=False):
		####################################################### DEBUG
		"""Creates a new dataset from a selection of indices from self."""
		if not os.path.exists(select_dir):
		os.makedirs(select_dir)
		@@ -497,13 +493,11 @@ class Dataset(object):
		w_sel = w[shard_indices]
		ids_sel = ids[shard_indices]
		basename = "dataset-%d" % shard_num
		############################################################ DEBUG
		metadata_rows.append(
		Dataset.write_data_to_disk(
		select_dir, basename, tasks,
		X_sel, y_sel, w_sel, ids_sel,
		compute_feature_statistics=compute_feature_statistics))
		############################################################ DEBUG
		# Updating counts
		indices_count += num_shard_elts
		count += shard_len
		@@ -627,38 +621,32 @@ class Dataset(object):
		"""Return pandas series of label stds."""
		return self.metadata_df["y_stds"]

		################################################## DEBUG
		#def get_statistics(self):
		def get_statistics(self, X_stats=True, y_stats=True):
		################################################## DEBUG
		"""Computes and returns statistics of this dataset"""
		if len(self) == 0:
		return None, None, None, None
		################################################## DEBUG
		#self.update_moments()
		self.update_moments(X_stats, y_stats)
		################################################## DEBUG
		df = self.metadata_df
		################################################## DEBUG
		#X_means, X_stds, y_means, y_stds = self._compute_mean_and_std(df)
		if X_stats:
		if X_stats and not y_stats:
		X_means, X_stds = self._compute_mean_and_std(df, X_stats, y_stats)
		return X_means, X_stds
		elif y_stats:
		elif y_stats and not X_stats:
		y_means, y_stds = self._compute_mean_and_std(df, X_stats, y_stats)
		return y_means, y_stds
		################################################## DEBUG
		#return X_means, X_stds, y_means, y_stds
		elif X_stats and y_stats:
		X_means, X_stds = self._compute_mean_and_std(
		df, X_stats=True, y_stats=False)
		y_means, y_stds = self._compute_mean_and_std(
		df, X_stats=False, y_stats=True)
		return X_means, X_stds, y_means, y_stds
		else:
		return None

		################################################## DEBUG
		#def _compute_mean_and_std(self, df):
		def _compute_mean_and_std(self, df, X_stats, y_stats):
		################################################## DEBUG
		"""
		Compute means/stds of X/y from sums/sum_squares of tensors.
		"""

		################################################## DEBUG
		if X_stats:
		X_sums = []
		X_sum_squares = []
		@@ -681,9 +669,7 @@ class Dataset(object):

		X_vars = (overall_X_sum_squares - np.square(overall_X_sums)/n)/(n)
		return overall_X_means, np.sqrt(X_vars)
		################################################## DEBUG

		################################################## DEBUG
		if y_stats:
		y_sums = []
		y_sum_squares = []
		@@ -703,27 +689,16 @@ class Dataset(object):
		y_means = np.sum(y_sums, axis=0)/y_n
		y_vars = np.sum(y_sum_squares, axis=0)/y_n - np.square(y_means)
		return y_means, np.sqrt(y_vars)
		################################################## DEBUG

		########################################################## DEBUG
		#def update_moments(self):
		def update_moments(self, X_stats, y_stats):
		########################################################## DEBUG
		"""Re-compute statistics of this dataset during transformation"""
		df = self.metadata_df
		########################################################## DEBUG
		#self._update_mean_and_std(df)
		self._update_mean_and_std(df, X_stats, y_stats)
		########################################################## DEBUG

		########################################################## DEBUG
		#def _update_mean_and_std(self, df):
		def _update_mean_and_std(self, df, X_stats, y_stats):
		########################################################## DEBUG
		"""
		Compute means/stds of X/y from sums/sum_squares of tensors.
		"""
		########################################################## DEBUG
		if X_stats:
		X_transform = []
		for _, row in df.iterrows():
		@@ -732,9 +707,7 @@ class Dataset(object):
		Xss = np.sum(np.square(Xt),axis=0)
		save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
		save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))
		########################################################## DEBUG

		########################################################## DEBUG
		if y_stats:
		y_transform = []
		for _, row in df.iterrows():
		@@ -743,7 +716,6 @@ class Dataset(object):
		yss = np.sum(np.square(yt),axis=0)
		save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
		save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))
		########################################################## DEBUG

		def get_grad_statistics(self):
		"""Computes and returns statistics of this dataset
		@@ -780,7 +752,6 @@ class Dataset(object):

		return grad, ydely_means


		def compute_sums_and_nb_sample(tensor, W=None):
		"""
		Computes sums, squared sums of tensor along axis 0.
		@@ -872,16 +843,9 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, dtype,
		w = w[valid_inds]
		# Adding this assertion in to avoid ill-formed outputs.
		assert len(sorted_ids) == len(x) == len(y) == len(w)
		############################################################## DEBUG
		#return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
		print("x[0]")
		print(x[0])
		print("type(x)")
		print(type(x))
		if dtype == float:
		return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
		elif dtype == object:
		return sorted_ids, x, y.astype(float), w.astype(float)
		else:
		raise ValueError("Unrecognized dtype for featurizer.")
		############################################################## DEBUG

deepchem/featurizers/tests/test_atomic_coordinates.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -159,7 +159,7 @@ class TestAtomicCoordinates(unittest.TestCase):
		x_bins, y_bins, z_bins = get_cells(coords, nblist_featurizer.neighbor_cutoff)

		nblist_featurizer = NeighborListAtomicCoordinates()
		nblist = nblist_featurizer._featurize(self.mol)
		nblist = nblist_featurizer._featurize(self.mol)[1]
		assert isinstance(nblist, dict)
		assert len(nblist.keys()) == N
		for (atom, neighbors) in nblist.items():
		@@ -185,12 +185,12 @@ class TestAtomicCoordinates(unittest.TestCase):

		# Test with cutoff 0 angstroms. There should be no neighbors in this case.
		nblist_featurizer = NeighborListAtomicCoordinates(neighbor_cutoff=.1)
		nblist = nblist_featurizer._featurize(self.mol)
		nblist = nblist_featurizer._featurize(self.mol)[1]
		for atom in range(N):
		assert len(nblist[atom]) == 0

		# Test with cutoff 100 angstroms. Everything should be neighbors now.
		nblist_featurizer = NeighborListAtomicCoordinates(neighbor_cutoff=100)
		nblist = nblist_featurizer._featurize(self.mol)
		nblist = nblist_featurizer._featurize(self.mol)[1]
		for atom in range(N):
		assert len(nblist[atom]) == N-1

deepchem/models/tensorflow_models/init.py

+1 −2

Original line number	Diff line number	Diff line
		"""Helper operations and classes for general model building.

		"""
		from __future__ import print_function
		from __future__ import division
		@@ -211,7 +210,7 @@ class TensorflowGraph(object):
		dataset.shuffle()
		####################################################################### DEBUG
		for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size)):
		print("On batch %d" % ind)
		log("On batch %d" % ind, self.verbosity)
		####################################################################### DEBUG
		# Run training op.
		feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)

deepchem/models/tests/test_api.py

+2 −0

Original line number	Diff line number	Diff line
		@@ -193,6 +193,7 @@ class TestModelAPI(TestAPI):
		smiles_field=self.smiles_field,
		featurizer=featurizer,
		verbosity="low")

		dataset = loader.featurize(input_file, self.data_dir)

		splitter = ScaffoldSplitter()
		@@ -218,6 +219,7 @@ class TestModelAPI(TestAPI):
		mode="regression",
		model_instance=RandomForestRegressor())


		# Fit trained model
		model.fit(train_dataset)
		model.save()

deepchem/splits/init.py

+11 −4

Original line number	Diff line number	Diff line
		@@ -37,7 +37,8 @@ class Splitter(object):
		def train_valid_test_split(self, dataset, train_dir,
		valid_dir, test_dir, frac_train=.8,
		frac_valid=.1, frac_test=.1, seed=None,
		log_every_n=1000):
		log_every_n=1000,
		compute_feature_statistics=True):
		"""
		Splits self into train/validation/test sets.

		@@ -48,12 +49,18 @@ class Splitter(object):
		dataset,
		frac_train=frac_train, frac_test=frac_test,
		frac_valid=frac_valid, log_every_n=log_every_n)
		train_dataset = dataset.select(train_dir, train_inds)
		train_dataset = dataset.select(
		train_dir, train_inds,
		compute_feature_statistics=compute_feature_statistics)
		if valid_dir is not None:
		valid_dataset = dataset.select(valid_dir, valid_inds)
		valid_dataset = dataset.select(
		valid_dir, valid_inds,
		compute_feature_statistics=compute_feature_statistics)
		else:
		valid_dataset = None
		test_dataset = dataset.select(test_dir, test_inds)
		test_dataset = dataset.select(
		test_dir, test_inds,
		compute_feature_statistics=compute_feature_statistics)

		return train_dataset, valid_dataset, test_dataset

Admin message