Preliminary commit for arbitrary object featurizers (08fdf94c) · Commits · 钟慕尧 / deepchem

deepchem/datasets/init.py

+152 −76

Original line number	Diff line number	Diff line
		@@ -73,11 +73,19 @@ class Dataset(object):
		# TODO(rbharath): This is a hack. clean up.
		if not len(df):
		return None
		##################################################### DEBUG
		if hasattr(featurizer, "dtype"):
		dtype = featurizer.dtype
		compute_feature_statistics = False
		else:
		dtype = float
		compute_feature_statistics = True
		##################################################### DEBUG
		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		ids, X, y, w = convert_df_to_numpy(df, feature_type, tasks, mol_id_field,
		verbosity)
		dtype, verbosity)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: convert_df_to_numpy took %0.3f s" % (time2-time1), verbosity)
		@@ -88,7 +96,9 @@ class Dataset(object):
		assert X.shape[0] == y.shape[0]
		assert y.shape == w.shape
		assert len(ids) == X.shape[0]
		return Dataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)
		return Dataset.write_data_to_disk(
		data_dir, basename, tasks, X, y, w, ids,
		compute_feature_statistics=compute_feature_statistics)

		@staticmethod
		def construct_metadata(metadata_entries):
		@@ -107,7 +117,11 @@ class Dataset(object):
		return metadata_df

		@staticmethod
		def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None):
		############################################################## DEBUG
		#def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None):
		def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None,
		compute_feature_statistics=True):
		############################################################## DEBUG
		out_X = "%s-X.joblib" % basename
		out_X_transformed = "%s-X-transformed.joblib" % basename
		out_X_sums = "%s-X_sums.joblib" % basename
		@@ -125,6 +139,7 @@ class Dataset(object):
		if X is not None:
		save_to_disk(X, os.path.join(data_dir, out_X))
		save_to_disk(X, os.path.join(data_dir, out_X_transformed))
		if compute_feature_statistics:
		X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
		save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
		save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
		@@ -449,7 +464,12 @@ class Dataset(object):
		"""Sets verbosity."""
		self.verbosity = new_verbosity

		def select(self, select_dir, indices):
		####################################################### DEBUG
		# TODO(rbharath): This ad-hoc change for general object types seems kludgey.
		# Is there a more principled approach to support general objects?
		#def select(self, select_dir, indices):
		def select(self, select_dir, indices, compute_feature_statistics=False):
		####################################################### DEBUG
		"""Creates a new dataset from a selection of indices from self."""
		if not os.path.exists(select_dir):
		os.makedirs(select_dir)
		@@ -477,9 +497,13 @@ class Dataset(object):
		w_sel = w[shard_indices]
		ids_sel = ids[shard_indices]
		basename = "dataset-%d" % shard_num
		############################################################ DEBUG
		metadata_rows.append(
		Dataset.write_data_to_disk(select_dir, basename, tasks,
		X_sel, y_sel, w_sel, ids_sel))
		Dataset.write_data_to_disk(
		select_dir, basename, tasks,
		X_sel, y_sel, w_sel, ids_sel,
		compute_feature_statistics=compute_feature_statistics))
		############################################################ DEBUG
		# Updating counts
		indices_count += num_shard_elts
		count += shard_len
		@@ -603,20 +627,39 @@ class Dataset(object):
		"""Return pandas series of label stds."""
		return self.metadata_df["y_stds"]

		def get_statistics(self):
		################################################## DEBUG
		#def get_statistics(self):
		def get_statistics(self, X_stats=True, y_stats=True):
		################################################## DEBUG
		"""Computes and returns statistics of this dataset"""
		if len(self) == 0:
		return None, None, None, None
		self.update_moments()
		################################################## DEBUG
		#self.update_moments()
		self.update_moments(X_stats, y_stats)
		################################################## DEBUG
		df = self.metadata_df
		X_means, X_stds, y_means, y_stds = self._compute_mean_and_std(df)
		return X_means, X_stds, y_means, y_stds

		def _compute_mean_and_std(self, df):
		################################################## DEBUG
		#X_means, X_stds, y_means, y_stds = self._compute_mean_and_std(df)
		if X_stats:
		X_means, X_stds = self._compute_mean_and_std(df, X_stats, y_stats)
		return X_means, X_stds
		elif y_stats:
		y_means, y_stds = self._compute_mean_and_std(df, X_stats, y_stats)
		return y_means, y_stds
		################################################## DEBUG
		#return X_means, X_stds, y_means, y_stds

		################################################## DEBUG
		#def _compute_mean_and_std(self, df):
		def _compute_mean_and_std(self, df, X_stats, y_stats):
		################################################## DEBUG
		"""
		Compute means/stds of X/y from sums/sum_squares of tensors.
		"""

		################################################## DEBUG
		if X_stats:
		X_sums = []
		X_sum_squares = []
		X_n = []
		@@ -637,7 +680,11 @@ class Dataset(object):
		overall_X_sum_squares = np.sum(X_sum_squares, axis=0)

		X_vars = (overall_X_sum_squares - np.square(overall_X_sums)/n)/(n)
		return overall_X_means, np.sqrt(X_vars)
		################################################## DEBUG

		################################################## DEBUG
		if y_stats:
		y_sums = []
		y_sum_squares = []
		y_n = []
		@@ -655,17 +702,29 @@ class Dataset(object):
		y_sum_squares = np.vstack(y_sum_squares)
		y_means = np.sum(y_sums, axis=0)/y_n
		y_vars = np.sum(y_sum_squares, axis=0)/y_n - np.square(y_means)
		return overall_X_means, np.sqrt(X_vars), y_means, np.sqrt(y_vars)
		return y_means, np.sqrt(y_vars)
		################################################## DEBUG

		def update_moments(self):
		########################################################## DEBUG
		#def update_moments(self):
		def update_moments(self, X_stats, y_stats):
		########################################################## DEBUG
		"""Re-compute statistics of this dataset during transformation"""
		df = self.metadata_df
		self._update_mean_and_std(df)
		########################################################## DEBUG
		#self._update_mean_and_std(df)
		self._update_mean_and_std(df, X_stats, y_stats)
		########################################################## DEBUG

		def _update_mean_and_std(self, df):
		########################################################## DEBUG
		#def _update_mean_and_std(self, df):
		def _update_mean_and_std(self, df, X_stats, y_stats):
		########################################################## DEBUG
		"""
		Compute means/stds of X/y from sums/sum_squares of tensors.
		"""
		########################################################## DEBUG
		if X_stats:
		X_transform = []
		for _, row in df.iterrows():
		Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed']))
		@@ -673,7 +732,10 @@ class Dataset(object):
		Xss = np.sum(np.square(Xt),axis=0)
		save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
		save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))
		########################################################## DEBUG

		########################################################## DEBUG
		if y_stats:
		y_transform = []
		for _, row in df.iterrows():
		yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
		@@ -681,6 +743,7 @@ class Dataset(object):
		yss = np.sum(np.square(yt),axis=0)
		save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
		save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))
		########################################################## DEBUG

		def get_grad_statistics(self):
		"""Computes and returns statistics of this dataset
		@@ -749,7 +812,8 @@ def compute_sums_and_nb_sample(tensor, W=None):

		# The following are all associated with Dataset, but are separate functions to
		# make it easy to use multiprocessing.
		def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, verbosity=None):
		def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, dtype,
		verbosity=None):
		"""Transforms a dataframe containing deepchem input into numpy arrays"""
		if feature_type not in df.keys():
		raise ValueError(
		@@ -808,4 +872,16 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, verbosity=None):
		w = w[valid_inds]
		# Adding this assertion in to avoid ill-formed outputs.
		assert len(sorted_ids) == len(x) == len(y) == len(w)
		############################################################## DEBUG
		#return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
		print("x[0]")
		print(x[0])
		print("type(x)")
		print(type(x))
		if dtype == float:
		return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
		elif dtype == object:
		return sorted_ids, x, y.astype(float), w.astype(float)
		else:
		raise ValueError("Unrecognized dtype for featurizer.")
		############################################################## DEBUG

deepchem/featurizers/atomic_coordinates.py

+7 −2

Original line number	Diff line number	Diff line
		@@ -173,6 +173,9 @@ class NeighborListAtomicCoordinates(Featurizer):
		if neighbor_cutoff <= 0:
		raise ValueError("neighbor_cutoff must be positive value.")
		self.neighbor_cutoff = neighbor_cutoff
		# Type of data created by this featurizer
		self.dtype = object
		self.coordinates_featurizer = AtomicCoordinates()

		def _featurize(self, mol):
		"""
		@@ -182,6 +185,8 @@ class NeighborListAtomicCoordinates(Featurizer):
		----------
		"""
		N = mol.GetNumAtoms()
		# TODO(rbharath): Should this return a list?
		bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
		coords = get_coords(mol)

		x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff)
		@@ -217,6 +222,6 @@ class NeighborListAtomicCoordinates(Featurizer):
		if np.linalg.norm(coords[atom] - coords[neighbor_atom]) < self.neighbor_cutoff:
		neighbor_list[atom].add(neighbor_atom)

		neighbor_list[atom] = list(neighbor_list[atom])
		neighbor_list[atom] = sorted(list(neighbor_list[atom]))

		return neighbor_list
		return (bohr_coords, neighbor_list)

deepchem/featurizers/tests/test_atomic_coordinates.py

+0 −1

Original line number	Diff line number	Diff line
		@@ -194,4 +194,3 @@ class TestAtomicCoordinates(unittest.TestCase):
		nblist = nblist_featurizer._featurize(self.mol)
		for atom in range(N):
		assert len(nblist[atom]) == N-1

deepchem/models/tensorflow_models/init.py

+7 −4

Original line number	Diff line number	Diff line
		@@ -209,7 +209,10 @@ class TensorflowGraph(object):
		if shuffle:
		log("About to shuffle dataset before epoch start.", self.verbosity)
		dataset.shuffle()
		for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size):
		####################################################################### DEBUG
		for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size)):
		print("On batch %d" % ind)
		####################################################################### DEBUG
		# Run training op.
		feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
		fetches = self.output + [
		@@ -370,9 +373,9 @@ class TensorflowGraph(object):
		# allow_soft_placement=True allows ops without a GPU implementation
		# to run on the CPU instead.
		config = tf.ConfigProto(allow_soft_placement=True)
		################################################################# DEBUG
		config.gpu_options.allow_growth = True
		################################################################# DEBUG
		################################################################## DEBUG
		#config.gpu_options.allow_growth = True
		################################################################## DEBUG
		self._shared_session = tf.Session(config=config)
		return self._shared_session

deepchem/transformers/init.py

+13 −7

Original line number	Diff line number	Diff line
		@@ -105,13 +105,19 @@ class NormalizationTransformer(Transformer):
		super(NormalizationTransformer, self).__init__(
		transform_X=transform_X, transform_y=transform_y,
		transform_w=transform_w, dataset=dataset)
		X_means, X_stds, y_means, y_stds = dataset.get_statistics()
		#################################################################### DEBUG
		#X_means, X_stds, y_means, y_stds = dataset.get_statistics()
		if transform_X:
		X_means, X_stds = dataset.get_statistics(X_stats=True, y_stats=False)
		self.X_means = X_means
		self.X_stds = X_stds
		elif transform_y:
		y_means, y_stds = dataset.get_statistics(X_stats=False, y_stats=True)
		self.y_means = y_means
		# Control for pathological case with no variance.
		y_stds[y_stds == 0] = 1.
		self.y_stds = y_stds
		#################################################################### DEBUG
		self.transform_gradients = transform_gradients
		if self.transform_gradients:
		true_grad, ydely_means = dataset.get_grad_statistics()

Admin message