Added functions for converting between Datasets and pandas DataFrames (0f0e82ba) · Commits · 钟慕尧 / deepchem

deepchem/data/datasets.py

+103 −0

Original line number	Original line	Diff line number	Diff line
	@@ -305,6 +305,109 @@ class Dataset(object):

	return tf.data.Dataset.from_generator(gen_data, dtypes, shapes)		return tf.data.Dataset.from_generator(gen_data, dtypes, shapes)

			def to_dataframe(self):
			"""Construct a pandas DataFrame containing the data from this Dataset."""
			X = self.X
			y = self.y
			w = self.w
			ids = self.ids
			if len(X.shape) == 1 or X.shape[1] == 1:
			columns = ['X']
			else:
			columns = [f'X{i+1}' for i in range(X.shape[1])]
			X_df = pd.DataFrame(X, columns=columns)
			if len(y.shape) == 1 or y.shape[1] == 1:
			columns = ['y']
			else:
			columns = [f'y{i+1}' for i in range(y.shape[1])]
			y_df = pd.DataFrame(y, columns=columns)
			if len(w.shape) == 1 or w.shape[1] == 1:
			columns = ['w']
			else:
			columns = [f'w{i+1}' for i in range(w.shape[1])]
			w_df = pd.DataFrame(w, columns=columns)
			ids_df = pd.DataFrame(ids, columns=['ids'])
			return pd.concat([X_df, y_df, w_df, ids_df], axis=1, sort=False)

			@staticmethod
			def from_dataframe(df, X=None, y=None, w=None, ids=None):
			"""Construct a Dataset from the contents of a pandas DataFrame.

			Parameters
			----------
			df: DataFrame
			the pandas DataFrame
			X: string or list of strings
			the name of the column or columns containing the X array. If this is None,
			it will look for default column names that match those produced by to_dataframe().
			y: string or list of strings
			the name of the column or columns containing the y array. If this is None,
			it will look for default column names that match those produced by to_dataframe().
			w: string or list of strings
			the name of the column or columns containing the w array. If this is None,
			it will look for default column names that match those produced by to_dataframe().
			ids: string
			the name of the column containing the ids. If this is None, it will look
			for default column names that match those produced by to_dataframe().
			"""
			# Find the X values.

			if X is not None:
			X_val = df[X]
			elif 'X' in df.columns:
			X_val = df['X']
			else:
			columns = []
			i = 1
			while f'X{i}' in df.columns:
			columns.append(f'X{i}')
			i += 1
			X_val = df[columns]
			if len(X_val.shape) == 1:
			X_val = np.expand_dims(X_val, 1)

			# Find the X values.

			if y is not None:
			y_val = df[y]
			elif 'y' in df.columns:
			y_val = df['y']
			else:
			columns = []
			i = 1
			while f'y{i}' in df.columns:
			columns.append(f'y{i}')
			i += 1
			y_val = df[columns]
			if len(y_val.shape) == 1:
			y_val = np.expand_dims(y_val, 1)

			# Find the w values.

			if w is not None:
			w_val = df[w]
			elif 'w' in df.columns:
			w_val = df['w']
			else:
			columns = []
			i = 1
			while f'w{i}' in df.columns:
			columns.append(f'w{i}')
			i += 1
			w_val = df[columns]
			if len(w_val.shape) == 1:
			w_val = np.expand_dims(w_val, 1)

			# Find the ids.

			if ids is not None:
			ids_val = df[ids]
			elif 'ids' in df.columns:
			ids_val = df['ids']
			else:
			ids_val = None
			return NumpyDataset(X_val, y_val, w_val, ids_val)


	class NumpyDataset(Dataset):		class NumpyDataset(Dataset):
	"""A Dataset defined by in-memory numpy arrays."""		"""A Dataset defined by in-memory numpy arrays."""

deepchem/data/tests/test_datasets.py

+23 −0

Original line number	Original line	Diff line number	Diff line
	@@ -14,6 +14,7 @@ import shutil
	import numpy as np		import numpy as np
	import deepchem as dc		import deepchem as dc
	import tensorflow as tf		import tensorflow as tf
			import pandas as pd
	from tensorflow.python.framework import test_util		from tensorflow.python.framework import test_util


	@@ -696,6 +697,28 @@ class TestDatasets(test_util.TensorFlowTestCase):
	np.testing.assert_array_equal(np.ones((10, 1)), batch_w)		np.testing.assert_array_equal(np.ones((10, 1)), batch_w)
	assert i == 19		assert i == 19

			def test_dataframe(self):
			"""Test converting between Datasets and DataFrames."""
			dataset = dc.data.tests.load_solubility_data()

			# A round trip from Dataset to DataFrame to Dataset should produce identical arrays.

			df = dataset.to_dataframe()
			dataset2 = dc.data.Dataset.from_dataframe(df)
			np.testing.assert_array_equal(dataset.X, dataset2.X)
			np.testing.assert_array_equal(dataset.y, dataset2.y)
			np.testing.assert_array_equal(dataset.w, dataset2.w)
			np.testing.assert_array_equal(dataset.ids, dataset2.ids)

			# Try specifying particular columns.

			dataset3 = dc.data.Dataset.from_dataframe(
			df, X=['X2', 'X4'], y='w', w=['y', 'X1'])
			np.testing.assert_array_equal(dataset.X[:, (1, 3)], dataset3.X)
			np.testing.assert_array_equal(dataset.w, dataset3.y)
			np.testing.assert_array_equal(
			np.stack([dataset.y[:, 0], dataset.X[:, 0]], axis=1), dataset3.w)


	if __name__ == "__main__":		if __name__ == "__main__":
	unittest.main()		unittest.main()

Admin message