Merge pull request #1797 from peastman/dataframe (df35696f) · Commits · 钟慕尧 / deepchem

deepchem/data/datasets.py

+103 −0

Original line number	Diff line number	Diff line
		@@ -305,6 +305,109 @@ class Dataset(object):

		return tf.data.Dataset.from_generator(gen_data, dtypes, shapes)

		def to_dataframe(self):
		"""Construct a pandas DataFrame containing the data from this Dataset."""
		X = self.X
		y = self.y
		w = self.w
		ids = self.ids
		if len(X.shape) == 1 or X.shape[1] == 1:
		columns = ['X']
		else:
		columns = [f'X{i+1}' for i in range(X.shape[1])]
		X_df = pd.DataFrame(X, columns=columns)
		if len(y.shape) == 1 or y.shape[1] == 1:
		columns = ['y']
		else:
		columns = [f'y{i+1}' for i in range(y.shape[1])]
		y_df = pd.DataFrame(y, columns=columns)
		if len(w.shape) == 1 or w.shape[1] == 1:
		columns = ['w']
		else:
		columns = [f'w{i+1}' for i in range(w.shape[1])]
		w_df = pd.DataFrame(w, columns=columns)
		ids_df = pd.DataFrame(ids, columns=['ids'])
		return pd.concat([X_df, y_df, w_df, ids_df], axis=1, sort=False)

		@staticmethod
		def from_dataframe(df, X=None, y=None, w=None, ids=None):
		"""Construct a Dataset from the contents of a pandas DataFrame.

		Parameters
		----------
		df: DataFrame
		the pandas DataFrame
		X: string or list of strings
		the name of the column or columns containing the X array. If this is None,
		it will look for default column names that match those produced by to_dataframe().
		y: string or list of strings
		the name of the column or columns containing the y array. If this is None,
		it will look for default column names that match those produced by to_dataframe().
		w: string or list of strings
		the name of the column or columns containing the w array. If this is None,
		it will look for default column names that match those produced by to_dataframe().
		ids: string
		the name of the column containing the ids. If this is None, it will look
		for default column names that match those produced by to_dataframe().
		"""
		# Find the X values.

		if X is not None:
		X_val = df[X]
		elif 'X' in df.columns:
		X_val = df['X']
		else:
		columns = []
		i = 1
		while f'X{i}' in df.columns:
		columns.append(f'X{i}')
		i += 1
		X_val = df[columns]
		if len(X_val.shape) == 1:
		X_val = np.expand_dims(X_val, 1)

		# Find the y values.

		if y is not None:
		y_val = df[y]
		elif 'y' in df.columns:
		y_val = df['y']
		else:
		columns = []
		i = 1
		while f'y{i}' in df.columns:
		columns.append(f'y{i}')
		i += 1
		y_val = df[columns]
		if len(y_val.shape) == 1:
		y_val = np.expand_dims(y_val, 1)

		# Find the w values.

		if w is not None:
		w_val = df[w]
		elif 'w' in df.columns:
		w_val = df['w']
		else:
		columns = []
		i = 1
		while f'w{i}' in df.columns:
		columns.append(f'w{i}')
		i += 1
		w_val = df[columns]
		if len(w_val.shape) == 1:
		w_val = np.expand_dims(w_val, 1)

		# Find the ids.

		if ids is not None:
		ids_val = df[ids]
		elif 'ids' in df.columns:
		ids_val = df['ids']
		else:
		ids_val = None
		return NumpyDataset(X_val, y_val, w_val, ids_val)


		class NumpyDataset(Dataset):
		"""A Dataset defined by in-memory numpy arrays."""

deepchem/data/tests/test_datasets.py

+23 −0

Original line number	Diff line number	Diff line
		@@ -14,6 +14,7 @@ import shutil
		import numpy as np
		import deepchem as dc
		import tensorflow as tf
		import pandas as pd
		from tensorflow.python.framework import test_util


		@@ -696,6 +697,28 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(np.ones((10, 1)), batch_w)
		assert i == 19

		def test_dataframe(self):
		"""Test converting between Datasets and DataFrames."""
		dataset = dc.data.tests.load_solubility_data()

		# A round trip from Dataset to DataFrame to Dataset should produce identical arrays.

		df = dataset.to_dataframe()
		dataset2 = dc.data.Dataset.from_dataframe(df)
		np.testing.assert_array_equal(dataset.X, dataset2.X)
		np.testing.assert_array_equal(dataset.y, dataset2.y)
		np.testing.assert_array_equal(dataset.w, dataset2.w)
		np.testing.assert_array_equal(dataset.ids, dataset2.ids)

		# Try specifying particular columns.

		dataset3 = dc.data.Dataset.from_dataframe(
		df, X=['X2', 'X4'], y='w', w=['y', 'X1'])
		np.testing.assert_array_equal(dataset.X[:, (1, 3)], dataset3.X)
		np.testing.assert_array_equal(dataset.w, dataset3.y)
		np.testing.assert_array_equal(
		np.stack([dataset.y[:, 0], dataset.X[:, 0]], axis=1), dataset3.w)


		if __name__ == "__main__":
		unittest.main()

Admin message