Unverified Commit df35696f authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1797 from peastman/dataframe

Added functions for converting between Datasets and pandas DataFrames
parents a6d990ca 0824b36b
Loading
Loading
Loading
Loading
+103 −0
Original line number Diff line number Diff line
@@ -305,6 +305,109 @@ class Dataset(object):

    return tf.data.Dataset.from_generator(gen_data, dtypes, shapes)

  def to_dataframe(self):
    """Construct a pandas DataFrame containing the data from this Dataset."""
    X = self.X
    y = self.y
    w = self.w
    ids = self.ids
    if len(X.shape) == 1 or X.shape[1] == 1:
      columns = ['X']
    else:
      columns = [f'X{i+1}' for i in range(X.shape[1])]
    X_df = pd.DataFrame(X, columns=columns)
    if len(y.shape) == 1 or y.shape[1] == 1:
      columns = ['y']
    else:
      columns = [f'y{i+1}' for i in range(y.shape[1])]
    y_df = pd.DataFrame(y, columns=columns)
    if len(w.shape) == 1 or w.shape[1] == 1:
      columns = ['w']
    else:
      columns = [f'w{i+1}' for i in range(w.shape[1])]
    w_df = pd.DataFrame(w, columns=columns)
    ids_df = pd.DataFrame(ids, columns=['ids'])
    return pd.concat([X_df, y_df, w_df, ids_df], axis=1, sort=False)

  @staticmethod
  def from_dataframe(df, X=None, y=None, w=None, ids=None):
    """Construct a Dataset from the contents of a pandas DataFrame.

    Parameters
    ----------
    df: DataFrame
      the pandas DataFrame
    X: string or list of strings
      the name of the column or columns containing the X array.  If this is None,
      it will look for default column names that match those produced by to_dataframe().
    y: string or list of strings
      the name of the column or columns containing the y array.  If this is None,
      it will look for default column names that match those produced by to_dataframe().
    w: string or list of strings
      the name of the column or columns containing the w array.  If this is None,
      it will look for default column names that match those produced by to_dataframe().
    ids: string
      the name of the column containing the ids.  If this is None, it will look
      for default column names that match those produced by to_dataframe().
    """
    # Find the X values.

    if X is not None:
      X_val = df[X]
    elif 'X' in df.columns:
      X_val = df['X']
    else:
      columns = []
      i = 1
      while f'X{i}' in df.columns:
        columns.append(f'X{i}')
        i += 1
      X_val = df[columns]
    if len(X_val.shape) == 1:
      X_val = np.expand_dims(X_val, 1)

    # Find the y values.

    if y is not None:
      y_val = df[y]
    elif 'y' in df.columns:
      y_val = df['y']
    else:
      columns = []
      i = 1
      while f'y{i}' in df.columns:
        columns.append(f'y{i}')
        i += 1
      y_val = df[columns]
    if len(y_val.shape) == 1:
      y_val = np.expand_dims(y_val, 1)

    # Find the w values.

    if w is not None:
      w_val = df[w]
    elif 'w' in df.columns:
      w_val = df['w']
    else:
      columns = []
      i = 1
      while f'w{i}' in df.columns:
        columns.append(f'w{i}')
        i += 1
      w_val = df[columns]
    if len(w_val.shape) == 1:
      w_val = np.expand_dims(w_val, 1)

    # Find the ids.

    if ids is not None:
      ids_val = df[ids]
    elif 'ids' in df.columns:
      ids_val = df['ids']
    else:
      ids_val = None
    return NumpyDataset(X_val, y_val, w_val, ids_val)


class NumpyDataset(Dataset):
  """A Dataset defined by in-memory numpy arrays."""
+23 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ import shutil
import numpy as np
import deepchem as dc
import tensorflow as tf
import pandas as pd
from tensorflow.python.framework import test_util


@@ -696,6 +697,28 @@ class TestDatasets(test_util.TensorFlowTestCase):
      np.testing.assert_array_equal(np.ones((10, 1)), batch_w)
    assert i == 19

  def test_dataframe(self):
    """Test converting between Datasets and DataFrames."""
    dataset = dc.data.tests.load_solubility_data()

    # A round trip from Dataset to DataFrame to Dataset should produce identical arrays.

    df = dataset.to_dataframe()
    dataset2 = dc.data.Dataset.from_dataframe(df)
    np.testing.assert_array_equal(dataset.X, dataset2.X)
    np.testing.assert_array_equal(dataset.y, dataset2.y)
    np.testing.assert_array_equal(dataset.w, dataset2.w)
    np.testing.assert_array_equal(dataset.ids, dataset2.ids)

    # Try specifying particular columns.

    dataset3 = dc.data.Dataset.from_dataframe(
        df, X=['X2', 'X4'], y='w', w=['y', 'X1'])
    np.testing.assert_array_equal(dataset.X[:, (1, 3)], dataset3.X)
    np.testing.assert_array_equal(dataset.w, dataset3.y)
    np.testing.assert_array_equal(
        np.stack([dataset.y[:, 0], dataset.X[:, 0]], axis=1), dataset3.w)


if __name__ == "__main__":
  unittest.main()