Commit 0f0e82ba authored by peastman's avatar peastman
Browse files

Added functions for converting between Datasets and pandas DataFrames

parent a6d990ca
Loading
Loading
Loading
Loading
+103 −0
Original line number Original line Diff line number Diff line
@@ -305,6 +305,109 @@ class Dataset(object):


    return tf.data.Dataset.from_generator(gen_data, dtypes, shapes)
    return tf.data.Dataset.from_generator(gen_data, dtypes, shapes)


  def to_dataframe(self):
    """Construct a pandas DataFrame containing the data from this Dataset."""
    X = self.X
    y = self.y
    w = self.w
    ids = self.ids
    if len(X.shape) == 1 or X.shape[1] == 1:
      columns = ['X']
    else:
      columns = [f'X{i+1}' for i in range(X.shape[1])]
    X_df = pd.DataFrame(X, columns=columns)
    if len(y.shape) == 1 or y.shape[1] == 1:
      columns = ['y']
    else:
      columns = [f'y{i+1}' for i in range(y.shape[1])]
    y_df = pd.DataFrame(y, columns=columns)
    if len(w.shape) == 1 or w.shape[1] == 1:
      columns = ['w']
    else:
      columns = [f'w{i+1}' for i in range(w.shape[1])]
    w_df = pd.DataFrame(w, columns=columns)
    ids_df = pd.DataFrame(ids, columns=['ids'])
    return pd.concat([X_df, y_df, w_df, ids_df], axis=1, sort=False)

  @staticmethod
  def from_dataframe(df, X=None, y=None, w=None, ids=None):
    """Construct a Dataset from the contents of a pandas DataFrame.

    Parameters
    ----------
    df: DataFrame
      the pandas DataFrame
    X: string or list of strings
      the name of the column or columns containing the X array.  If this is None,
      it will look for default column names that match those produced by to_dataframe().
    y: string or list of strings
      the name of the column or columns containing the y array.  If this is None,
      it will look for default column names that match those produced by to_dataframe().
    w: string or list of strings
      the name of the column or columns containing the w array.  If this is None,
      it will look for default column names that match those produced by to_dataframe().
    ids: string
      the name of the column containing the ids.  If this is None, it will look
      for default column names that match those produced by to_dataframe().
    """
    # Find the X values.

    if X is not None:
      X_val = df[X]
    elif 'X' in df.columns:
      X_val = df['X']
    else:
      columns = []
      i = 1
      while f'X{i}' in df.columns:
        columns.append(f'X{i}')
        i += 1
      X_val = df[columns]
    if len(X_val.shape) == 1:
      X_val = np.expand_dims(X_val, 1)

    # Find the X values.

    if y is not None:
      y_val = df[y]
    elif 'y' in df.columns:
      y_val = df['y']
    else:
      columns = []
      i = 1
      while f'y{i}' in df.columns:
        columns.append(f'y{i}')
        i += 1
      y_val = df[columns]
    if len(y_val.shape) == 1:
      y_val = np.expand_dims(y_val, 1)

    # Find the w values.

    if w is not None:
      w_val = df[w]
    elif 'w' in df.columns:
      w_val = df['w']
    else:
      columns = []
      i = 1
      while f'w{i}' in df.columns:
        columns.append(f'w{i}')
        i += 1
      w_val = df[columns]
    if len(w_val.shape) == 1:
      w_val = np.expand_dims(w_val, 1)

    # Find the ids.

    if ids is not None:
      ids_val = df[ids]
    elif 'ids' in df.columns:
      ids_val = df['ids']
    else:
      ids_val = None
    return NumpyDataset(X_val, y_val, w_val, ids_val)



class NumpyDataset(Dataset):
class NumpyDataset(Dataset):
  """A Dataset defined by in-memory numpy arrays."""
  """A Dataset defined by in-memory numpy arrays."""
+23 −0
Original line number Original line Diff line number Diff line
@@ -14,6 +14,7 @@ import shutil
import numpy as np
import numpy as np
import deepchem as dc
import deepchem as dc
import tensorflow as tf
import tensorflow as tf
import pandas as pd
from tensorflow.python.framework import test_util
from tensorflow.python.framework import test_util




@@ -696,6 +697,28 @@ class TestDatasets(test_util.TensorFlowTestCase):
      np.testing.assert_array_equal(np.ones((10, 1)), batch_w)
      np.testing.assert_array_equal(np.ones((10, 1)), batch_w)
    assert i == 19
    assert i == 19


  def test_dataframe(self):
    """Test converting between Datasets and DataFrames."""
    dataset = dc.data.tests.load_solubility_data()

    # A round trip from Dataset to DataFrame to Dataset should produce identical arrays.

    df = dataset.to_dataframe()
    dataset2 = dc.data.Dataset.from_dataframe(df)
    np.testing.assert_array_equal(dataset.X, dataset2.X)
    np.testing.assert_array_equal(dataset.y, dataset2.y)
    np.testing.assert_array_equal(dataset.w, dataset2.w)
    np.testing.assert_array_equal(dataset.ids, dataset2.ids)

    # Try specifying particular columns.

    dataset3 = dc.data.Dataset.from_dataframe(
        df, X=['X2', 'X4'], y='w', w=['y', 'X1'])
    np.testing.assert_array_equal(dataset.X[:, (1, 3)], dataset3.X)
    np.testing.assert_array_equal(dataset.w, dataset3.y)
    np.testing.assert_array_equal(
        np.stack([dataset.y[:, 0], dataset.X[:, 0]], axis=1), dataset3.w)



if __name__ == "__main__":
if __name__ == "__main__":
  unittest.main()
  unittest.main()