Commit 220647e1 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #245 from peastman/dataset

Refactoring of Dataset
parents ba11ae5f 896fcb73
Loading
Loading
Loading
Loading
+291 −223
Original line number Diff line number Diff line
@@ -107,9 +107,198 @@ def pad_batch(batch_size, X_b, y_b, w_b, ids_b):
      start += increment
    return (X_out, y_out, w_out, ids_out)


class Dataset(object):
  """Abstract base class for datasets defined by X, y, w elements."""

  def __init__(self):
    raise NotImplementedError()

  def __len__(self):
    """
    Get the number of elements in the dataset.
    """
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
    raise NotImplementedError()

  def get_shape(self):
    """Get the shape of the dataset.
    
    Returns four tuples, giving the shape of the X, y, w, and ids arrays.
    """
    raise NotImplementedError()

  def get_task_names(self):
    """Get the names of the tasks associated with this dataset."""
    raise NotImplementedError()

  @property
  def X(self):
    """Get the X vector for this dataset as a single numpy array."""
    raise NotImplementedError()

  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
    raise NotImplementedError()

  @property
  def ids(self):
    """Get the ids vector for this dataset as a single numpy array."""

    raise NotImplementedError()

  @property
  def w(self):
    """Get the weight vector for this dataset as a single numpy array."""
    raise NotImplementedError()

  def iterbatches(self, batch_size=None, epoch=0, deterministic=False, pad_batches=False):
    """Get an object that iterates over minibatches from the dataset.

    Each minibatch is returned as a tuple of four numpy arrays: (X, y, w, ids).
    """
    raise NotImplementedError()

  def itersamples(self):
    """Get an object that iterates over the samples in the dataset.

    Example:
    >>> for x, y, w, id in dataset.itersamples():
    >>>   print(x, y, w, id)
    """
    raise NotImplementedError()

  def get_statistics(self, X_stats=True, y_stats=True):
    """Compute and return statistics of this dataset."""
    X_means = 0.0
    X_m2 = 0.0
    y_means = 0.0
    y_m2 = 0.0
    n = 0
    for X, y, _, _ in self.itersamples():
        n += 1
        dx = X-X_means
        dy = y-y_means
        X_means += dx/n
        y_means += dy/n
        X_m2 += dx*(X-X_means)
        y_m2 += dy*(y-y_means)
    if n < 2:
        X_stds = 0.0
        y_stds = 0
    else:
        X_stds = np.sqrt(X_m2/n)
        y_stds = np.sqrt(y_m2/n)
    if X_stats and not y_stats:
      return X_means, X_stds
    elif y_stats and not X_stats:
      return y_means, y_stds
    elif X_stats and y_stats:
      return X_means, X_stds, y_means, y_stds
    else:
      return None


class NumpyDataset(Dataset):
  """A Dataset defined by in-memory numpy arrays."""

  def __init__(self, X, y, w=None, ids=None, verbosity=None):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
      y = np.reshape(y, (n_samples, -1))
      if w is not None:
        w = np.reshape(w, (n_samples, -1))
    n_tasks = y.shape[1]
    if ids is None:
      ids = np.arange(n_samples)
    if w is None:
      w = np.ones_like(y)
    self._X = X
    self._y = y
    self._w = w
    self._ids = np.array(ids, dtype=object)
    self.verbosity = verbosity

  def __len__(self):
    """
    Get the number of elements in the dataset.
    """
    return len(self._y)

  def get_shape(self):
    """Get the shape of the dataset.
    
    Returns four tuples, giving the shape of the X, y, w, and ids arrays.
    """
    return self._X.shape, self._y.shape, self._w.shape, self._ids.shape

  def get_task_names(self):
    """Get the names of the tasks associated with this dataset."""
    tasks = np.arange(self._y.shape[1])

  @property
  def X(self):
    """Get the X vector for this dataset as a single numpy array."""
    return self._X

  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
    return self._y

  @property
  def ids(self):
    """Get the ids vector for this dataset as a single numpy array."""
    return self._ids

  @property
  def w(self):
    """Get the weight vector for this dataset as a single numpy array."""
    return self._w

  def iterbatches(self, batch_size=None, epoch=0, deterministic=False, pad_batches=False):
    """Get an object that iterates over minibatches from the dataset.

    Each minibatch is returned as a tuple of four numpy arrays: (X, y, w, ids).
    """
    def iterate(dataset, batch_size, deterministic, pad_batches):
      n_samples = dataset._X.shape[0]
      if not deterministic:
        sample_perm = np.random.permutation(n_samples)
      else:
        sample_perm = np.arange(n_samples)
      if batch_size is None:
        batch_size = n_samples
      interval_points = np.linspace(
          0, n_samples, np.ceil(float(n_samples)/batch_size)+1, dtype=int)
      for j in range(len(interval_points)-1):
        indices = range(interval_points[j], interval_points[j+1])
        perm_indices = sample_perm[indices]
        X_batch = dataset._X[perm_indices, :]
        y_batch = dataset._y[perm_indices]
        w_batch = dataset._w[perm_indices]
        ids_batch = dataset._ids[perm_indices]
        if pad_batches:
          (X_batch, y_batch, w_batch, ids_batch) = pad_batch(
            batch_size, X_batch, y_batch, w_batch, ids_batch)
        yield (X_batch, y_batch, w_batch, ids_batch)
    return iterate(self, batch_size, deterministic, pad_batches)

  def itersamples(self):
    """Get an object that iterates over the samples in the dataset.

    Example:
    >>> for x, y, w, id in dataset.itersamples():
    >>>   print(x, y, w, id)
    """
    n_samples = self._X.shape[0]
    return ((self._X[i], self._y[i], self._w[i], self._ids[i]) for i in range(n_samples))


class DiskDataset(Dataset):
  """
  A Dataset that is stored as a set of files on disk.
  """
  def __init__(self, data_dir=None, tasks=[], metadata_rows=None, #featurizers=None, 
               raw_data=None, verbosity=None, reload=False,
@@ -125,23 +314,23 @@ class Dataset(object):

    if not reload or not os.path.exists(self._get_metadata_filename()):
      if metadata_rows is not None:
        self.metadata_df = Dataset.construct_metadata(metadata_rows)
        self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
        self.save_to_disk()
      elif raw_data is not None:
        metadata_rows = []
        ids, X, y, w = raw_data
        metadata_rows.append(
            Dataset.write_data_to_disk(
            DiskDataset.write_data_to_disk(
                self.data_dir, "data", tasks, X, y, w, ids,
                compute_feature_statistics=compute_feature_statistics))
        self.metadata_df = Dataset.construct_metadata(metadata_rows)
        self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
        self.save_to_disk()
      else:
        # Create an empty metadata dataframe to be filled at a later time
        basename = "metadata"
        metadata_rows = [Dataset.write_data_to_disk(
        metadata_rows = [DiskDataset.write_data_to_disk(
            self.data_dir, basename, tasks)]
        self.metadata_df = Dataset.construct_metadata(metadata_rows)
        self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
        self.save_to_disk()

    else:
@@ -184,7 +373,7 @@ class Dataset(object):
      assert X.shape[0] == y.shape[0]
      assert y.shape == w.shape
      assert len(ids) == X.shape[0]
    return Dataset.write_data_to_disk(
    return DiskDataset.write_data_to_disk(
        data_dir, basename, tasks, X, y, w, ids,
        compute_feature_statistics=compute_feature_statistics)

@@ -297,33 +486,39 @@ class Dataset(object):

  def itershards(self):
    """
    Iterates over all shards in dataset.
    Return an object that iterates over all shards in dataset.

    Datasets are stored in sharded fashion on disk. Each call to next() for the
    generator defined by this function returns the data from a particular shard.
    The order of shards returned is guaranteed to remain fixed.
    """
    for _, row in self.metadata_df.iterrows():
    def iterate(dataset):
      for _, row in dataset.metadata_df.iterrows():
        X = np.array(load_from_disk(
          os.path.join(self.data_dir, row['X-transformed'])))
            os.path.join(dataset.data_dir, row['X-transformed'])))
        y = np.array(load_from_disk(
          os.path.join(self.data_dir, row['y-transformed'])))
            os.path.join(dataset.data_dir, row['y-transformed'])))
        w = np.array(load_from_disk(
          os.path.join(self.data_dir, row['w-transformed'])))
            os.path.join(dataset.data_dir, row['w-transformed'])))
        ids = np.array(load_from_disk(
          os.path.join(self.data_dir, row['ids'])), dtype=object)
            os.path.join(dataset.data_dir, row['ids'])), dtype=object)
        yield (X, y, w, ids)
    return iterate(self)

  def iterbatches(self, batch_size=None, epoch=0, deterministic=False,
                  pad_batches=False):
    """Returns minibatches from dataset randomly."""
    num_shards = self.get_number_shards()
    """Get an object that iterates over minibatches from the dataset.

    Each minibatch is returned as a tuple of four numpy arrays: (X, y, w, ids).
    """
    def iterate(dataset):
      num_shards = dataset.get_number_shards()
      if not deterministic:
        shard_perm = np.random.permutation(num_shards)
      else:
        shard_perm = np.arange(num_shards)
      for i in range(num_shards):
      X, y, w, ids = self.get_shard(shard_perm[i])
        X, y, w, ids = dataset.get_shard(shard_perm[i])
        n_samples = X.shape[0]
        if not deterministic:
          sample_perm = np.random.permutation(n_samples)
@@ -346,6 +541,21 @@ class Dataset(object):
            (X_batch, y_batch, w_batch, ids_batch) = pad_batch(
              shard_batch_size, X_batch, y_batch, w_batch, ids_batch)
          yield (X_batch, y_batch, w_batch, ids_batch)
    return iterate(self)

  def itersamples(self):
    """Get an object that iterates over the samples in the dataset.

    Example:
    >>> for x, y, w, id in dataset.itersamples():
    >>>   print(x, y, w, id)
    """
    def iterate(dataset):
        for (X_shard, y_shard, w_shard, ids_shard) in dataset.itershards():
            n_samples = X_shard.shape[0]
            for i in range(n_samples):
                yield (X_shard[i], y_shard[i], w_shard[i], ids_shard[i])
    return iterate(self)

  def reshard(self, shard_size):
    """Reshards data to have specified shard size."""
@@ -370,16 +580,16 @@ class Dataset(object):
        w_batch, w_next = w_next[:shard_size], w_next[shard_size:]
        ids_batch, ids_next = ids_next[:shard_size], ids_next[shard_size:]
        new_basename = "reshard-%d" % ind
        new_metadata.append(Dataset.write_data_to_disk(
        new_metadata.append(DiskDataset.write_data_to_disk(
            reshard_dir, new_basename, tasks, X_batch, y_batch, w_batch, ids_batch))
        ind += 1
    # Handle spillover from last shard
    new_basename = "reshard-%d" % ind
    new_metadata.append(Dataset.write_data_to_disk(
    new_metadata.append(DiskDataset.write_data_to_disk(
        reshard_dir, new_basename, tasks, X_next, y_next, w_next, ids_next))
    ind += 1
    # Get new metadata rows
    resharded_dataset = Dataset(
    resharded_dataset = DiskDataset(
        data_dir=reshard_dir, tasks=tasks, metadata_rows=new_metadata,
        verbosity=self.verbosity)
    shutil.rmtree(self.data_dir)
@@ -404,7 +614,7 @@ class Dataset(object):
    if tasks is None:
      tasks = np.arange(n_tasks)
    raw_data = (ids, X, y, w)
    return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data,
    return DiskDataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data,
                   verbosity=verbosity,
                   compute_feature_statistics=compute_feature_statistics)

@@ -416,12 +626,12 @@ class Dataset(object):
    Xs, ys, ws, all_ids = [], [], [], []
    metadata_rows = []
    for ind, dataset in enumerate(datasets):
      X, y, w, ids = dataset.to_numpy()
      X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
      basename = "dataset-%d" % ind
      tasks = dataset.get_task_names()
      metadata_rows.append(
          Dataset.write_data_to_disk(merge_dir, basename, tasks, X, y, w, ids))
    return Dataset(data_dir=merge_dir,
          DiskDataset.write_data_to_disk(merge_dir, basename, tasks, X, y, w, ids))
    return DiskDataset(data_dir=merge_dir,
                   metadata_rows=metadata_rows,
                   verbosity=dataset.verbosity)

@@ -436,9 +646,9 @@ class Dataset(object):
        continue
      X, y, w, ids = self.get_shard(shard_num)
      basename = "dataset-%d" % shard_num
      metadata_rows.append(Dataset.write_data_to_disk(
      metadata_rows.append(DiskDataset.write_data_to_disk(
          subset_dir, basename, tasks, X, y, w, ids))
    return Dataset(data_dir=subset_dir,
    return DiskDataset(data_dir=subset_dir,
                   metadata_rows=metadata_rows,
                   verbosity=self.verbosity)

@@ -540,14 +750,14 @@ class Dataset(object):
        X_i, y_i, w_i, ids_i = X[:n_i], y[:n_i], w[:n_i], ids[:n_i]
        X_j, y_j, w_j, ids_j = X[n_i:], y[n_i:], w[n_i:], ids[n_i:]

        Dataset.write_data_to_disk(
        DiskDataset.write_data_to_disk(
            self.data_dir, basename_i, tasks, X_i, y_i, w_i, ids_i)
        Dataset.write_data_to_disk(
        DiskDataset.write_data_to_disk(
            self.data_dir, basename_j, tasks, X_j, y_j, w_j, ids_j)
        assert len(self) == len_data
      # Now shuffle order of rows in metadata_df
      random.shuffle(metadata_rows)
      self.metadata_df = Dataset.construct_metadata(metadata_rows)
      self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
      self.save_to_disk()

  def shuffle_each_shard(self):
@@ -564,14 +774,14 @@ class Dataset(object):
      permutation = np.random.permutation(n)
      X, y, w, ids = (X[permutation], y[permutation],
                      w[permutation], ids[permutation])
      Dataset.write_data_to_disk(
      DiskDataset.write_data_to_disk(
          self.data_dir, basename, tasks, X, y, w, ids)

  def shuffle_shards(self):
    """Shuffles the order of the shards for this dataset."""
    metadata_rows = self.metadata_df.values.tolist()
    random.shuffle(metadata_rows)
    self.metadata_df = Dataset.construct_metadata(metadata_rows)
    self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
    self.save_to_disk()

  def get_shard(self, i):
@@ -591,7 +801,7 @@ class Dataset(object):
    """Writes data shard to disk"""
    basename = "shard-%d" % shard_num 
    tasks = self.get_task_names()
    Dataset.write_data_to_disk(self.data_dir, basename, tasks, X, y, w, ids)
    DiskDataset.write_data_to_disk(self.data_dir, basename, tasks, X, y, w, ids)

  def set_verbosity(self, new_verbosity):
    """Sets verbosity."""
@@ -616,7 +826,7 @@ class Dataset(object):
      os.makedirs(select_dir)
    # Handle edge case with empty indices
    if not len(indices):
      return Dataset(
      return DiskDataset(
          data_dir=select_dir, metadata_rows=[], verbosity=self.verbosity)
    indices = np.array(sorted(indices)).astype(int)
    count, indices_count = 0, 0
@@ -638,72 +848,20 @@ class Dataset(object):
      ids_sel = ids[shard_inds]
      basename = "dataset-%d" % shard_num
      metadata_rows.append(
          Dataset.write_data_to_disk(
          DiskDataset.write_data_to_disk(
              select_dir, basename, tasks,
              X_sel, y_sel, w_sel, ids_sel,
              compute_feature_statistics=compute_feature_statistics))
      # Updating counts
      indices_count += num_shard_elts
      count += shard_len
    return Dataset(data_dir=select_dir,
    return DiskDataset(data_dir=select_dir,
                   metadata_rows=metadata_rows,
                   verbosity=self.verbosity)

  def to_singletask(self, task_dirs):
    """Transforms multitask dataset in collection of singletask datasets."""
    tasks = self.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", self.verbosity)
    task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
      log("Processing shard %d" % shard_num, self.verbosity)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, self.verbosity)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        if X_nonzero.size > 0: 
          task_metadata_rows[task].append(
            Dataset.write_data_to_disk(
                task_dirs[task_num], basename, [task],
                X_nonzero, y_nonzero, w_nonzero, ids_nonzero))
    
    task_datasets = [
        Dataset(data_dir=task_dirs[task_num],
                metadata_rows=task_metadata_rows[task],
                verbosity=self.verbosity)
        for (task_num, task) in enumerate(tasks)]
    return task_datasets
    
  def to_numpy(self):
    """
    Transforms internal data into arrays X, y, w

    Creates three arrays containing all data in this object. This operation is
    dangerous (!) for large datasets which don't fit into memory.
    """
    Xs, ys, ws, ids = [], [], [], []
    for (X_b, y_b, w_b, ids_b) in self.itershards():
      Xs.append(X_b)
      ys.append(y_b)
      ws.append(w_b)
      ids.append(np.atleast_1d(np.squeeze(ids_b)))
    np.concatenate(ids)
    return (np.vstack(Xs), np.vstack(ys), np.vstack(ws),
            np.concatenate(ids))

  def get_ids(self):
    """
    Returns all molecule-ids for this dataset.
    """
  @property
  def ids(self):
    """Get the ids vector for this dataset as a single numpy array."""
    if len(self) == 0:
      return np.array([])
    ids = []
@@ -711,19 +869,25 @@ class Dataset(object):
      ids.append(np.atleast_1d(np.squeeze(ids_b)))
    return np.concatenate(ids)

  def get_labels(self):
    """
    Returns all labels for this dataset.
    """
  @property
  def X(self):
    """Get the X vector for this dataset as a single numpy array."""
    Xs = []
    for (X_b, _, _, _) in self.itershards():
      Xs.append(X_b)
    return np.vstack(Xs)

  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
    ys = []
    for (_, y_b, _, _) in self.itershards():
      ys.append(y_b)
    return np.vstack(ys)

  def get_weights(self):
    """
    Returns all weights for this dataset.
    """
  @property
  def w(self):
    """Get the weight vector for this dataset as a single numpy array."""
    ws = []
    for (_, _, w_b, _) in self.itershards():
      ws.append(np.array(w_b))
@@ -767,102 +931,6 @@ class Dataset(object):
    """Return pandas series of label stds."""
    return self.metadata_df["y_stds"]

  def get_statistics(self, X_stats=True, y_stats=True):
    """Computes and returns statistics of this dataset"""
    if len(self) == 0:
      return None, None, None, None
    self.update_moments(X_stats, y_stats)
    df = self.metadata_df
    if X_stats and not y_stats:
      X_means, X_stds = self._compute_mean_and_std(df, X_stats, y_stats)
      return X_means, X_stds
    elif y_stats and not X_stats:
      y_means, y_stds = self._compute_mean_and_std(df, X_stats, y_stats)
      return y_means, y_stds
    elif X_stats and y_stats:
      X_means, X_stds = self._compute_mean_and_std(
          df, X_stats=True, y_stats=False)
      y_means, y_stds = self._compute_mean_and_std(
          df, X_stats=False, y_stats=True)
      return X_means, X_stds, y_means, y_stds
    else:
      return None

  def _compute_mean_and_std(self, df, X_stats, y_stats):
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """

    if X_stats:
      X_sums = []
      X_sum_squares = []
      X_n = []
      for _, row in df.iterrows():
        Xs = load_from_disk(os.path.join(self.data_dir, row['X_sums']))
        Xss = load_from_disk(os.path.join(self.data_dir, row['X_sum_squares']))
        Xn = load_from_disk(os.path.join(self.data_dir, row['X_n']))
        X_sums.append(np.array(Xs))
        X_sum_squares.append(np.array(Xss))
        X_n.append(np.array(Xn))

      # Note that X_n is a list of floats
      n = float(np.sum(X_n))
      X_sums = np.vstack(X_sums)
      X_sum_squares = np.vstack(X_sum_squares)
      overall_X_sums = np.sum(X_sums, axis=0)
      overall_X_means = overall_X_sums / n
      overall_X_sum_squares = np.sum(X_sum_squares, axis=0)

      X_vars = (overall_X_sum_squares - np.square(overall_X_sums)/n)/(n)
      return overall_X_means, np.sqrt(X_vars)

    if y_stats:
      y_sums = []
      y_sum_squares = []
      y_n = []
      for _, row in df.iterrows():
        ys = load_from_disk(os.path.join(self.data_dir, row['y_sums']))
        yss = load_from_disk(os.path.join(self.data_dir, row['y_sum_squares']))
        yn = load_from_disk(os.path.join(self.data_dir, row['y_n']))
        y_sums.append(np.array(ys))
        y_sum_squares.append(np.array(yss))
        y_n.append(np.array(yn))

      # Note y_n is a list of arrays of shape (n_tasks,)
      y_n = np.sum(y_n, axis=0)
      y_sums = np.vstack(y_sums)
      y_sum_squares = np.vstack(y_sum_squares)
      y_means = np.sum(y_sums, axis=0)/y_n
      y_vars = np.sum(y_sum_squares, axis=0)/y_n - np.square(y_means)
      return y_means, np.sqrt(y_vars)
  
  def update_moments(self, X_stats, y_stats):
    """Re-compute statistics of this dataset during transformation"""
    df = self.metadata_df
    self._update_mean_and_std(df, X_stats, y_stats)

  def _update_mean_and_std(self, df, X_stats, y_stats):
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """
    if X_stats:
      X_transform = []
      for _, row in df.iterrows():
        Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed']))
        Xs = np.sum(Xt,axis=0)
        Xss = np.sum(np.square(Xt),axis=0)
        save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
        save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))

    if y_stats:
      y_transform = []
      for _, row in df.iterrows():
        yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
        ys = np.sum(yt,axis=0)
        yss = np.sum(np.square(yt),axis=0)
        save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
        save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))

  def get_grad_statistics(self):
    """Computes and returns statistics of this dataset

+5 −5
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ import os
import numpy as np
import shutil
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.datasets import DiskDataset
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
@@ -59,7 +59,7 @@ def load_muv(base_dir, reload=True, frac_train=.8):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
    dataset = DiskDataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
@@ -69,7 +69,7 @@ def load_muv(base_dir, reload=True, frac_train=.8):
    for transformer in transformers:
        transformer.transform(dataset)

  X, y, w, ids = dataset.to_numpy()
  X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  num_tasks = 17
  num_train = frac_train * len(dataset)
  MUV_tasks = MUV_tasks[:num_tasks]
@@ -80,9 +80,9 @@ def load_muv(base_dir, reload=True, frac_train=.8):
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
  train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, MUV_tasks)
  valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
  valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, MUV_tasks)
  
  return MUV_tasks, (train_dataset, valid_dataset), transformers
+5 −5

File changed.

Preview size limit exceeded, changes collapsed.

+3 −3
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ import pandas as pd
import shutil
from rdkit import Chem
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.datasets import DiskDataset
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
@@ -109,7 +109,7 @@ def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True):
  X = np.array(features, dtype-object)
  w = np.ones_like(y)
   
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers
@@ -176,7 +176,7 @@ def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
  X = np.vstack(features)
  w = np.ones_like(y)
   
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers
+41 −43

File changed.

Preview size limit exceeded, changes collapsed.

Loading