Unverified Commit 37e61ee9 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2082 from nd-02110114/fix-windows-ci

Fix windows ci
parents 6c70a681 a244ab73
Loading
Loading
Loading
Loading
+57 −106
Original line number Diff line number Diff line
@@ -9,18 +9,16 @@ import numpy as np
import pandas as pd
import random
import logging
from pandas import read_hdf
import tempfile
import time
import shutil
import json
import warnings
import multiprocessing
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from ast import literal_eval as make_tuple

from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, Union
from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, Union
from deepchem.utils.typing import OneOrMany, Shape

Batch = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
@@ -516,6 +514,7 @@ class Dataset(object):

    Returns
    -------
    torch.utils.data.IterableDataset
      `torch.utils.data.IterableDataset` that iterates over the data in
      this dataset.
    """
@@ -870,34 +869,21 @@ class NumpyDataset(Dataset):
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
    """
    import torch

    def iterate():
      n_samples = self._X.shape[0]
      worker_info = torch.utils.data.get_worker_info()
      if worker_info is None:
        first_sample = 0
        last_sample = n_samples
      else:
        first_sample = worker_info.id * n_samples // worker_info.num_workers
        last_sample = (
            worker_info.id + 1) * n_samples // worker_info.num_workers
      for epoch in range(epochs):
        if deterministic:
          order = first_sample + np.arange(last_sample - first_sample)
        else:
          order = first_sample + np.random.permutation(last_sample -
                                                       first_sample)
        for i in order:
          yield (self._X[i], self._y[i], self._w[i], self._ids[i])

    class TorchDataset(torch.utils.data.IterableDataset):  # type: ignore

      def __iter__(self):
        return iterate()
    Returns
    -------
    torch.utils.data.IterableDataset
      `torch.utils.data.IterableDataset` that iterates over the data in
      this dataset.
    """
    try:
      from deepchem.data.pytorch_datasets import _TorchNumpyDataset
    except:
      raise ValueError("This method requires PyTorch to be installed.")

    return TorchDataset()
    pytorch_ds = _TorchNumpyDataset(
        numpy_dataset=self, epochs=epochs, deterministic=deterministic)
    return pytorch_ds

  @staticmethod
  def from_DiskDataset(ds: "DiskDataset") -> "NumpyDataset":
@@ -958,6 +944,15 @@ class NumpyDataset(Dataset):
    return NumpyDataset(X, y, w, ids, n_tasks=y.shape[1])


class _Shard(object):

  def __init__(self, X, y, w, ids):
    self.X = X
    self.y = y
    self.w = w
    self.ids = ids


class DiskDataset(Dataset):
  """
  A Dataset that is stored as a set of files on disk.
@@ -1135,7 +1130,7 @@ class DiskDataset(Dataset):
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
      return tasks, metadata_df
    except Exception as e:
    except Exception:
      pass

    # Load obsolete format -> save in new format
@@ -1482,8 +1477,8 @@ class DiskDataset(Dataset):
      # than process based pools, since process based pools need to pickle/serialize
      # objects as an extra overhead. Also, as hideously as un-thread safe this looks,
      # we're actually protected by the GIL.
      pool = multiprocessing.dummy.Pool(
          1)  # mp.dummy aliases ThreadPool to Pool
      # mp.dummy aliases ThreadPool to Pool
      pool = multiprocessing.dummy.Pool(1)

      if batch_size is None:
        num_global_batches = num_shards
@@ -1703,33 +1698,21 @@ class DiskDataset(Dataset):
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
    """
    import torch

    def iterate():
      worker_info = torch.utils.data.get_worker_info()
      n_shards = self.get_number_shards()
      if worker_info is None:
        first_shard = 0
        last_shard = n_shards
      else:
        first_shard = worker_info.id * n_shards // worker_info.num_workers
        last_shard = (worker_info.id + 1) * n_shards // worker_info.num_workers
      if first_shard == last_shard:
        return
      shard_indices = list(range(first_shard, last_shard))
      for epoch in range(epochs):
        for X, y, w, ids in self._iterbatches_from_shards(
            shard_indices, deterministic=deterministic):
          for i in range(X.shape[0]):
            yield (X[i], y[i], w[i], ids[i])

    class TorchDataset(torch.utils.data.IterableDataset):  # type: ignore

      def __iter__(self):
        return iterate()
    Returns
    -------
    torch.utils.data.IterableDataset
      `torch.utils.data.IterableDataset` that iterates over the data in
      this dataset.
    """
    try:
      from deepchem.data.pytorch_datasets import _TorchDiskDataset
    except:
      raise ValueError("This method requires PyTorch to be installed.")

    return TorchDataset()
    pytorch_ds = _TorchDiskDataset(
        disk_dataset=self, epochs=epochs, deterministic=deterministic)
    return pytorch_ds

  @staticmethod
  def from_numpy(X: np.ndarray,
@@ -1962,14 +1945,6 @@ class DiskDataset(Dataset):
  def get_shard(self, i: int) -> Batch:
    """Retrieves data for the i-th shard from disk."""

    class Shard(object):

      def __init__(self, X, y, w, ids):
        self.X = X
        self.y = y
        self.w = w
        self.ids = ids

    # See if we have a cached copy of this shard.
    if self._cached_shards is None:
      self._cached_shards = [None] * self.get_number_shards()
@@ -2010,7 +1985,7 @@ class DiskDataset(Dataset):
    # shard again before the next time we want this one.  So just cache as many
    # as we can and then stop.

    shard = Shard(X, y, w, ids)
    shard = _Shard(X, y, w, ids)
    shard_size = X.nbytes + ids.nbytes
    if y is not None:
      shard_size += y.nbytes
@@ -2526,42 +2501,18 @@ class ImageDataset(Dataset):

    Returns
    -------
    `torch.utils.data.IterableDataset` iterating over the same data as
    torch.utils.data.IterableDataset
      `torch.utils.data.IterableDataset` that iterates over the data in
      this dataset.
    """
    import torch

    def get_image(array, index):
      if isinstance(array, np.ndarray):
        return array[index]
      return dc.data.ImageLoader.load_img([array[index]])[0]

    def iterate():
      n_samples = self._X_shape[0]
      worker_info = torch.utils.data.get_worker_info()
      if worker_info is None:
        first_sample = 0
        last_sample = n_samples
      else:
        first_sample = worker_info.id * n_samples // worker_info.num_workers
        last_sample = (
            worker_info.id + 1) * n_samples // worker_info.num_workers
      for epoch in range(epochs):
        if deterministic:
          order = first_sample + np.arange(last_sample - first_sample)
        else:
          order = first_sample + np.random.permutation(last_sample -
                                                       first_sample)
        for i in order:
          yield (get_image(self._X, i), get_image(self._y, i), self._w[i],
                 self._ids[i])

    class TorchDataset(torch.utils.data.IterableDataset):  # type: ignore

      def __iter__(self):
        return iterate()
    try:
      from deepchem.data.pytorch_datasets import _TorchImageDataset
    except:
      raise ValueError("This method requires PyTorch to be installed.")

    return TorchDataset()
    pytorch_ds = _TorchImageDataset(
        image_dataset=self, epochs=epochs, deterministic=deterministic)
    return pytorch_ds


class Databag(object):
+142 −0
Original line number Diff line number Diff line
from typing import List, Union
import numpy as np
import torch

from deepchem.data.data_loader import ImageLoader
from deepchem.data.datasets import NumpyDataset, DiskDataset, ImageDataset


class _TorchNumpyDataset(torch.utils.data.IterableDataset):  # type: ignore

  def __init__(self, numpy_dataset: NumpyDataset, epochs: int,
               deterministic: bool):
    """
    Parameters
    ----------
    numpy_dataset: NumpyDataset
      The original NumpyDataset which you want to convert to PyTorch Dataset
    epochs: int
      the number of times to iterate over the Dataset
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
    """
    self.numpy_dataset = numpy_dataset
    self.epochs = epochs
    self.deterministic = deterministic

  def __iter__(self):
    n_samples = self.numpy_dataset._X.shape[0]
    worker_info = torch.utils.data.get_worker_info()
    if worker_info is None:
      first_sample = 0
      last_sample = n_samples
    else:
      first_sample = worker_info.id * n_samples // worker_info.num_workers
      last_sample = (worker_info.id + 1) * n_samples // worker_info.num_workers
    for epoch in range(self.epochs):
      if self.deterministic:
        order = first_sample + np.arange(last_sample - first_sample)
      else:
        order = first_sample + np.random.permutation(last_sample - first_sample)
      for i in order:
        yield (self.numpy_dataset._X[i], self.numpy_dataset._y[i],
               self.numpy_dataset._w[i], self.numpy_dataset._ids[i])


class _TorchDiskDataset(torch.utils.data.IterableDataset):  # type: ignore

  def __init__(self, disk_dataset: DiskDataset, epochs: int,
               deterministic: bool):
    """
    Parameters
    ----------
    disk_dataset: DiskDataset
      The original DiskDataset which you want to convert to PyTorch Dataset
    epochs: int
      the number of times to iterate over the Dataset
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
    """
    self.disk_dataset = disk_dataset
    self.epochs = epochs
    self.deterministic = deterministic

  def __iter__(self):
    worker_info = torch.utils.data.get_worker_info()
    n_shards = self.disk_dataset.get_number_shards()
    if worker_info is None:
      first_shard = 0
      last_shard = n_shards
    else:
      first_shard = worker_info.id * n_shards // worker_info.num_workers
      last_shard = (worker_info.id + 1) * n_shards // worker_info.num_workers
    if first_shard == last_shard:
      return

    shard_indices = list(range(first_shard, last_shard))
    for epoch in range(self.epochs):
      for X, y, w, ids in self.disk_dataset._iterbatches_from_shards(
          shard_indices, deterministic=self.deterministic):
        for i in range(X.shape[0]):
          yield (X[i], y[i], w[i], ids[i])


class _TorchImageDataset(torch.utils.data.IterableDataset):  # type: ignore

  def __init__(self, image_dataset: ImageDataset, epochs: int,
               deterministic: bool):
    """
    Parameters
    ----------
    image_dataset: ImageDataset
      The original ImageDataset which you want to convert to PyTorch Dataset
    epochs: int
      the number of times to iterate over the Dataset
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
    """
    self.image_dataset = image_dataset
    self.epochs = epochs
    self.deterministic = deterministic

  def __iter__(self):
    n_samples = self.image_dataset._X_shape[0]
    worker_info = torch.utils.data.get_worker_info()
    if worker_info is None:
      first_sample = 0
      last_sample = n_samples
    else:
      first_sample = worker_info.id * n_samples // worker_info.num_workers
      last_sample = (worker_info.id + 1) * n_samples // worker_info.num_workers
    for epoch in range(self.epochs):
      if self.deterministic:
        order = first_sample + np.arange(last_sample - first_sample)
      else:
        order = first_sample + np.random.permutation(last_sample - first_sample)
      for i in order:
        yield (self._get_image(self.image_dataset._X, i),
               self._get_image(self.image_dataset._y, i),
               self.image_dataset._w[i], self.image_dataset._ids[i])

  def _get_image(self, array: Union[np.ndarray, List[str]],
                 index: int) -> np.ndarray:
    """Method for loading an image

    Parameters
    ----------
    array: Union[np.ndarray, List[str]]
      A numpy array which contains images or List of image filenames
    index: int
      Index you want to get the image

    Returns
    -------
    np.ndarray
      Loaded image
    """
    if isinstance(array, np.ndarray):
      return array[index]
    return ImageLoader.load_img([array[index]])[0]
+2 −10
Original line number Diff line number Diff line
@@ -4,14 +4,9 @@ Tests for dataset creation
import random
import math
import unittest
import tempfile
import os
import shutil
import numpy as np
import deepchem as dc
import tensorflow as tf
import pandas as pd
from tensorflow.python.framework import test_util

try:
  import torch
@@ -25,7 +20,6 @@ def load_solubility_data():
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
@@ -107,7 +101,6 @@ def test_pad_features():
  """Test that pad_features pads features correctly."""
  batch_size = 100
  num_features = 10
  num_tasks = 5

  # Test cases where n_samples < 2*n_samples < batch_size
  n_samples = 29
@@ -302,7 +295,6 @@ def test_select():

def test_complete_shuffle():
  shard_sizes = [1, 2, 3, 4, 5]
  batch_size = 10

  all_Xs, all_ys, all_ws, all_ids = [], [], [], []

@@ -526,7 +518,7 @@ def test_disk_iterate_y_w_None():
  shard_sizes = [21, 11, 41, 21, 51]
  batch_size = 10

  all_Xs, all_ys, all_ws, all_ids = [], [], [], []
  all_Xs, all_ids = [], []

  def shard_generator():
    for sz in shard_sizes:
@@ -815,7 +807,7 @@ def test_to_str():
  assert str(dataset) == ref_str


class TestDatasets(test_util.TensorFlowTestCase):
class TestDatasets(unittest.TestCase):
  """
  Test basic top-level API for dataset objects.
  """
+2 −5
Original line number Diff line number Diff line
@@ -5,10 +5,7 @@ Tests to make sure deepchem models can fit models on easy datasets.
import sklearn
import sklearn.datasets
import numpy as np
import unittest
import tempfile
import deepchem as dc
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

@@ -180,7 +177,7 @@ def test_sklearn_multitask_classification():
  model.save()
  # Eval model on test
  scores = model.evaluate(test_dataset, [classification_metric])
  assert scores['roc_auc_score'] > 0.5
  assert scores[classification_metric.name] > .5


def test_xgboost_regression():
@@ -247,7 +244,7 @@ def test_xgboost_multitask_regression():
  # Eval model on test
  scores = model.evaluate(test_dataset, [regression_metric])
  score = scores[regression_metric.name]
  assert score < 50
  assert score < 55


def test_xgboost_classification():