Commit 00fc4ed9 authored by nd-02110114's avatar nd-02110114
Browse files

Merge branch 'master' into gat-pyg-2

parents 313d3a60 de78012e
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -32,18 +32,18 @@ install:
  - conda update -q conda
  - bash scripts/install_deepchem_conda.sh cpu
  - conda activate deepchem
  - python setup.py install
  - pip install -e .
script:
  - bash devtools/run_yapf.sh
  - bash devtools/run_flake8.sh
  - mypy -p deepchem
  - pytest -m "not slow" --cov=deepchem deepchem
  - pytest -v -m "not slow" --cov=deepchem deepchem
  - if [ $TRAVIS_PYTHON_VERSION == '3.7' ]; then
      cd docs && pip install -r requirements.txt;
      make clean html && cd ..;
    fi
  - if [ $TRAVIS_PYTHON_VERSION == '3.7' ]; then
      find ./deepchem -name "*.py" ! -name '*load_dataset_template.py' | xargs python -m doctest -v;
      pytest -v --ignore-glob='deepchem/**/test*.py' --doctest-modules deepchem;
    fi
after_success:
  - echo $TRAVIS_SECURE_ENV_VARS
+732 −429

File changed.

Preview size limit exceeded, changes collapsed.

+63 −40
Original line number Diff line number Diff line
from typing import List, Union
import numpy as np
import torch

from deepchem.utils.save import load_image_files
from deepchem.data.datasets import NumpyDataset, DiskDataset, ImageDataset


class _TorchNumpyDataset(torch.utils.data.IterableDataset):  # type: ignore

  def __init__(self, numpy_dataset: NumpyDataset, epochs: int,
               deterministic: bool):
  def __init__(self,
               numpy_dataset: NumpyDataset,
               epochs: int,
               deterministic: bool,
               batch_size: int = None):
    """
    Parameters
    ----------
@@ -20,10 +21,14 @@ class _TorchNumpyDataset(torch.utils.data.IterableDataset): # type: ignore
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
    batch_size: int
      the number of samples to return in each batch.  If None, each returned
      value is a single sample.
    """
    self.numpy_dataset = numpy_dataset
    self.epochs = epochs
    self.deterministic = deterministic
    self.batch_size = batch_size

  def __iter__(self):
    n_samples = self.numpy_dataset._X.shape[0]
@@ -38,16 +43,28 @@ class _TorchNumpyDataset(torch.utils.data.IterableDataset): # type: ignore
      if self.deterministic:
        order = first_sample + np.arange(last_sample - first_sample)
      else:
        order = first_sample + np.random.permutation(last_sample - first_sample)
        # Ensure that every worker will pick the same random order for each epoch.
        random = np.random.RandomState(epoch)
        order = random.permutation(n_samples)[first_sample:last_sample]
      if self.batch_size is None:
        for i in order:
          yield (self.numpy_dataset._X[i], self.numpy_dataset._y[i],
                 self.numpy_dataset._w[i], self.numpy_dataset._ids[i])
      else:
        for i in range(0, len(order), self.batch_size):
          indices = order[i:i + self.batch_size]
          yield (self.numpy_dataset._X[indices], self.numpy_dataset._y[indices],
                 self.numpy_dataset._w[indices],
                 self.numpy_dataset._ids[indices])


class _TorchDiskDataset(torch.utils.data.IterableDataset):  # type: ignore

  def __init__(self, disk_dataset: DiskDataset, epochs: int,
               deterministic: bool):
  def __init__(self,
               disk_dataset: DiskDataset,
               epochs: int,
               deterministic: bool,
               batch_size: int = None):
    """
    Parameters
    ----------
@@ -58,10 +75,14 @@ class _TorchDiskDataset(torch.utils.data.IterableDataset): # type: ignore
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
    batch_size: int
      the number of samples to return in each batch.  If None, each returned
      value is a single sample.
    """
    self.disk_dataset = disk_dataset
    self.epochs = epochs
    self.deterministic = deterministic
    self.batch_size = batch_size

  def __iter__(self):
    worker_info = torch.utils.data.get_worker_info()
@@ -76,17 +97,25 @@ class _TorchDiskDataset(torch.utils.data.IterableDataset): # type: ignore
      return

    shard_indices = list(range(first_shard, last_shard))
    for epoch in range(self.epochs):
    for X, y, w, ids in self.disk_dataset._iterbatches_from_shards(
          shard_indices, deterministic=self.deterministic):
        shard_indices,
        batch_size=self.batch_size,
        epochs=self.epochs,
        deterministic=self.deterministic):
      if self.batch_size is None:
        for i in range(X.shape[0]):
          yield (X[i], y[i], w[i], ids[i])
      else:
        yield (X, y, w, ids)


class _TorchImageDataset(torch.utils.data.IterableDataset):  # type: ignore

  def __init__(self, image_dataset: ImageDataset, epochs: int,
               deterministic: bool):
  def __init__(self,
               image_dataset: ImageDataset,
               epochs: int,
               deterministic: bool,
               batch_size: int = None):
    """
    Parameters
    ----------
@@ -97,10 +126,14 @@ class _TorchImageDataset(torch.utils.data.IterableDataset): # type: ignore
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
    batch_size: int
      the number of samples to return in each batch.  If None, each returned
      value is a single sample.
    """
    self.image_dataset = image_dataset
    self.epochs = epochs
    self.deterministic = deterministic
    self.batch_size = batch_size

  def __iter__(self):
    n_samples = self.image_dataset._X_shape[0]
@@ -115,28 +148,18 @@ class _TorchImageDataset(torch.utils.data.IterableDataset): # type: ignore
      if self.deterministic:
        order = first_sample + np.arange(last_sample - first_sample)
      else:
        order = first_sample + np.random.permutation(last_sample - first_sample)
        # Ensure that every worker will pick the same random order for each epoch.
        random = np.random.RandomState(epoch)
        order = random.permutation(n_samples)[first_sample:last_sample]
      if self.batch_size is None:
        for i in order:
        yield (self._get_image(self.image_dataset._X, i),
               self._get_image(self.image_dataset._y, i),
          yield (self.image_dataset._get_image(self.image_dataset._X, i),
                 self.image_dataset._get_image(self.image_dataset._y, i),
                 self.image_dataset._w[i], self.image_dataset._ids[i])

  def _get_image(self, array: Union[np.ndarray, List[str]],
                 index: int) -> np.ndarray:
    """Method for loading an image

    Parameters
    ----------
    array: Union[np.ndarray, List[str]]
      A numpy array which contains images or List of image filenames
    index: int
      Index you want to get the image

    Returns
    -------
    np.ndarray
      Loaded image
    """
    if isinstance(array, np.ndarray):
      return array[index]
    return load_image_files([array[index]])[0]
      else:
        for i in range(0, len(order), self.batch_size):
          indices = order[i:i + self.batch_size]
          yield (self.image_dataset._get_image(self.image_dataset._X, indices),
                 self.image_dataset._get_image(self.image_dataset._y, indices),
                 self.image_dataset._w[indices],
                 self.image_dataset._ids[indices])
+19 −23
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ import numpy as np
import deepchem as dc

try:
  import torch
  import torch  # noqa
  PYTORCH_IMPORT_FAILED = False
except ImportError:
  PYTORCH_IMPORT_FAILED = True
@@ -272,27 +272,6 @@ def test_reshard():
  np.testing.assert_array_equal(ids, ids_rr)


def test_select():
  """Test that dataset select works."""
  num_datapoints = 10
  num_features = 10
  num_tasks = 1
  X = np.random.rand(num_datapoints, num_features)
  y = np.random.randint(2, size=(num_datapoints, num_tasks))
  w = np.ones((num_datapoints, num_tasks))
  ids = np.array(["id"] * num_datapoints)
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)

  indices = [0, 4, 5, 8]
  select_dataset = dataset.select(indices)
  X_sel, y_sel, w_sel, ids_sel = (select_dataset.X, select_dataset.y,
                                  select_dataset.w, select_dataset.ids)
  np.testing.assert_array_equal(X[indices], X_sel)
  np.testing.assert_array_equal(y[indices], y_sel)
  np.testing.assert_array_equal(w[indices], w_sel)
  np.testing.assert_array_equal(ids[indices], ids_sel)


def test_complete_shuffle():
  shard_sizes = [1, 2, 3, 4, 5]

@@ -742,9 +721,26 @@ def _validate_pytorch_dataset(dataset):
    id_count[iter_id] += 1
  assert all(id_count[id] == 2 for id in ids)

  # Test iterating in batches.

  ds = dataset.make_pytorch_dataset(epochs=2, deterministic=False, batch_size=7)
  id_to_index = dict((id, i) for i, id in enumerate(ids))
  id_count = dict((id, 0) for id in ids)
  for iter_X, iter_y, iter_w, iter_id in ds:
    size = len(iter_id)
    assert size <= 7
    for i in range(size):
      j = id_to_index[iter_id[i]]
      np.testing.assert_array_equal(X[j, :], iter_X[i])
      np.testing.assert_array_equal(y[j, :], iter_y[i])
      np.testing.assert_array_equal(w[j, :], iter_w[i])
      id_count[iter_id[i]] += 1
  assert all(id_count[id] == 2 for id in ids)

  # Test iterating with multiple workers.

  import torch
  import torch  # noqa
  ds = dataset.make_pytorch_dataset(epochs=2, deterministic=False)
  loader = torch.utils.data.DataLoader(ds, num_workers=3)
  id_count = dict((id, 0) for id in ids)
  for iter_X, iter_y, iter_w, iter_id in loader:
+0 −8
Original line number Diff line number Diff line
import os
import shutil
import logging
import unittest
import tempfile
import deepchem as dc
import numpy as np
from sklearn.ensemble import RandomForestClassifier

logger = logging.getLogger(__name__)

@@ -19,10 +15,6 @@ class TestDrop(unittest.TestCase):

  def test_drop(self):
    """Test on dataset where RDKit fails on some strings."""
    # Set some global variables up top
    reload = True
    len_full = 25

    current_dir = os.path.dirname(os.path.realpath(__file__))
    logger.info("About to load emols dataset.")
    dataset_file = os.path.join(current_dir, "mini_emols.csv")
Loading