Commit fd174158 authored by Boris Dayma's avatar Boris Dayma
Browse files

Merge branch 'master' into feat_wandb

parents 0ce820ff af2db874
Loading
Loading
Loading
Loading
+41 −44
Original line number Diff line number Diff line
jobs:
  include:
    - name: "Python 3.6"
  - name: Python 3.6
    language: python
      python: "3.6"
    python: '3.6'
    sudo: required
    dist: xenial

    - name: "Python 3.7"
  - name: Python 3.7
    language: python
      python: "3.7"
    python: '3.7'
    sudo: required
    dist: xenial

    - name: "Windows"
      language: c # Not really, but travis doesn't support python on Windows
      python: "3.7"
  - name: Windows
    language: c
    python: '3.7'
    os: windows

install:
  - if [[ "$TRAVIS_OS_NAME" != "windows" ]]; then
    wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
    export python_version=$TRAVIS_PYTHON_VERSION;
    bash miniconda.sh -b -p $HOME/miniconda;
    source "$HOME/miniconda/etc/profile.d/conda.sh";
    fi
  - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then
    choco install miniconda3 --params="'/JustMe /AddToPath:1'";
    export PATH="/c/tools/miniconda3/:/c/tools/miniconda3/Scripts:/c/tools/miniconda3/Library/bin:$PATH";
    source /c/tools/miniconda3/etc/profile.d/conda.sh;
    fi
- if [[ "$TRAVIS_OS_NAME" != "windows" ]]; then wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
  -O miniconda.sh; export python_version=$TRAVIS_PYTHON_VERSION; bash miniconda.sh
  -b -p $HOME/miniconda; source "$HOME/miniconda/etc/profile.d/conda.sh"; fi
- if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then choco install miniconda3 --params="'/JustMe
  /AddToPath:1'"; export PATH="/c/tools/miniconda3/:/c/tools/miniconda3/Scripts:/c/tools/miniconda3/Library/bin:$PATH";
  source /c/tools/miniconda3/etc/profile.d/conda.sh; fi
- hash -r
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
@@ -40,10 +32,15 @@ install:
- python setup.py install
script:
- pytest -m "not slow" --cov=deepchem deepchem
  - if [ $TRAVIS_PYTHON_VERSION == '3.7' ]; then
    find ./deepchem | grep .py$ |xargs python -m doctest -v;
    fi
- if [ $TRAVIS_PYTHON_VERSION == '3.7' ]; then find ./deepchem | grep .py$ |xargs
  python -m doctest -v; fi
- bash devtools/travis-ci/test_format_code.sh
after_success:
- echo $TRAVIS_SECURE_ENV_VARS
- coveralls
deploy:
  provider: pypi
  username: __token__
  password:
    secure: b67LO8VZcoKEWo7gDlFdjS1yKUavCt578uAuXPyW6f+e+Tk/sEQRdkx1VYoZlQdfZQo8u4q+E3W184T+/j6ht65/cdy/HYH57LCQySjF/MY2M9+/lcP45aY7Z0F2QHeY9QgpRc8gKthGzgM/bHj2glxlEvT1diItEEoGqE2x/fw1K25cNOni08E4hqz0HPY1SXVwd8/9Z/t1YasrBcOjtJ8kcbyjnmeyhjfkaV/aTaAzuqh2MlqZTSz3dhwsBrZfZp86+8T2TgcoDSuIxCwb777QKW1QlvNyLEKlnfateKMYqrrP65oHrxXEEcHd/N3IH28Bz9wVnENjHLkGJ0vXyXyEWcJFe+V6T0k/8NkZamU4SZE5BM4v6mOdThs4l54vuFajctHDeGgIDjL55MfkDmkKd5lAvlWPwrdw8DERsmqetUfZ/TG7FE6/MT1puu2ffu3A9Ivcch5T46pojIggDWHHn9hUsc6iD3Ov7rVqd024Lzm9V8wXiDYU9EMqAu5lJQRIOO/hnr8Gn6zYRCE1n29MKuNJwauSHfdV/mBTRyOjZyWHSGNaiPw2hqE3tZrrIN4koEYaZiERRVnmVt8wMUTj40YglosTHYpL91SkDH/ResX1rtHKs4Am+R+MmcWULTUQ7UwEtqlsa3nVxTK9gfmJ0nX8Jhjtl2iRhVg5PP8=
  edge: true
+1 −0
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@ DeepChem has a number of "soft" requirements. These are packages which are neede
- [simdna](https://github.com/kundajelab/simdna)
- [XGBoost](https://xgboost.readthedocs.io/en/latest/)
- [Weights & Biases](https://docs.wandb.com/)
- [Tensorflow Probability](https://www.tensorflow.org/probability)

## Installation

+1 −1
Original line number Diff line number Diff line
"""
Imports all submodules
"""
__version__ = '2.3.0'
__version__ = '2.4.0-rc.1'

import deepchem.data
import deepchem.feat
+0 −122
Original line number Diff line number Diff line
"""
General API for testing dataset objects
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import unittest
import tempfile
import os
import shutil
import numpy as np
import deepchem as dc


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


def load_butina_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["task"]
  # task_type = "regression"
  input_file = os.path.join(current_dir,
                            "../../models/tests/butina_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_classification_data():
  """Loads classification data from example.csv"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["outcome"]
  task_type = "classification"
  input_file = os.path.join(current_dir,
                            "../../models/tests/example_classification.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_sparse_multitask_dataset():
  """Load sparse tox multitask data, sample dataset."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8",
      "task9"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/sparse_multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_feat_multitask_data():
  """Load example with numerical features, tasks."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"]
  featurizer = dc.feat.UserDefinedFeaturizer(features)
  tasks = ["task0", "task1", "task2", "task3", "task4", "task5"]
  input_file = os.path.join(current_dir,
                            "../../models/tests/feat_multitask_example.csv")
  loader = dc.data.UserCSVLoader(
      tasks=tasks, featurizer=featurizer, id_field="id")
  return loader.featurize(input_file)


def load_gaussian_cdf_data():
  """Load example with numbers sampled from Gaussian normal distribution.
     Each feature and task is a column of values that is sampled
     from a normal distribution of mean 0, stdev 1."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  features = ["feat0", "feat1"]
  featurizer = dc.feat.UserDefinedFeaturizer(features)
  tasks = ["task0", "task1"]
  input_file = os.path.join(current_dir,
                            "../../models/tests/gaussian_cdf_example.csv")
  loader = dc.data.UserCSVLoader(
      tasks=tasks, featurizer=featurizer, id_field="id")
  return loader.featurize(input_file)


def load_unlabelled_data():
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = []
  input_file = os.path.join(current_dir, "../../data/tests/no_labels.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)
+45 −16
Original line number Diff line number Diff line
@@ -24,6 +24,35 @@ except ImportError:
  PYTORCH_IMPORT_FAILED = True


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.create_dataset(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


class TestDatasets(test_util.TensorFlowTestCase):
  """
  Test basic top-level API for dataset objects.
@@ -172,10 +201,10 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_get_task_names(self):
    """Test that get_task_names returns correct task_names"""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert solubility_dataset.get_task_names() == ["log-solubility"]

    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    assert sorted(multitask_dataset.get_task_names()) == sorted([
        "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
        "task8", "task9", "task10", "task11", "task12", "task13", "task14",
@@ -184,20 +213,20 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_get_data_shape(self):
    """Test that get_data_shape returns currect data shape"""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert solubility_dataset.get_data_shape() == (1024,)

    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    assert multitask_dataset.get_data_shape() == (1024,)

  def test_len(self):
    """Test that len(dataset) works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert len(solubility_dataset) == 10

  def test_reshard(self):
    """Test that resharding the dataset works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    assert solubility_dataset.get_number_shards() == 1
@@ -302,7 +331,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_iterbatches(self):
    """Test that iterating over batches of data works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    batch_size = 2
    data_shape = solubility_dataset.get_data_shape()
    tasks = solubility_dataset.get_task_names()
@@ -331,7 +360,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_itersamples_disk(self):
    """Test that iterating over samples in a DiskDataset works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X = solubility_dataset.X
    y = solubility_dataset.y
    w = solubility_dataset.w
@@ -372,7 +401,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_transform_disk(self):
    """Test that the transform() method works for DiskDatasets."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()
    X = dataset.X
    y = dataset.y
    w = dataset.w
@@ -394,7 +423,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_to_numpy(self):
    """Test that transformation to numpy arrays is sensible."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    data_shape = solubility_dataset.get_data_shape()
    tasks = solubility_dataset.get_task_names()
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
@@ -409,7 +438,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_consistent_ordering(self):
    """Test that ordering of labels is consistent over time."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()

    ids1 = solubility_dataset.ids
    ids2 = solubility_dataset.ids
@@ -418,7 +447,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_get_statistics(self):
    """Test statistics computation of this dataset."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    X_means, y_means = np.mean(X, axis=0), np.mean(y, axis=0)
@@ -431,7 +460,7 @@ class TestDatasets(test_util.TensorFlowTestCase):
    np.testing.assert_allclose(comp_y_stds, y_stds)

  def test_disk_iterate_batch_size(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    batch_sizes = []
@@ -656,7 +685,7 @@ class TestDatasets(test_util.TensorFlowTestCase):
          np.sort(all_ids, axis=0), np.sort(test_ids, axis=0))

  def test_numpy_iterate_batch_size(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = dc.data.NumpyDataset.from_DiskDataset(
@@ -769,12 +798,12 @@ class TestDatasets(test_util.TensorFlowTestCase):
  @unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
  def test_make_pytorch_dataset_from_disk(self):
    """Test creating a PyTorch Dataset from a DiskDataset."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()
    self._validate_pytorch_dataset(dataset)

  def test_dataframe(self):
    """Test converting between Datasets and DataFrames."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()

    # A round trip from Dataset to DataFrame to Dataset should produce identical arrays.

Loading