Commit 24acca18 authored by VIGNESHinZONE's avatar VIGNESHinZONE
Browse files

Merge branch 'master' into jax

parents 4a94792e 692a2ed7
Loading
Loading
Loading
Loading
+3 −4
Original line number Diff line number Diff line
@@ -29,7 +29,6 @@ jobs:
    - name: Build DeepChem
      run: |
        python -m pip install --upgrade pip
        pip install tensorflow==2.4
        pip install -e '.[jax]'
    - name: Import checking
      run: python -c "import deepchem; import jax;"
@@ -39,7 +38,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, windows-latest]
        os: [ubuntu-latest]
        python-version: [3.7]
        include:
          - os: ubuntu-latest
@@ -100,9 +99,9 @@ jobs:
        python -m pip install --upgrade pip;
        pip install conda-merge;
        if [ "$(uname)" == 'Darwin' ]; then
          conda-merge env_jax.yml env.test.yml > env.yml
          conda-merge requirements/jax/env_jax.yml env.test.yml > env.yml
        else
          conda-merge env_jax.yml env.test.yml > env.yml
          conda-merge requirements/jax/env_jax.yml env.test.yml > env.yml
        fi;
    - name: Install all dependencies
      uses: conda-incubator/setup-miniconda@v2
+1 −1
Original line number Diff line number Diff line
@@ -145,7 +145,7 @@ jobs:
    - name: PyTest
      if: ${{ (success() || failure()) && (steps.install.outcome == 'failure' || steps.install.outcome == 'success') }}
      shell: bash -l {0}
      run: pytest -v -m "not slow and not jax" --cov=deepchem --cov-report=xml deepchem
      run: pytest -v -m "not slow and not jax and not torch" --cov=deepchem --cov-report=xml deepchem
    - name: Upload coverage to Codecov
      if: ${{ (success() || failure()) && (steps.install.outcome == 'failure' || steps.install.outcome == 'success') }}
      uses: codecov/codecov-action@v1
+136 −0
Original line number Diff line number Diff line
name: Test for DeepChem Torch
on:
  push: # ci work when pushing master branch
    branches:
      - master
  pull_request: # ci work when creating a PR to master branch
    branches:
      - master
jobs:
  build:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python-version: [3.7]
    steps:
    - uses: actions/checkout@v2
    - name: Cache pip modules for Linux
      uses: actions/cache@v2
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('env.*.yml') }}
        restore-keys: |
          ${{ runner.os }}-pip-
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Build DeepChem
      run: |
        python -m pip install --upgrade pip
        pip install -e '.[torch]'
    - name: Import checking
      run: python -c "import deepchem; import torch;"

  test:
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, windows-latest]
        python-version: [3.7]
        include:
          - os: ubuntu-latest
            python-version: 3.8
    env:
      OS: ${{ matrix.os }}
      PYTHON_VERSION: ${{ matrix.python-version }}
    steps:
    - uses: actions/checkout@v2
      with:
        fetch-depth: 0
    # https://github.com/galaxyproject/tools-iuc/blob/master/.github/workflows/pr.yaml
    # The range of commits to check for changes is:
    # - for events on the master branch we compare against the sha before the event
    #   (note that this does not work for feature branch events since we want all
    #   commits on the feature branch and not just the commits of the last event)
    # - for pull requests we compare against the 1st ancestor, given the current
    #   HEAD is the merge between the PR branch and the base branch
    - name: Set commit range (push to the master branch, e.g. merge)
      if: github.ref == 'refs/heads/master' && github.event_name == 'push'
      run: echo "COMMIT_RANGE=${{ github.event.before }}.." >> $GITHUB_ENV
    - name: Set commit range (pull request)
      if: github.event_name == 'pull_request'
      run: |
        git fetch origin master
        echo "COMMIT_RANGE=origin/master..." >> $GITHUB_ENV
    - name: Cache pip packages for Linux
      if: runner.os == 'Linux'
      uses: actions/cache@v2
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('env.*.yml') }}
        restore-keys: |
          ${{ runner.os }}-pip-
    - name: Cache pip packages for MacOS
      if: runner.os == 'macOS'
      uses: actions/cache@v2
      with:
        path: ~/Library/Caches/pip
        key: ${{ matrix.os }}-pip-${{ hashFiles('env.*.yml') }}
        restore-keys: |
          ${{ runner.os }}-pip-
    - name: Cache pip packages for Windows
      if: runner.os == 'Windows'
      uses: actions/cache@v2
      with:
        path: ~\AppData\Local\pip\Cache
        key: ${{ matrix.os }}-pip-${{ hashFiles('env.*.yml') }}
        restore-keys: |
          ${{ runner.os }}-pip-
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Create env.yml
      shell: bash
      run: |
        python -m pip install --upgrade pip;
        pip install conda-merge;
        if [ "$(uname)" == 'Darwin' ]; then
          conda-merge requirements/torch/env_torch.yml requirements/torch/env_torch.cpu.yml env.test.yml > env.yml
        else
          conda-merge requirements/torch/env_torch.yml requirements/torch/env_torch.cpu.yml env.test.yml > env.yml
        fi;
    - name: Install all dependencies
      uses: conda-incubator/setup-miniconda@v2
      with:
        miniconda-version: "latest"
        auto-update-conda: true
        activate-environment: deepchem
        channels: omnia,conda-forge,defaults
        python-version: ${{ matrix.python-version }}
        environment-file: env.yml
    - name: Install DeepChem
      id: install
      shell: bash -l {0}
      run: pip install -e '.[torch]'
    - name: Yapf (version 0.22.0)
      id: yapf
      # on Windows, yapf raise the strange error..., so ignore
      if: runner.os == 'Linux' || runner.os == 'macOS'
      shell: bash -l {0}
      run: |
        CHANGED_FILES=`git diff --name-only $COMMIT_RANGE | grep .py$ | grep -v contrib/ || true`
        if [ -n "$CHANGED_FILES" ]; then
          yapf -d $CHANGED_FILES
        fi
    - name: Flake8
      if: ${{ (success() || failure()) && (steps.install.outcome == 'failure' || steps.install.outcome == 'success') }}
      shell: bash -l {0}
      run: source scripts/flake8_for_ci.sh
    - name: PyTest
      if: ${{ (success() || failure()) && (steps.install.outcome == 'failure' || steps.install.outcome == 'success') }}
      shell: bash -l {0}
      run: pytest -v -m torch deepchem
+3 −0
Original line number Diff line number Diff line
@@ -67,6 +67,9 @@ target/
# Vim swap
*.swp

# Weights & Biases
wandb/

# Dataset files
datasets/2008-2011_USPTO_reactionSmiles_filtered.zip
datasets/2008-2011_USPTO_reactionSmiles_filtered/
+115 −8
Original line number Diff line number Diff line
@@ -14,9 +14,10 @@ import numpy as np

from deepchem.utils.typing import OneOrMany
from deepchem.utils.data_utils import load_image_files, load_csv_files, load_json_files, load_sdf_files
from deepchem.utils.genomics_utils import encode_bio_sequence
from deepchem.feat import UserDefinedFeaturizer, Featurizer
from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset
from deepchem.feat.molecule_featurizers import OneHotFeaturizer
from deepchem.utils.genomics_utils import encode_bio_sequence

logger = logging.getLogger(__name__)

@@ -875,9 +876,70 @@ class FASTALoader(DataLoader):
  learning tasks.
  """

  def __init__(self):
    """Initialize loader."""
    pass
  def __init__(self,
               featurizer: Optional[Featurizer] = None,
               auto_add_annotations: bool = False,
               legacy: bool = True):
    """Initialize FASTALoader.

    Parameters
    ----------
    featurizer: Featurizer (default: None)
      The Featurizer to be used for the loaded FASTA data.

      If featurizer is None and legacy is True, the original featurization
      logic is used, creating a one hot encoding of all included FASTA strings
      of shape
      (number of FASTA sequences, number of channels + 1, sequence length, 1).

      If featurizer is None and legacy is False, the featurizer is initialized
      as a OneHotFeaturizer object with charset ("A", "C", "T", "G") and
      max_length = None.

    auto_add_annotations: bool (default False)
      Whether create_dataset will automatically add [CLS] and [SEP] annotations
      to the sequences it reads in order to assist tokenization.
      Keep False if your FASTA file already includes [CLS] and [SEP] annotations.

    legacy: bool (default True)
      Whether to use legacy logic for featurization. Legacy mode will create
      a one hot encoding of the FASTA content of shape
      (number of FASTA sequences, number of channels + 1, max length, 1).

      Legacy mode is only tested for ACTGN charsets, and will be deprecated.
   """

    # Process legacy toggle
    if legacy:
      warnings.warn(
          """
                    Legacy mode is deprecated and will be removed in
                    DeepChem 3.0. Disable legacy mode by passing legacy=False
                    during construction of FASTALoader object.
                    """, FutureWarning)
      if featurizer is not None or auto_add_annotations:
        raise ValueError(f"""
                          featurizer option must be None and
                          auto_add_annotations must be false when legacy mode
                          is enabled. You set featurizer to {featurizer} and
                          auto_add_annotations to {auto_add_annotations}.
                          """)

    # Set attributes
    self.legacy = legacy
    self.auto_add_annotations = auto_add_annotations

    self.user_specified_features = None

    # Handle special featurizer cases
    if isinstance(featurizer, UserDefinedFeaturizer):  # User defined featurizer
      self.user_specified_features = featurizer.feature_fields
    elif featurizer is None:  # Default featurizer
      featurizer = OneHotFeaturizer(
          charset=["A", "C", "T", "G"], max_length=None)

    # Set self.featurizer
    self.featurizer = featurizer

  def create_dataset(self,
                     input_files: OneOrMany[str],
@@ -885,8 +947,7 @@ class FASTALoader(DataLoader):
                     shard_size: Optional[int] = None) -> DiskDataset:
    """Creates a `Dataset` from input FASTA files.

    At present, FASTA support is limited and only allows for one-hot
    featurization, and doesn't allow for sharding.
    At present, FASTA support is limited and doesn't allow for sharding.

    Parameters
    ----------
@@ -907,13 +968,59 @@ class FASTALoader(DataLoader):
    if isinstance(input_files, str):
      input_files = [input_files]

    def shard_generator():
    def shard_generator():  # TODO Enable sharding with shard size parameter
      for input_file in input_files:
        if self.legacy:
          X = encode_bio_sequence(input_file)
        else:
          sequences = _read_file(input_file)
          X = self.featurizer(sequences)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    def _read_file(input_file: str, auto_add_annotations: bool = False):
      """
      Convert the FASTA file to a numpy array of FASTA-format strings.
      """

      # TODO don't convert all sequences into np array (allow shards)
      def _generate_sequences(fasta_file, header_mark=">") -> np.array:
        """
        Uses a fasta_file to create a numpy array of annotated FASTA-format strings
        """
        sequences = np.array([])
        sequence = np.array([])
        header_read = False
        for line in fasta_file:
          # Check if line is a header
          if line.startswith(header_mark):  # New header line
            header_read = True
            sequences = _add_sequence(sequences, sequence)
            sequence = np.array([])
          elif header_read:  # Line contains sequence in FASTA format
            if line[-1:] == '\n':  # Check last character in string
              line = line[0:-1]  # Remove last character
            sequence = np.append(sequence, line)
        sequences = _add_sequence(sequences, sequence)  # Add last sequence
        return sequences

      def _add_sequence(sequences: np.array, sequence: np.array) -> np.array:
        # Handle empty sequence
        if sequence is None or len(sequence) <= 0:
          # TODO log attempts to add empty sequences every shard
          return np.array([])
        # Annotate start/stop of sequence
        if auto_add_annotations:
          sequence = np.insert(sequence, 0, "[CLS]")
          sequence = np.append(sequence, "[SEP]")
        new_sequence = ''.join(sequence)
        new_sequences = np.append(sequences, new_sequence)
        return new_sequences

      with open(input_file, 'r') as f:  # Read FASTA file
        return _generate_sequences(f)

    return DiskDataset.create_dataset(shard_generator(), data_dir)


Loading