Merge branch 'master' into jax (24acca18) · Commits · 钟慕尧 / deepchem

.github/workflows/jax_setup.yml

+3 −4

Original line number	Diff line number	Diff line
		@@ -29,7 +29,6 @@ jobs:
		- name: Build DeepChem
		run: \|
		python -m pip install --upgrade pip
		pip install tensorflow==2.4
		pip install -e '.[jax]'
		- name: Import checking
		run: python -c "import deepchem; import jax;"
		@@ -39,7 +38,7 @@ jobs:
		strategy:
		fail-fast: false
		matrix:
		os: [ubuntu-latest, windows-latest]
		os: [ubuntu-latest]
		python-version: [3.7]
		include:
		- os: ubuntu-latest
		@@ -100,9 +99,9 @@ jobs:
		python -m pip install --upgrade pip;
		pip install conda-merge;
		if [ "$(uname)" == 'Darwin' ]; then
		conda-merge env_jax.yml env.test.yml > env.yml
		conda-merge requirements/jax/env_jax.yml env.test.yml > env.yml
		else
		conda-merge env_jax.yml env.test.yml > env.yml
		conda-merge requirements/jax/env_jax.yml env.test.yml > env.yml
		fi;
		- name: Install all dependencies
		uses: conda-incubator/setup-miniconda@v2

.github/workflows/main.yml

+1 −1

Original line number	Diff line number	Diff line
		@@ -145,7 +145,7 @@ jobs:
		- name: PyTest
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		shell: bash -l {0}
		run: pytest -v -m "not slow and not jax" --cov=deepchem --cov-report=xml deepchem
		run: pytest -v -m "not slow and not jax and not torch" --cov=deepchem --cov-report=xml deepchem
		- name: Upload coverage to Codecov
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		uses: codecov/codecov-action@v1

.github/workflows/torch_setup.yml

0 → 100644

+136 −0

Original line number	Diff line number	Diff line
		name: Test for DeepChem Torch
		on:
		push: # ci work when pushing master branch
		branches:
		- master
		pull_request: # ci work when creating a PR to master branch
		branches:
		- master
		jobs:
		build:
		runs-on: ubuntu-latest
		strategy:
		fail-fast: false
		matrix:
		python-version: [3.7]
		steps:
		- uses: actions/checkout@v2
		- name: Cache pip modules for Linux
		uses: actions/cache@v2
		with:
		path: ~/.cache/pip
		key: ${{ runner.os }}-pip-${{ hashFiles('env.*.yml') }}
		restore-keys: \|
		${{ runner.os }}-pip-
		- name: Set up Python ${{ matrix.python-version }}
		uses: actions/setup-python@v2
		with:
		python-version: ${{ matrix.python-version }}
		- name: Build DeepChem
		run: \|
		python -m pip install --upgrade pip
		pip install -e '.[torch]'
		- name: Import checking
		run: python -c "import deepchem; import torch;"

		test:
		runs-on: ${{ matrix.os }}
		strategy:
		fail-fast: false
		matrix:
		os: [ubuntu-latest, windows-latest]
		python-version: [3.7]
		include:
		- os: ubuntu-latest
		python-version: 3.8
		env:
		OS: ${{ matrix.os }}
		PYTHON_VERSION: ${{ matrix.python-version }}
		steps:
		- uses: actions/checkout@v2
		with:
		fetch-depth: 0
		# https://github.com/galaxyproject/tools-iuc/blob/master/.github/workflows/pr.yaml
		# The range of commits to check for changes is:
		# - for events on the master branch we compare against the sha before the event
		# (note that this does not work for feature branch events since we want all
		# commits on the feature branch and not just the commits of the last event)
		# - for pull requests we compare against the 1st ancestor, given the current
		# HEAD is the merge between the PR branch and the base branch
		- name: Set commit range (push to the master branch, e.g. merge)
		if: github.ref == 'refs/heads/master' && github.event_name == 'push'
		run: echo "COMMIT_RANGE=${{ github.event.before }}.." >> $GITHUB_ENV
		- name: Set commit range (pull request)
		if: github.event_name == 'pull_request'
		run: \|
		git fetch origin master
		echo "COMMIT_RANGE=origin/master..." >> $GITHUB_ENV
		- name: Cache pip packages for Linux
		if: runner.os == 'Linux'
		uses: actions/cache@v2
		with:
		path: ~/.cache/pip
		key: ${{ runner.os }}-pip-${{ hashFiles('env.*.yml') }}
		restore-keys: \|
		${{ runner.os }}-pip-
		- name: Cache pip packages for MacOS
		if: runner.os == 'macOS'
		uses: actions/cache@v2
		with:
		path: ~/Library/Caches/pip
		key: ${{ matrix.os }}-pip-${{ hashFiles('env.*.yml') }}
		restore-keys: \|
		${{ runner.os }}-pip-
		- name: Cache pip packages for Windows
		if: runner.os == 'Windows'
		uses: actions/cache@v2
		with:
		path: ~\AppData\Local\pip\Cache
		key: ${{ matrix.os }}-pip-${{ hashFiles('env.*.yml') }}
		restore-keys: \|
		${{ runner.os }}-pip-
		- name: Set up Python ${{ matrix.python-version }}
		uses: actions/setup-python@v2
		with:
		python-version: ${{ matrix.python-version }}
		- name: Create env.yml
		shell: bash
		run: \|
		python -m pip install --upgrade pip;
		pip install conda-merge;
		if [ "$(uname)" == 'Darwin' ]; then
		conda-merge requirements/torch/env_torch.yml requirements/torch/env_torch.cpu.yml env.test.yml > env.yml
		else
		conda-merge requirements/torch/env_torch.yml requirements/torch/env_torch.cpu.yml env.test.yml > env.yml
		fi;
		- name: Install all dependencies
		uses: conda-incubator/setup-miniconda@v2
		with:
		miniconda-version: "latest"
		auto-update-conda: true
		activate-environment: deepchem
		channels: omnia,conda-forge,defaults
		python-version: ${{ matrix.python-version }}
		environment-file: env.yml
		- name: Install DeepChem
		id: install
		shell: bash -l {0}
		run: pip install -e '.[torch]'
		- name: Yapf (version 0.22.0)
		id: yapf
		# on Windows, yapf raise the strange error..., so ignore
		if: runner.os == 'Linux' \|\| runner.os == 'macOS'
		shell: bash -l {0}
		run: \|
		CHANGED_FILES=`git diff --name-only $COMMIT_RANGE \| grep .py$ \| grep -v contrib/ \|\| true`
		if [ -n "$CHANGED_FILES" ]; then
		yapf -d $CHANGED_FILES
		fi
		- name: Flake8
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		shell: bash -l {0}
		run: source scripts/flake8_for_ci.sh
		- name: PyTest
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		shell: bash -l {0}
		run: pytest -v -m torch deepchem

.gitignore

+3 −0

Original line number	Diff line number	Diff line
		@@ -67,6 +67,9 @@ target/
		# Vim swap
		*.swp

		# Weights & Biases
		wandb/

		# Dataset files
		datasets/2008-2011_USPTO_reactionSmiles_filtered.zip
		datasets/2008-2011_USPTO_reactionSmiles_filtered/

deepchem/data/data_loader.py

+115 −8

Original line number	Diff line number	Diff line
		@@ -14,9 +14,10 @@ import numpy as np

		from deepchem.utils.typing import OneOrMany
		from deepchem.utils.data_utils import load_image_files, load_csv_files, load_json_files, load_sdf_files
		from deepchem.utils.genomics_utils import encode_bio_sequence
		from deepchem.feat import UserDefinedFeaturizer, Featurizer
		from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset
		from deepchem.feat.molecule_featurizers import OneHotFeaturizer
		from deepchem.utils.genomics_utils import encode_bio_sequence

		logger = logging.getLogger(__name__)

		@@ -875,9 +876,70 @@ class FASTALoader(DataLoader):
		learning tasks.
		"""

		def __init__(self):
		"""Initialize loader."""
		pass
		def __init__(self,
		featurizer: Optional[Featurizer] = None,
		auto_add_annotations: bool = False,
		legacy: bool = True):
		"""Initialize FASTALoader.

		Parameters
		----------
		featurizer: Featurizer (default: None)
		The Featurizer to be used for the loaded FASTA data.

		If featurizer is None and legacy is True, the original featurization
		logic is used, creating a one hot encoding of all included FASTA strings
		of shape
		(number of FASTA sequences, number of channels + 1, sequence length, 1).

		If featurizer is None and legacy is False, the featurizer is initialized
		as a OneHotFeaturizer object with charset ("A", "C", "T", "G") and
		max_length = None.

		auto_add_annotations: bool (default False)
		Whether create_dataset will automatically add [CLS] and [SEP] annotations
		to the sequences it reads in order to assist tokenization.
		Keep False if your FASTA file already includes [CLS] and [SEP] annotations.

		legacy: bool (default True)
		Whether to use legacy logic for featurization. Legacy mode will create
		a one hot encoding of the FASTA content of shape
		(number of FASTA sequences, number of channels + 1, max length, 1).

		Legacy mode is only tested for ACTGN charsets, and will be deprecated.
		"""

		# Process legacy toggle
		if legacy:
		warnings.warn(
		"""
		Legacy mode is deprecated and will be removed in
		DeepChem 3.0. Disable legacy mode by passing legacy=False
		during construction of FASTALoader object.
		""", FutureWarning)
		if featurizer is not None or auto_add_annotations:
		raise ValueError(f"""
		featurizer option must be None and
		auto_add_annotations must be false when legacy mode
		is enabled. You set featurizer to {featurizer} and
		auto_add_annotations to {auto_add_annotations}.
		""")

		# Set attributes
		self.legacy = legacy
		self.auto_add_annotations = auto_add_annotations

		self.user_specified_features = None

		# Handle special featurizer cases
		if isinstance(featurizer, UserDefinedFeaturizer): # User defined featurizer
		self.user_specified_features = featurizer.feature_fields
		elif featurizer is None: # Default featurizer
		featurizer = OneHotFeaturizer(
		charset=["A", "C", "T", "G"], max_length=None)

		# Set self.featurizer
		self.featurizer = featurizer

		def create_dataset(self,
		input_files: OneOrMany[str],
		@@ -885,8 +947,7 @@ class FASTALoader(DataLoader):
		shard_size: Optional[int] = None) -> DiskDataset:
		"""Creates a `Dataset` from input FASTA files.

		At present, FASTA support is limited and only allows for one-hot
		featurization, and doesn't allow for sharding.
		At present, FASTA support is limited and doesn't allow for sharding.

		Parameters
		----------
		@@ -907,13 +968,59 @@ class FASTALoader(DataLoader):
		if isinstance(input_files, str):
		input_files = [input_files]

		def shard_generator():
		def shard_generator(): # TODO Enable sharding with shard size parameter
		for input_file in input_files:
		if self.legacy:
		X = encode_bio_sequence(input_file)
		else:
		sequences = _read_file(input_file)
		X = self.featurizer(sequences)
		ids = np.ones(len(X))
		# (X, y, w, ids)
		yield X, None, None, ids

		def _read_file(input_file: str, auto_add_annotations: bool = False):
		"""
		Convert the FASTA file to a numpy array of FASTA-format strings.
		"""

		# TODO don't convert all sequences into np array (allow shards)
		def _generate_sequences(fasta_file, header_mark=">") -> np.array:
		"""
		Uses a fasta_file to create a numpy array of annotated FASTA-format strings
		"""
		sequences = np.array([])
		sequence = np.array([])
		header_read = False
		for line in fasta_file:
		# Check if line is a header
		if line.startswith(header_mark): # New header line
		header_read = True
		sequences = _add_sequence(sequences, sequence)
		sequence = np.array([])
		elif header_read: # Line contains sequence in FASTA format
		if line[-1:] == '\n': # Check last character in string
		line = line[0:-1] # Remove last character
		sequence = np.append(sequence, line)
		sequences = _add_sequence(sequences, sequence) # Add last sequence
		return sequences

		def _add_sequence(sequences: np.array, sequence: np.array) -> np.array:
		# Handle empty sequence
		if sequence is None or len(sequence) <= 0:
		# TODO log attempts to add empty sequences every shard
		return np.array([])
		# Annotate start/stop of sequence
		if auto_add_annotations:
		sequence = np.insert(sequence, 0, "[CLS]")
		sequence = np.append(sequence, "[SEP]")
		new_sequence = ''.join(sequence)
		new_sequences = np.append(sequences, new_sequence)
		return new_sequences

		with open(input_file, 'r') as f: # Read FASTA file
		return _generate_sequences(f)

		return DiskDataset.create_dataset(shard_generator(), data_dir)

Admin message