Merge branch 'deepchem:master' into usptotok (7ae5640f) · Commits · 钟慕尧 / deepchem

.github/workflows/main.yml

+5 −5

Original line number	Diff line number	Diff line
		@@ -29,7 +29,6 @@ jobs:
		- name: Build DeepChem
		run: \|
		python -m pip install --upgrade pip
		pip install tensorflow'>=2.3,<2.4'
		pip install -e .
		- name: Import checking
		run: python -c "import deepchem"
		@@ -142,10 +141,11 @@ jobs:
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		shell: bash -l {0}
		run: DGLBACKEND=pytorch pytest -v --ignore-glob='deepchem/*/test.py' --doctest-modules deepchem
		- name: PyTest
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		shell: bash -l {0}
		run: pytest -v -m "not slow and not jax and not torch and not tensorflow" --cov=deepchem --cov-report=xml deepchem
		# These tests are handled by new CI runs
		#- name: PyTest
		# if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		# shell: bash -l {0}
		# run: pytest -v -m "not slow and not jax and not torch and not tensorflow" --cov=deepchem --cov-report=xml deepchem
		- name: Upload coverage to Codecov
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		uses: codecov/codecov-action@v1

deepchem/feat/init.py

+7 −1

Original line number	Diff line number	Diff line
		@@ -64,14 +64,20 @@ from deepchem.feat.material_featurizers import LCNNFeaturizer
		from deepchem.feat.atomic_conformation import AtomicConformation
		from deepchem.feat.atomic_conformation import AtomicConformationFeaturizer

		# tokenizers
		try:
		import transformers
		from transformers import BertTokenizer

		from deepchem.feat.smiles_tokenizer import SmilesTokenizer
		from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
		except ModuleNotFoundError:
		pass

		try:
		from transformers import RobertaTokenizerFast
		from deepchem.feat.roberta_tokenizer import RobertaFeaturizer
		except ModuleNotFoundError:
		pass

		# support classes
		from deepchem.feat.molecule_featurizers import GraphMatrix

deepchem/feat/atomic_conformation.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -149,7 +149,7 @@ class AtomicConformationFeaturizer(Featurizer):

		"""

		def _featurize(self, datapoint: str) -> AtomicConformation:
		def _featurize(self, datapoint: str, **kwargs) -> AtomicConformation:
		"""Calculate features for a single datapoint.

		Parameters

deepchem/feat/base_classes.py

+78 −38

Original line number	Diff line number	Diff line
		@@ -4,7 +4,7 @@ Feature calculations.
		import inspect
		import logging
		import numpy as np
		from typing import Any, Dict, Iterable, Tuple, Union, cast
		from typing import Any, Dict, Iterable, Optional, Tuple, Union, cast

		from deepchem.utils import get_print_threshold
		from deepchem.utils.typing import PymatgenStructure
		@@ -23,8 +23,10 @@ class Featurizer(object):
		new datatype.
		"""

		def featurize(self, datapoints: Iterable[Any],
		log_every_n: int = 1000) -> np.ndarray:
		def featurize(self,
		datapoints: Iterable[Any],
		log_every_n: int = 1000,
		**kwargs) -> np.ndarray:
		"""Calculate features for datapoints.

		Parameters
		@@ -47,7 +49,7 @@ class Featurizer(object):
		if i % log_every_n == 0:
		logger.info("Featurizing datapoint %i" % i)
		try:
		features.append(self._featurize(point))
		features.append(self._featurize(point, **kwargs))
		except:
		logger.warning(
		"Failed to featurize datapoint %d. Appending empty array")
		@@ -55,17 +57,19 @@ class Featurizer(object):

		return np.asarray(features)

		def __call__(self, datapoints: Iterable[Any]):
		def __call__(self, datapoints: Iterable[Any], **kwargs):
		"""Calculate features for datapoints.

		`**kwargs` will get passed directly to `Featurizer.featurize`

		Parameters
		----------
		datapoints: Iterable[Any]
		Any blob of data you like. Subclasss should instantiate this.
		"""
		return self.featurize(datapoints)
		return self.featurize(datapoints, **kwargs)

		def _featurize(self, datapoint: Any):
		def _featurize(self, datapoint: Any, **kwargs):
		"""Calculate features for a single datapoint.

		Parameters
		@@ -154,14 +158,15 @@ class ComplexFeaturizer(Featurizer):
		"""

		def featurize(self,
		complexes: Iterable[Tuple[str, str]],
		log_every_n: int = 100) -> np.ndarray:
		datapoints: Optional[Iterable[Tuple[str, str]]] = None,
		log_every_n: int = 100,
		**kwargs) -> np.ndarray:
		"""
		Calculate features for mol/protein complexes.

		Parameters
		----------
		complexes: Iterable[Tuple[str, str]]
		datapoints: Iterable[Tuple[str, str]]
		List of filenames (PDB, SDF, etc.) for ligand molecules and proteins.
		Each element should be a tuple of the form (ligand_filename,
		protein_filename).
		@@ -172,14 +177,19 @@ class ComplexFeaturizer(Featurizer):
		Array of features
		"""

		if not isinstance(complexes, Iterable):
		complexes = [cast(Tuple[str, str], complexes)]
		if 'complexes' in kwargs:
		datapoints = kwargs.get("complexes")
		raise DeprecationWarning(
		'Complexes is being phased out as a parameter, please pass "datapoints" instead.'
		)
		if not isinstance(datapoints, Iterable):
		datapoints = [cast(Tuple[str, str], datapoints)]
		features, failures, successes = [], [], []
		for idx, point in enumerate(complexes):
		for idx, point in enumerate(datapoints):
		if idx % log_every_n == 0:
		logger.info("Featurizing datapoint %i" % idx)
		try:
		features.append(self._featurize(point))
		features.append(self._featurize(point, **kwargs))
		successes.append(idx)
		except:
		logger.warning(
		@@ -202,7 +212,7 @@ class ComplexFeaturizer(Featurizer):

		return np.asarray(features)

		def _featurize(self, complex: Tuple[str, str]):
		def _featurize(self, datapoint: Optional[Tuple[str, str]] = None, **kwargs):
		"""
		Calculate features for single mol/protein complex.

		@@ -232,12 +242,12 @@ class MolecularFeaturizer(Featurizer):
		The subclasses of this class require RDKit to be installed.
		"""

		def featurize(self, molecules, log_every_n=1000) -> np.ndarray:
		def featurize(self, datapoints, log_every_n=1000, **kwargs) -> np.ndarray:
		"""Calculate features for molecules.

		Parameters
		----------
		molecules: rdkit.Chem.rdchem.Mol / SMILES string / iterable
		datapoints: rdkit.Chem.rdchem.Mol / SMILES string / iterable
		RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
		strings.
		log_every_n: int, default 1000
		@@ -256,15 +266,21 @@ class MolecularFeaturizer(Featurizer):
		except ModuleNotFoundError:
		raise ImportError("This class requires RDKit to be installed.")

		if 'molecules' in kwargs:
		datapoints = kwargs.get("molecules")
		raise DeprecationWarning(
		'Molecules is being phased out as a parameter, please pass "datapoints" instead.'
		)

		# Special case handling of single molecule
		if isinstance(molecules, str) or isinstance(molecules, Mol):
		molecules = [molecules]
		if isinstance(datapoints, str) or isinstance(datapoints, Mol):
		datapoints = [datapoints]
		else:
		# Convert iterables to list
		molecules = list(molecules)
		datapoints = list(datapoints)

		features = []
		for i, mol in enumerate(molecules):
		features: list = []
		for i, mol in enumerate(datapoints):
		if i % log_every_n == 0:
		logger.info("Featurizing datapoint %i" % i)

		@@ -276,7 +292,7 @@ class MolecularFeaturizer(Featurizer):
		new_order = rdmolfiles.CanonicalRankAtoms(mol)
		mol = rdmolops.RenumberAtoms(mol, new_order)

		features.append(self._featurize(mol))
		features.append(self._featurize(mol, **kwargs))
		except Exception as e:
		if isinstance(mol, Chem.rdchem.Mol):
		mol = Chem.MolToSmiles(mol)
		@@ -313,13 +329,15 @@ class MaterialStructureFeaturizer(Featurizer):
		"""

		def featurize(self,
		structures: Iterable[Union[Dict[str, Any], PymatgenStructure]],
		log_every_n: int = 1000) -> np.ndarray:
		datapoints: Optional[Iterable[Union[Dict[str, Any],
		PymatgenStructure]]] = None,
		log_every_n: int = 1000,
		**kwargs) -> np.ndarray:
		"""Calculate features for crystal structures.

		Parameters
		----------
		structures: Iterable[Union[Dict, pymatgen.core.Structure]]
		datapoints: Iterable[Union[Dict, pymatgen.core.Structure]]
		Iterable sequence of pymatgen structure dictionaries
		or pymatgen.core.Structure. Please confirm the dictionary representations
		of pymatgen.core.Structure from https://pymatgen.org/pymatgen.core.structure.html.
		@@ -330,22 +348,31 @@ class MaterialStructureFeaturizer(Featurizer):
		-------
		features: np.ndarray
		A numpy array containing a featurized representation of
		`structures`.
		`datapoints`.
		"""
		try:
		from pymatgen.core import Structure
		except ModuleNotFoundError:
		raise ImportError("This class requires pymatgen to be installed.")

		structures = list(structures)
		if 'structures' in kwargs:
		datapoints = kwargs.get("structures")
		raise DeprecationWarning(
		'Structures is being phased out as a parameter, please pass "datapoints" instead.'
		)

		if not isinstance(datapoints, Iterable):
		datapoints = [cast(Union[Dict[str, Any], PymatgenStructure], datapoints)]

		datapoints = list(datapoints)
		features = []
		for idx, structure in enumerate(structures):
		for idx, structure in enumerate(datapoints):
		if idx % log_every_n == 0:
		logger.info("Featurizing datapoint %i" % idx)
		try:
		if isinstance(structure, Dict):
		structure = Structure.from_dict(structure)
		features.append(self._featurize(structure))
		features.append(self._featurize(structure, **kwargs))
		except:
		logger.warning(
		"Failed to featurize datapoint %i. Appending empty array" % idx)
		@@ -377,13 +404,15 @@ class MaterialCompositionFeaturizer(Featurizer):
		installed.
		"""

		def featurize(self, compositions: Iterable[str],
		log_every_n: int = 1000) -> np.ndarray:
		def featurize(self,
		datapoints: Optional[Iterable[str]] = None,
		log_every_n: int = 1000,
		**kwargs) -> np.ndarray:
		"""Calculate features for crystal compositions.

		Parameters
		----------
		compositions: Iterable[str]
		datapoints: Iterable[str]
		Iterable sequence of composition strings, e.g. "MoS2".
		log_every_n: int, default 1000
		Logging messages reported every `log_every_n` samples.
		@@ -399,14 +428,23 @@ class MaterialCompositionFeaturizer(Featurizer):
		except ModuleNotFoundError:
		raise ImportError("This class requires pymatgen to be installed.")

		compositions = list(compositions)
		if 'compositions' in kwargs and datapoints is None:
		datapoints = kwargs.get("compositions")
		raise DeprecationWarning(
		'Compositions is being phased out as a parameter, please pass "datapoints" instead.'
		)

		if not isinstance(datapoints, Iterable):
		datapoints = [cast(str, datapoints)]

		datapoints = list(datapoints)
		features = []
		for idx, composition in enumerate(compositions):
		for idx, composition in enumerate(datapoints):
		if idx % log_every_n == 0:
		logger.info("Featurizing datapoint %i" % idx)
		try:
		c = Composition(composition)
		features.append(self._featurize(c))
		features.append(self._featurize(c, **kwargs))
		except:
		logger.warning(
		"Failed to featurize datapoint %i. Appending empty array" % idx)
		@@ -442,8 +480,10 @@ class DummyFeaturizer(Featurizer):
		'FCC(c1cccc(Br)n1)N1CCOCC1']], dtype='<U55')
		"""

		def featurize(self, datapoints: Iterable[Any],
		log_every_n: int = 1000) -> np.ndarray:
		def featurize(self,
		datapoints: Iterable[Any],
		log_every_n: int = 1000,
		**kwargs) -> np.ndarray:
		"""Passes through dataset, and returns the datapoint.

		Parameters

deepchem/feat/complex_featurizers/complex_atomic_coordinates.py

+9 −5

Original line number	Diff line number	Diff line
		@@ -12,8 +12,6 @@ from deepchem.utils.data_utils import pad_array
		from deepchem.utils.rdkit_utils import MoleculeLoadException, get_xyz_from_mol, \
		load_molecule, merge_molecules_xyz, merge_molecules

		from typing import Tuple


		def compute_neighbor_list(coords, neighbor_cutoff, max_num_neighbors,
		periodic_box_size):
		@@ -118,16 +116,22 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
		# Type of data created by this featurizer
		self.dtype = object

		def _featurize(self, complex: Tuple[str, str]):
		def _featurize(self, datapoint, **kwargs):
		"""
		Compute neighbor list for complex.

		Parameters
		----------
		complex: Tuple[str, str]
		datapoint: Tuple[str, str]
		Filenames for molecule and protein.
		"""
		mol_pdb_file, protein_pdb_file = complex
		if 'complex' in kwargs:
		datapoint = kwargs.get("complex")
		raise DeprecationWarning(
		'Complex is being phased out as a parameter, please pass "datapoint" instead.'
		)

		mol_pdb_file, protein_pdb_file = datapoint
		mol_coords, ob_mol = load_molecule(mol_pdb_file)
		protein_coords, protein_mol = load_molecule(protein_pdb_file)
		system_coords = merge_molecules_xyz([mol_coords, protein_coords])

Admin message