Merge pull request #1 from deepchem/master (e422b9b8) · Commits · 钟慕尧 / deepchem

.github/workflows/main.yml

+10 −3

Original line number	Diff line number	Diff line
		@@ -117,9 +117,11 @@ jobs:
		python-version: ${{ matrix.python-version }}
		environment-file: env.yml
		- name: Install DeepChem
		id: install
		shell: bash -l {0}
		run: pip install -e .
		- name: Yapf (version 0.22.0)
		id: yapf
		# on Windows, yapf raise the strange error..., so ignore
		if: runner.os == 'Linux' \|\| runner.os == 'macOS'
		shell: bash -l {0}
		@@ -129,18 +131,23 @@ jobs:
		yapf -d $CHANGED_FILES
		fi
		- name: Flake8
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		shell: bash -l {0}
		run: source scripts/flake8_for_ci.sh
		- name: Mypy
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		shell: bash -l {0}
		run: mypy -p deepchem
		- name: Doctest
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		shell: bash -l {0}
		run: DGLBACKEND=pytorch pytest -v --ignore-glob='deepchem/*/test.py' --doctest-modules deepchem
		- name: PyTest
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		shell: bash -l {0}
		run: pytest -v -m "not slow" --cov=deepchem --cov-report=xml deepchem
		- name: Upload coverage to Codecov
		if: ${{ (success() \|\| failure()) && (steps.install.outcome == 'failure' \|\| steps.install.outcome == 'success') }}
		uses: codecov/codecov-action@v1
		with:
		file: ./coverage.xml

README.md

+3 −3

Original line number	Diff line number	Diff line
		@@ -52,10 +52,10 @@ Please check [the document](https://deepchem.readthedocs.io/en/latest/requiremen

		### Stable version

		Please install tensorflow v2.3.* before installing deepchem.
		Please install tensorflow ~2.4 before installing deepchem.

		```bash
		pip install tensorflow==2.3.*
		pip install tensorflow~=2.4
		```

		Then, you install deepchem via pip or conda.
		@@ -80,7 +80,7 @@ conda install -y -c conda-forge rdkit
		The nightly version is built by the HEAD of DeepChem.

		```bash
		pip install tensorflow==2.3.*
		pip install tensorflow~=2.4
		pip install --pre deepchem
		```

deepchem/init.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -3,7 +3,7 @@ Imports all submodules
		"""

		# If you push the tag, please remove `.dev`
		__version__ = '2.5.0.dev'
		__version__ = '2.6.0.dev'

		import deepchem.data
		import deepchem.feat

deepchem/data/data_loader.py

+77 −0

Original line number	Diff line number	Diff line
		@@ -738,6 +738,83 @@ class SDFLoader(DataLoader):
		self.id_field = "smiles"
		self.log_every_n = log_every_n

		def create_dataset(self,
		inputs: OneOrMany[Any],
		data_dir: Optional[str] = None,
		shard_size: Optional[int] = 8192) -> Dataset:
		"""Creates and returns a `Dataset` object by featurizing provided sdf files.

		Parameters
		----------
		inputs: List
		List of inputs to process. Entries can be filenames or arbitrary objects.
		Each file should be supported format (.sdf) or compressed folder of
		.sdf files
		data_dir: str, optional (default None)
		Directory to store featurized dataset.
		shard_size: int, optional (default 8192)
		Number of examples stored in each shard.

		Returns
		-------
		DiskDataset
		A `DiskDataset` object containing a featurized representation of data
		from `inputs`.
		"""
		logger.info("Loading raw samples now.")
		logger.info("shard_size: %s" % str(shard_size))

		# Special case handling of single input
		if not isinstance(inputs, list):
		inputs = [inputs]

		processed_files = []
		for input_file in inputs:
		filename, extension = os.path.splitext(input_file)
		extension = extension.lower()
		if extension == ".sdf":
		processed_files.append(input_file)
		elif extension == ".zip":
		zip_dir = tempfile.mkdtemp()
		zip_ref = zipfile.ZipFile(input_file, 'r')
		zip_ref.extractall(path=zip_dir)
		zip_ref.close()
		zip_files = [os.path.join(zip_dir, name) for name in zip_ref.namelist()]
		for zip_file in zip_files:
		_, extension = os.path.splitext(zip_file)
		extension = extension.lower()
		if extension in [".sdf"]:
		processed_files.append(zip_file)
		else:
		raise ValueError("Unsupported file format")

		inputs = processed_files

		def shard_generator():
		for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
		time1 = time.time()
		X, valid_inds = self._featurize_shard(shard)
		ids = shard[self.id_field].values
		ids = ids[valid_inds]
		if len(self.tasks) > 0:
		# Featurize task results iff they exist.
		y, w = _convert_df_to_numpy(shard, self.tasks)
		# Filter out examples where featurization failed.
		y, w = (y[valid_inds], w[valid_inds])
		assert len(X) == len(ids) == len(y) == len(w)
		else:
		# For prospective data where results are unknown, it
		# makes no sense to have y values or weights.
		y, w = (None, None)
		assert len(X) == len(ids)

		time2 = time.time()
		logger.info("TIMING: featurizing shard %d took %0.3f s" %
		(shard_num, time2 - time1))
		yield X, y, w, ids

		return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

		def _get_shards(self, input_files: List[str],
		shard_size: Optional[int]) -> Iterator[pd.DataFrame]:
		"""Defines a generator which returns data for each shard

deepchem/data/tests/multiple_sdf.zip

0 → 100644

+3.01 KiB

File added.

No diff preview for this file type.

View file

Admin message