Merge pull request #2351 from deepchem/sdf_fix (5bde6973) · Commits · 钟慕尧 / deepchem

deepchem/molnet/load_function/qm7_datasets.py

+13 −1

Original line number	Diff line number	Diff line
		@@ -22,7 +22,8 @@ class _QM7Loader(_MolnetLoader):
		dc.utils.data_utils.download_url(url=GDB7_URL, dest_dir=self.data_dir)
		dc.utils.data_utils.untargz_file(
		os.path.join(self.data_dir, "gdb7.tar.gz"), self.data_dir)
		loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
		loader = dc.data.SDFLoader(
		tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
		return loader.create_dataset(dataset_file, shard_size=8192)


		@@ -81,6 +82,17 @@ def load_qm7(
		save_dir: str
		a directory to save the dataset in

		Note
		----
		DeepChem 2.4.0 has turned on sanitization for this dataset by
		default. For the QM7 dataset, this means that calling this
		function will return 6838 compounds instead of 7160 in the source
		dataset file. This appears to be due to valence specification
		mismatches in the dataset that weren't caught in earlier more lax
		versions of RDKit. Note that this may subtly affect benchmarking
		results on this
		dataset.

		References
		----------
		.. [1] Rupp, Matthias, et al. "Fast and accurate modeling of molecular

deepchem/molnet/load_function/qm8_datasets.py

+12 −1

Original line number	Diff line number	Diff line
		@@ -24,7 +24,8 @@ class _QM8Loader(_MolnetLoader):
		dc.utils.data_utils.download_url(url=GDB8_URL, dest_dir=self.data_dir)
		dc.utils.data_utils.untargz_file(
		os.path.join(self.data_dir, "gdb8.tar.gz"), self.data_dir)
		loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
		loader = dc.data.SDFLoader(
		tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
		return loader.create_dataset(dataset_file, shard_size=8192)


		@@ -90,6 +91,16 @@ def load_qm8(
		save_dir: str
		a directory to save the dataset in

		Note
		----
		DeepChem 2.4.0 has turned on sanitization for this dataset by
		default. For the QM8 dataset, this means that calling this
		function will return 21747 compounds instead of 21786 in the source
		dataset file. This appears to be due to valence specification
		mismatches in the dataset that weren't caught in earlier more lax
		versions of RDKit. Note that this may subtly affect benchmarking
		results on this dataset.

		References
		----------
		.. [1] Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike

deepchem/molnet/load_function/qm9_datasets.py

+16 −5

Original line number	Diff line number	Diff line
		@@ -23,7 +23,8 @@ class _QM9Loader(_MolnetLoader):
		dc.utils.data_utils.download_url(url=GDB9_URL, dest_dir=self.data_dir)
		dc.utils.data_utils.untargz_file(
		os.path.join(self.data_dir, "gdb9.tar.gz"), self.data_dir)
		loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
		loader = dc.data.SDFLoader(
		tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
		return loader.create_dataset(dataset_file, shard_size=8192)


		@@ -39,10 +40,10 @@ def load_qm9(
		"""Load QM9 dataset

		QM9 is a comprehensive dataset that provides geometric, energetic,
		electronic and thermodynamic properties for a subset of GDB-17 database,
		comprising 134 thousand stable organic molecules with up to 9 heavy atoms.
		All molecules are modeled using density functional theory
		(B3LYP/6-31G(2df,p) based DFT).
		electronic and thermodynamic properties for a subset of GDB-17
		database, comprising 134 thousand stable organic molecules with up
		to 9 heavy atoms. All molecules are modeled using density
		functional theory (B3LYP/6-31G(2df,p) based DFT).

		Random splitting is recommended for this dataset.

		@@ -99,6 +100,16 @@ def load_qm9(
		save_dir: str
		a directory to save the dataset in

		Note
		----
		DeepChem 2.4.0 has turned on sanitization for this dataset by
		default. For the QM9 dataset, this means that calling this
		function will return 132480 compounds instead of 133885 in the
		source dataset file. This appears to be due to valence
		specification mismatches in the dataset that weren't caught in
		earlier more lax versions of RDKit. Note that this may subtly
		affect benchmarking results on this dataset.

		References
		----------
		.. [1] Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike small

Admin message