More targeted sanitization (6aaaee3e) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+1 −1

+12 −1

Original line number	Diff line number	Diff line
		@@ -22,7 +22,8 @@ class _QM7Loader(_MolnetLoader):
		dc.utils.data_utils.download_url(url=GDB7_URL, dest_dir=self.data_dir)
		dc.utils.data_utils.untargz_file(
		os.path.join(self.data_dir, "gdb7.tar.gz"), self.data_dir)
		loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
		loader = dc.data.SDFLoader(
		tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
		return loader.create_dataset(dataset_file, shard_size=8192)


		@@ -81,6 +82,16 @@ def load_qm7(
		save_dir: str
		a directory to save the dataset in

		Note
		----
		DeepChem 2.4.0 has turned on sanitization for SDF files by default.
		For the QM7 dataset, this means that calling this function will
		return 6838 compounds instead of 7160 in the source dataset file.
		This appears to be due to valence specification mismatches in the
		dataset that weren't caught in earlier more lax versions of RDKit.
		Note that this may subtly affect benchmarking results on this
		dataset.

		References
		----------
		.. [1] Rupp, Matthias, et al. "Fast and accurate modeling of molecular

+12 −1

Original line number	Diff line number	Diff line
		@@ -24,7 +24,8 @@ class _QM8Loader(_MolnetLoader):
		dc.utils.data_utils.download_url(url=GDB8_URL, dest_dir=self.data_dir)
		dc.utils.data_utils.untargz_file(
		os.path.join(self.data_dir, "gdb8.tar.gz"), self.data_dir)
		loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
		loader = dc.data.SDFLoader(
		tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
		return loader.create_dataset(dataset_file, shard_size=8192)


		@@ -90,6 +91,16 @@ def load_qm8(
		save_dir: str
		a directory to save the dataset in

		Note
		----
		DeepChem 2.4.0 has turned on sanitization for SDF files by default.
		For the QM8 dataset, this means that calling this function will
		return 21747 compounds instead of 21786 in the source dataset file.
		This appears to be due to valence specification mismatches in the
		dataset that weren't caught in earlier more lax versions of RDKit.
		Note that this may subtly affect benchmarking results on this
		dataset.

		References
		----------
		.. [1] Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike

+7 −6

Original line number	Diff line number	Diff line
		@@ -23,7 +23,8 @@ class _QM9Loader(_MolnetLoader):
		dc.utils.data_utils.download_url(url=GDB9_URL, dest_dir=self.data_dir)
		dc.utils.data_utils.untargz_file(
		os.path.join(self.data_dir, "gdb9.tar.gz"), self.data_dir)
		loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
		loader = dc.data.SDFLoader(
		tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
		return loader.create_dataset(dataset_file, shard_size=8192)


		@@ -103,11 +104,11 @@ def load_qm9(
		----
		DeepChem 2.4.0 has turned on sanitization for SDF files by default.
		For the QM9 dataset, this means that calling this function will
		return a list of 132480 compounds instead of 133885 in the source
		dataset file. This appears to be due to valence specification
		mismatches in the dataset that weren't caught in earlier more lax
		versions of RDKit. Note that this may subtly affect benchmarking
		results on this dataset.
		return 132480 compounds instead of 133885 in the source dataset
		file. This appears to be due to valence specification mismatches in
		the dataset that weren't caught in earlier more lax versions of
		RDKit. Note that this may subtly affect benchmarking results on
		this dataset.

		References
		----------