Unverified Commit 5bde6973 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2351 from deepchem/sdf_fix

Turning on SDF sanitization for load_qm7/8/9
parents efa029c9 946c2e1e
Loading
Loading
Loading
Loading
+13 −1
Original line number Diff line number Diff line
@@ -22,7 +22,8 @@ class _QM7Loader(_MolnetLoader):
      dc.utils.data_utils.download_url(url=GDB7_URL, dest_dir=self.data_dir)
      dc.utils.data_utils.untargz_file(
          os.path.join(self.data_dir, "gdb7.tar.gz"), self.data_dir)
    loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
    loader = dc.data.SDFLoader(
        tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
    return loader.create_dataset(dataset_file, shard_size=8192)


@@ -81,6 +82,17 @@ def load_qm7(
  save_dir: str
    a directory to save the dataset in

  Note
  ----
  DeepChem 2.4.0 has turned on sanitization for this dataset by
  default.  For the QM7 dataset, this means that calling this
  function will return 6838 compounds instead of 7160 in the source
  dataset file.  This appears to be due to valence specification
  mismatches in the dataset that weren't caught in earlier more lax
  versions of RDKit.  Note that this may subtly affect benchmarking
  results on this
  dataset.

  References
  ----------
  .. [1] Rupp, Matthias, et al. "Fast and accurate modeling of molecular
+12 −1
Original line number Diff line number Diff line
@@ -24,7 +24,8 @@ class _QM8Loader(_MolnetLoader):
      dc.utils.data_utils.download_url(url=GDB8_URL, dest_dir=self.data_dir)
      dc.utils.data_utils.untargz_file(
          os.path.join(self.data_dir, "gdb8.tar.gz"), self.data_dir)
    loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
    loader = dc.data.SDFLoader(
        tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
    return loader.create_dataset(dataset_file, shard_size=8192)


@@ -90,6 +91,16 @@ def load_qm8(
  save_dir: str
    a directory to save the dataset in

  Note
  ----
  DeepChem 2.4.0 has turned on sanitization for this dataset by
  default.  For the QM8 dataset, this means that calling this
  function will return 21747 compounds instead of 21786 in the source
  dataset file.  This appears to be due to valence specification
  mismatches in the dataset that weren't caught in earlier more lax
  versions of RDKit.  Note that this may subtly affect benchmarking
  results on this dataset.

  References
  ----------
  .. [1] Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike
+16 −5
Original line number Diff line number Diff line
@@ -23,7 +23,8 @@ class _QM9Loader(_MolnetLoader):
      dc.utils.data_utils.download_url(url=GDB9_URL, dest_dir=self.data_dir)
      dc.utils.data_utils.untargz_file(
          os.path.join(self.data_dir, "gdb9.tar.gz"), self.data_dir)
    loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
    loader = dc.data.SDFLoader(
        tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
    return loader.create_dataset(dataset_file, shard_size=8192)


@@ -39,10 +40,10 @@ def load_qm9(
  """Load QM9 dataset

  QM9 is a comprehensive dataset that provides geometric, energetic,
  electronic and thermodynamic properties for a subset of GDB-17 database,
  comprising 134 thousand stable organic molecules with up to 9 heavy atoms.
  All molecules are modeled using density functional theory
  (B3LYP/6-31G(2df,p) based DFT).
  electronic and thermodynamic properties for a subset of GDB-17
  database, comprising 134 thousand stable organic molecules with up
  to 9 heavy atoms.  All molecules are modeled using density
  functional theory (B3LYP/6-31G(2df,p) based DFT).

  Random splitting is recommended for this dataset.

@@ -99,6 +100,16 @@ def load_qm9(
  save_dir: str
    a directory to save the dataset in

  Note
  ----
  DeepChem 2.4.0 has turned on sanitization for this dataset by
  default.  For the QM9 dataset, this means that calling this
  function will return 132480 compounds instead of 133885 in the
  source dataset file. This appears to be due to valence
  specification mismatches in the dataset that weren't caught in
  earlier more lax versions of RDKit. Note that this may subtly
  affect benchmarking results on this dataset.

  References
  ----------
  .. [1] Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike small