Commit 6aaaee3e authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

More targeted sanitization

parent b53e9cab
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -714,7 +714,7 @@ class SDFLoader(DataLoader):
  def __init__(self,
               tasks: List[str],
               featurizer: Featurizer,
               sanitize: bool = True,
               sanitize: bool = False,
               log_every_n: int = 1000):
    """Initialize SDF Loader

+12 −1
Original line number Diff line number Diff line
@@ -22,7 +22,8 @@ class _QM7Loader(_MolnetLoader):
      dc.utils.data_utils.download_url(url=GDB7_URL, dest_dir=self.data_dir)
      dc.utils.data_utils.untargz_file(
          os.path.join(self.data_dir, "gdb7.tar.gz"), self.data_dir)
    loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
    loader = dc.data.SDFLoader(
        tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
    return loader.create_dataset(dataset_file, shard_size=8192)


@@ -81,6 +82,16 @@ def load_qm7(
  save_dir: str
    a directory to save the dataset in

  Note
  ----
  DeepChem 2.4.0 has turned on sanitization for SDF files by default.
  For the QM7 dataset, this means that calling this function will
  return 6838 compounds instead of 7160 in the source dataset file.
  This appears to be due to valence specification mismatches in the
  dataset that weren't caught in earlier more lax versions of RDKit.
  Note that this may subtly affect benchmarking results on this
  dataset.

  References
  ----------
  .. [1] Rupp, Matthias, et al. "Fast and accurate modeling of molecular
+12 −1
Original line number Diff line number Diff line
@@ -24,7 +24,8 @@ class _QM8Loader(_MolnetLoader):
      dc.utils.data_utils.download_url(url=GDB8_URL, dest_dir=self.data_dir)
      dc.utils.data_utils.untargz_file(
          os.path.join(self.data_dir, "gdb8.tar.gz"), self.data_dir)
    loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
    loader = dc.data.SDFLoader(
        tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
    return loader.create_dataset(dataset_file, shard_size=8192)


@@ -90,6 +91,16 @@ def load_qm8(
  save_dir: str
    a directory to save the dataset in

  Note
  ----
  DeepChem 2.4.0 has turned on sanitization for SDF files by default.
  For the QM8 dataset, this means that calling this function will
  return 21747 compounds instead of 21786 in the source dataset file.
  This appears to be due to valence specification mismatches in the
  dataset that weren't caught in earlier more lax versions of RDKit.
  Note that this may subtly affect benchmarking results on this
  dataset.

  References
  ----------
  .. [1] Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike
+7 −6
Original line number Diff line number Diff line
@@ -23,7 +23,8 @@ class _QM9Loader(_MolnetLoader):
      dc.utils.data_utils.download_url(url=GDB9_URL, dest_dir=self.data_dir)
      dc.utils.data_utils.untargz_file(
          os.path.join(self.data_dir, "gdb9.tar.gz"), self.data_dir)
    loader = dc.data.SDFLoader(tasks=self.tasks, featurizer=self.featurizer)
    loader = dc.data.SDFLoader(
        tasks=self.tasks, featurizer=self.featurizer, sanitize=True)
    return loader.create_dataset(dataset_file, shard_size=8192)


@@ -103,11 +104,11 @@ def load_qm9(
  ----
  DeepChem 2.4.0 has turned on sanitization for SDF files by default.
  For the QM9 dataset, this means that calling this function will
  return a list of 132480 compounds instead of 133885 in the source
  dataset file. This appears to be due to valence specification
  mismatches in the dataset that weren't caught in earlier more lax
  versions of RDKit. Note that this may subtly affect benchmarking
  results on this dataset.
  return 132480 compounds instead of 133885 in the source dataset
  file. This appears to be due to valence specification mismatches in
  the dataset that weren't caught in earlier more lax versions of
  RDKit. Note that this may subtly affect benchmarking results on
  this dataset.

  References
  ----------