Unverified Commit ceaadc1b authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2068 from deepchem/sdf_yield

Changing sdf loader to yield batches at a time
parents e61c7df7 584bb183
Loading
Loading
Loading
Loading
+5 −2
Original line number Diff line number Diff line
@@ -707,7 +707,6 @@ class SDFLoader(DataLoader):
  >>> featurizer = dc.feat.CircularFingerprint(size=16)
  >>> loader = dc.data.SDFLoader(["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
  >>> dataset = loader.create_dataset(os.path.join(current_dir, "tests", "membrane_permeability.sdf")) # doctest:+ELLIPSIS
  Reading ...
  >>> len(dataset)
  2
  """
@@ -743,7 +742,11 @@ class SDFLoader(DataLoader):

  def _get_shards(self, input_files, shard_size):
    """Defines a generator which returns data for each shard"""
    return load_sdf_files(input_files, self.sanitize, tasks=self.tasks)
    return load_sdf_files(
        input_files=input_files,
        clean_mols=self.sanitize,
        tasks=self.tasks,
        shard_size=shard_size)

  def _featurize_shard(self, shard):
    """Featurizes a shard of an input dataframe."""
+38 −0
Original line number Diff line number Diff line
@@ -19,3 +19,41 @@ def test_singleton_sdf_load():
      ["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
  dataset = loader.create_dataset(os.path.join(current_dir, "singleton.sdf"))
  assert len(dataset) == 1


def test_sharded_sdf_load():
  current_dir = os.path.dirname(os.path.realpath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=16)
  loader = dc.data.SDFLoader(
      ["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
  dataset = loader.create_dataset(
      os.path.join(current_dir, "membrane_permeability.sdf"), shard_size=1)
  assert dataset.get_number_shards() == 2
  assert len(dataset) == 2


def test_sharded_multi_file_sdf_load():
  current_dir = os.path.dirname(os.path.realpath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=16)
  loader = dc.data.SDFLoader(
      ["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
  input_files = [
      os.path.join(current_dir, "membrane_permeability.sdf"),
      os.path.join(current_dir, "singleton.sdf")
  ]
  dataset = loader.create_dataset(input_files, shard_size=1)
  assert dataset.get_number_shards() == 3
  assert len(dataset) == 3


def test_sdf_load_with_csv():
  """Test a case where SDF labels are in associated csv file"""
  current_dir = os.path.dirname(os.path.realpath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=16)
  loader = dc.data.SDFLoader(
      ["atomization_energy"], featurizer=featurizer, sanitize=True)
  dataset = loader.create_dataset(
      os.path.join(current_dir, "water.sdf"), shard_size=1)
  assert len(dataset) == 10
  assert dataset.get_number_shards() == 10
  assert dataset.get_task_names() == ["atomization_energy"]
+160 −0
Original line number Diff line number Diff line
Generated by ForceBalance from calcs/cluster-02/VLE/250K/00/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
    0.3522   -0.0789   -1.1805 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.1361   -1.0054   -1.2859 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.2308    0.3743   -1.7895 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.2501    0.1228    1.2735 O   0  0  0  0  0  0  0  0  0  0  0  0
   -0.1609   -0.0681    0.3398 H   0  0  0  0  0  0  0  0  0  0  0  0
   -1.1927    0.0889    1.4364 H   0  0  0  0  0  0  0  0  0  0  0  0
  2  1  1  0  0  0  0
  3  1  1  0  0  0  0
  4  6  1  0  0  0  0
  5  4  1  0  0  0  0
M  END
$$$$
Generated by ForceBalance from calcs/cluster-02/VLE/250K/01/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
   -0.6833    0.9705    0.3745 O   0  0  0  0  0  0  0  0  0  0  0  0
   -1.4703    0.7389   -0.1187 H   0  0  0  0  0  0  0  0  0  0  0  0
   -1.0013    1.1666    1.2558 H   0  0  0  0  0  0  0  0  0  0  0  0
    0.8119   -0.9589   -0.4393 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.4470   -1.7720   -0.0901 H   0  0  0  0  0  0  0  0  0  0  0  0
    0.2255   -0.2747   -0.1166 H   0  0  0  0  0  0  0  0  0  0  0  0
  1  3  1  0  0  0  0
  2  1  1  0  0  0  0
  4  6  1  0  0  0  0
  4  5  1  0  0  0  0
M  END
$$$$
Generated by ForceBalance from calcs/cluster-02/VLE/250K/02/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
    0.2250   -1.2400    0.1519 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.0488   -1.9014    0.8211 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0675   -1.6423   -0.6659 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.2524    1.3362   -0.1080 O   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0379    0.4038   -0.0769 H   0  0  0  0  0  0  0  0  0  0  0  0
    0.4224    1.7191   -0.6687 H   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0  0  0  0
  3  1  1  0  0  0  0
  4  5  1  0  0  0  0
  6  4  1  0  0  0  0
M  END
$$$$
Generated by ForceBalance from calcs/cluster-02/VLE/250K/03/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
    0.5442   -1.2553    0.0884 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.2717   -0.3382    0.0601 H   0  0  0  0  0  0  0  0  0  0  0  0
    1.2009   -1.2892    0.7840 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.5724    1.1365   -0.1524 O   0  0  0  0  0  0  0  0  0  0  0  0
   -0.9053    1.4770    0.6779 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.1396    1.8852   -0.5627 H   0  0  0  0  0  0  0  0  0  0  0  0
  1  3  1  0  0  0  0
  2  1  1  0  0  0  0
  4  5  1  0  0  0  0
  6  4  1  0  0  0  0
M  END
$$$$
Generated by ForceBalance from calcs/cluster-02/VLE/250K/04/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
    0.5716   -0.9660    0.8167 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.0990   -0.2491    0.3937 H   0  0  0  0  0  0  0  0  0  0  0  0
    1.4503   -0.9302    0.4386 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.5869    0.8748   -0.7201 O   0  0  0  0  0  0  0  0  0  0  0  0
   -0.4764    0.7662   -1.6647 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.8321    1.7936   -0.6112 H   0  0  0  0  0  0  0  0  0  0  0  0
  2  1  1  0  0  0  0
  3  1  1  0  0  0  0
  4  6  1  0  0  0  0
  5  4  1  0  0  0  0
M  END
$$$$
Generated by ForceBalance from calcs/cluster-02/VLE/250K/05/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
   -1.2519    0.2981    0.1241 O   0  0  0  0  0  0  0  0  0  0  0  0
   -1.7775   -0.3172    0.6354 H   0  0  0  0  0  0  0  0  0  0  0  0
   -1.6386    1.1542    0.3076 H   0  0  0  0  0  0  0  0  0  0  0  0
    1.3413   -0.2661   -0.1294 O   0  0  0  0  0  0  0  0  0  0  0  0
    1.5930   -0.9980   -0.6925 H   0  0  0  0  0  0  0  0  0  0  0  0
    0.3851   -0.3004   -0.1042 H   0  0  0  0  0  0  0  0  0  0  0  0
  1  3  1  0  0  0  0
  1  2  1  0  0  0  0
  4  6  1  0  0  0  0
  5  4  1  0  0  0  0
M  END
$$$$
Generated by ForceBalance from calcs/cluster-02/VLE/250K/06/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
    0.9408   -0.0129    1.0150 O   0  0  0  0  0  0  0  0  0  0  0  0
    1.8624    0.0338    0.7608 H   0  0  0  0  0  0  0  0  0  0  0  0
    0.4636   -0.0294    0.1854 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.9270   -0.0427   -0.9440 O   0  0  0  0  0  0  0  0  0  0  0  0
   -1.7394    0.2078   -0.5042 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.7702    0.6586   -1.5762 H   0  0  0  0  0  0  0  0  0  0  0  0
  2  1  1  0  0  0  0
  3  1  1  0  0  0  0
  4  5  1  0  0  0  0
  6  4  1  0  0  0  0
M  END
$$$$
Generated by ForceBalance from calcs/cluster-02/VLE/250K/07/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
   -0.2170    0.6223   -1.1153 O   0  0  0  0  0  0  0  0  0  0  0  0
   -0.9903    0.6099   -1.6793 H   0  0  0  0  0  0  0  0  0  0  0  0
    0.2030    1.4618   -1.3022 H   0  0  0  0  0  0  0  0  0  0  0  0
    0.2096   -0.6802    1.2222 O   0  0  0  0  0  0  0  0  0  0  0  0
   -0.1250   -0.2173    0.4540 H   0  0  0  0  0  0  0  0  0  0  0  0
    1.0515   -1.0371    0.9394 H   0  0  0  0  0  0  0  0  0  0  0  0
  2  1  1  0  0  0  0
  3  1  1  0  0  0  0
  5  4  1  0  0  0  0
  6  4  1  0  0  0  0
M  END
$$$$
Generated by ForceBalance from calcs/cluster-02/VLE/250K/08/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
    0.4738    0.9229   -0.7887 O   0  0  0  0  0  0  0  0  0  0  0  0
   -0.2878    1.1716   -1.3126 H   0  0  0  0  0  0  0  0  0  0  0  0
    1.2208    1.2882   -1.2627 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.4689   -1.0248    0.8355 O   0  0  0  0  0  0  0  0  0  0  0  0
   -0.8800   -0.5469    1.5558 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.1112   -0.3421    0.2678 H   0  0  0  0  0  0  0  0  0  0  0  0
  2  1  1  0  0  0  0
  3  1  1  0  0  0  0
  4  5  1  0  0  0  0
  6  4  1  0  0  0  0
M  END
$$$$
Generated by ForceBalance from calcs/cluster-02/VLE/250K/09/qchem.out: Frame 1 of 1
 OpenBabel03241615583D

  6  4  0  0  0  0  0  0  0  0999 V2000
    0.3431    0.5546    1.2527 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.9254   -0.1303    1.5813 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0195    0.1937    0.4437 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.3857   -0.4769   -1.1651 O   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0194   -1.2992   -1.4903 H   0  0  0  0  0  0  0  0  0  0  0  0
   -0.2338    0.1499   -1.8724 H   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0  0  0  0
  3  1  1  0  0  0  0
  5  4  1  0  0  0  0
  6  4  1  0  0  0  0
M  END
$$$$
+11 −0
Original line number Diff line number Diff line
atomization_energy
447.082359
448.859851
450.466600
450.851977
450.894234
450.743387
451.436905
451.559751
451.326782
451.400550
+3 −4
Original line number Diff line number Diff line
@@ -77,8 +77,8 @@ class CoulombMatrix(MolecularFeaturizer):
  >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
  >>> tasks = ["atomization_energy"]
  >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
  Reading structures from deepchem/feat/tests/data/water.sdf.
  >>> dataset = loader.create_dataset(input_file)


  References
  ----------
@@ -261,8 +261,7 @@ class CoulombMatrixEig(CoulombMatrix):
  >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
  >>> tasks = ["atomization_energy"]
  >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
  Reading structures from deepchem/feat/tests/data/water.sdf.
  >>> dataset = loader.create_dataset(input_file)

  References
  ----------
Loading