Commit 714661e1 authored by Atreya Majumdar's avatar Atreya Majumdar
Browse files

Added docstring and rst entry

parent af13b69f
Loading
Loading
Loading
Loading
+57 −9
Original line number Diff line number Diff line
@@ -10,15 +10,18 @@ from typing import List, Optional, Tuple, Union
FREESOLV_URL = 'https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/freesolv.csv.gz'
FREESOLV_TASKS = ['y']


class _FreesolvLoader(_MolnetLoader):

  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, 'freesolv.csv.gz')
    if not os.path.exists(dataset_file):
      dc.utils.data_utils.download_url(url=FREESOLV_URL, dest_dir=self.data_dir)
            loader = dc.data.CSVLoader(tasks = self.tasks, feature_field = 'smiles', featurizer = self.featurizer)
      loader = dc.data.CSVLoader(
          tasks=self.tasks, feature_field='smiles', featurizer=self.featurizer)
      return loader.create_dataset(dataset_file)


def load_freesolv(
    featurizer: Union[dc.feat.Featurizer, str] = dc.feat.MATFeaturizer(),
    splitter: Union[dc.splits.Splitter, str, None] = None,
@@ -28,6 +31,51 @@ def load_freesolv(
    save_dir: Optional[str] = None,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """Load Freesolv dataset

  The FreeSolv dataset is a collection of experimental and calculated hydration 
  free energies for small molecules in water, along with their experiemental values.
  Here, we are using a modified version of the dataset with the molecule smile string 
  and the corresponding experimental hydration free energies. 
  

  Random splitting is recommended for this dataset.

    loader = _FreesolvLoader(featurizer, splitter, transformers, FREESOLV_TASKS, data_dir, save_dir, **kwargs)
  The raw data csv file contains columns below:

  - "mol" - SMILES representation of the molecular structure
  - "y" - Experimental hydration free energy

  Parameters
  ----------
  featurizer: Featurizer or str
    the featurizer to use for processing the data.  Alternatively you can pass
    one of the names from dc.molnet.featurizers as a shortcut.
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in

  References
  ----------
  .. [1] Łukasz Maziarka, et al. "Molecule Attention Transformer." NeurIPS 2019
     arXiv:2002.08264v1 [cs.LG].
  .. [2] Mobley DL, Guthrie JP. FreeSolv: 
     a database of experimental and calculated hydration free energies, with input files. 
     J Comput Aided Mol Des. 2014;28(7):711-720. doi:10.1007/s10822-014-9747-x
  """
  loader = _FreesolvLoader(featurizer, splitter, transformers, FREESOLV_TASKS,
                           data_dir, save_dir, **kwargs)
  return loader.load_dataset('freesolv', reload)
+5 −0
Original line number Diff line number Diff line
@@ -54,6 +54,11 @@ Cell Counting Datasets

.. autofunction:: deepchem.molnet.load_cell_counting

Freesolv Dataset
----------------------

.. autofunction:: deepchem.molnet.load_freesolv

Chembl Datasets
---------------