Unverified Commit d6b17fa2 authored by Suzukazole's avatar Suzukazole
Browse files

Add loader doc

parent 95f43b86
Loading
Loading
Loading
Loading
+60 −3
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
USPTO_MIT_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_MIT.csv"
USPTO_STEREO_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_STEREO.csv"
USPTO_50K_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_50K.csv"
USPTO_FULL_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_FULL.csv"


class _USPTOLoader(_MolnetLoader):
@@ -30,7 +31,7 @@ class _USPTOLoader(_MolnetLoader):
    self.name = 'USPTO_' + subset

  def create_dataset(self) -> Dataset:
    if self.subset not in ['MIT', 'STEREO', '50K']:
    if self.subset not in ['MIT', 'STEREO', '50K', 'FULL']:
      raise ValueError("Valid Subset names are MIT, STEREO and 50K.")

    if self.subset == 'MIT':
@@ -42,6 +43,9 @@ class _USPTOLoader(_MolnetLoader):
    if self.subset == '50K':
      dataset_url = USPTO_50K_URL

    if self.subset == 'FULL':
      dataset_url = USPTO_FULL_URL

    dataset_file = os.path.join(self.data_dir, self.name + '.csv')

    if not os.path.exists(dataset_file):
@@ -56,7 +60,7 @@ class _USPTOLoader(_MolnetLoader):

def load_uspto(
    featurizer=None,  # should I remove this?
    splitter=None,
    splitter: Union[dc.splits.Splitter, str, None] = 'SpecifiedSplitter',
    transformers=None,
    reload: bool = True,
    data_dir: Optional[str] = None,
@@ -65,7 +69,60 @@ def load_uspto(
    sep_reagent: bool = True,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:

  """Load USPTO Datasets.

  USPTO is a dataset of over 1.8 Million organic chemical reactions extracted from
  US patents and patent applications. The dataset is stored in the from of src and
  tgt . The src contains the SMILES for the reactants and reagent in the form reactant>reagent
  the tgt contains the SMILES for the product SMILES.

  Molnet provides ability to load subsets of USPTO such as MIT, STEREO and 50K.
  The MIT dataset contains around 480k reactions
  The STEREO dataset contains around 1 Million Reactions.
  The 50K dataset contatins 50,000 reactions with an additional label indicating the class of reaction to which it belongs.
  The loader uses the specified splitter to use the same splits as used by Schwaller and Coley. Custom splitters could also be used. 
  There is also a toggle to load the dataset with the reagents separated or mixed.

  Parameters
  ----------
  featurizer: Featurizer or str
    the featurizer to use for processing the data.  Alternatively you can pass
    one of the names from dc.molnet.featurizers as a shortcut.
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in
  subset : str (default 'MIT')
    Subset of dataset to download. 'FULL', 'MIT', 'STEREO', and '50K' are supported.
  Returns
  -------
  tasks, datasets, transformers : tuple
    tasks : list
      Column names corresponding to machine learning target variables.
    datasets : tuple
      train, validation, test splits of data as
      ``deepchem.data.datasets.Dataset`` instances.
    transformers : list
      ``deepchem.trans.transformers.Transformer`` instances applied
      to dataset.
  ----------
  .. [1] 
  """
  #get test and valid lists if subset is MIT, 50K, STEREO and splitter = specified.
  #if subset is Full use splitter passed by the user.
  #splitter = dc.splits.SpecifiedSplitter(valid_indices=,test_indices=)
  featurizer = dc.feat.UserDefinedFeaturizer([])
  loader = _USPTOLoader(
      featurizer,