Add loader doc (d6b17fa2) · Commits · 钟慕尧 / deepchem

deepchem/molnet/load_function/uspto_datasets.py

+60 −3

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@ DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
		USPTO_MIT_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_MIT.csv"
		USPTO_STEREO_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_STEREO.csv"
		USPTO_50K_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_50K.csv"
		USPTO_FULL_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_FULL.csv"


		class _USPTOLoader(_MolnetLoader):
		@@ -30,7 +31,7 @@ class _USPTOLoader(_MolnetLoader):
		self.name = 'USPTO_' + subset

		def create_dataset(self) -> Dataset:
		if self.subset not in ['MIT', 'STEREO', '50K']:
		if self.subset not in ['MIT', 'STEREO', '50K', 'FULL']:
		raise ValueError("Valid Subset names are MIT, STEREO and 50K.")

		if self.subset == 'MIT':
		@@ -42,6 +43,9 @@ class _USPTOLoader(_MolnetLoader):
		if self.subset == '50K':
		dataset_url = USPTO_50K_URL

		if self.subset == 'FULL':
		dataset_url = USPTO_FULL_URL

		dataset_file = os.path.join(self.data_dir, self.name + '.csv')

		if not os.path.exists(dataset_file):
		@@ -56,7 +60,7 @@ class _USPTOLoader(_MolnetLoader):

		def load_uspto(
		featurizer=None, # should I remove this?
		splitter=None,
		splitter: Union[dc.splits.Splitter, str, None] = 'SpecifiedSplitter',
		transformers=None,
		reload: bool = True,
		data_dir: Optional[str] = None,
		@@ -65,7 +69,60 @@ def load_uspto(
		sep_reagent: bool = True,
		**kwargs
		) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:

		"""Load USPTO Datasets.

		USPTO is a dataset of over 1.8 Million organic chemical reactions extracted from
		US patents and patent applications. The dataset is stored in the from of src and
		tgt . The src contains the SMILES for the reactants and reagent in the form reactant>reagent
		the tgt contains the SMILES for the product SMILES.

		Molnet provides ability to load subsets of USPTO such as MIT, STEREO and 50K.
		The MIT dataset contains around 480k reactions
		The STEREO dataset contains around 1 Million Reactions.
		The 50K dataset contatins 50,000 reactions with an additional label indicating the class of reaction to which it belongs.
		The loader uses the specified splitter to use the same splits as used by Schwaller and Coley. Custom splitters could also be used.
		There is also a toggle to load the dataset with the reagents separated or mixed.

		Parameters
		----------
		featurizer: Featurizer or str
		the featurizer to use for processing the data. Alternatively you can pass
		one of the names from dc.molnet.featurizers as a shortcut.
		splitter: Splitter or str
		the splitter to use for splitting the data into training, validation, and
		test sets. Alternatively you can pass one of the names from
		dc.molnet.splitters as a shortcut. If this is None, all the data
		will be included in a single dataset.
		transformers: list of TransformerGenerators or strings
		the Transformers to apply to the data. Each one is specified by a
		TransformerGenerator or, as a shortcut, one of the names from
		dc.molnet.transformers.
		reload: bool
		if True, the first call for a particular featurizer and splitter will cache
		the datasets to disk, and subsequent calls will reload the cached datasets.
		data_dir: str
		a directory to save the raw data in
		save_dir: str
		a directory to save the dataset in
		subset : str (default 'MIT')
		Subset of dataset to download. 'FULL', 'MIT', 'STEREO', and '50K' are supported.
		Returns
		-------
		tasks, datasets, transformers : tuple
		tasks : list
		Column names corresponding to machine learning target variables.
		datasets : tuple
		train, validation, test splits of data as
		``deepchem.data.datasets.Dataset`` instances.
		transformers : list
		``deepchem.trans.transformers.Transformer`` instances applied
		to dataset.
		----------
		.. [1]
		"""
		#get test and valid lists if subset is MIT, 50K, STEREO and splitter = specified.
		#if subset is Full use splitter passed by the user.
		#splitter = dc.splits.SpecifiedSplitter(valid_indices=,test_indices=)
		featurizer = dc.feat.UserDefinedFeaturizer([])
		loader = _USPTOLoader(
		featurizer,

Admin message