added loader (a3608ddd) · Commits · 钟慕尧 / deepchem

deepchem/molnet/load_function/uspto_datasets.py

+72 −17

Original line number	Diff line number	Diff line
		@@ -9,40 +9,94 @@ import logging
		import deepchem
		import numpy as np
		from deepchem.data import DiskDataset
		from deepchem.molnet.load_function.molnet_loader import _MolnetLoader
		from typing import List, Optional, Tuple, Union
		import deepchem as dc

		logger = logging.getLogger(__name__)

		DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
		USPTO_URL = "https://bitbucket.org/dan2097/patent-reaction-extraction/downloads/2008-2011_USPTO_reactionSmiles_filtered.zip"

		USPTO_MIT_TRAIN = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/USPTO_MIT_train.csv"
		USPTO_MIT_TEST = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/USPTO_MIT_test.csv"
		USPTO_MIT_VALID = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/USPTO_MIT_val.csv"

		def load_uspto(featurizer="plain",
		USPTO_STEREO_TRAIN = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/USPTO_STEREO_train.csv"
		USPTO_STEREO_TEST = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/USPTO_STEREO_test.csv"
		USPTO_STEREO_VALID = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/USPTO_STEREO_val.csv"


		class _USPTOLoader(_MolnetLoader):

		def create_dataset(self) -> DiskDataset:
		dataset_file = os.path.join(self.data_dir, "USPTO_MIT_test.csv")
		if not os.path.exists(dataset_file):
		dc.utils.data_utils.download_url(url=USPTO_MIT_TEST, dest_dir=self.data_dir)
		loader = dc.data.CSVLoader(
		tasks=self.tasks, feature_field="smiles", featurizer=self.featurizer)
		return loader.create_dataset(dataset_file, shard_size=8192)


		def load_uspto(
		featurizer: Union[dc.feat.Featurizer, str] = None,
		splitter: Union[dc.splits.Splitter, str, None] = None,
		transformers: List[Union[TransformerGenerator, str]] = None,
		reload: bool = True,
		data_dir: Optional[str] = None,
		save_dir: Optional[str] = None,
		**kwargs
		) -> Tuple[List[str], Tuple[DiskDataset, ...], List[dc.trans.Transformer]]:














		"""
		def load_uspto(featurizer="plain", #what does the plain featurizer mean? what are the other featurizers?
		split=None,
		num_to_load=10000,
		reload=True,
		verbose=False,
		data_dir=None,
		num_to_load=10000, # load the whole thing!
		reload=True, #what is reload?
		verbose=False, #what is verbose?
		data_dir=None, #ig this is okay
		save_dir=None,
		**kwargs):
		**kwargs): ##have to give option to load a particular subset. and option to separate reagents
		"""
		"""Load USPTO dataset.

		For now, only loads the subset of data for 2008-2011 reactions.
		See https://figshare.com/articles/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873
		The USPTO Dataset consists of over a million reactions from United States
		patent applications(2001-2013) and the same again from patent grants
		(1976-2013). The loader can load the entire dataset or subsets of it.
		The subsets are USPTO_STEREO, USPTO_MIT and USPTO_50k. The STEREO dataset
		contains around a million reactions with stereochemical information,
		the 50k dataset consists of 50k reactions classified into 10 classes and
		the MIT dataset consists of around 470k reactions.

		https://figshare.com/articles/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873
		for more details. The full dataset contains some 400K reactions. This causes
		an out-of-memory error on development laptop if full dataset is featurized.
		For now, return a truncated subset of dataset.
		Reloading is not entirely supported for this dataset.
		"""
		"""
		if data_dir is None:
		data_dir = DEFAULT_DIR
		if save_dir is None:
		save_dir = DEFAULT_DIR

		# Most reaction dataset ML tasks train the prediction of products from
		# ractants. Both of these are contained in the rxn object that is output,
		# reactants. Both of these are contained in the rxn object that is output,
		# so there is no "tasks" field.
		uspto_tasks = []
		if split is not None:
		uspto_tasks = [] #check what is tasks, but im pretty sure there are none!
		if split is not None: ##have to change this, we have train/test/valid ready.
		raise ValueError("Train/valid/test not yet supported.")
		# Download USPTO dataset
		if reload:
		@@ -59,10 +113,10 @@ def load_uspto(featurizer="plain",

		dataset_file = os.path.join(data_dir,
		"2008-2011_USPTO_reactionSmiles_filtered.zip")
		if not os.path.exists(dataset_file):
		if not os.path.exists(dataset_file): # no need to download? since its on AWS
		deepchem.utils.data_utils.download_url(url=USPTO_URL, dest_dir=data_dir)

		# Unzip
		# Unzip #NO need to unzip, since its already in .csv
		unzip_dir = os.path.join(data_dir, "2008-2011_USPTO_reactionSmiles_filtered")
		if not os.path.exists(unzip_dir):
		deepchem.utils.data_utils.unzip_file(dataset_file, dest_dir=unzip_dir)
		@@ -81,7 +135,7 @@ def load_uspto(featurizer="plain",
		# The first element in the row is the reaction smarts
		smarts = row[0]
		# Sometimes smarts have extraneous information at end of form "
		# \|f:0" that causes parsing to fail. Not sure what this information
		# \|f:0" that causes parsing to fail. Not sure what this information, ##yup i need to figure out what that \|f thing means as well!
		# is, but just ignoring for now.
		smarts = smarts.split(" ")[0]
		rxn = rdChemReactions.ReactionFromSmarts(smarts)
		@@ -92,5 +146,6 @@ def load_uspto(featurizer="plain",
		y = np.ones(len(rxn_array))
		# TODO: This dataset isn't saved to disk so reload doesn't happen.
		rxn_dataset = DiskDataset.from_numpy(rxn_array, y)
		transformers = []
		return uspto_tasks, (rxn_dataset, None, None), transformers
		transformers = [] #what are these transformers?
		return uspto_tasks, (rxn_dataset, None, None), transformers #it returns a diskdataset.
		"""
		No newline at end of file

Admin message