Merge pull request #2546 from Suzukazole/loaduspto (78ce3586) · Commits · 钟慕尧 / deepchem

deepchem/molnet/load_function/uspto_datasets.py

+149 −84

Original line number	Diff line number	Diff line
		"""
		Loads synthetic reaction datasets from USPTO.

		This file contains loaders for synthetic reaction datasets from the US Patenent Office. http://nextmovesoftware.com/blog/2014/02/27/unleashing-over-a-million-reactions-into-the-wild/.
		This file contains loaders for synthetic reaction datasets from the US Patent Office. http://nextmovesoftware.com/blog/2014/02/27/unleashing-over-a-million-reactions-into-the-wild/.
		"""
		import os
		import csv
		import logging
		import deepchem
		import numpy as np
		from deepchem.data import DiskDataset
		from deepchem.data import Dataset
		from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
		from typing import List, Optional, Tuple, Union
		import deepchem as dc

		logger = logging.getLogger(__name__)

		DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
		USPTO_URL = "https://bitbucket.org/dan2097/patent-reaction-extraction/downloads/2008-2011_USPTO_reactionSmiles_filtered.zip"


		def load_uspto(featurizer="plain",
		split=None,
		num_to_load=10000,
		reload=True,
		verbose=False,
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load USPTO dataset.

		For now, only loads the subset of data for 2008-2011 reactions.
		See https://figshare.com/articles/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873
		for more details. The full dataset contains some 400K reactions. This causes
		an out-of-memory error on development laptop if full dataset is featurized.
		For now, return a truncated subset of dataset.
		Reloading is not entirely supported for this dataset.
		"""
		if data_dir is None:
		data_dir = DEFAULT_DIR
		if save_dir is None:
		save_dir = DEFAULT_DIR

		# Most reaction dataset ML tasks train the prediction of products from
		# ractants. Both of these are contained in the rxn object that is output,
		# so there is no "tasks" field.
		uspto_tasks = []
		if split is not None:
		raise ValueError("Train/valid/test not yet supported.")
		# Download USPTO dataset
		if reload:
		save_folder = os.path.join(save_dir, "uspto-featurized", str(featurizer))
		if featurizer == "smiles2img":
		img_spec = kwargs.get("img_spec", "std")
		save_folder = os.path.join(save_folder, img_spec)
		save_folder = os.path.join(save_folder, str(split))

		loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
		save_folder)
		if loaded:
		return uspto_tasks, all_dataset, transformers

		dataset_file = os.path.join(data_dir,
		"2008-2011_USPTO_reactionSmiles_filtered.zip")

		USPTO_MIT_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_MIT.csv"
		USPTO_STEREO_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_STEREO.csv"
		USPTO_50K_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_50K.csv"
		USPTO_FULL_URL = "https://deepchemdata.s3.us-west-1.amazonaws.com/datasets/USPTO_FULL.csv"

		USPTO_TASK: List[str] = []


		class _USPTOLoader(_MolnetLoader):

		def __init__(self, args, subset: str, sep_reagent: bool, *kwargs):
		super(_USPTOLoader, self).__init__(args, *kwargs)
		self.subset = subset
		self.sep_reagent = sep_reagent
		self.name = 'USPTO_' + subset

		def create_dataset(self) -> Dataset:
		if self.subset not in ['MIT', 'STEREO', '50K', 'FULL']:
		raise ValueError("Valid Subset names are MIT, STEREO and 50K.")

		if self.subset == 'MIT':
		dataset_url = USPTO_MIT_URL

		if self.subset == 'STEREO':
		dataset_url = USPTO_STEREO_URL

		if self.subset == '50K':
		dataset_url = USPTO_50K_URL

		if self.subset == 'FULL':
		dataset_url = USPTO_FULL_URL
		if self.splitter == 'SpecifiedSplitter':
		raise ValueError(
		"There is no pre computed split for the full dataset, use a custom split instead!"
		)

		dataset_file = os.path.join(self.data_dir, self.name + '.csv')

		if not os.path.exists(dataset_file):
		deepchem.utils.data_utils.download_url(url=USPTO_URL, dest_dir=data_dir)

		# Unzip
		unzip_dir = os.path.join(data_dir, "2008-2011_USPTO_reactionSmiles_filtered")
		if not os.path.exists(unzip_dir):
		deepchem.utils.data_utils.unzip_file(dataset_file, dest_dir=unzip_dir)
		# Unzipped file is a tap seperated values file (despite the .txt)
		filename = os.path.join(unzip_dir,
		"2008-2011_USPTO_reactionSmiles_filtered.txt")
		rxns = []
		from rdkit.Chem import rdChemReactions
		with open(filename) as tsvfile:
		reader = csv.reader(tsvfile, delimiter="\t")
		for ind, row in enumerate(reader):
		if ind > num_to_load:
		break
		if verbose:
		print("Loading reaction %d" % ind)
		# The first element in the row is the reaction smarts
		smarts = row[0]
		# Sometimes smarts have extraneous information at end of form "
		# \|f:0" that causes parsing to fail. Not sure what this information
		# is, but just ignoring for now.
		smarts = smarts.split(" ")[0]
		rxn = rdChemReactions.ReactionFromSmarts(smarts)
		rxns.append(rxn)
		rxn_array = np.array(rxns)
		# Make up dummy labels since DiskDataset.from_numpy doesn't allow
		# creation from just features for now.
		y = np.ones(len(rxn_array))
		# TODO: This dataset isn't saved to disk so reload doesn't happen.
		rxn_dataset = DiskDataset.from_numpy(rxn_array, y)
		transformers = []
		return uspto_tasks, (rxn_dataset, None, None), transformers
		logger.info("Downloading dataset...")
		dc.utils.data_utils.download_url(url=dataset_url, dest_dir=self.data_dir)
		logger.info("Dataset download complete.")

		loader = dc.data.CSVLoader(
		tasks=self.tasks, feature_field="reactions", featurizer=self.featurizer)

		return loader.create_dataset(dataset_file, shard_size=8192)


		def load_uspto(
		featurizer: Union[dc.feat.Featurizer, str] = dc.feat.DummyFeaturizer(),
		splitter: Union[dc.splits.Splitter, str, None] = None,
		transformers: List[Union[TransformerGenerator, str]] = [],
		reload: bool = True,
		data_dir: Optional[str] = None,
		save_dir: Optional[str] = None,
		subset: str = "MIT",
		sep_reagent: bool = True, # functionality to be added!
		**kwargs
		) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
		"""Load USPTO Datasets.

		USPTO is a dataset of over 1.8 Million organic chemical reactions extracted
		from US patents and patent applications. The dataset contains the reactions
		in the form of reaction SMILES, which have the general format:
		reactant>reagent>product.

		Molnet provides ability to load subsets of the USPTO dataset namely MIT,
		STEREO and 50K. The MIT dataset contains around 479K reactions, curated by
		jin et al. The STEREO dataset contains around 1 Million Reactions, it does
		not have duplicates and the reactions include stereochemical information.
		The 50K dataset contatins 50,000 reactions and is the benchmark for
		retrosynthesis predictions. The reactions are additionally classified into 10
		reaction classes. The canonicalized version of the dataset used by the loader
		is the same as that used by somnath et. al.

		The loader uses the SpecifiedSplitter to use the same splits as specified
		by Schwaller and Coley. Custom splitters could also be used. There is also a
		toggle to load the dataset with the reagents separated or mixed. This alters
		the entries in src by replacing the '>' with '.', effectively loading them as
		a unified SMILES string.

		Parameters
		----------
		featurizer: Featurizer or str
		the featurizer to use for processing the data. Alternatively you can pass
		one of the names from dc.molnet.featurizers as a shortcut.
		splitter: Splitter or str
		the splitter to use for splitting the data into training, validation, and
		test sets. Alternatively you can pass one of the names from
		dc.molnet.splitters as a shortcut. If this is None, all the data
		will be included in a single dataset.
		transformers: list of TransformerGenerators or strings
		the Transformers to apply to the data. Each one is specified by a
		TransformerGenerator or, as a shortcut, one of the names from
		dc.molnet.transformers.
		reload: bool
		if True, the first call for a particular featurizer and splitter will cache
		the datasets to disk, and subsequent calls will reload the cached datasets.
		data_dir: str
		a directory to save the raw data in
		save_dir: str
		a directory to save the dataset in
		subset : str (default 'MIT')
		Subset of dataset to download. 'FULL', 'MIT', 'STEREO', and '50K' are supported.
		sep_reagent : bool (default True)
		Toggle to load dataset with reactants and reado I call it
		Returns
		-------
		tasks, datasets, transformers : tuple
		tasks : list
		Column names corresponding to machine learning target variables.
		datasets : tuple
		train, validation, test splits of data as
		``deepchem.data.datasets.Dataset`` instances.
		transformers : list
		``deepchem.trans.transformers.Transformer`` instances applied
		to dataset.
		----------
		.. [1] Lowe, D.. (2017). Chemical reactions from US patents (1976-Sep2016)
		(Version 1). figshare. https://doi.org/10.6084/m9.figshare.5104873.v1
		.. [2] Schwaller, P., Laino, T., Gaudin, T., Bolgar, P., Hunter, C. A., Bekas,
		C., & Lee, A. A. (2019). Molecular transformer: a model for
		uncertainty-calibrated chemical reaction prediction.
		ACS central science, 5(9), 1572-1583.
		.. [3] Somnath, V. R., Bunne, C., Coley, C. W., Krause, A., & Barzilay, R.
		(2020). Learning Graph Models for Retrosynthesis Prediction.
		arXiv preprint arXiv:2006.07038.
		.. [4] Dai, H., Li, C., Coley, C. W., Dai, B., & Song, L. (2020).
		Retrosynthesis prediction with conditional graph logic network.
		arXiv preprint arXiv:2001.01408.
		"""

		loader = _USPTOLoader(
		featurizer,
		splitter,
		transformers,
		USPTO_TASK,
		data_dir,
		save_dir,
		subset=subset,
		sep_reagent=sep_reagent,
		**kwargs)
		return loader.load_dataset(loader.name, reload)

Admin message