Change args (d52ed986) · Commits · 钟慕尧 / deepchem

deepchem/molnet/load_function/zinc15_datasets.py

+40 −21

Original line number	Diff line number	Diff line
		@@ -3,13 +3,14 @@ ZINC15 commercially-available compounds for virtual screening.
		"""
		import os
		import logging
		import numpy as np
		import deepchem
		from deepchem.feat import Featurizer
		from deepchem.trans import Transformer
		from deepchem.splits.splitters import Splitter
		from deepchem.molnet.defaults import get_defaults

		from typing import List, Tuple, Dict, Optional
		from typing import List, Tuple, Dict, Optional, Union

		logger = logging.getLogger(__name__)

		@@ -51,10 +52,9 @@ def load_zinc15(
		'transform_X': True
		}
		},
		zinc15_kwargs: Dict[str, str] = {
		'dataset_size': '250K',
		'dataset_dimension': '2D'
		},
		dataset_size: str = '250K',
		dataset_dimension: str = '2D',
		test_run: bool = False,
		**kwargs) -> Tuple[List, Optional[Tuple], List]:
		"""Load zinc15.

		@@ -64,11 +64,13 @@ def load_zinc15(
		in 2D (SMILES string) format.

		MolNet provides subsets of 250K, 1M, and 10M "lead-like" compounds
		from ZINC15. Compounds in ZINC15 are labeled by their molecular weight
		from ZINC15. The full dataset of 270M "goldilocks" compounds is also
		available. Compounds in ZINC15 are labeled by their molecular weight
		and LogP (solubility) values. Each compound also has information about how
		readily available (purchasable) it is and its reactivity. Lead-like
		compounds have molecular weight between 300 and 350 Daltons and LogP
		between -1 and 3.5.
		between -1 and 3.5. Goldilocks compounds are lead-like compounds with
		LogP values further restricted to between 2 and 3.

		If `reload = True` and `data_dir` (`save_dir`) is specified, the loader
		will attempt to load the raw dataset (featurized dataset) from disk.
		@@ -103,9 +105,12 @@ def load_zinc15(
		transformer_kwargs : dict
		Maps transformer names to constructor arguments, e.g.
		{"BalancingTransformer": {"transform_x":True, "transform_y":False}}
		zinc15_kwargs : dict
		Specify parameters for the ZINC15 dataset. Accepted keywords are
		'dataset_size' and 'dataset_dimension'.
		dataset_size : str (default '250K')
		Number of compounds to download; '250K', '1M', '10M', or '270M'.
		dataset_dimension : str (default '2D')
		SMILES strings (2D) or 3D SDF files; '2D' or '3D'
		test_run : bool (default False)
		Flag to indicate tests, if True dataset is not downloaded.
		**kwargs : additional optional arguments.

		Returns
		@@ -124,6 +129,7 @@ def load_zinc15(
		-----
		The total ZINC dataset with SMILES strings contains hundreds of millions
		of compounds and is over 100GB! ZINC250K is recommended for experimentation.
		The full set of 270M goldilocks compounds is 23GB.

		References
		----------
		@@ -131,12 +137,12 @@ def load_zinc15(

		Examples
		--------
		>> import deepchem as dc
		>> tasks, datasets, transformers = dc.molnet.load_zinc15(reload=False)
		>> train_dataset, val_dataset, test_dataset = datasets
		>> n_tasks = len(tasks)
		>> n_features = train_dataset.get_data_shape()[0]
		>> model = dc.models.MultitaskRegressor(n_tasks, n_features)
		>>> import deepchem as dc
		>>> tasks, datasets, transformers = dc.molnet.load_zinc15(test_run=True)
		>>> train_dataset, val_dataset, test_dataset = datasets
		>>> n_tasks = len(tasks)
		>>> n_features = train_dataset.X.shape[1]
		>>> model = dc.models.MultitaskRegressor(n_tasks, n_features)

		"""

		@@ -144,12 +150,12 @@ def load_zinc15(
		logger.info("About to featurize zinc15.")
		my_tasks = ['mwt', 'logp', 'reactive'] # machine learning targets

		# Get params specific to ZINC15
		dataset_size = zinc15_kwargs.get('dataset_size', '250K')
		dataset_dimension = zinc15_kwargs.get('dataset_dimension', '2D')
		if test_run:
		ds = deepchem.data.NumpyDataset(np.zeros((10, 1)))
		return my_tasks, (ds, ds, ds), []

		# Raise warnings and list other available options
		if dataset_size not in ['250K', '1M', '10M']:
		if dataset_size not in ['250K', '1M', '10M', '270M']:
		raise ValueError("""
		Only '250K', '1M', and '10M' are supported for dataset_size.
		""")
		@@ -157,6 +163,14 @@ def load_zinc15(
		raise ValueError("""
		Currently, only '2D' is supported for dataset_dimension.
		""")
		if dataset_size == '270M':
		answer = ''
		while answer not in ['y', 'n']:
		answer = input("""You're about to download 270M SMILES strings.
		This dataset is 23GB. Are you sure you want to continue? (Y/N)"""
		).lower()
		if answer == 'n':
		raise ValueError('Choose a smaller dataset_size.')

		dataset_filename = 'zinc15_' + dataset_size + '_' + dataset_dimension + '.tar.gz'

		@@ -208,7 +222,7 @@ def load_zinc15(
		featurizer=featurizer)

		# Featurize dataset
		dataset = loader.create_dataset(dataset_file)
		dataset = loader.create_dataset(os.path.join(data_dir, dataset_file))

		train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
		dataset, **splitter_kwargs)
		@@ -231,3 +245,8 @@ def load_zinc15(
		save_folder, train_dataset, valid_dataset, test_dataset, transformers)

		return my_tasks, (train_dataset, valid_dataset, test_dataset), transformers


		if __name__ == "__main__":
		import doctest
		doctest.testmod()

Admin message