Merge pull request #2048 from miaecle/master (f34c9ce3) · Commits · 钟慕尧 / deepchem

deepchem/molnet/load_function/bace_datasets.py

+27 −2

Original line number	Diff line number	Diff line
		@@ -19,7 +19,29 @@ def load_bace_regression(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load bace datasets."""
		""" Load BACE dataset, regression labels

		The BACE dataset provides quantitative IC50 and qualitative (binary label)
		binding results for a set of inhibitors of human beta-secretase 1 (BACE-1).

		All data are experimental values reported in scientific literature over the
		past decade, some with detailed crystal structures available. A collection
		of 1522 compounds is provided, along with the regression labels of IC50.

		Scaffold splitting is recommended for this dataset.

		The raw data csv file contains columns below:

		- "mol" - SMILES representation of the molecular structure
		- "pIC50" - Negative log of the IC50 binding affinity
		- "class" - Binary labels for inhibitor

		References
		----------
		.. [1] Subramanian, Govindan, et al. "Computational modeling of β-secretase 1
		(BACE-1) inhibitors using ligand based approaches." Journal of chemical
		information and modeling 56.10 (2016): 1936-1949.
		"""
		# Featurize bace dataset
		logger.info("About to featurize bace dataset.")
		if data_dir is None:
		@@ -125,7 +147,10 @@ def load_bace_classification(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load bace datasets."""
		""" Load BACE dataset, classification labels

		BACE dataset with classification labels ("class").
		"""
		# Featurize bace dataset
		logger.info("About to featurize bace dataset.")
		if data_dir is None:

deepchem/molnet/load_function/bbbp_datasets.py

+27 −1

Original line number	Diff line number	Diff line
		@@ -17,7 +17,33 @@ def load_bbbp(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load blood-brain barrier penetration datasets """
		"""Load BBBP dataset

		The blood-brain barrier penetration (BBBP) dataset is designed for the
		modeling and prediction of barrier permeability. As a membrane separating
		circulating blood and brain extracellular fluid, the blood-brain barrier
		blocks most drugs, hormones and neurotransmitters. Thus penetration of the
		barrier forms a long-standing issue in development of drugs targeting
		central nervous system.

		This dataset includes binary labels for over 2000 compounds on their
		permeability properties.

		Scaffold splitting is recommended for this dataset.

		The raw data csv file contains columns below:

		- "name" - Name of the compound
		- "smiles" - SMILES representation of the molecular structure
		- "p_np" - Binary labels for penetration/non-penetration

		References
		----------
		.. [1] Martins, Ines Filipa, et al. "A Bayesian approach to in silico
		blood-brain barrier penetration modeling." Journal of chemical
		information and modeling 52.6 (2012): 1686-1697.
		"""

		# Featurize bbb dataset
		logger.info("About to featurize bbbp dataset.")
		if data_dir is None:

deepchem/molnet/load_function/clintox_datasets.py

+20 −12

Original line number	Diff line number	Diff line
		@@ -18,33 +18,41 @@ def load_clintox(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load clintox datasets.
		"""Load ClinTox dataset

		The ClinTox dataset compares drugs approved by the FDA and
		drugs that have failed clinical trials for toxicity reasons.
		The dataset includes two classification tasks for 1491 drug
		compounds with known chemical structures: (1) clinical trial
		toxicity (or absence of toxicity) and (2) FDA approval status.
		compounds with known chemical structures:

		#. clinical trial toxicity (or absence of toxicity)
		#. FDA approval status.

		List of FDA-approved drugs are compiled from the SWEETLEAD
		database, and list of drugs that failed clinical trials for
		toxicity reasons are compiled from the Aggregate Analysis of
		ClinicalTrials.gov(AACT) database.

		The data file contains a csv table, in which columns below are used:
		"smiles" - SMILES representation of the molecular structure
		"FDA_APPROVED" - FDA approval status
		"CT_TOX" - Clinical trial results
		Random splitting is recommended for this dataset.

		The raw data csv file contains columns below:

		- "smiles" - SMILES representation of the molecular structure
		- "FDA_APPROVED" - FDA approval status
		- "CT_TOX" - Clinical trial results

		References
		----------
		.. [1] Gayvert, Kaitlyn M., Neel S. Madhukar, and Olivier Elemento.
		"A data-driven approach to predicting successes and failures of clinical trials."
		"A data-driven approach to predicting successes and failures of clinical
		trials."
		Cell chemical biology 23.10 (2016): 1294-1301.
		.. [2] Artemov, Artem V., et al. "Integrated deep learned transcriptomic and
		structure-based predictor of clinical trials outcomes." bioRxiv (2016): 095653.
		.. [3] Novick, Paul A., et al. "SWEETLEAD: an in silico database of approved drugs,
		regulated chemicals, and herbal isolates for computer-aided drug discovery."
		PloS one 8.11 (2013): e79568.
		structure-based predictor of clinical trials outcomes." bioRxiv (2016):
		095653.
		.. [3] Novick, Paul A., et al. "SWEETLEAD: an in silico database of approved
		drugs, regulated chemicals, and herbal isolates for computer-aided drug
		discovery." PloS one 8.11 (2013): e79568.
		.. [4] Aggregate Analysis of ClincalTrials.gov (AACT) Database.
		https://www.ctti-clinicaltrials.org/aact-database
		"""

deepchem/molnet/load_function/delaney_datasets.py

+18 −6

Original line number	Diff line number	Diff line
		@@ -18,15 +18,27 @@ def load_delaney(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load delaney datasets.
		"""Load delaney dataset

		The Delaney datasets are extracted from the following paper
		The Delaney(ESOL) dataset a regression dataset containing structures and
		water solubility data for 1128 compounds. The dataset is widely used to
		validate machine learning models on estimating solubility directly from
		molecular structures (as encoded in SMILES strings).

		Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005.
		Random splitting is recommended for this dataset.

		This dataset contains 2874 measured aqueous solubility
		values. The source dataset is available in the supplemental
		material of the original paper.
		The raw data csv file contains columns below:

		- "Compound ID" - Name of the compound
		- "smiles" - SMILES representation of the molecular structure
		- "measured log solubility in mols per litre" - Log-scale water solubility
		of the compound, used as label

		References
		----------
		.. [1] Delaney, John S. "ESOL: estimating aqueous solubility directly from
		molecular structure." Journal of chemical information and computer
		sciences 44.3 (2004): 1000-1005.
		"""
		# Featurize Delaney dataset
		logger.info("About to featurize Delaney dataset.")

deepchem/molnet/load_function/hiv_datasets.py

+7 −3

Original line number	Diff line number	Diff line
		@@ -17,7 +17,7 @@ def load_hiv(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load hiv datasets. Does not do train/test split
		"""Load HIV dataset

		The HIV dataset was introduced by the Drug Therapeutics
		Program (DTP) AIDS Antiviral Screen, which tested the ability
		@@ -28,14 +28,18 @@ def load_hiv(featurizer='ECFP',
		latter two labels, making it a classification task between
		inactive (CI) and active (CA and CM).

		The data file contains a csv table, in which columns below are used:
		Scaffold splitting is recommended for this dataset.

		The raw data csv file contains columns below:

		- "smiles": SMILES representation of the molecular structure
		- "activity": Three-class labels for screening results: CI/CM/CA
		- "HIV_active": Binary labels for screening results: 1 (CA/CM) and 0 (CI)

		References
		----------
		.. [1] AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data
		.. [1] AIDS Antiviral Screen Data.
		https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data
		"""
		# Featurize hiv dataset
		logger.info("About to featurize hiv dataset.")

Admin message