Merge pull request #1867 from deepchem/molnet_docs (513abf9e) · Commits · 钟慕尧 / deepchem

deepchem/molnet/load_function/clintox_datasets.py

+27 −1

Original line number	Diff line number	Diff line
		@@ -18,7 +18,33 @@ def load_clintox(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load clintox datasets."""
		"""Load clintox datasets.

		The ClinTox dataset compares drugs approved by the FDA and
		drugs that have failed clinical trials for toxicity reasons.
		The dataset includes two classification tasks for 1491 drug
		compounds with known chemical structures: (1) clinical trial
		toxicity (or absence of toxicity) and (2) FDA approval status.
		List of FDA-approved drugs are compiled from the SWEETLEAD
		database, and list of drugs that failed clinical trials for
		toxicity reasons are compiled from the Aggregate Analysis of
		ClinicalTrials.gov(AACT) database.

		The data file contains a csv table, in which columns below are
		used:
		"smiles" - SMILES representation of the molecular structure
		"FDA_APPROVED" - FDA approval status
		"CT_TOX" - Clinical trial results

		References:
		Gayvert, Kaitlyn M., Neel S. Madhukar, and Olivier Elemento. "A data-driven approach to predicting successes and failures of clinical trials." Cell chemical biology 23.10 (2016): 1294-1301.

		Artemov, Artem V., et al. "Integrated deep learned transcriptomic and structure-based predictor of clinical trials outcomes." bioRxiv (2016): 095653.

		Novick, Paul A., et al. "SWEETLEAD: an in silico database of approved drugs, regulated chemicals, and herbal isolates for computer-aided drug discovery." PloS one 8.11 (2013): e79568.

		Aggregate Analysis of ClincalTrials.gov (AACT) Database. https://www.ctti-clinicaltrials.org/aact-database
		"""
		if data_dir is None:
		data_dir = DEFAULT_DIR
		if save_dir is None:

deepchem/molnet/load_function/delaney_datasets.py

+12 −2

Original line number	Diff line number	Diff line
		@@ -18,7 +18,16 @@ def load_delaney(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load delaney datasets."""
		"""Load delaney datasets.

		The Delaney datasets are extracted from the following paper

		Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005.

		This dataset contains 2874 measured aqueous solubility
		values. The source dataset is available in the supplemental
		material of the original paper.
		"""
		# Featurize Delaney dataset
		logger.info("About to featurize Delaney dataset.")
		if data_dir is None:
		@@ -60,8 +69,9 @@ def load_delaney(featurizer='ECFP',
		elif featurizer == "smiles2img":
		img_spec = kwargs.get("img_spec", "std")
		img_size = kwargs.get("img_size", 80)
		res = kwargs.get("res", 0.5)
		featurizer = deepchem.feat.SmilesToImage(
		img_size=img_size, img_spec=img_spec)
		img_size=img_size, img_spec=img_spec, res=res)

		loader = deepchem.data.CSVLoader(
		tasks=delaney_tasks, smiles_field="smiles", featurizer=featurizer)

deepchem/molnet/load_function/factors_datasets.py

+32 −1

Original line number	Diff line number	Diff line
		@@ -135,7 +135,38 @@ def gen_factors(FACTORS_tasks,


		def load_factors(shard_size=2000, featurizer=None, split=None, reload=True):
		"""Loads FACTOR dataset; does not do train/test split"""
		"""Loads FACTOR dataset; does not do train/test split

		The Factors dataset is an in-house dataset from Merck that was first introduced in the following paper:

		Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.

		It contains 1500 Merck in-house compounds that were measured
		for IC50 of inhibition on 12 serine proteases. Unlike most of
		the other datasets featured in MoleculeNet, the Factors
		collection does not have structures for the compounds tested
		since they were proprietary Merck compounds. However, the
		collection does feature pre-computed descriptors for these
		compounds.

		Note that the original train/valid/test split from the source
		data was preserved here, so this function doesn't allow for
		alternate modes of splitting. Similarly, since the source data
		came pre-featurized, it is not possible to apply alternative
		featurizations.

		Parameters
		----------
		shard_size: int, optional
		Size of the DiskDataset shards to write on disk
		featurizer: optional
		Ignored since featurization pre-computed
		split: optional
		Ignored since split pre-computed
		reload: bool, optional
		Whether to automatically re-load from disk

		"""

		FACTORS_tasks = [
		'T_00001', 'T_00002', 'T_00003', 'T_00004', 'T_00005', 'T_00006',

deepchem/molnet/load_function/hiv_datasets.py

+20 −1

Original line number	Diff line number	Diff line
		@@ -17,7 +17,26 @@ def load_hiv(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load hiv datasets. Does not do train/test split"""
		"""Load hiv datasets. Does not do train/test split

		The HIV dataset was introduced by the Drug Therapeutics
		Program (DTP) AIDS Antiviral Screen, which tested the ability
		to inhibit HIV replication for over 40,000 compounds.
		Screening results were evaluated and placed into three
		categories: confirmed inactive (CI),confirmed active (CA) and
		confirmed moderately active (CM). We further combine the
		latter two labels, making it a classification task between
		inactive (CI) and active (CA and CM).

		The data file contains a csv table, in which columns below
		are used:
		- "smiles": SMILES representation of the molecular structure
		- "activity": Three-class labels for screening results: CI/CM/CA
		- "HIV_active": Binary labels for screening results: 1 (CA/CM) and 0 (CI)

		References:
		AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data
		"""
		# Featurize hiv dataset
		logger.info("About to featurize hiv dataset.")
		if data_dir is None:

deepchem/molnet/load_function/hopv_datasets.py

+14 −1

Original line number	Diff line number	Diff line
		@@ -17,7 +17,20 @@ def load_hopv(featurizer='ECFP',
		data_dir=None,
		save_dir=None,
		**kwargs):
		"""Load HOPV datasets. Does not do train/test split"""
		"""Load HOPV datasets. Does not do train/test split

		The HOPV datasets consist of the "Harvard Organic
		Photovoltaic Dataset. This dataset includes 350 small
		molecules and polymers that were utilized as p-type materials
		in OPVs. Experimental properties include: HOMO [a.u.], LUMO
		[a.u.], Electrochemical gap [a.u.], Optical gap [a.u.], Power
		conversion efficiency [%], Open circuit potential [V], Short
		circuit current density [mA/cm^2], and fill factor [%].
		Theoretical calculations in the original dataset have been
		removed (for now).

		Lopez, Steven A., et al. "The Harvard organic photovoltaic dataset." Scientific data 3.1 (2016): 1-7.
		"""
		# Featurize HOPV dataset
		logger.info("About to featurize HOPV dataset.")
		if data_dir is None:

Admin message