Unverified Commit 513abf9e authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1867 from deepchem/molnet_docs

Improve MoleculeNet Docstrings
parents dcd8272f 843bbe3a
Loading
Loading
Loading
Loading
+27 −1
Original line number Diff line number Diff line
@@ -18,7 +18,33 @@ def load_clintox(featurizer='ECFP',
                 data_dir=None,
                 save_dir=None,
                 **kwargs):
  """Load clintox datasets."""
  """Load clintox datasets.

  The ClinTox dataset compares drugs approved by the FDA and
  drugs that have failed clinical trials for toxicity reasons.
  The dataset includes two classification tasks for 1491 drug
  compounds with known chemical structures: (1) clinical trial
  toxicity (or absence of toxicity) and (2) FDA approval status.
  List of FDA-approved drugs are compiled from the SWEETLEAD
  database, and list of drugs that failed clinical trials for
  toxicity reasons are compiled from the Aggregate Analysis of
  ClinicalTrials.gov(AACT) database.

  The data file contains a csv table, in which columns below are
  used:
     "smiles" - SMILES representation of the molecular structure
     "FDA_APPROVED" - FDA approval status
     "CT_TOX" - Clinical trial results

References:
  Gayvert, Kaitlyn M., Neel S. Madhukar, and Olivier Elemento. "A data-driven approach to predicting successes and failures of clinical trials." Cell chemical biology 23.10 (2016): 1294-1301.

  Artemov, Artem V., et al. "Integrated deep learned transcriptomic and structure-based predictor of clinical trials outcomes." bioRxiv (2016): 095653.

  Novick, Paul A., et al. "SWEETLEAD: an in silico database of approved drugs, regulated chemicals, and herbal isolates for computer-aided drug discovery." PloS one 8.11 (2013): e79568.

  Aggregate Analysis of ClincalTrials.gov (AACT) Database. https://www.ctti-clinicaltrials.org/aact-database
  """
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
+12 −2
Original line number Diff line number Diff line
@@ -18,7 +18,16 @@ def load_delaney(featurizer='ECFP',
                 data_dir=None,
                 save_dir=None,
                 **kwargs):
  """Load delaney datasets."""
  """Load delaney datasets.

  The Delaney datasets are extracted from the following paper

  Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005.

  This dataset contains 2874 measured aqueous solubility
  values. The source dataset is available in the supplemental
  material of the original paper.
  """
  # Featurize Delaney dataset
  logger.info("About to featurize Delaney dataset.")
  if data_dir is None:
@@ -60,8 +69,9 @@ def load_delaney(featurizer='ECFP',
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    res = kwargs.get("res", 0.5)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)
        img_size=img_size, img_spec=img_spec, res=res)

  loader = deepchem.data.CSVLoader(
      tasks=delaney_tasks, smiles_field="smiles", featurizer=featurizer)
+32 −1
Original line number Diff line number Diff line
@@ -135,7 +135,38 @@ def gen_factors(FACTORS_tasks,


def load_factors(shard_size=2000, featurizer=None, split=None, reload=True):
  """Loads FACTOR dataset; does not do train/test split"""
  """Loads FACTOR dataset; does not do train/test split

  The Factors dataset is an in-house dataset from Merck that was first introduced in the following paper:

Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.

  It contains 1500 Merck in-house compounds that were measured
  for IC50 of inhibition on 12 serine proteases. Unlike most of
  the other datasets featured in MoleculeNet, the Factors 
  collection does not have structures for the compounds tested
  since they were proprietary Merck compounds. However, the
  collection does feature pre-computed descriptors for these
  compounds.

  Note that the original train/valid/test split from the source
  data was preserved here, so this function doesn't allow for
  alternate modes of splitting. Similarly, since the source data
  came pre-featurized, it is not possible to apply alternative
  featurizations.

  Parameters
  ----------
  shard_size: int, optional
    Size of the DiskDataset shards to write on disk
  featurizer: optional
    Ignored since featurization pre-computed
  split: optional
    Ignored since split pre-computed
  reload: bool, optional
    Whether to automatically re-load from disk

  """

  FACTORS_tasks = [
      'T_00001', 'T_00002', 'T_00003', 'T_00004', 'T_00005', 'T_00006',
+20 −1
Original line number Diff line number Diff line
@@ -17,7 +17,26 @@ def load_hiv(featurizer='ECFP',
             data_dir=None,
             save_dir=None,
             **kwargs):
  """Load hiv datasets. Does not do train/test split"""
  """Load hiv datasets. Does not do train/test split

  The HIV dataset was introduced by the Drug Therapeutics
  Program (DTP) AIDS Antiviral Screen, which tested the ability
  to inhibit HIV replication for over 40,000 compounds.
  Screening results were evaluated and placed into three
  categories: confirmed inactive (CI),confirmed active (CA) and
  confirmed moderately active (CM). We further combine the
  latter two labels, making it a classification task between
  inactive (CI) and active (CA and CM).

  The data file contains a csv table, in which columns below
  are used:
     - "smiles": SMILES representation of the molecular structure
     - "activity": Three-class labels for screening results: CI/CM/CA
     - "HIV_active": Binary labels for screening results: 1 (CA/CM) and 0 (CI)

  References:
  AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data
  """
  # Featurize hiv dataset
  logger.info("About to featurize hiv dataset.")
  if data_dir is None:
+14 −1
Original line number Diff line number Diff line
@@ -17,7 +17,20 @@ def load_hopv(featurizer='ECFP',
              data_dir=None,
              save_dir=None,
              **kwargs):
  """Load HOPV datasets. Does not do train/test split"""
  """Load HOPV datasets. Does not do train/test split

  The HOPV datasets consist of the "Harvard Organic
  Photovoltaic Dataset. This dataset includes 350 small
  molecules and polymers that were utilized as p-type materials
  in OPVs. Experimental properties include: HOMO [a.u.], LUMO
  [a.u.], Electrochemical gap [a.u.], Optical gap [a.u.], Power
  conversion efficiency [%], Open circuit potential [V], Short
  circuit current density [mA/cm^2], and fill factor [%].
  Theoretical calculations in the original dataset have been
  removed (for now).

  Lopez, Steven A., et al. "The Harvard organic photovoltaic dataset." Scientific data 3.1 (2016): 1-7.
  """
  # Featurize HOPV dataset
  logger.info("About to featurize HOPV dataset.")
  if data_dir is None:
Loading