Unverified Commit a207b77f authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1596 from VIGS25/load-thermosol-hppb

Loading thermosol and hppb datasets
parents 5b1ae085 d79ad64e
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -31,6 +31,8 @@ from deepchem.molnet.load_function.uspto_datasets import load_uspto
from deepchem.molnet.load_function.uv_datasets import load_uv
from deepchem.molnet.load_function.factors_datasets import load_factors
from deepchem.molnet.load_function.kinase_datasets import load_kinase
from deepchem.molnet.load_function.thermosol_datasets import load_thermosol
from deepchem.molnet.load_function.hppb_datasets import load_hppb

from deepchem.molnet.dnasim import simulate_motif_density_localization
from deepchem.molnet.dnasim import simulate_motif_counting
+117 −0
Original line number Diff line number Diff line
"""
HPPB Dataset Loader.
"""

from __future__ import division
from __future__ import unicode_literals

import os
import logging
import deepchem
import numpy as np

logger = logging.getLogger(__name__)

HPPB_URL = "http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/hppb.csv"
DEFAULT_DATA_DIR = deepchem.utils.get_data_dir()


def remove_missing_entries(dataset):
  """Remove missing entries.

  Some of the datasets have missing entries that sneak in as zero'd out
  feature vectors. Get rid of them.
  """
  for i, (X, y, w, ids) in enumerate(dataset.itershards()):
    available_rows = X.any(axis=1)
    logger.info("Shard %d has %d missing entries." %
                (i, np.count_nonzero(~available_rows)))
    X = X[available_rows]
    y = y[available_rows]
    w = w[available_rows]
    ids = ids[available_rows]
    dataset.set_shard(i, X, y, w, ids)


def load_hppb(featurizer="ECFP",
              data_dir=None,
              save_dir=None,
              split=None,
              split_seed=None,
              reload=True):
  """Loads the thermodynamic solubility datasets."""
  # Featurizer hppb dataset
  logger.info("About to featurize hppb dataset...")
  hppb_tasks = ["target"]  #Task is solubility in pH 7.4 buffer

  if data_dir is None:
    data_dir = DEFAULT_DATA_DIR
  if save_dir is None:
    save_dir = os.path.join(DEFAULT_DATA_DIR, "hppb", featurizer, str(split))

  if reload:
    if not os.path.exists(save_dir):
      logger.warning("{} does not exist. Creating one.".format(save_dir))
    else:
      logger.info("{} exists. Loading featurized datasets.".format(save_dir))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return hppb_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "hppb.csv")
  if not os.path.exists(dataset_file):
    logger.info("{} does not exist. Downloading it.".format(dataset_file))
    deepchem.utils.download_url(url=hppb_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == 'AdjacencyConv':
    featurizer = deepchem.feat.AdjacencyFingerprint(
        max_n_atoms=150, max_valence=6)

  logger.info("Featurizing datasets.")
  loader = deepchem.data.CSVLoader(
      tasks=hppb_tasks, smiles_field='smile', featurizer=featurizer)
  dataset = loader.featurize(input_files=[dataset_file], shard_size=2000)

  logger.info("Removing missing entries...")
  remove_missing_entries(dataset)

  if split == None:
    logger.info("About to transform the data...")
    for transformer in transformers:
      logger.info("Transforming the dataset with transformer ",
                  transformer.__class__.__name__)
      dataset = transformer.transform(dataset)
    return hppb_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'butina': deepchem.splits.ButinaSplitter(),
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset, seed=split_seed)
  transformers = []

  logger.info("About to transform the data...")
  for transformer in transformers:
    logger.info("Transforming the data with transformer ",
                transformer.__class__.__name__)
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    logger.info("Saving file to {}.".format(save_dir))
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return hppb_tasks, (train, valid, test), transformers
+118 −0
Original line number Diff line number Diff line
"""
Thermodynamic Solubility Dataset Loader
"""

from __future__ import division
from __future__ import unicode_literals

import os
import logging
import deepchem
import numpy as np

logger = logging.getLogger(__name__)

THERMOSOL_URL = "http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/thermosol.csv"
DEFAULT_DATA_DIR = deepchem.utils.get_data_dir()


def remove_missing_entries(dataset):
  """Remove missing entries.

  Some of the datasets have missing entries that sneak in as zero'd out
  feature vectors. Get rid of them.
  """
  for i, (X, y, w, ids) in enumerate(dataset.itershards()):
    available_rows = X.any(axis=1)
    logger.info("Shard %d has %d missing entries." %
                (i, np.count_nonzero(~available_rows)))
    X = X[available_rows]
    y = y[available_rows]
    w = w[available_rows]
    ids = ids[available_rows]
    dataset.set_shard(i, X, y, w, ids)


def load_thermosol(featurizer="ECFP",
                   data_dir=None,
                   save_dir=None,
                   split=None,
                   split_seed=None,
                   reload=True):
  """Loads the thermodynamic solubility datasets."""
  # Featurizer thermosol dataset
  logger.info("About to featurize thermosol dataset...")
  thermosol_tasks = ["target"]  #Task is solubility in pH 7.4 buffer

  if data_dir is None:
    data_dir = DEFAULT_DATA_DIR
  if save_dir is None:
    save_dir = os.path.join(DEFAULT_DATA_DIR, "thermosol", featurizer,
                            str(split))

  if reload:
    if not os.path.exists(save_dir):
      logger.warning("{} does not exist. Creating one.".format(save_dir))
    else:
      logger.info("{} exists. Loading featurized datasets.".format(save_dir))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return thermosol_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "thermosol.csv")
  if not os.path.exists(dataset_file):
    logger.info("{} does not exist. Downloading it.".format(dataset_file))
    deepchem.utils.download_url(url=THERMOSOL_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == 'AdjacencyConv':
    featurizer = deepchem.feat.AdjacencyFingerprint(
        max_n_atoms=150, max_valence=6)

  logger.info("Featurizing datasets.")
  loader = deepchem.data.CSVLoader(
      tasks=thermosol_tasks, smiles_field='smile', featurizer=featurizer)
  dataset = loader.featurize(input_files=[dataset_file], shard_size=2000)

  logger.info("Removing missing entries...")
  remove_missing_entries(dataset)

  if split == None:
    logger.info("About to transform the data...")
    for transformer in transformers:
      logger.info("Transforming the dataset with transformer ",
                  transformer.__class__.__name__)
      dataset = transformer.transform(dataset)
    return thermosol_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'butina': deepchem.splits.ButinaSplitter(),
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset, seed=split_seed)
  transformers = []

  logger.info("About to transform the data...")
  for transformer in transformers:
    logger.info("Transforming the data with transformer ",
                transformer.__class__.__name__)
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    logger.info("Saving file to {}.".format(save_dir))
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return thermosol_tasks, (train, valid, test), transformers
+6 −1
Original line number Diff line number Diff line
@@ -90,7 +90,8 @@ def run_benchmark(datasets,
        ]
    elif dataset in [
        'bace_r', 'chembl', 'clearance', 'delaney', 'hopv', 'kaggle', 'lipo',
        'nci', 'pdbbind', 'ppb', 'qm7', 'qm7b', 'qm8', 'qm9', 'sampl'
        'nci', 'pdbbind', 'ppb', 'qm7', 'qm7b', 'qm8', 'qm9', 'sampl',
        'thermosol'
    ]:
      mode = 'regression'
      if metric == None:
@@ -123,6 +124,7 @@ def run_benchmark(datasets,
        'factors': deepchem.molnet.load_factors,
        'hiv': deepchem.molnet.load_hiv,
        'hopv': deepchem.molnet.load_hopv,
        'hppb': deepchem.molnet.load_hppb,
        'kaggle': deepchem.molnet.load_kaggle,
        'kinase': deepchem.molnet.load_kinase,
        'lipo': deepchem.molnet.load_lipo,
@@ -139,6 +141,7 @@ def run_benchmark(datasets,
        'qm9': deepchem.molnet.load_qm9,
        'sampl': deepchem.molnet.load_sampl,
        'sider': deepchem.molnet.load_sider,
        'thermosol': deepchem.molnet.load_thermosol,
        'tox21': deepchem.molnet.load_tox21,
        'toxcast': deepchem.molnet.load_toxcast,
        'uv': deepchem.molnet.load_uv,
@@ -272,6 +275,7 @@ def load_dataset(dataset, featurizer, split='random'):
      'factors': deepchem.molnet.load_factors,
      'hiv': deepchem.molnet.load_hiv,
      'hopv': deepchem.molnet.load_hopv,
      'hppb': deepchem.molnet.load_hppb,
      'kaggle': deepchem.molnet.load_kaggle,
      'kinase': deepchem.molnet.load_kinase,
      'lipo': deepchem.molnet.load_lipo,
@@ -289,6 +293,7 @@ def load_dataset(dataset, featurizer, split='random'):
      'qm9': deepchem.molnet.load_qm9,
      'sampl': deepchem.molnet.load_sampl,
      'sider': deepchem.molnet.load_sider,
      'thermosol': deepchem.molnet.load_thermosol,
      'tox21': deepchem.molnet.load_tox21,
      'toxcast': deepchem.molnet.load_toxcast,
      'uv': deepchem.molnet.load_uv