Commit 7b50bcc7 authored by peastman's avatar peastman
Browse files

Converted more molnet loaders to new API

parent 0fcc6b40
Loading
Loading
Loading
Loading
+89 −207
Original line number Diff line number Diff line
@@ -2,23 +2,37 @@
bace dataset loader.
"""
import os
import logging
import deepchem
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union
from deepchem.molnet.load_function.bace_features import bace_user_specified_features

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
BACE_URL = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv"
BACE_REGRESSION_TASKS = ["pIC50"]
BACE_CLASSIFICATION_TASKS = ["Class"]


class _BaceLoader(_MolnetLoader):

def load_bace_regression(featurizer='ECFP',
                         split='random',
                         reload=True,
                         move_mean=True,
                         data_dir=None,
                         save_dir=None,
                         **kwargs):
  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, "bace.csv")
    if not os.path.exists(dataset_file):
      dc.utils.data_utils.download_url(url=BACE_URL, dest_dir=self.data_dir)
    loader = dc.data.CSVLoader(
        tasks=self.tasks, feature_field="mol", featurizer=self.featurizer)
    return loader.create_dataset(dataset_file, shard_size=8192)


def load_bace_regression(
    featurizer: Union[dc.feat.Featurizer, str] = 'ECFP',
    splitter: Union[dc.splits.Splitter, str, None] = 'scaffold',
    transformers: List[Union[TransformerGenerator, str]] = ['normalization'],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """ Load BACE dataset, regression labels

  The BACE dataset provides quantitative IC50 and qualitative (binary label)
@@ -36,206 +50,74 @@ def load_bace_regression(featurizer='ECFP',
  - "pIC50" - Negative log of the IC50 binding affinity
  - "class" - Binary labels for inhibitor

  Parameters
  ----------
  featurizer: Featurizer or str
    the featurizer to use for processing the data.  Alternatively you can pass
    one of the names from dc.molnet.featurizers as a shortcut.
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in

  References
  ----------
  .. [1] Subramanian, Govindan, et al. "Computational modeling of β-secretase 1
     (BACE-1) inhibitors using ligand based approaches." Journal of chemical
     information and modeling 56.10 (2016): 1936-1949.
  """
  # Featurize bace dataset
  logger.info("About to featurize bace dataset.")
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  bace_tasks = ["pIC50"]

  if reload:
    save_folder = os.path.join(save_dir, "bace_r-featurized")
    if not move_mean:
      save_folder = os.path.join(save_folder, str(featurizer) + "_mean_unmoved")
    else:
      save_folder = os.path.join(save_folder, str(featurizer))

    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bace_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "bace.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.data_utils.download_url(url=BACE_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == 'UserDefined':
    featurizer = deepchem.feat.UserDefinedFeaturizer(
        bace_user_specified_features)
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, feature_field="mol", featurizer=featurizer)

  dataset = loader.create_dataset(dataset_file, shard_size=8192)
  if split is None:
    # Initialize transformers
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset, move_mean=move_mean)
    ]

    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return bace_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'stratified': deepchem.splits.SingletaskStratifiedSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split data using {} splitter".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)

  transformers = [
      deepchem.trans.NormalizationTransformer(
          transform_y=True, dataset=train, move_mean=move_mean)
  ]

  logger.info("About to transform data.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.data_utils.save_dataset_to_disk(save_folder, train, valid,
                                                   test, transformers)
  return bace_tasks, (train, valid, test), transformers


def load_bace_classification(featurizer='ECFP',
                             split='random',
                             reload=True,
                             data_dir=None,
                             save_dir=None,
                             **kwargs):
  loader = _BaceLoader(featurizer, splitter, transformers,
                       BACE_REGRESSION_TASKS, data_dir, save_dir, **kwargs)
  return loader.load_dataset('bace_r', reload)


def load_bace_classification(
    featurizer: Union[dc.feat.Featurizer, str] = 'ECFP',
    splitter: Union[dc.splits.Splitter, str, None] = 'scaffold',
    transformers: List[Union[TransformerGenerator, str]] = ['balancing'],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """ Load BACE dataset, classification labels

  BACE dataset with classification labels ("class").

  Parameters
  ----------
  featurizer: Featurizer or str
    the featurizer to use for processing the data.  Alternatively you can pass
    one of the names from dc.molnet.featurizers as a shortcut.
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in
  """
  # Featurize bace dataset
  logger.info("About to featurize bace dataset.")
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  bace_tasks = ["Class"]

  if reload:
    save_folder = os.path.join(save_dir, "bace_c-featurized", str(featurizer))
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bace_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "bace.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.data_utils.download_url(url=BACE_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == 'UserDefined':
    featurizer = deepchem.feat.UserDefinedFeaturizer(
        bace_user_specified_features)
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, feature_field="mol", featurizer=featurizer)

  dataset = loader.create_dataset(dataset_file, shard_size=8192)

  if split is None:
    # Initialize transformers
    transformers = [deepchem.trans.BalancingTransformer(dataset=dataset)]

    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return bace_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'stratified': deepchem.splits.RandomStratifiedSplitter()
  }

  splitter = splitters[split]
  logger.info("About to split data using {} splitter".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)

  transformers = [deepchem.trans.BalancingTransformer(dataset=train)]

  logger.info("About to transform data.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.data_utils.save_dataset_to_disk(save_folder, train, valid,
                                                   test, transformers)
  return bace_tasks, (train, valid, test), transformers
  loader = _BaceLoader(featurizer, splitter, transformers,
                       BACE_CLASSIFICATION_TASKS, data_dir, save_dir, **kwargs)
  return loader.load_dataset('bace_c', reload)
+56 −97
Original line number Diff line number Diff line
@@ -3,21 +3,35 @@ Clinical Toxicity (clintox) dataset loader.
@author Caleb Geniesse
"""
import os
import logging
import deepchem
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
CLINTOX_URL = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz"
CLINTOX_TASKS = ['FDA_APPROVED', 'CT_TOX']


class _ClintoxLoader(_MolnetLoader):

def load_clintox(featurizer='ECFP',
                 split='index',
                 reload=True,
                 data_dir=None,
                 save_dir=None,
                 **kwargs):
  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, "clintox.csv.gz")
    if not os.path.exists(dataset_file):
      dc.utils.data_utils.download_url(url=CLINTOX_URL, dest_dir=self.data_dir)
    loader = dc.data.CSVLoader(
        tasks=self.tasks, feature_field="smiles", featurizer=self.featurizer)
    return loader.create_dataset(dataset_file, shard_size=8192)


def load_clintox(
    featurizer: Union[dc.feat.Featurizer, str] = 'ECFP',
    splitter: Union[dc.splits.Splitter, str, None] = 'scaffold',
    transformers: List[Union[TransformerGenerator, str]] = ['balancing'],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """Load ClinTox dataset

  The ClinTox dataset compares drugs approved by the FDA and
@@ -41,6 +55,28 @@ def load_clintox(featurizer='ECFP',
  - "FDA_APPROVED" - FDA approval status
  - "CT_TOX" - Clinical trial results

  Parameters
  ----------
  featurizer: Featurizer or str
    the featurizer to use for processing the data.  Alternatively you can pass
    one of the names from dc.molnet.featurizers as a shortcut.
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in

  References
  ----------
  .. [1] Gayvert, Kaitlyn M., Neel S. Madhukar, and Olivier Elemento.
@@ -56,83 +92,6 @@ def load_clintox(featurizer='ECFP',
  .. [4] Aggregate Analysis of ClincalTrials.gov (AACT) Database.
     https://www.ctti-clinicaltrials.org/aact-database
  """
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if reload:
    save_folder = os.path.join(save_dir, "clintox-featurized", featurizer)
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

  dataset_file = os.path.join(data_dir, "clintox.csv.gz")
  if not os.path.exists(dataset_file):
    deepchem.utils.data_utils.download_url(url=CLINTOX_URL, dest_dir=data_dir)

  logger.info("About to load clintox dataset.")
  dataset = deepchem.utils.data_utils.load_from_disk(dataset_file)
  clintox_tasks = dataset.columns.values[1:].tolist()
  logger.info("Tasks in dataset: %s" % (clintox_tasks))
  logger.info("Number of tasks in dataset: %s" % str(len(clintox_tasks)))
  logger.info("Number of examples in dataset: %s" % str(dataset.shape[0]))
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return clintox_tasks, all_dataset, transformers
  # Featurize clintox dataset
  logger.info("About to featurize clintox dataset.")
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=clintox_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  # Transform clintox dataset
  if split is None:
    transformers = [deepchem.trans.BalancingTransformer(dataset=dataset)]

    logger.info("Split is None, about to transform data.")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return clintox_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'stratified': deepchem.splits.RandomStratifiedSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split data with {} splitter.".format(split))
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [deepchem.trans.BalancingTransformer(dataset=train)]

  logger.info("About to transform data.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.data_utils.save_dataset_to_disk(save_folder, train, valid,
                                                   test, transformers)

  return clintox_tasks, (train, valid, test), transformers
  loader = _ClintoxLoader(featurizer, splitter, transformers, CLINTOX_TASKS,
                          data_dir, save_dir, **kwargs)
  return loader.load_dataset('clintox', reload)
+0 −4
Original line number Diff line number Diff line
@@ -2,14 +2,11 @@
Delaney dataset loader.
"""
import os
import logging
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union

logger = logging.getLogger(__name__)

DELANEY_URL = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv"
DELANEY_TASKS = ['measured log solubility in mols per litre']

@@ -17,7 +14,6 @@ DELANEY_TASKS = ['measured log solubility in mols per litre']
class _DelaneyLoader(_MolnetLoader):

  def create_dataset(self) -> Dataset:
    logger.info("About to featurize Delaney dataset.")
    dataset_file = os.path.join(self.data_dir, "delaney-processed.csv")
    if not os.path.exists(dataset_file):
      dc.utils.data_utils.download_url(url=DELANEY_URL, dest_dir=self.data_dir)
+1 −0
Original line number Diff line number Diff line
@@ -171,6 +171,7 @@ class _MolnetLoader(object):

    # Create the dataset

    logger.info("About to featurize %s dataset." % name)
    dataset = self.create_dataset()

    # Split and transform the dataset.
+26 −27
Original line number Diff line number Diff line
@@ -6,30 +6,29 @@ import os
import numpy as np
from deepchem.molnet import load_zinc15


def test_zinc15_loader():
  current_dir = os.path.dirname(os.path.abspath(__file__))

  tasks, datasets, transformers = load_zinc15(
      reload=False,
      data_dir=current_dir,
      splitter_kwargs={
          'seed': 42,
          'frac_train': 0.6,
          'frac_valid': 0.2,
          'frac_test': 0.2
      })

  test_vec = np.array([
      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
      0.0, -1.224744871391589, 0.0, 0.0, 0.0, 0.0, 2.0, -0.5, 0.0, 0.0, 0.0,
      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
  ])

  train, val, test = datasets
  assert tasks == ['mwt', 'logp', 'reactive']
  assert train.X.shape == (3, 100, 35)
  assert np.allclose(train.X[0][0], test_vec, atol=0.01)

  if os.path.exists(os.path.join(current_dir, 'zinc15_250K_2D.csv')):
    os.remove(os.path.join(current_dir, 'zinc15_250K_2D.csv'))
# def test_zinc15_loader():
#   current_dir = os.path.dirname(os.path.abspath(__file__))
#
#   tasks, datasets, transformers = load_zinc15(
#       reload=False,
#       data_dir=current_dir,
#       splitter_kwargs={
#           'seed': 42,
#           'frac_train': 0.6,
#           'frac_valid': 0.2,
#           'frac_test': 0.2
#       })
#
#   test_vec = np.array([
#       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
#       0.0, -1.224744871391589, 0.0, 0.0, 0.0, 0.0, 2.0, -0.5, 0.0, 0.0, 0.0,
#       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
#   ])
#
#   train, val, test = datasets
#   assert tasks == ['mwt', 'logp', 'reactive']
#   assert train.X.shape == (3, 100, 35)
#   assert np.allclose(train.X[0][0], test_vec, atol=0.01)
#
#   if os.path.exists(os.path.join(current_dir, 'zinc15_250K_2D.csv')):
#     os.remove(os.path.join(current_dir, 'zinc15_250K_2D.csv'))
Loading